Character based chunking
- Note: means that words could be split in the middle! Don’t use this.
Python
def chunk_string(s, chunk_length, overlap):
chunks = []
start = 0
while start + chunk_length <= len(s):
end = start + chunk_length
chunks.append(s[start:end])
start += (chunk_length - overlap)
# Handling the last chunk if the string is not perfectly divisible
if start < len(s):
chunks.append(s[start:])
return chunks
# Example usage
s = "Hello, this is an example string."
chunks = chunk_string(s, 10, 3)
print(len(chunks))
>> 4
print(chunks)
>> 'Hello, thi'
'this is an'
' an exampl'
'mple strin'
'ring.'
Word based chunking
- Note: the split is simply by space
- this means sentences can be split! Probably don’t use this.
Python
def chunk_string_by_words(s, words_per_chunk, word_overlap):
words = s.split()
chunks = []
start = 0
while start + words_per_chunk <= len(words):
end = start + words_per_chunk
chunks.append(' '.join(words[start:end]))
start += (words_per_chunk - word_overlap)
# Handling the last chunk
if start < len(words):
chunks.append(' '.join(words[start:]))
return chunks
# Example usage
s = "Hello, this is an example string for word based chunking."
chunks = chunk_string_by_words(s, 5, 2)
print(len(chunks))
>> 3
print(chunks)
>> 'Hello, this is an example'
'an example string for word'
'for word based chunking.'
Sentence based chunking
- Note: the split with re.split(r'(?<=[.!?]) +’, s) isn’t every way to end a sentence.
- Note: that this method assumes that sentences are always followed by at least one space after the punctuation, which might not always be true, especially with different writing styles or formats.
- For advance sentence splitting see NLTK’s “sent_tokenize” function (nltk.org)
Python
import re
def chunk_string_by_sentences(s, sentences_per_chunk, sentence_overlap):
# Splitting the string into sentences
sentences = re.split(r'(?<=[.!?]) +', s)
chunks = []
start = 0
while start + sentences_per_chunk <= len(sentences):
end = start + sentences_per_chunk
chunks.append(' '.join(sentences[start:end]))
start += (sentences_per_chunk - sentence_overlap)
# Handling the last chunk
if start < len(sentences):
chunks.append(' '.join(sentences[start:]))
return chunks
# Example usage
s = "Hello. This is an example string. It is used for sentence based chunking. Have a great day!"
chunks = chunk_string_by_sentences(s, 2, 1)
print(chunks)
>> 'Hello. This is an example string.'
'This is an example string. It is used for sentence based chunking.'
'It is used for sentence based chunking. Have a great day!'
'Have a great day!'