Note: this library is called rank-bm25
on pypi (pypi) and NOT bm25
Python
pip install rank-bm25
Official docs (github) say:
Python
from rank_bm25 import BM25Okapi
corpus = [
"Hello there good man!",
"It is quite windy in London",
"How is the weather today?"
]
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
# <rank_bm25.BM25Okapi at 0x1047881d0>
# Ranking Documents
query = "windy London"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
# array([0. , 0.93729472, 0. ])
# Get top N
bm25.get_top_n(tokenized_query, corpus, n=1)
# ['It is quite windy in London']
However, you often want to clean the corpus (lowercase, remove punctuation) before indexing the corpus. Once you do that you must keep the original corpus around that index into the original unedited strings.
Python
import string
from rank_bm25 import BM25Okapi
def create_clean_bm25(corpus):
def clean_text(text):
# remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# lower, strip
return text.lower().strip()
# clean
cleaned_corpus = [clean_text(text) for text in corpus]
# tokenize
tokenized_corpus = [doc.split(" ") for doc in cleaned_corpus]
# create api
bm25 = BM25Okapi(tokenized_corpus)
return bm25
def search_with_bm25(corpus, query, n=2):
# create api
bm25 = create_clean_bm25(corpus)
# tokenize query
tokenized_query = query.split(" ")
# rank
return bm25.get_top_n(tokenized_query, corpus, n=n)
def your_code():
corpus = [
"Hello there good man!",
"It is quite windy in London",
"How is the weather today?"
]
query = "windy London"
bm25_results = search_with_bm25(corpus, query, n=2)
This works for most cases!
Note, if you are doing multiple searches this requires cleaning, tokenizing and indexing a new bm25 every time you search.
Python
def your_code():
corpus = [
"Hello there good man!",
"It is quite windy in London",
"How is the weather today?"
]
queries = [
"windy London",
"weather TODAY"
]
bm25 = create_clean_bm25(corpus)
for query in queries:
bm25_results = search_with_bm25(corpus, query, n=2)
print(bm25_results)