Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
query = "windy London"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)
a = bm25.get_top_n(tokenized_query, corpus, n=2)
print(a)
print("*" * 45)
corpus = ['女网红能火的只是一小部分', '当下最火的男明星为鹿晗', "How is the weather today?"]
tokenized_corpus = [segment(doc) for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
query = '当下最火的女网红是谁?'
tokenized_query = segment(query)
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)
a = bm25.get_top_n(tokenized_query, corpus, n=2)
print(a)
def init(self):
if not self.bm25_instance:
if not self.corpus:
logger.error('corpus is none, set corpus with docs.')
raise ValueError("must set corpus, which is documents, list of str")
if isinstance(self.corpus, str):
self.corpus = [self.corpus]
self.corpus_seg = {k: self.tokenizer.tokenize(k) for k in self.corpus}
self.bm25_instance = BM25Okapi(corpus=list(self.corpus_seg.values()))