Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# load bert base tokenizer model, note one model can be used by multiple threads within the same process
# h = load_model("./bert_base_tok.bin")
h = blingfire.load_model(os.path.join(os.path.dirname(blingfire.__file__), "bert_base_tok.bin"))
for line in sys.stdin:
line = line.strip()
print(line)
#line = text_to_words(line)
#print(line)
ids = blingfire.text_to_ids(h, line, 128, 100)
print(ids)
blingfire.free_model(h)
import sys
import os
import blingfire
# from blingfiretok import *
# load bert base tokenizer model, note one model can be used by multiple threads within the same process
# h = load_model("./bert_base_tok.bin")
h = blingfire.load_model(os.path.join(os.path.dirname(blingfire.__file__), "bert_base_tok.bin"))
for line in sys.stdin:
line = line.strip()
print(line)
#line = text_to_words(line)
#print(line)
ids = blingfire.text_to_ids(h, line, 128, 100)
print(ids)
blingfire.free_model(h)
# from blingfiretok import *
# load bert base tokenizer model, note one model can be used by multiple threads within the same process
# h = load_model("./bert_base_tok.bin")
h = blingfire.load_model(os.path.join(os.path.dirname(blingfire.__file__), "bert_base_tok.bin"))
for line in sys.stdin:
line = line.strip()
print(line)
#line = text_to_words(line)
#print(line)
ids = blingfire.text_to_ids(h, line, 128, 100)
print(ids)
blingfire.free_model(h)
def convert_into_sentences(lines):
stack = []
sent_L = []
n_sent = 0
for chunk in lines:
if not chunk.strip():
if stack:
sents = text_to_sentences(
" ".join(stack).strip().replace('\n', ' ')).split('\n')
sent_L.extend(sents)
n_sent += len(sents)
sent_L.append('\n')
stack = []
continue
stack.append(chunk.strip())
if stack:
sents = text_to_sentences(
" ".join(stack).strip().replace('\n', ' ')).split('\n')
sent_L.extend(sents)
n_sent += len(sents)
return sent_L, n_sent
def blingfire_tokenize(text):
return blingfire.text_to_sentences(text).split('\n')
def blingfire_tokenize(text):
return blingfire.text_to_sentences(text).split('\n')
def text2sentences(text: str) -> str:
lines = [line.strip() for line in text.splitlines()]
stack = []
sentences = []
for line in lines:
if line:
stack.append(line)
elif stack: # empty line and non-empty stack
sentences += text_to_sentences(' '.join(stack).strip()).splitlines()
stack = []
return '\n'.join(sentences)
import sys
from blingfire import text_to_words
for l in sys.stdin:
if l.strip():
print(text_to_words(l.strip()))
else:
print('')