Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def train_byte_pair_encoding(vocab_size):
print("Training BytePair encoding......")
token_dict = Counter()
with open(PROCESS_DATA_PATH, 'r') as fr:
for line in tqdm.tqdm(fr):
token_dict.update(line.lower().split())
with open(BPE_TSV_PATH, 'w', newline='') as f_output:
tsv_output = csv.writer(f_output, delimiter='\t')
for word in token_dict:
tsv_output.writerow([word, token_dict[word]])
spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(
spm_input=BPE_TSV_PATH, spm_model=BPE_MODEL_PATH, vocab_size=vocab_size)
spm.SentencePieceTrainer.train(spmcmd)
f'no text files found')
paths.extend(corpus_paths)
try:
with sp_text.open('wt', encoding='utf8') as sp_text_file:
for path in tqdm.tqdm(
paths, desc='building sentencepiece input'):
with path.open('rt', encoding='utf8') as f:
for line in f:
if line.strip():
sp_text_file.write(line)
except Exception:
if sp_text.exists():
sp_text.unlink()
raise
spm.SentencePieceTrainer.train(' '.join([
f'--input={sp_text}',
f'--model_prefix={args.sp_model_prefix}',
f'--vocab_size={args.vocab_size}',
f'--model_type=bpe',
f'--max_sentence_length=16384',
f'--bos_id=-1',
f'--eos_id=-1',
f'--unk_piece={UNK}',
f'--control_symbols={END_OF_LINE},{END_OF_TEXT}',
f'--character_coverage={args.character_coverage}',
]))
def train_tokenizer(filename, vocab_size=128000):
spm.SentencePieceTrainer.train(
(
"--input={} --model_prefix={} --user_defined_symbols=<_start_>,<_delimiter_>,<_classify_> --unk_id=0 "
"--vocab_size={} --input_sentence_size=10000000 --shuffle_input_sentence=true"
" --max_sentence_length=10000000 --character_coverage=0.9999"
).format(filename, ENCODER_PATH, vocab_size))
The model and vocab are saved in two separate files with
model_prefix.
Examples:
>>> from torchtext.data.functional import generate_sp_model
>>> generate_sp_model('test.csv', vocab_size=23456, model_prefix='spm_user')
"""
spm_training_string = "--input={} \
--vocab_size={} \
--model_prefix={} \
--model_type={}".format(filename,
vocab_size,
model_prefix,
model_type)
spm.SentencePieceTrainer.train(spm_training_string)
return None
f"--vocab_size={self.__max_size}",
f"--model_type=bpe",
f"--max_sentence_length={max_sentence_length}",
f"--bos_piece={self.__bos_token}",
f"--eos_piece={self.__eos_token}",
f"--pad_piece={self.__pad_token}",
f"--pad_id=3",
f"--unk_piece={self.__unk_token}",
f"--user_defined_symbols={self.user_defined_symbols}",
f"--control_symbols={self.control_symbols}",
f"--character_coverage={character_coverage}",
"--minloglevel=1",
"--hard_vocab_limit=false",
]
spm.SentencePieceTrainer.train(
" ".join(command)
)
loaded = self.__load_model_from_filepath(model_filename+'.model')
assert loaded, 'Sentencepiece failed to load model.'
def train_tokenizer(filename, vocab_size=128000):
spm.SentencePieceTrainer.train('--input={} --model_prefix={} --user_defined_symbols=<_start_>,<_delimiter_>,<_classify_> --unk_id=0 --vocab_size={} --input_sentence_size=10000000 --shuffle_input_sentence=true --max_sentence_length=10000000 --character_coverage=0.9999'.format(filename, ENCODER_PATH, vocab_size))
def train_bsp(self, data, model_prefix, vocab_size):
"""
:data: (str) data path with extension
:model_prefix: (str) model name prefix
:vocab_size: (int) size of train vocabulary
"""
train_args = "--model_prefix="+model_prefix+" --input="+data+" --vocab_size="+str(vocab_size)
bsp.SentencePieceTrainer.train(train_args)
print("%s.model and %s.vocab is saved on your current directory"%(model_prefix, model_prefix))