How to use the sentencepiece.SentencePieceTrainer.train function in sentencepiece

To help you get started, we’ve selected a few sentencepiece examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github akanyaani / gpt-2-tensorflow2.0 / pre_process.py View on Github external
def train_byte_pair_encoding(vocab_size):
	print("Training BytePair encoding......")
	token_dict = Counter()
	with open(PROCESS_DATA_PATH, 'r') as fr:
		for line in tqdm.tqdm(fr):
			token_dict.update(line.lower().split())

	with open(BPE_TSV_PATH, 'w', newline='') as f_output:
		tsv_output = csv.writer(f_output, delimiter='\t')
		for word in token_dict:
			tsv_output.writerow([word, token_dict[word]])

	spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(
		spm_input=BPE_TSV_PATH, spm_model=BPE_MODEL_PATH, vocab_size=vocab_size)
	spm.SentencePieceTrainer.train(spmcmd)
github lopuhin / transformer-lm / lm / data.py View on Github external
f'no text files found')
            paths.extend(corpus_paths)
        try:
            with sp_text.open('wt', encoding='utf8') as sp_text_file:
                for path in tqdm.tqdm(
                        paths, desc='building sentencepiece input'):
                    with path.open('rt', encoding='utf8') as f:
                        for line in f:
                            if line.strip():
                                sp_text_file.write(line)
        except Exception:
            if sp_text.exists():
                sp_text.unlink()
            raise

    spm.SentencePieceTrainer.train(' '.join([
        f'--input={sp_text}',
        f'--model_prefix={args.sp_model_prefix}',
        f'--vocab_size={args.vocab_size}',
        f'--model_type=bpe',
        f'--max_sentence_length=16384',
        f'--bos_id=-1',
        f'--eos_id=-1',
        f'--unk_piece={UNK}',
        f'--control_symbols={END_OF_LINE},{END_OF_TEXT}',
        f'--character_coverage={args.character_coverage}',
    ]))
github IndicoDataSolutions / finetune / finetune / base_models / oscar / encoder.py View on Github external
def train_tokenizer(filename, vocab_size=128000):
    spm.SentencePieceTrainer.train(
        (
            "--input={} --model_prefix={} --user_defined_symbols=<_start_>,<_delimiter_>,<_classify_> --unk_id=0 "
            "--vocab_size={} --input_sentence_size=10000000 --shuffle_input_sentence=true"
            " --max_sentence_length=10000000 --character_coverage=0.9999"
        ).format(filename, ENCODER_PATH, vocab_size))
github pytorch / text / torchtext / data / functional.py View on Github external
The model and vocab are saved in two separate files with
            model_prefix.

    Examples:
        >>> from torchtext.data.functional import generate_sp_model
        >>> generate_sp_model('test.csv', vocab_size=23456, model_prefix='spm_user')
    """

    spm_training_string = "--input={} \
                           --vocab_size={} \
                           --model_prefix={} \
                           --model_type={}".format(filename,
                                                   vocab_size,
                                                   model_prefix,
                                                   model_type)
    spm.SentencePieceTrainer.train(spm_training_string)
    return None
github microsoft / dpu-utils / python / dpu_utils / mlutils / bpevocabulary.py View on Github external
f"--vocab_size={self.__max_size}",
                        f"--model_type=bpe",
                        f"--max_sentence_length={max_sentence_length}",
                        f"--bos_piece={self.__bos_token}",
                        f"--eos_piece={self.__eos_token}",
                        f"--pad_piece={self.__pad_token}",
                        f"--pad_id=3",
                        f"--unk_piece={self.__unk_token}",
                        f"--user_defined_symbols={self.user_defined_symbols}",
                        f"--control_symbols={self.control_symbols}",
                        f"--character_coverage={character_coverage}",
                        "--minloglevel=1",
                        "--hard_vocab_limit=false",
                    ]

            spm.SentencePieceTrainer.train(
                " ".join(command)
            )

            loaded = self.__load_model_from_filepath(model_filename+'.model')
            assert loaded, 'Sentencepiece failed to load model.'
github IndicoDataSolutions / finetune / finetune / base_models / gpc / encoder2.py View on Github external
def train_tokenizer(filename, vocab_size=128000):
    spm.SentencePieceTrainer.train('--input={} --model_prefix={} --user_defined_symbols=<_start_>,<_delimiter_>,<_classify_> --unk_id=0 --vocab_size={} --input_sentence_size=10000000 --shuffle_input_sentence=true --max_sentence_length=10000000 --character_coverage=0.9999'.format(filename, ENCODER_PATH, vocab_size))
github sagorbrur / bnlp / bnlp / sentencepiece_tokenizer.py View on Github external
def train_bsp(self, data, model_prefix, vocab_size):
        """
        :data: (str) data path with extension
        :model_prefix: (str) model name prefix
        :vocab_size: (int) size of train vocabulary

        """
        train_args = "--model_prefix="+model_prefix+" --input="+data+" --vocab_size="+str(vocab_size)
        bsp.SentencePieceTrainer.train(train_args)
        print("%s.model and %s.vocab is saved on your current directory"%(model_prefix, model_prefix))

sentencepiece

SentencePiece python wrapper

Apache-2.0
Latest version published 9 months ago

Package Health Score

90 / 100
Full package analysis