Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _build_token2idx_from_w2v(self):
if not self.w2v_path or not os.path.exists(self.w2v_path):
if self.w2v_path in self.model_key_map:
self.w2v_path = self.model_key_map[self.w2v_path]
model_dict = self.model_key_map.get(self.w2v_path, self.model_key_map['w2v-light-tencent-chinese'])
tar_filename = model_dict.get('tar_filename')
self.w2v_kwargs = {'binary': model_dict.get('binary')}
url = model_dict.get('url')
untar_filename = model_dict.get('untar_filename')
self.w2v_path = os.path.join(text2vec.USER_DATA_DIR, untar_filename)
if not os.path.exists(self.w2v_path):
get_file(
tar_filename, url, extract=True,
cache_dir=text2vec.USER_DIR,
cache_subdir=text2vec.USER_DATA_DIR,
verbose=1
)
t0 = time.time()
w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs)
w2v.init_sims(replace=True)
logger.debug('load w2v from %s, spend %s s' % (self.w2v_path, time.time() - t0))
token2idx = {
self.processor.token_pad: 0,
self.processor.token_unk: 1,
self.processor.token_bos: 2,
self.processor.token_eos: 3
}
def _build_token2idx_from_bert(self):
dict_path = os.path.join(self.model_folder, 'vocab.txt')
if not os.path.exists(dict_path):
model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12')
url = self.pre_trained_models.get(model_name)
get_file(
model_name + ".zip", url, extract=True,
cache_dir=text2vec.USER_DIR,
cache_subdir=text2vec.USER_DATA_DIR,
verbose=1
)
self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name)
dict_path = os.path.join(self.model_folder, 'vocab.txt')
logger.debug(f'load vocab.txt from {dict_path}')
token2idx = {}
with codecs.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
token = line.strip()
token2idx[token] = len(token2idx)
self.bert_token2idx = token2idx
self.tokenizer = keras_bert.Tokenizer(token2idx)