Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
with open(answer_file, mode='r', encoding='utf-8') as f:
gold_q_id = [line.split(' ', 1)[0] for line in f]
print_mscc_score(gold_q_id, q_id_and_sim)
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Please specify your input directory that contains MSCC dataset.')
print('(Most of the case the name of the directory might be `Holmes_Training_Data`.)')
print('sample usage: python src/eval/mscc.py ~/dataset/Holmes_Training_Data/')
quit()
create_mscc_dataset(sys.argv[1], 'dataset/mscc_train.txt')
gpu_id = config.gpu_id
model_path = config.model_path
emb_path = config.emb_path
# device
use_cuda = torch.cuda.is_available() and gpu_id > -1
if use_cuda:
device = torch.device('cuda:{}'.format(gpu_id))
torch.cuda.set_device(gpu_id)
else:
device = torch.device('cpu')
# load model
model, config_dict = read_model(model_path, device)
unk_token = config_dict['unk_token']
bos_token = config_dict['bos_token']
eos_token = config_dict['eos_token']
tokens[target_pos] = unk_token
tokens = [bos_token] + tokens + [eos_token]
indexed_sentence = [stoi[token] if token in stoi else stoi[unk_token] for token in tokens]
input_tokens = torch.tensor(indexed_sentence, dtype=torch.long, device=device).unsqueeze(0)
topv, topi = model.run_inference(input_tokens, target=None, target_pos=target_pos)
for value, key in zip(topv, topi):
print(value.item(), itos[key.item()])
if __name__ == "__main__":
sents = ["而 且 我 希 望 不 再 存 在 抽 [] 的 人 。",
"男 女 分 班 的 问 题 有 什 [] 好 处 ?",
"由 我 开 始 [] 起 。"]
model, unk_token, bos_token, eos_token, itos, stoi, device = get_infer_data(config.model_path,
config.emb_path,
config.gpu_id)
for i in sents:
infer_one_sentence(i, model, unk_token, bos_token, eos_token, itos, stoi, device)
print()
except SyntaxError:
pass
tokens[target_pos] = unk_token
tokens = [bos_token] + tokens + [eos_token]
indexed_sentence = [stoi[token] if token in stoi else stoi[unk_token] for token in tokens]
input_tokens = torch.tensor(indexed_sentence, dtype=torch.long, device=device).unsqueeze(0)
topv, topi = model.run_inference(input_tokens, target=None, target_pos=target_pos)
for value, key in zip(topv, topi):
print(value.item(), itos[key.item()])
if __name__ == "__main__":
sents = ["而 且 我 希 望 不 再 存 在 抽 [] 的 人 。",
"男 女 分 班 的 问 题 有 什 [] 好 处 ?",
"由 我 开 始 [] 起 。"]
model, unk_token, bos_token, eos_token, itos, stoi, device = get_infer_data(config.model_path,
config.emb_path,
config.gpu_id)
for i in sents:
infer_one_sentence(i, model, unk_token, bos_token, eos_token, itos, stoi, device)
print()
word_seq = segment(text, cut_type='char', pos=False)
word_arr.append(word_seq)
return word_arr
def save_data_list(data_list, data_path):
with open(data_path, 'w', encoding='utf-8') as f:
count = 0
for line in data_list:
f.write(' '.join(line) + '\n')
count += 1
print("save line size:%d to %s" % (count, data_path))
if __name__ == '__main__':
output_dir = os.path.dirname(config.train_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# train data
train_words = []
for path in config.raw_train_paths:
train_words.extend(parse_xml_file(path))
save_data_list(train_words, config.train_path)
def save_data_list(data_list, data_path):
with open(data_path, 'w', encoding='utf-8') as f:
count = 0
for line in data_list:
f.write(' '.join(line) + '\n')
count += 1
print("save line size:%d to %s" % (count, data_path))
if __name__ == '__main__':
output_dir = os.path.dirname(config.train_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# train data
train_words = []
for path in config.raw_train_paths:
train_words.extend(parse_xml_file(path))
save_data_list(train_words, config.train_path)
if use_cuda:
device = torch.device('cuda:{}'.format(gpu_id))
torch.cuda.set_device(gpu_id)
else:
device = torch.device('cpu')
# load model
model, config_dict = read_model(model_path, device)
unk_token = config_dict['unk_token']
bos_token = config_dict['bos_token']
eos_token = config_dict['eos_token']
# read vocab from word_emb path
itos, stoi = load_vocab(emb_path)
mscc_evaluation(config.question_file,
config.answer_file,
'mscc.result',
model,
stoi,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
device=device)
write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path)
torch.save(model.state_dict(), model_path)
torch.save(optimizer.state_dict(), model_path + '_optim')
if __name__ == "__main__":
train(config.train_path,
config.emb_path,
config.model_path,
config.use_mlp,
config.batch_size,
config.epochs,
config.maxlen,
config.word_embed_size,
config.hidden_size,
config.learning_rate,
config.n_layers,
config.min_freq,
config.dropout,
config.gpu_id)
print('epoch:[{}/{}], total_loss:[{}], best_cur_loss:[{}]'
.format(epoch + 1, epochs, total_loss.item(), best_loss))
def save_checkpoint(epoch, model, optimizer, model_path, dataset, use_cuda, emb_path, is_best):
write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path + '.epoch_' + str(epoch + 1))
torch.save(model.state_dict(), model_path + '.epoch_' + str(epoch + 1))
torch.save(optimizer.state_dict(), model_path + '_optim' + '.epoch_' + str(epoch + 1))
if is_best:
write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path)
torch.save(model.state_dict(), model_path)
torch.save(optimizer.state_dict(), model_path + '_optim')
if __name__ == "__main__":
train(config.train_path,
config.emb_path,
config.model_path,
config.use_mlp,
config.batch_size,
config.epochs,
config.maxlen,
config.word_embed_size,
config.hidden_size,
config.learning_rate,
config.n_layers,
config.min_freq,
config.dropout,
config.gpu_id)