Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _preprocess_episodes(self, episodes, dictionary, mode):
"""
Tokenize all the fields in Holl-E
"""
colorlog.info("Preprocess holle dataset")
tokenize = lambda x: ' '.join([str(data_vocab.BERT_CLS_ID)] +
[str(y) for y in dictionary.convert_tokens_to_ids(dictionary.tokenize(x))] + [str(data_vocab.BERT_SEP_ID)])
new_episodes = []
for episode_num, episode in enumerate(tqdm(episodes, ncols=70)):
new_examples = []
for example_num, example in enumerate(episode):
# Tokenize inputs and convert to tokens
context = tokenize(example['text'])
if mode == "train":
response = tokenize(example['labels'][0])
else:
response = tokenize(example['eval_labels'][0])
chosen_topic = tokenize(example['chosen_topic'])
# Set up knowledge
def _to_wow_format(self, raw_episodes, mode):
colorlog.info("Convert holle dataset to wow format")
episodes = []
for episode_idx, raw_episode in enumerate(tqdm(raw_episodes, ncols=70)):
episode = []
for example_idx in range(0, len(raw_episode['chat']), 2):
if example_idx + 1 < len(raw_episode['chat']):
chosen_topic = raw_episode['movie_name']
response = raw_episode['chat'][example_idx + 1]
knowledge_sentences = self._get_knowledge_sentences(
raw_episode,
episode_idx,
example_idx,
mode
)
checked_sentence = knowledge_sentences[0]
title = 'no_passages_used' if checked_sentence == 'no_passages_used' else chosen_topic
formatted_knowledge = '\n'.join([
def _epoch_started(self, engine):
colorlog.info('>' * 50)
colorlog.info('EPOCH: {}'.format(self.engine.state.epoch))
def _epoch_completed(self, engine):
ckpt_fname = os.path.join('tmp/wow_pretrained', 'ckpt-46070')
elif hparams.test_mode == "holle_1":
os.makedirs('./tmp', exist_ok=True)
if not os.path.exists('tmp/holle_pretrained_1'):
fname = 'holle_pretrained_1.zip'
gd_id = '1o1-Gv5PScxlSzxW6DyZnSp3gDI5zXOhh'
colorlog.info(f"Download pretrained checkpoint {fname}")
download_from_google_drive(gd_id, os.path.join('tmp', fname))
unzip('tmp', fname)
ckpt_fname = os.path.join('tmp/holle_pretrained_1', 'ckpt-1th-best')
elif hparams.test_mode == "holle_2":
os.makedirs('./tmp', exist_ok=True)
if not os.path.exists('tmp/holle_pretrained_2'):
fname = 'holle_pretrained_2.zip'
gd_id = '13FkCjuC0aBEenlSf-NAAgOfoWVPhqFSc'
colorlog.info(f"Download pretrained checkpoint {fname}")
download_from_google_drive(gd_id, os.path.join('tmp', fname))
unzip('tmp', fname)
ckpt_fname = os.path.join('tmp/holle_pretrained_2', 'ckpt-1th-best')
else:
raise ValueError("'wow' and 'holle' is currently supported")
# Set environment variables & gpus
set_logger()
set_gpus(hparams.gpus)
set_tcmalloc()
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus, 'GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# Set random seed
def _init_meta_param(self, model):
colorlog.critical("[Init Meta Parameter] >> uniform_ [-0.01, 0.01]")
for name, param in model.meta_param_manager.state_dict().items():
colorlog.info("{} intialized".format(name))
nn.init.uniform_(param, -0.01, 0.01)
def __init__(self, parent=None, ipython=True, user_options={}):
super(SequanaGUI, self).__init__(parent=parent)
colorlog.getLogger().setLevel("INFO")
colorlog.info("Welcome to Sequana GUI (aka Sequanix)")
self._tempdir = QTemporaryDir()
self.shell = ""
self.shell_error = ""
self._colors = {
'green': QtGui.QColor(0,170,0),
'red': QtGui.QColor(170,0,0),
'orange': QtGui.QColor(170,150,0),
'blue': QtGui.QColor(0,90,154),
}
# some global attributes
self._undefined_section = "Parameters in no sections/rules"
#self._config = None
# Set the regex to catch steps in the progres bar
def _preprocess_episodes(self, episodes, dictionary, mode):
"""
Tokenize all the fields in Wizard-of-Wikipedia
"""
colorlog.info("Preprocess wizard of wikipedia dataset")
tokenize = lambda x: ' '.join([str(data_vocab.BERT_CLS_ID)] +
[str(y) for y in dictionary.convert_tokens_to_ids(dictionary.tokenize(x))] + [str(data_vocab.BERT_SEP_ID)])
new_episodes = []
for episode_num, episode in enumerate(tqdm(episodes, ncols=70)):
new_examples = []
for example_num, example in enumerate(episode):
# Tokenize inputs and convert to tokens
context = tokenize(example['text'])
if mode == "train":
response = tokenize(example['labels'][0])
else:
response = tokenize(example['eval_labels'][0])
chosen_topic = tokenize(example['chosen_topic'])
# Set up knowledge
def _engine_ready(self):
colorlog.info("[Ignite Engine Ready]")
self.engine = Engine(self._update)
ProgressBar().attach(self.engine) # support tqdm progress bar
self.engine.add_event_handler(Events.STARTED, self._started)
self.engine.add_event_handler(Events.COMPLETED, self._completed)
self.engine.add_event_handler(Events.EPOCH_STARTED, self._epoch_started)
self.engine.add_event_handler(Events.EPOCH_COMPLETED, self._epoch_completed)
self.engine.add_event_handler(Events.ITERATION_STARTED, self._iteration_started)
self.engine.add_event_handler(Events.ITERATION_COMPLETED, self._iteration_completed)
# Tokenize knowledge
knowledge_sentences = [tokenize(k) for k in knowledges]
new_example = {'context': context,
'response': response,
'chosen_topic': chosen_topic,
'knowledge_sentences': knowledge_sentences,
'episode_num': episode_num,
'example_num': example_num}
new_examples.append(new_example)
new_episodes.append(new_examples)
if self._datapath:
episodes_fname = self._get_preprocessed_fname(mode)
colorlog.info(f"Cache preprocessed dataset to {episodes_fname}")
with open(episodes_fname, 'w') as fp:
for episode in new_episodes:
fp.write(json.dumps(episode) + '\n')
return new_episodes, dictionary