Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def build(opt):
dpath = os.path.join(opt['datapath'], 'MCTest')
version = None
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname = 'mctest.tar.gz'
url = 'http://parl.ai/downloads/mctest/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
dpext = os.path.join(dpath, 'mctest')
create_fb_format(
dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None
)
create_fb_format(
dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None
)
create_fb_format(
dpath,
'test160',
os.path.join(dpext, 'MCTest', 'mc160.test'),
os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'),
)
create_fb_format(
dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None
def build(self, opt):
dpath = os.path.join(opt['datapath'], 'dstc7')
version = None
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname = 'dstc7.tar.gz'
url = 'http://parl.ai/downloads/dstc7/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
if not build_data.built(dpath, version_string=version):
print('[target data path: ' + dpath + ']')
# make a clean directory if needed
if build_data.built(dpath):
# an older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
ds_path = os.environ.get('DATASETS_URL')
file_name = 'gareev.tar.gz'
if not ds_path:
raise RuntimeError("Looks like the `DATASETS_URL` variable is set incorrectly")
print('Trying to download a dataset %s from the repository' % file_name)
url = urllib.parse.urljoin(ds_path, file_name)
build_data.download(url, dpath, file_name)
build_data.untar(dpath, file_name)
print('Downloaded a %s dataset' % file_name)
# mark the data as built
build_data.mark_done(dpath, version_string=version)
opt['raw_dataset_path']=dpath
print("Use dataset from path: %s" % repr(opt['raw_dataset_path']))
create_heap_file(opt['raw_dataset_path'])
if not isfile(join(dpath, 'vocab', 'char_vocab.russian.txt')):
print('[Download the chars vocalibary]...')
try:
vocab_url = os.environ['MODELS_URL'] + 'coreference/vocabs/char_vocab.russian.txt'
build_data.download(vocab_url, join(dpath, 'vocab'), 'char_vocab.russian.txt')
print('[End of download the chars vocalibary]...')
except RuntimeWarning:
raise('To use your own char vocalibary, please, put the file char_vocab.russian.txt in the folder '
'{0}'.format(join(dpath, 'vocabs')))
if opt['name'] == 'pretrained_model' and not isdir(join(dpath, 'logs', 'pretrain_model')):
print('[Download the pretrain model]...')
try:
pretrain_url = os.environ['MODELS_URL'] + 'coreference/OpeanAI/pretrain_model.zip'
build_data.download(pretrain_url, join(dpath, 'logs'), 'pretrain_model.zip')
build_data.untar(join(dpath, 'logs'), 'pretrain_model.zip')
print('[End of download pretrain model]...')
except RuntimeWarning:
raise('To train your own model, please, change the variable --name in build.py:train_coreference '
'to anything other than `pretrain_model`')
build_data.make_dir(join(dpath, 'reports', 'response_files'))
build_data.make_dir(join(dpath, 'reports', 'results'))
build_data.make_dir(join(dpath, 'reports', 'predictions'))
return None
build_data.make_dir(join(dpath, 'report', 'results'))
build_data.make_dir(join(dpath, 'scorer'))
build_data.make_dir(join(dpath, 'train'))
build_data.make_dir(join(dpath, 'test'))
build_data.make_dir(join(dpath, 'valid'))
# urls
dataset_url = 'http://rucoref.maimbava.net/files/rucoref_29.10.2015.zip'
scorer_url = 'http://conll.cemantix.org/download/reference-coreference-scorers.v8.01.tar.gz'
# download the conll-2012 scorer v 8.1
start = time.time()
print('[Downloading the conll-2012 scorer]...')
build_data.download(scorer_url, join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
build_data.untar(join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
print('[Scorer was downloaded]...')
fname = 'rucoref_29.10.2015.zip'
if not os.path.isdir(join(dpath, 'rucoref_29.10.2015')):
print('[Downloading the rucoref dataset]...')
build_data.make_dir(join(dpath, 'rucoref_29.10.2015'))
build_data.download(dataset_url, join(dpath, 'rucoref_29.10.2015'), fname)
# uncompress it
build_data.untar(join(dpath, 'rucoref_29.10.2015'), 'rucoref_29.10.2015.zip')
print('End of downloading: took {0:.3f}s'.format(time.time() - start))
# Convertation rucorpus files in conll files
conllpath = join(dpath, 'ru_conll')
build_data.make_dir(conllpath)
coreference_utils.RuCoref2CoNLL(
join(dpath, 'rucoref_29.10.2015'), conllpath, language)
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
# make a clean directory if needed
if build_data.built(dpath):
# an older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# download the data.
fname = 'mnist.tar.gz'
url = 'https://s3.amazonaws.com/fair-data/parlai/mnist/' + fname # dataset URL
build_data.download(url, dpath, fname)
# uncompress it
build_data.untar(dpath, fname)
# mark the data as built
build_data.mark_done(dpath, version_string=version)
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
# make a clean directory if needed
if build_data.built(dpath):
# an older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# download the data.
url = 'http://paraphraser.ru/download/get?file_id=' # datasets URL
fname = 'paraphraser.zip'
build_data.download(url+'1', dpath, fname)
# uncompress it
build_data.untar(dpath, fname)
path = os.path.join(dpath, 'paraphrases.xml')
clean_dataset(path)
fname = 'paraphraser_gold.zip'
build_data.download(url+'5', dpath, fname)
# uncompress it
build_data.untar(dpath, fname)
path = os.path.join(dpath, 'paraphrases_gold.xml')
clean_dataset(path)
# mark the data as built
build_data.mark_done(dpath, version_string=version)
fname3 = 'v2_Questions_Test_mscoco.zip'
fname4 = 'v2_Annotations_Val_mscoco.zip'
fname5 = 'v2_Annotations_Train_mscoco.zip'
url = 'https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/'
build_data.download(url + fname1, dpath, fname1)
build_data.download(url + fname2, dpath, fname2)
build_data.download(url + fname3, dpath, fname3)
build_data.download(url + fname4, dpath, fname4)
build_data.download(url + fname5, dpath, fname5)
build_data.untar(dpath, fname1)
build_data.untar(dpath, fname2)
build_data.untar(dpath, fname3)
build_data.untar(dpath, fname4)
build_data.untar(dpath, fname5)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
if not build_data.built(dpath, version_string=version):
if build_data.built(dpath):
# older version exists, so remove the outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# first download the data files
fname_data = 'data_v1.tar.gz'
build_data.download(URL_ROOT + fname_data, dpath, fname_data)
build_data.untar(dpath, fname_data)
# next download the wordstats files
fname_wordstats = 'wordstats_v1.tar.gz'
build_data.download(URL_ROOT + fname_wordstats, dpath, fname_wordstats)
build_data.untar(dpath, fname_wordstats)
# next download the evaluation logs
fname_evallogs = 'evaluationlogs_v1.tar.gz'
build_data.download(URL_ROOT + fname_evallogs, dpath, fname_evallogs)
build_data.untar(dpath, fname_evallogs)
print("Data has been placed in " + dpath)
build_data.mark_done(dpath, version)