Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
""" Initializing train, dev and test data loaders for the already loaded datasets """
if self.distributed:
sampler_train = DistributedSampler(self.data["train"])
else:
sampler_train = RandomSampler(self.data["train"])
data_loader_train = NamedDataLoader(
dataset=self.data["train"],
sampler=sampler_train,
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
if self.data["dev"] is not None:
data_loader_dev = NamedDataLoader(
dataset=self.data["dev"],
sampler=SequentialSampler(self.data["dev"]),
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
else:
data_loader_dev = None
if self.processor.test_filename:
data_loader_test = NamedDataLoader(
dataset=self.data["test"],
sampler=SequentialSampler(self.data["test"]),
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
else:
), "Dataset contains {} tensors while there are {} tensor names supplied: {}".format(
len(batch[0]), len(tensor_names), tensor_names
)
lists_temp = [[] for _ in range(len(tensor_names))]
ret = dict(zip(tensor_names, lists_temp))
for example in batch:
for name, tensor in zip(tensor_names, example):
ret[name].append(tensor)
for key in ret:
ret[key] = torch.stack(ret[key])
return ret
super(NamedDataLoader, self).__init__(
dataset=dataset,
sampler=sampler,
batch_size=batch_size,
collate_fn=collate_fn,
)
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
if self.data["dev"] is not None:
data_loader_dev = NamedDataLoader(
dataset=self.data["dev"],
sampler=SequentialSampler(self.data["dev"]),
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
else:
data_loader_dev = None
if self.processor.test_filename:
data_loader_test = NamedDataLoader(
dataset=self.data["test"],
sampler=SequentialSampler(self.data["test"]),
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
else:
data_loader_test = None
self.loaders = {
"train": data_loader_train,
"dev": data_loader_dev,
"test": data_loader_test,
}
def _run_inference(self, dataset, tensor_names, baskets, rest_api_schema=False):
samples = [s for b in baskets for s in b.samples]
data_loader = NamedDataLoader(
dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names
)
logits_all = []
preds_all = []
aggregate_preds = hasattr(self.model.prediction_heads[0], "aggregate_preds")
for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")):
batch = {key: batch[key].to(self.device) for key in batch}
if not aggregate_preds:
batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size]
with torch.no_grad():
logits = self.model.forward(**batch)[0]
if not aggregate_preds:
preds = self.model.formatted_preds(
logits=[logits],
samples=batch_samples,
tokenizer=self.processor.tokenizer,
})
id += 1
# extract eval documents and convert data back to SQuAD-like format
documents = document_store.get_all_documents_in_index(index=doc_index)
dicts = []
for document in documents:
doc_id = document["_source"]["doc_id"]
text = document["_source"]["text"]
questions = doc_questions_dict[doc_id]
dicts.append({"qas" : questions, "context" : text})
# Create DataLoader that can be passed to the Evaluator
indices = range(len(dicts))
dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(dicts, indices=indices)
data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names)
evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device)
eval_results = evaluator.eval(self.inferencer.model)
results = {
"EM": eval_results[0]["EM"],
"f1": eval_results[0]["f1"],
"top_n_accuracy": eval_results[0]["top_n_accuracy"]
}
return results
def __init__(self, origsilo, trainset, devset, testset):
self.tensor_names = origsilo.tensor_names
self.data = {"train": trainset, "dev": devset, "test": testset}
self.processor = origsilo.processor
self.batch_size = origsilo.batch_size
# should not be necessary, xval makes no sense with huge data
# sampler_train = DistributedSampler(self.data["train"])
sampler_train = RandomSampler(trainset)
self.data_loader_train = NamedDataLoader(
dataset=trainset,
sampler=sampler_train,
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
self.data_loader_dev = NamedDataLoader(
dataset=devset,
sampler=SequentialSampler(devset),
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
self.data_loader_test = NamedDataLoader(
dataset=testset,
sampler=SequentialSampler(testset),
batch_size=self.batch_size,
tensor_names=self.tensor_names,
def _initialize_data_loaders(self):
""" Initializing train, dev and test data loaders for the already loaded datasets """
if self.distributed:
sampler_train = DistributedSampler(self.data["train"])
else:
sampler_train = RandomSampler(self.data["train"])
data_loader_train = NamedDataLoader(
dataset=self.data["train"],
sampler=sampler_train,
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
if self.data["dev"] is not None:
data_loader_dev = NamedDataLoader(
dataset=self.data["dev"],
sampler=SequentialSampler(self.data["dev"]),
batch_size=self.batch_size,
tensor_names=self.tensor_names,
)
else:
data_loader_dev = None