How to use the farm.data_handler.dataloader.NamedDataLoader function in farm

To help you get started, we’ve selected a few farm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepset-ai / FARM / farm / data_handler / data_silo.py View on Github external
""" Initializing train, dev and test data loaders for the already loaded datasets """

        if self.distributed:
            sampler_train = DistributedSampler(self.data["train"])
        else:
            sampler_train = RandomSampler(self.data["train"])

        data_loader_train = NamedDataLoader(
            dataset=self.data["train"],
            sampler=sampler_train,
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )

        if self.data["dev"] is not None:
            data_loader_dev = NamedDataLoader(
                dataset=self.data["dev"],
                sampler=SequentialSampler(self.data["dev"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_dev = None

        if self.processor.test_filename:
            data_loader_test = NamedDataLoader(
                dataset=self.data["test"],
                sampler=SequentialSampler(self.data["test"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
github deepset-ai / FARM / farm / data_handler / dataloader.py View on Github external
), "Dataset contains {} tensors while there are {} tensor names supplied: {}".format(
                len(batch[0]), len(tensor_names), tensor_names
            )
            lists_temp = [[] for _ in range(len(tensor_names))]
            ret = dict(zip(tensor_names, lists_temp))

            for example in batch:
                for name, tensor in zip(tensor_names, example):
                    ret[name].append(tensor)

            for key in ret:
                ret[key] = torch.stack(ret[key])

            return ret

        super(NamedDataLoader, self).__init__(
            dataset=dataset,
            sampler=sampler,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
github deepset-ai / FARM / farm / data_handler / data_silo.py View on Github external
batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )

        if self.data["dev"] is not None:
            data_loader_dev = NamedDataLoader(
                dataset=self.data["dev"],
                sampler=SequentialSampler(self.data["dev"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_dev = None

        if self.processor.test_filename:
            data_loader_test = NamedDataLoader(
                dataset=self.data["test"],
                sampler=SequentialSampler(self.data["test"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_test = None

        self.loaders = {
            "train": data_loader_train,
            "dev": data_loader_dev,
            "test": data_loader_test,
        }
github deepset-ai / FARM / farm / infer.py View on Github external
def _run_inference(self, dataset, tensor_names, baskets, rest_api_schema=False):
        samples = [s for b in baskets for s in b.samples]

        data_loader = NamedDataLoader(
            dataset=dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size, tensor_names=tensor_names
        )
        logits_all = []
        preds_all = []
        aggregate_preds = hasattr(self.model.prediction_heads[0], "aggregate_preds")
        for i, batch in enumerate(tqdm(data_loader, desc=f"Inferencing")):
            batch = {key: batch[key].to(self.device) for key in batch}
            if not aggregate_preds:
                batch_samples = samples[i * self.batch_size : (i + 1) * self.batch_size]
            with torch.no_grad():
                logits = self.model.forward(**batch)[0]
                if not aggregate_preds:
                    preds = self.model.formatted_preds(
                        logits=[logits],
                        samples=batch_samples,
                        tokenizer=self.processor.tokenizer,
github deepset-ai / haystack / haystack / reader / farm.py View on Github external
})
            id += 1

        # extract eval documents and convert data back to SQuAD-like format
        documents = document_store.get_all_documents_in_index(index=doc_index)
        dicts = []
        for document in documents:
            doc_id = document["_source"]["doc_id"]
            text = document["_source"]["text"]
            questions = doc_questions_dict[doc_id]
            dicts.append({"qas" : questions, "context" : text})

        # Create DataLoader that can be passed to the Evaluator
        indices = range(len(dicts))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(dicts, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_accuracy": eval_results[0]["top_n_accuracy"]
        }
        return results
github deepset-ai / FARM / farm / data_handler / data_silo.py View on Github external
def __init__(self, origsilo, trainset, devset, testset):
        self.tensor_names = origsilo.tensor_names
        self.data = {"train": trainset, "dev": devset, "test": testset}
        self.processor = origsilo.processor
        self.batch_size = origsilo.batch_size
        # should not be necessary, xval makes no sense with huge data
        # sampler_train = DistributedSampler(self.data["train"])
        sampler_train = RandomSampler(trainset)

        self.data_loader_train = NamedDataLoader(
            dataset=trainset,
            sampler=sampler_train,
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.data_loader_dev = NamedDataLoader(
            dataset=devset,
            sampler=SequentialSampler(devset),
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )
        self.data_loader_test = NamedDataLoader(
            dataset=testset,
            sampler=SequentialSampler(testset),
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
github deepset-ai / FARM / farm / data_handler / data_silo.py View on Github external
def _initialize_data_loaders(self):
        """ Initializing train, dev and test data loaders for the already loaded datasets """

        if self.distributed:
            sampler_train = DistributedSampler(self.data["train"])
        else:
            sampler_train = RandomSampler(self.data["train"])

        data_loader_train = NamedDataLoader(
            dataset=self.data["train"],
            sampler=sampler_train,
            batch_size=self.batch_size,
            tensor_names=self.tensor_names,
        )

        if self.data["dev"] is not None:
            data_loader_dev = NamedDataLoader(
                dataset=self.data["dev"],
                sampler=SequentialSampler(self.data["dev"]),
                batch_size=self.batch_size,
                tensor_names=self.tensor_names,
            )
        else:
            data_loader_dev = None