How to use the farm.data_handler.processor.TextClassificationProcessor function in farm

To help you get started, we’ve selected a few farm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepset-ai / FARM / test / test_doc_classification.py View on Github external
def test_doc_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=8,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
github deepset-ai / FARM / test / test_processor_saving_loading.py View on Github external
def test_processor_saving_loading(caplog):
    caplog.set_level(logging.CRITICAL)
    set_all_seeds(seed=42)
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            dev_filename=None,
                                            test_filename=None,
                                            dev_split=0.1,
                                            columns=["text", "label", "unused"],
                                            label_list=["OTHER", "OFFENSE"],
                                            metrics=["f1_macro"]
                                            )
    dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv")
    data, tensor_names = processor.dataset_from_dicts(dicts)

    save_dir = "testsave/processor"
    processor.save(save_dir)
github deepset-ai / FARM / test / test_doc_classification_roberta.py View on Github external
def test_doc_classification():
    #caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "roberta-base"

    tokenizer = RobertaTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=8,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = Roberta.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
github deepset-ai / FARM / examples / doc_classification_multilabel.py View on Github external
evaluate_every = 500
lang_model = "bert-base-uncased"
do_lower_case = True

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="../data/toxic-comments",
                                        label_list=label_list,
                                        label_column_name="label",
                                        metric=metric,
                                        quote_char='"',
                                        multilabel=True,
                                        train_filename="train.tsv",
                                        dev_filename="val.tsv",
                                        test_filename=None,
                                        dev_split=0
                                        )

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
github deepset-ai / FARM / examples / doc_classification_cola.py View on Github external
n_epochs = 5
batch_size = 100
evaluate_every = 20
lang_model = "bert-base-cased"
do_lower_case = False

# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load Cola 2018 Data.

label_list = ["0", "1"]
metric = "mcc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=64,
                                        data_dir="../data/cola",
                                        dev_filename="dev.tsv",
                                        dev_split=None,
                                        test_filename=None,
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="label"
                                        )

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

# 4. Create an AdaptiveModel
github deepset-ai / FARM / examples / doc_classification_with_earlystopping.py View on Github external
# The evaluation on the dev-set can be done with one of the predefined metrics or with a
# metric defined as a function from (preds, labels) to a dict that contains all the actual
# metrics values. The function must get registered under a string name and the string name must
# be used.
def mymetrics(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
    f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
    f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
    f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
    return {"acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro}
register_metrics('mymetrics', mymetrics)
metric = 'mymetrics'

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=64,
                                        data_dir="../data/germeval18",
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="coarse_label"
                                        )

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
github deepset-ai / FARM / examples / doc_classification_crossvalidation.py View on Github external
"acc": acc,
        "f1_other": f1other,
        "f1_offense": f1offense,
        "f1_macro": f1macro,
        "f1_micro": f1micro,
        "mcc": mcc
    }
register_metrics('mymetrics', mymetrics)
metric = 'mymetrics'

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

# The processor wants to know the possible labels ...
label_list = ["OTHER", "OFFENSE"]
processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=64,
                                        data_dir="../data/germeval18",
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="coarse_label"
                                        )

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

# Load one silo for each fold in our cross-validation
silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

# the following steps should be run for each of the folds of the cross validation, so we put them
github deepset-ai / FARM / examples / doc_classification_multilabel_roberta.py View on Github external
batch_size = 32

evaluate_every = 500
lang_model = "roberta-base"

# 1.Create a tokenizer
tokenizer = RobertaTokenizer.from_pretrained(
    pretrained_model_name_or_path=lang_model)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="../data/toxic-comments",
                                        label_list=label_list,
                                        label_column_name="label",
                                        metric=metric,
                                        quote_char='"',
                                        multilabel=True,
                                        train_filename="train.tsv",
                                        dev_filename="val.tsv",
                                        test_filename=None,
                                        dev_split=0
                                        )

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
github deepset-ai / FARM / farm / data_handler / processor.py View on Github external
:type  header: int
        :param proxies: proxy configuration to allow downloads of remote datasets.
                        Format as in  "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
        :type proxies: dict
        :param kwargs: placeholder for passing generic parameters
        :type kwargs: object
        """
        #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs

        # Custom processor attributes
        self.delimiter = delimiter
        self.quote_char = quote_char
        self.skiprows = skiprows
        self.header = header

        super(TextClassificationProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,

        )
        if metric and label_list:
            if multilabel:
                task_type = "multilabel_classification"
            else:
                task_type = "classification"