How to use the farm.modeling.tokenization.Tokenizer.load function in farm

To help you get started, we’ve selected a few farm examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepset-ai / FARM / test / test_question_answering.py View on Github external
def test_qa(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=base_LM_model, do_lower_case=False
    )
    label_list = ["start_token", "end_token"]
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=20,
        doc_stride=10,
        max_query_length=6,
        train_filename="train-sample.json",
        dev_filename="dev-sample.json",
        test_filename=None,
        data_dir="samples/qa",
        label_list=label_list,
        metric="squad"
    )
github deepset-ai / FARM / test / test_doc_regression.py View on Github external
def test_doc_regression(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    processor = RegressionProcessor(tokenizer=tokenizer,
                            max_seq_len=8,
                            data_dir="samples/doc_regr",
                            train_filename="train-sample.tsv",
                            dev_filename="test-sample.tsv",
                            test_filename=None,
                            label_column_name="label")

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
github deepset-ai / FARM / test / test_lm_finetuning.py View on Github external
def test_lm_finetuning_custom_vocab(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )
    tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"])

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
        next_sent_pred=True
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)

    language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
github deepset-ai / FARM / examples / embeddings_extraction.py View on Github external
from farm.modeling.language_model import LanguageModel
from farm.modeling.tokenization import Tokenizer

from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

##########################
########## Settings
##########################
set_all_seeds(seed=42)
batch_size = 32
use_gpu = True
device, n_gpu = initialize_device_settings(use_cuda=use_gpu)
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model, do_lower_case=False
)

# 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

# 4. Create an AdaptiveModel with  a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)

adaptive_model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[],
    embeds_dropout_prob=0,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)
github deepset-ai / FARM / examples / question_answering.py View on Github external
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
batch_size = 5
n_epochs = 2
evaluate_every = 500
base_LM_model = "bert-base-cased"
train_filename="train-v2.0.json"
dev_filename="dev-v2.0.json"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=base_LM_model, do_lower_case=False
)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
label_list = ["start_token", "end_token"]
metric = "squad"
processor = SquadProcessor(
    tokenizer=tokenizer,
    max_seq_len=256,
    label_list=label_list,
    metric=metric,
    train_filename=train_filename,
    dev_filename=dev_filename,
    test_filename=None,
    data_dir="../data/squad20",
)
github deepset-ai / FARM / examples / doc_classification_crossvalidation.py View on Github external
##########################
########## Settings
##########################
xval_folds = 5
xval_stratified = True

set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 20
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=False)

# The evaluation on the dev-set can be done with one of the predefined metrics or with a
# metric defined as a function from (preds, labels) to a dict that contains all the actual
# metrics values. The function must get registered under a string name and the string name must
# be used.
# For xval, we also store the actual predictions and labels in each result so we can
# calculate overall metrics over all folds later
def mymetrics(preds, labels):
    acc = simple_accuracy(preds, labels).get("acc")
    f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
    f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
    f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
    f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
    mcc = matthews_corrcoef(labels, preds)