Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_qa(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
batch_size = 2
n_epochs = 1
evaluate_every = 4
base_LM_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=base_LM_model, do_lower_case=False
)
label_list = ["start_token", "end_token"]
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=20,
doc_stride=10,
max_query_length=6,
train_filename="train-sample.json",
dev_filename="dev-sample.json",
test_filename=None,
data_dir="samples/qa",
label_list=label_list,
metric="squad"
)
def test_doc_regression(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
processor = RegressionProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_regr",
train_filename="train-sample.tsv",
dev_filename="test-sample.tsv",
test_filename=None,
label_column_name="label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
def test_lm_finetuning_custom_vocab(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
tokenizer.add_tokens(["aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"])
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
max_seq_len=12,
next_sent_pred=True
)
data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
language_model = LanguageModel.load(lang_model, n_added_tokens=len(tokenizer.added_tokens_decoder))
from farm.modeling.language_model import LanguageModel
from farm.modeling.tokenization import Tokenizer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
##########################
########## Settings
##########################
set_all_seeds(seed=42)
batch_size = 32
use_gpu = True
device, n_gpu = initialize_device_settings(use_cuda=use_gpu)
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
# 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)
# 4. Create an AdaptiveModel with a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
adaptive_model = AdaptiveModel(
language_model=language_model,
prediction_heads=[],
embeds_dropout_prob=0,
lm_output_types=["per_token", "per_sequence"],
device=device,
)
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
batch_size = 5
n_epochs = 2
evaluate_every = 500
base_LM_model = "bert-base-cased"
train_filename="train-v2.0.json"
dev_filename="dev-v2.0.json"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=base_LM_model, do_lower_case=False
)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
label_list = ["start_token", "end_token"]
metric = "squad"
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=256,
label_list=label_list,
metric=metric,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=None,
data_dir="../data/squad20",
)
##########################
########## Settings
##########################
xval_folds = 5
xval_stratified = True
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 20
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
# The evaluation on the dev-set can be done with one of the predefined metrics or with a
# metric defined as a function from (preds, labels) to a dict that contains all the actual
# metrics values. The function must get registered under a string name and the string name must
# be used.
# For xval, we also store the actual predictions and labels in each result so we can
# calculate overall metrics over all folds later
def mymetrics(preds, labels):
acc = simple_accuracy(preds, labels).get("acc")
f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
mcc = matthews_corrcoef(labels, preds)