Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
if args.max_steps > 0:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if p.requires_grad and not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if p.requires_grad and any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
warmup_steps = args.warmup_samples // args.train_batch_size
if args.lr_decay:
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps=warmup_steps, t_total=t_total)
else:
scheduler = get_constant_schedule(optimizer, warmup_steps=warmup_steps)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"])
t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"]
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{"params": [p for n, p in model.named_parameters() if not any(
nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]},
{"params": [p for n, p in model.named_parameters() if any(
nd in n for nd in no_decay)], "weight_decay": 0.0}
]
warmup_steps = math.ceil(t_total * args["warmup_ratio"])
args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"]
optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total)
if args["fp16"]:
try:
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"])
if args["n_gpu"] > 1:
model = torch.nn.DataParallel(model)
global_step = 0
tr_loss, logging_loss = 0.0, 0.0
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
if args.max_steps > 0:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank,
params += list(model_tag.parameters())
if opt.task_sc:
params += list(model_class.parameters())
params = list(filter(lambda p: p.requires_grad, params))
named_params = []
named_params += list(model_tag.named_parameters())
if opt.task_sc:
named_params += list(model_class.named_parameters())
named_params = list(filter(lambda p: p[1].requires_grad, named_params))
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = len(train_feats['data']) // opt.batchSize * opt.max_epoch
optimizer = AdamW(optimizer_grouped_parameters, lr=opt.lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(opt.warmup_proportion * num_train_optimization_steps), num_training_steps=num_train_optimization_steps) # PyTorch scheduler
# prepare_inputs_for_bert(sentences, word_lengths)
def decode(data_feats, data_tags, data_class, output_path):
data_index = np.arange(len(data_feats))
losses = []
TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0
TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0
with open(output_path, 'w') as f:
for j in range(0, len(data_index), opt.test_batchSize):
if opt.testing:
words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device)
else:
words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
if args.max_steps > 0:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank,
def fit(self,
train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
evaluator: SentenceEvaluator,
epochs: int = 1,
steps_per_epoch = None,
scheduler: str = 'WarmupLinear',
warmup_steps: int = 10000,
optimizer_class: Type[Optimizer] = transformers.AdamW,
optimizer_params : Dict[str, object ]= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False},
weight_decay: float = 0.01,
evaluation_steps: int = 0,
output_path: str = None,
save_best_model: bool = True,
max_grad_norm: float = 1,
fp16: bool = False,
fp16_opt_level: str = 'O1',
local_rank: int = -1
):
"""
Train the model with the given training objective
Each training objective is sampled in turn for one batch.
We sample only as many batches from each objective as there are in the smallest one
to make sure of equal training with each dataset.
if not args.pretrained_model:
model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config)
else:
model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model)
model.train()
model.to(device)
multi_gpu = False
full_len = 0
print('calculating total steps')
for i in tqdm(range(num_pieces)):
with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
full_len += len([int(item) for item in f.read().strip().split()])
total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
print('total steps = {}'.format(total_steps))
optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
t_total=total_steps)
if fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
model = DataParallel(model)
multi_gpu = True
print('starting training')
running_loss = 0
for epoch in range(epochs):
if training_data is not None and 'targets' in training_data and len(training_data['targets']) == 1 and training_data['targets'][0]['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL:
self._model_type = 'classifier'
self._model = self._classifier_model_class.from_pretrained(self._pretrained_model_name, num_labels=len(
set(training_data['targets'][0]['unencoded_output'])) + 1).to(self.device)
batch_size = 10
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in self._model.named_parameters() if not any(
nd in n for nd in no_decay)], 'weight_decay': 0.000001},
{'params': [p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay)],
'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20)
gym = Gym(model=self._model, optimizer=optimizer, scheduler=scheduler,
loss_criterion=None, device=self.device, name=self.name)
input = [self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data]
tokenized_max_len = max([len(x) for x in input])
input = torch.tensor([x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input])
real = training_data['targets'][0]['encoded_output']
merged_data = list(zip(input, real))
train_data_loader = DataLoader(
merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True)
grouped_params = [
{
'params': [
p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
and p not in additional_params],
'weight_decay': conf.weight_decay},
{
'params': [
p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)
and p not in additional_params],
'weight_decay': 0.0}]
if additional_params_dict:
grouped_params.append(additional_params_dict)
return AdamW(grouped_params, lr=conf.learning_rate, eps=1e-8)
elif optim_name == "sgd":
return optim.SGD(
params,
lr=conf.learning_rate,
momentum=conf.momentum,
weight_decay=conf.weight_decay)
elif optim_name == 'bertadam':
return get_bert_optim(conf, model, n_train_instances)
elif optim_name == 'radam':
from .radam import RAdam
return RAdam(params, lr=conf.learning_rate)
raise ValueError("Unknown optimizer: " + conf.optim)
writer = SummaryWriter(log_dir)
model = JointBERT(config['model'], DEVICE, dataloader.tag_dim, dataloader.intent_dim, dataloader.intent_weight)
model.to(DEVICE)
if config['model']['finetune']:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if
not any(nd in n for nd in no_decay) and p.requires_grad],
'weight_decay': config['model']['weight_decay']},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=config['model']['learning_rate'],
eps=config['model']['adam_epsilon'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config['model']['warmup_steps'],
num_training_steps=config['model']['max_step'])
else:
for n, p in model.named_parameters():
if 'bert' in n:
p.requires_grad = False
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
lr=config['model']['learning_rate'])
for name, param in model.named_parameters():
print(name, param.shape, param.device, param.requires_grad)
max_step = config['model']['max_step']
check_step = config['model']['check_step']
batch_size = config['model']['batch_size']