Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from nntoolbox.sequence.models import LanguageModel
from nntoolbox.sequence.learner import LanguageModelLearner
from nntoolbox.sequence.components import AdditiveContextEmbedding
from nntoolbox.sequence.utils import load_embedding
from torch import nn
from torch.optim import Adam
import torch
from nntoolbox.callbacks import *
from nntoolbox.metrics import *
MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 16
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
# train_iterator, val_iterator, test_iterator = WikiText2.iters()
# for tmp in train_iterator:
# print(tmp)
train_data, val_data, test_data = WikiText2.splits(TEXT)
train_iterator = data.BPTTIterator(
train_data,
batch_size=BATCH_SIZE,
sort_within_batch=True,
device=get_device(),
bptt_len=35,
shuffle=True
)
LEARNING_RATE = 1e-3
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 20
####################################
# Preparing Data #
####################################
# 1. data.Field()
TEXT = data.Field(include_lengths=True, pad_token='', unk_token='')
TAG_LABEL = data.LabelField()
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()
# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(path=TrustPilot_processed_dataset_path,
train="train.csv",
test="test.csv",
fields=[('text', TEXT), ('tag_label', TAG_LABEL),
('age_label', AGE_LABEL), ('gender_label', GENDER_LABEL)],
format="csv")
# 3. Split train_data to train_data, valid_data
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}\n".format(len(test_data)))
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 20
####################################
# Preparing Data #
####################################
# 1. data.Field()
TEXT = data.Field(include_lengths=True, pad_token='', unk_token='')
TAG_LABEL = data.LabelField()
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()
# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(path=TrustPilot_processed_dataset_path,
train="train.csv",
test="test.csv",
fields=[('text', TEXT), ('tag_label', TAG_LABEL),
('age_label', AGE_LABEL), ('gender_label', GENDER_LABEL)],
format="csv")
# 3. Split train_data to train_data, valid_data
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}\n".format(len(test_data)))
# 4. data.BucketIterator
def main(config):
device = get_device()
torch.manual_seed(config.seed)
torch.backends.cudnn.deterministic = True # cudnn 使用确定性算法,保证每次结果一样
""" sst2 数据准备 """
text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True, fix_length=config.sequence_length)
label_field = data.LabelField(dtype=torch.float)
train_iterator, dev_iterator, test_iterator = load_sst2(config.data_path, text_field, label_field, config.batch_size, device, config.glove_word_file)
""" 词向量准备 """
pretrained_embeddings = text_field.vocab.vectors
pad_idx = text_field.vocab.stoi[text_field.pad_token]
unk_idx = text_field.vocab.stoi[text_field.unk_token]
""" 模型准备 """
filter_sizes = [int(val) for val in config.filter_sizes.split()]
model = TextCNN.TextCNN(
len(text_field.vocab), config.glove_word_dim, config.filter_num, filter_sizes,
config.output_dim, config.dropout, pad_idx)
# 模型填充词向量
model.embedding.weight.data.copy_(pretrained_embeddings)
def make_imdb(batch_size, device=-1, vectors=None):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
train, test = datasets.IMDB.splits(TEXT, LABEL)
TEXT.build_vocab(train, test, val, vectors=vectors, max_size=30000)
LABEL.build_vocab(train, test, val)
train_iter, test_iter = data.BucketIterator.splits(
(train, test), batch_size=batch_size, device=device, repeat=False)
return (train_iter, test_iter), TEXT, LABEL
def generate_data_loaders(self, train_data, train_target, valid_data, valid_target, batch_size):
"""
This function generates TorchText dataloaders for training and validation datasets
:param train_data: training dataset (list of comment string)
:param valid_data: validation dataset (list of comment string)
:param train_target: training data's assosated ratings (list of 'pos' and 'neg')
:param valid_target: validation data's assosated ratings (list of 'pos' and 'neg')
:param batch_size: the loaders' batch sizes
:return: train data loader and validation data loader
"""
# create TorchText fields
self.comment_field = data.Field(tokenize = 'spacy', include_lengths = True)
self.rating_field = data.LabelField(dtype=torch.float)
# iterate through dataset and generate examples with comment_field and rating_field
train_examples = []
valid_examples = []
for i in range(len(train_data)):
comment = train_data[i]
rating = train_target[i]
review = {'comment': comment, 'rating': rating}
ex = Example.fromdict(data=review,
fields={'comment': ('comment', self.comment_field),
'rating': ('rating', self.rating_field)})
train_examples.append(ex)
for i in range(len(valid_data)):
comment = valid_data[i]
rating = valid_target[i]
def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None, use_tree=False):
if not use_tree:
TEXT = data.Field(batch_first=True, include_lengths=True, lower=True)
LABEL = data.LabelField(batch_first=True)
TREE = None
fields = {'sentence1': ('premise', TEXT),
'sentence2': ('hypothesis', TEXT),
'gold_label': ('label', LABEL)}
else:
TEXT = data.Field(batch_first=True,
lower=True,
preprocessing=lambda parse: [t for t in parse if t not in ('(', ')')],
include_lengths=True)
LABEL = data.LabelField(batch_first=True)
TREE = data.Field(preprocessing=lambda parse: ['reduce' if t == ')' else 'shift' for t in parse if t != '('],
batch_first=True)
TREE.build_vocab([['reduce'], ['shift']])
def main(config):
device = get_device()
torch.manual_seed(config.seed)
torch.backends.cudnn.deterministic = True # cudnn 使用确定性算法,保证每次结果一样
""" sst2 数据准备 """
text_field = data.Field(tokenize='spacy', lower=True,
include_lengths=True)
label_field = data.LabelField(dtype=torch.float)
train_iterator, dev_iterator, test_iterator = load_sst2(
config.data_path, text_field, label_field, config.batch_size, device, config.glove_word_file)
""" 词向量准备 """
pretrained_embeddings = text_field.vocab.vectors
pad_idx = text_field.vocab.stoi[text_field.pad_token]
unk_idx = text_field.vocab.stoi[text_field.unk_token]
""" 模型准备 """
model = TextRNN.TextRNN(len(text_field.vocab), config.glove_word_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, pad_idx)
# 模型填充词向量
model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[unk_idx] = torch.rand(config.glove_word_dim)
model.embedding.weight.data[pad_idx] = torch.rand(config.glove_word_dim)
def __init__(self):
super().__init__()
self.text_field = Field(tokenize=super().tokenize, dtype=torch.float64)
self.label_field = LabelField(dtype=torch.float64)