How to use the torchtext.data.LabelField function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nhatsmrt / nn-toolbox / test_lm.py View on Github external
from nntoolbox.sequence.models import LanguageModel
from nntoolbox.sequence.learner import LanguageModelLearner
from nntoolbox.sequence.components import AdditiveContextEmbedding
from nntoolbox.sequence.utils import load_embedding
from torch import nn
from torch.optim import Adam
import torch
from nntoolbox.callbacks import *
from nntoolbox.metrics import *


MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 16

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

# train_iterator, val_iterator, test_iterator = WikiText2.iters()
# for tmp in train_iterator:
#     print(tmp)


train_data, val_data, test_data = WikiText2.splits(TEXT)
train_iterator = data.BPTTIterator(
    train_data,
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=get_device(),
    bptt_len=35,
    shuffle=True
)
github lrank / Robust_and_Privacy_preserving_Text_Representations / Pytorch / postag / baseline_model_torchtext.py View on Github external
LEARNING_RATE = 1e-3
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 20


####################################
#          Preparing Data          #
####################################
# 1. data.Field()
TEXT = data.Field(include_lengths=True, pad_token='', unk_token='')
TAG_LABEL = data.LabelField()
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()

# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(path=TrustPilot_processed_dataset_path,
                                                   train="train.csv",
                                                   test="test.csv",
                                                   fields=[('text', TEXT), ('tag_label', TAG_LABEL),
                                                           ('age_label', AGE_LABEL), ('gender_label', GENDER_LABEL)],
                                                   format="csv")

# 3. Split train_data to train_data, valid_data
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}\n".format(len(test_data)))
github lrank / Robust_and_Privacy_preserving_Text_Representations / Pytorch / postag / baseline_model_torchtext.py View on Github external
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 20


####################################
#          Preparing Data          #
####################################
# 1. data.Field()
TEXT = data.Field(include_lengths=True, pad_token='', unk_token='')
TAG_LABEL = data.LabelField()
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()

# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(path=TrustPilot_processed_dataset_path,
                                                   train="train.csv",
                                                   test="test.csv",
                                                   fields=[('text', TEXT), ('tag_label', TAG_LABEL),
                                                           ('age_label', AGE_LABEL), ('gender_label', GENDER_LABEL)],
                                                   format="csv")

# 3. Split train_data to train_data, valid_data
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}\n".format(len(test_data)))

# 4. data.BucketIterator
github songyingxin / TextClassification-Pytorch / Medium-SST / run_TextCNN.py View on Github external
def main(config):

    device = get_device()

    torch.manual_seed(config.seed)
    torch.backends.cudnn.deterministic = True  # cudnn 使用确定性算法,保证每次结果一样

    """ sst2 数据准备 """
    text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True, fix_length=config.sequence_length)
    label_field = data.LabelField(dtype=torch.float)

    train_iterator, dev_iterator, test_iterator = load_sst2(config.data_path, text_field, label_field, config.batch_size, device, config.glove_word_file)

    """ 词向量准备 """
    pretrained_embeddings = text_field.vocab.vectors
    pad_idx = text_field.vocab.stoi[text_field.pad_token]
    unk_idx = text_field.vocab.stoi[text_field.unk_token]

    """ 模型准备 """
    filter_sizes = [int(val) for val in config.filter_sizes.split()]
    model = TextCNN.TextCNN(
        len(text_field.vocab), config.glove_word_dim, config.filter_num, filter_sizes,
        config.output_dim, config.dropout, pad_idx)
    
    # 模型填充词向量
    model.embedding.weight.data.copy_(pretrained_embeddings)
github mttk / rnn-classifier / datasets.py View on Github external
def make_imdb(batch_size, device=-1, vectors=None):
  TEXT = data.Field(include_lengths=True, lower=True)
  LABEL = data.LabelField()
  train, test = datasets.IMDB.splits(TEXT, LABEL)

  TEXT.build_vocab(train, test, val, vectors=vectors, max_size=30000) 
  LABEL.build_vocab(train, test, val)
  train_iter, test_iter = data.BucketIterator.splits(
              (train, test), batch_size=batch_size, device=device, repeat=False)

  return (train_iter, test_iter), TEXT, LABEL
github NLPatVCU / medinify / medinify / sentiment / rnn_review_classifier.py View on Github external
def generate_data_loaders(self, train_data, train_target, valid_data, valid_target, batch_size):
        """
        This function generates TorchText dataloaders for training and validation datasets
        :param train_data: training dataset (list of comment string)
        :param valid_data: validation dataset (list of comment string)
        :param train_target: training data's assosated ratings (list of 'pos' and 'neg')
        :param valid_target: validation data's assosated ratings (list of 'pos' and 'neg')
        :param batch_size: the loaders' batch sizes
        :return: train data loader and validation data loader
        """

        # create TorchText fields
        self.comment_field = data.Field(tokenize = 'spacy', include_lengths = True)
        self.rating_field = data.LabelField(dtype=torch.float)

        # iterate through dataset and generate examples with comment_field and rating_field
        train_examples = []
        valid_examples = []
        for i in range(len(train_data)):
            comment = train_data[i]
            rating = train_target[i]
            review = {'comment': comment, 'rating': rating}
            ex = Example.fromdict(data=review,
                                  fields={'comment': ('comment', self.comment_field),
                                          'rating': ('rating', self.rating_field)})
            train_examples.append(ex)
        
        for i in range(len(valid_data)):
            comment = valid_data[i]
            rating = valid_target[i]
github Stark-JC / code-for-nlp-beginner / Task3-Natural Language Inference / util.py View on Github external
def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None, use_tree=False):
    if not use_tree:
        TEXT = data.Field(batch_first=True, include_lengths=True, lower=True)
        LABEL = data.LabelField(batch_first=True)
        TREE = None

        fields = {'sentence1': ('premise', TEXT),
                  'sentence2': ('hypothesis', TEXT),
                  'gold_label': ('label', LABEL)}
    else:
        TEXT = data.Field(batch_first=True,
                          lower=True,
                          preprocessing=lambda parse: [t for t in parse if t not in ('(', ')')],
                          include_lengths=True)
        LABEL = data.LabelField(batch_first=True)
        TREE = data.Field(preprocessing=lambda parse: ['reduce' if t == ')' else 'shift' for t in parse if t != '('],
                          batch_first=True)

        TREE.build_vocab([['reduce'], ['shift']])
github songyingxin / TextClassification-Pytorch / Medium-SST / run_TextRNN.py View on Github external
def main(config):

    device = get_device()

    torch.manual_seed(config.seed)
    torch.backends.cudnn.deterministic = True  # cudnn 使用确定性算法,保证每次结果一样

    """ sst2 数据准备 """
    text_field = data.Field(tokenize='spacy', lower=True,
                            include_lengths=True)
    label_field = data.LabelField(dtype=torch.float)

    train_iterator, dev_iterator, test_iterator = load_sst2(
        config.data_path, text_field, label_field, config.batch_size, device, config.glove_word_file)

    """ 词向量准备 """
    pretrained_embeddings = text_field.vocab.vectors
    pad_idx = text_field.vocab.stoi[text_field.pad_token]
    unk_idx = text_field.vocab.stoi[text_field.unk_token]

    """ 模型准备 """
    model = TextRNN.TextRNN(len(text_field.vocab), config.glove_word_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, pad_idx)

    # 模型填充词向量
    model.embedding.weight.data.copy_(pretrained_embeddings)
    model.embedding.weight.data[unk_idx] = torch.rand(config.glove_word_dim)
    model.embedding.weight.data[pad_idx] = torch.rand(config.glove_word_dim)
github NLPatVCU / medinify / medinify / process / dataloder_processor.py View on Github external
def __init__(self):
        super().__init__()
        self.text_field = Field(tokenize=super().tokenize, dtype=torch.float64)
        self.label_field = LabelField(dtype=torch.float64)