Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
train_data = data.Dataset(train_examples, datafields)
test_df = self.get_pandas_df(test_file)
test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
test_data = data.Dataset(test_examples, datafields)
# If validation file exists, load it. Otherwise get validation data from training data
if val_file:
val_df = self.get_pandas_df(val_file)
val_examples = [data.Example.fromlist(i, datafields) for i in val_df.values.tolist()]
val_data = data.Dataset(val_examples, datafields)
else:
train_data, val_data = train_data.split(split_ratio=0.8)
TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
self.word_embeddings = TEXT.vocab.vectors
self.vocab = TEXT.vocab
self.train_iterator = data.BucketIterator(
(train_data),
batch_size=self.config.batch_size,
sort_key=lambda x: len(x.text),
repeat=False,
shuffle=True)
self.val_iterator, self.test_iterator = data.BucketIterator.splits(
(val_data, test_data),
batch_size=self.config.batch_size,
sort_key=lambda x: len(x.text),
repeat=False,
shuffle=False)
def __init__(self, w2v_file):
"""
Initializes RNNReviewClassifier
:param w2v_file: embedding file
"""
vectors = Vectors(w2v_file)
self.vectors = vectors
def load_data(self, train_file, val_file, glove_dir):
# Loading saved data
train_dataset = torch.load(train_file)
train_examples = train_dataset['examples']
val_dataset = torch.load(val_file)
val_examples = val_dataset['examples']
# Generating torchtext dataset class
fields = [('src', self.src_field), ('trg', self.trg_field), ('feat', self.src_feat_field)]
train_dataset = data.Dataset(fields=fields, examples=train_examples)
val_dataset = data.Dataset(fields=fields, examples=val_examples)
# Loading GloVE vectors
vec = vocab.Vectors(os.path.join(glove_dir, "glove.6B.{}d.txt".format(config.word_embedding_size)))
# Building field vocabulary
self.src_field.build_vocab(train_dataset, vectors=vec, max_size=config.in_vocab_size)
self.trg_field.build_vocab(train_dataset, vectors=vec, max_size=config.out_vocab_size)
self.src_feat_field.build_vocab(train_dataset, vectors=vec, max_size=config.out_vocab_size)
src_vocab, trg_vocab, src_feat_vocab = self.generate_vocabs()
vocabs = {'src_vocab': src_vocab, 'trg_vocab': trg_vocab, 'src_feat_vocab': src_feat_vocab}
return train_dataset, val_dataset, vocabs
logger = logging.getLogger(__name__)
class FastText(vocab.Vectors):
url_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/'
def __init__(self, suffix='wiki-news-300d-1M.vec.zip', **kwargs):
url = self.url_base + suffix
base, ext = os.path.splitext(suffix)
name = suffix if ext == '.vec' else base
super(FastText, self).__init__(name, url=url, **kwargs)
class FastTextBinary(vocab.Vectors):
url_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.zip'
name_base = 'wiki.{}.bin'
def __init__(self, language='en', cache=None):
"""
Arguments:
language: Language of fastText pre-trained embedding model
cache: directory for cached model
"""
cache = os.path.expanduser(cache)
url = FastTextBinary.url_base.format(language)
name = FastTextBinary.name_base.format(language)
self.cache(name, cache, url=url)
def get_vectors(self, path: str):
logger.info('loading vectors from {}'.format(path))
vectors = Vectors(path)
logger.info('successed loading vectors')
return vectors
import torchtext.vocab as vocab
import torch.tensor as tensor
from pyspark.sql.types import ArrayType
from pyspark.sql.types import DoubleType
# Load env varibales
MASTER_URL = 'local[*]'
APPLICATION_NAME = 'preprocessor'
MORPHL_SERVER_IP_ADDRESS = getenv('MORPHL_SERVER_IP_ADDRESS')
MORPHL_CASSANDRA_USERNAME = getenv('MORPHL_CASSANDRA_USERNAME')
MORPHL_CASSANDRA_PASSWORD = getenv('MORPHL_CASSANDRA_PASSWORD')
MORPHL_CASSANDRA_KEYSPACE = getenv('MORPHL_CASSANDRA_KEYSPACE')
# Load word embeddings tensor
embedding = vocab.Vectors(name="/opt/glove/glove.6B.100d.txt",cache='/opt/glove')
# Function that returns a dataframe from a cassandra table
def fetch_from_cassandra(c_table_name, spark_session):
load_options = {
'keyspace': MORPHL_CASSANDRA_KEYSPACE,
'table': c_table_name,
'spark.cassandra.input.fetch.size_in_rows': '150'}
df = (spark_session.read.format('org.apache.spark.sql.cassandra')
.options(**load_options)
.load())
return df
def get_iters(self, train_batch_size, fold_num, vec_name, vec_cache):
# Load data splits
train, test = data.TabularDataset.splits(path="./data/fold_{}".format(fold_num), train="train.tsv",
test="test.tsv", format="tsv",
fields=[("TEXT", self.text_doc), ("ENTITY", self.entity_doc),
("LABEL", self.label_doc),
("OFFSET", self.offset_doc),
("LENGTH", self.length_doc),
("WORD_ATTN", self.word_attn_doc),
("SENT_ATTN", self.sent_attn_doc),
("DOC_ID", self.doc_id)])
# First load vectors
vector = Vectors(name=vec_name, cache=vec_cache)
# Build vocabs
self.text_doc.build_vocab(train, test, vectors=vector)
self.entity_doc.build_vocab(train, test)
self.label_doc.build_vocab(train, test)
# Get iterators
train_iter, test_iter = data.BucketIterator.splits((train, test),
sort=False, batch_sizes=(train_batch_size, 2),
repeat=True)
train_iter.shuffle = True
return train_iter, test_iter
self.train_df = train_df[['q1_wid', 'q2_wid', 'q1_cid', 'q2_cid', 'label']]
test_df['q1_wid'] = test_df['qid1'].apply(lambda qid: question_df.loc[qid]['wid'])
test_df['q2_wid'] = test_df['qid2'].apply(lambda qid: question_df.loc[qid]['wid'])
test_df['q1_cid'] = test_df['qid1'].apply(lambda qid: question_df.loc[qid]['cid'])
test_df['q2_cid'] = test_df['qid2'].apply(lambda qid: question_df.loc[qid]['cid'])
self.test_df = test_df[['q1_wid', 'q2_wid', 'q1_cid', 'q2_cid']]
self.word_embedding_path = word_path
self.char_embedding_path = char_path
cache = '../cache'
if not os.path.exists(cache):
os.mkdir(cache)
self.word_vectors = Vectors(self.word_embedding_path, cache)
self.char_vectors = Vectors(self.char_embedding_path, cache)
self.word_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
self.char_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
self.wordTEXT = data.Field(batch_first=True)
self.charTEXT = data.Field(batch_first=True)
self.LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
train_dataset = self.generate_dataset()
test_dataset = self.generate_dataset(role='test')
self.wordTEXT.build_vocab(train_dataset, test_dataset, min_freq=1, vectors=self.word_vectors)
self.charTEXT.build_vocab(train_dataset, test_dataset, min_freq=1, vectors=self.char_vectors)
self.word_embedding = self.wordTEXT.vocab.vectors
self.char_embedding = self.charTEXT.vocab.vectors
def get_features(self, dataset):
dataset = remove_neutral(dataset)
vectors = Vectors(dataset.word_embeddings)
self.get_labels(dataset)
fields = {'text': ('text', self.text_field), 'label': ('label', self.label_field)}
text = dataset.data_table[dataset.text_column].to_numpy()
labels = dataset.data_table['label'].to_numpy()
examples = [Example.fromdict(
data={'text': text[x], 'label': labels[x]}, fields=fields) for x in range(labels.shape[0])]
torch_dataset = TorchtextDataset(examples, {'text': self.text_field, 'label': self.label_field})
try:
self.text_field.vocab
except AttributeError:
self.text_field.build_vocab(torch_dataset, vectors=vectors)
self.label_field.build_vocab(torch_dataset)
loader = BucketIterator(torch_dataset, batch_size=25)
return loader