How to use the reader.ptb_raw_data function in reader

To help you get started, we’ve selected a few reader examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github widiot / tensorflow-practices / lstm / nl-modeling / train.py View on Github external
def main(_):
    # 原始数据
    train_data, valid_data, test_data, _ = reader.ptb_raw_data(DATA_PATH)

    # 计算一个epoch需要训练的次数
    train_data_len = len(train_data)  # 数据集的大小
    train_batch_len = train_data_len // TRAIN_BATCH_SIZE  # batch的个数
    train_epoch_size = (train_batch_len - 1) // TRAIN_NUM_STEP  # 该epoch的训练次数

    valid_data_len = len(valid_data)
    valid_batch_len = valid_data_len // EVAL_BATCH_SIZE
    valid_epoch_size = (valid_batch_len - 1) // EVAL_NUM_STEP

    test_data_len = len(test_data)
    test_batch_len = test_data_len // EVAL_BATCH_SIZE
    test_epoch_size = (test_batch_len - 1) // EVAL_NUM_STEP

    # 生成数据队列,必须放在开启多线程之前
    train_queue = reader.ptb_producer(train_data, train_model.batch_size,
github ujjax / fast-slow-lstm / main.py View on Github external
loss = criterion(outputs.view(-1, model.vocab_size), tt)
        costs += loss.data[0] * model.num_steps
        iters += model.num_steps

        if is_train:
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), 0.25)
            for p in model.parameters():
                p.data.add_(-lr, p.grad.data)
            if step % (epoch_size // 10) == 10:
                print("{} perplexity: {:8.2f} speed: {} wps".format(step * 1.0 / epoch_size, np.exp(costs / iters),
                                  iters * model.batch_size / (time.time() - start_time)))
    return np.exp(costs / iters)

if __name__ == "__main__":
    raw_data = reader.ptb_raw_data(data_path=args.data_path)
    train_data, valid_data, test_data, word_to_id, id_to_word = raw_data
    vocab_size = len(word_to_id)
    print('Vocabulary size: {}'.format(vocab_size))
    model = PTB_Model(embedding_dim=args.hidden_size, num_steps=args.num_steps, batch_size=args.batch_size,
                      vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.keep_prob)
    model.cuda()
    lr = args.lr_start
    # decay factor for learning rate
    lr_decay_base = args.lr_decay_rate
    # we will not touch lr for the first m_flat_lr epochs
    m_flat_lr = 14.0

    print("########## Training ##########################")

    for epoch in range(args.max_max_epoch):
        lr_decay = lr_decay_base ** max(epoch - m_flat_lr, 0)
github jingli9111 / RUM-Tensorflow / ptb_task.py View on Github external
def main(_):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to PTB data directory")

    config       = configs.get_config(FLAGS.model)
    eval_config  = configs.get_config(FLAGS.model)
    valid_config = configs.get_config(FLAGS.model)
    print(config.batch_size)
    eval_config.batch_size = 20
    valid_config.batch_size = 20
   
    raw_data = reader.ptb_raw_data(FLAGS.data_path + config.dataset + '/')
    train_data, valid_data, test_data, _ = raw_data

    if not os.path.exists(os.path.dirname(FLAGS.save_path)):
        try:
            os.makedirs(os.path.dirname(FLAGS.save_path))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)

        with tf.name_scope("Train"):
            train_input = PTBInput(config=config, data=train_data, name="TrainInput")
            with tf.variable_scope("Model", reuse=None, initializer=initializer):
github JianGoForIt / YellowFin / parsing / train.py View on Github external
def train():
  print('data_path: %s' % FLAGS.data_path)
  raw_data = reader.ptb_raw_data(FLAGS.data_path)
  train_data, valid_data, valid_nbest_data, vocab = raw_data
  train_data = chop(train_data, vocab[''])
  
  config = MediumConfig()
  if FLAGS.init_scale: config.init_scale = FLAGS.init_scale
  if FLAGS.learning_rate: config.learning_rate = FLAGS.learning_rate
  if FLAGS.max_grad_norm: config.max_grad_norm = FLAGS.max_grad_norm
  if FLAGS.num_layers: config.num_layers = FLAGS.num_layers
  if FLAGS.num_steps: config.num_steps = FLAGS.num_steps
  if FLAGS.hidden_size: config.hidden_size = FLAGS.hidden_size
  if FLAGS.max_epoch: config.max_epoch = FLAGS.max_epoch
  if FLAGS.max_max_epoch: config.max_max_epoch = FLAGS.max_max_epoch
  if FLAGS.keep_prob: config.keep_prob = FLAGS.keep_prob
  if FLAGS.lr_decay: config.lr_decay = FLAGS.lr_decay
  if FLAGS.batch_size: config.batch_size = FLAGS.batch_size
  if FLAGS.opt_method: config.opt_method = FLAGS.opt_method
github manuwhs / Trapyng / Examples / 4.1 BBB_LSTM / 1. main_BBB_LSTM.py View on Github external
############## Load the DATA #############################################
###########################################################################

"""
We will load the data into the RAM but we will until we build the graph
to transform it into TensorFlow elements and divide into batches with XXXX

"""
data_to_use = "aritificial"  #  ptb  aritificial

if (data_to_use == "ptb" ):
    model_select= "small"  # test small
    data_path = "../data"
    
    # Read the words from 3 documents and convert them to ids with a vocabulary
    raw_data = reader.ptb_raw_data(data_path)
    """
    Raw data contains 3 lists of a lot of words:
        - [0]: List of ids of the words for train
        - [1]: List of ids of the words for validation
        - [3]: List of ids of the words for validation
        - [4]: Number of words in the vocabulary.
    """
    
    train_data, valid_data, test_data, word_to_id, _ = raw_data
    # Create dictonary from ids to words.
    id_to_word = np.array(list(word_to_id.keys()))
    print (["Most common words: ", id_to_word[0:5]])

    # Create the objects with the hyperparameters that will be fed to the network
    train_config = Bconf.get_config(model_select,mixing_pi,prior_log_sigma1,prior_log_sigma2 )
    eval_config = Bconf.get_config(model_select,mixing_pi,prior_log_sigma1,prior_log_sigma2 )
github Sunnydreamrain / IndRNN_Theano_Lasagne / cPTB / penntree_charlevel_rernn.py View on Github external
def get_raw_data(dataset='ptb',data_path='data/'):
  raw_data = ptb_raw_data(data_path,filename=name_dataset)
  return raw_data
train_data, valid_data, test_data, _ = get_raw_data('ptb')
github lverwimp / tf-lm / scripts / word_lm_rescore_nbest.py View on Github external
else:
		config = configuration.get_config(FLAGS.config)

	fout = file(config['log'],'w')
	sys.stdout = writer(sys.stdout, fout)

	print('configuration:')
	for par,value in config.iteritems():
		print('{0}\t{1}'.format(par, value))

	eval_config = config.copy() # same parameters for evaluation, except for:
	eval_config['batch_size'] = 1 # batch_size
	eval_config['num_steps'] = 1 # and number of steps

	# hypotheses = list of all hypotheses in n-best list
	all_data, id_to_word, total_length, hypotheses = reader.ptb_raw_data(config)

	# if processing per sentence
	if 'per_sentence' in config:
		# set num_steps = total length of each (padded) sentence
		config['num_steps'] = total_length
		# vocab is expanded with  and padding symbol @
		config['vocab_size'] = len(id_to_word)
		eval_config['vocab_size'] = len(id_to_word)
		debug('vocabulary size: {0}\n'.format(config['vocab_size']))

	with tf.Graph().as_default():

		with tf.name_scope("Test"):
			test_hypotheses = wordInput(config=eval_config, data=hypotheses, name="Hypotheses")
			with tf.variable_scope("Model", reuse=None):
				mtest = wordLM(is_training=False, config=eval_config, input_=test_hypotheses)
github tensorflow / models / tutorials / rnn / ptb / ptb_word_lm.py View on Github external
def main(_):
  if not FLAGS.data_path:
    raise ValueError("Must set --data_path to PTB data directory")
  gpus = [
      x.name for x in device_lib.list_local_devices() if x.device_type == "GPU"
  ]
  if FLAGS.num_gpus > len(gpus):
    raise ValueError(
        "Your machine has only %d gpus "
        "which is less than the requested --num_gpus=%d."
        % (len(gpus), FLAGS.num_gpus))

  raw_data = reader.ptb_raw_data(FLAGS.data_path)
  train_data, valid_data, test_data, _ = raw_data

  config = get_config()
  eval_config = get_config()
  eval_config.batch_size = 1
  eval_config.num_steps = 1

  with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale,
                                                config.init_scale)

    with tf.name_scope("Train"):
      train_input = PTBInput(config=config, data=train_data, name="TrainInput")
      with tf.variable_scope("Model", reuse=None, initializer=initializer):
        m = PTBModel(is_training=True, config=config, input_=train_input)
      tf.summary.scalar("Training Loss", m.cost)
github cdg720 / emnlp2016 / train.py View on Github external
def train():
  print('data_path: %s' % FLAGS.data_path)
  raw_data = reader.ptb_raw_data(FLAGS.data_path)
  train_data, valid_data, valid_nbest_data, vocab = raw_data
  train_data = chop(train_data, vocab[''])
  
  config = MediumConfig()
  if FLAGS.init_scale: config.init_scale = FLAGS.init_scale
  if FLAGS.learning_rate: config.learning_rate = FLAGS.learning_rate
  if FLAGS.max_grad_norm: config.max_grad_norm = FLAGS.max_grad_norm
  if FLAGS.num_layers: config.num_layers = FLAGS.num_layers
  if FLAGS.num_steps: config.num_steps = FLAGS.num_steps
  if FLAGS.hidden_size: config.hidden_size = FLAGS.hidden_size
  if FLAGS.max_epoch: config.max_epoch = FLAGS.max_epoch
  if FLAGS.max_max_epoch: config.max_max_epoch = FLAGS.max_max_epoch
  if FLAGS.keep_prob: config.keep_prob = FLAGS.keep_prob
  if FLAGS.lr_decay: config.lr_decay = FLAGS.lr_decay
  if FLAGS.batch_size: config.batch_size = FLAGS.batch_size
  config.vocab_size = len(vocab)