Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _imagenet_preprocess(rgb):
"""Changes RGB [0,1] valued image to BGR [0,255] with mean subtracted."""
red, green, blue = tf.split(3, 3, rgb * 255.0)
bgr = tf.concat(3, [blue, green, red])
bgr -= IMAGENET_MEAN_BGR
return bgr
def inference(self):
"""main computation graph here:
#1.Word embedding. 2.Encoder with GRU 3.Decoder using GRU(optional with attention)."""
###################################################################################################################################
# 1.embedding of words
self.embedded_words = tf.nn.embedding_lookup(self.Embedding,self.input_x) #[None, self.sequence_length, self.embed_size]
# 2.encoder with GRU
# 2.1 forward gru
hidden_state_forward_list = self.gru_forward(self.embedded_words,self.gru_cell) # a list,length is sentence_length, each element is [batch_size,hidden_size]
# 2.2 backward gru
hidden_state_backward_list = self.gru_forward(self.embedded_words,self.gru_cell,reverse=True) # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
# 2.3 concat forward hidden state and backward hidden state. hidden_state: a list.len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
thought_vector_list=[tf.concat([h_forward,h_backward],axis=1) for h_forward,h_backward in zip(hidden_state_forward_list,hidden_state_backward_list)]#list,len:sent_len,e:[batch_size,hidden_size*2]
# 3.Decoder using GRU with attention
thought_vector=tf.stack(thought_vector_list,axis=1) #shape:[batch_size,sentence_length,hidden_size*2]
#initial_state=tf.reduce_sum(thought_vector,axis=1) #[batch_size,hidden_size*2] #TODO NEED TO TEST WHICH ONE IS BETTER: SUM UP OR USE LAST HIDDEN STATE==>similiarity.
initial_state=tf.nn.tanh(tf.matmul(hidden_state_backward_list[0],self.W_initial_state)+self.b_initial_state) #initial_state:[batch_size,hidden_size*2]. TODO this is follow paper's way.
cell=self.gru_cell_decoder #this is a special cell. because it beside previous hidden state, current input, it also has a context vecotor, which represent attention result.
output_projection=(self.W_projection,self.b_projection) #W_projection:[self.hidden_size * 2, self.num_classes]; b_projection:[self.num_classes]
loop_function = extract_argmax_and_embed(self.Embedding_label,output_projection) if not self.is_training else None #loop function will be used only at testing, not training.
attention_states=thought_vector #[None, self.sequence_length, self.embed_size]
decoder_input_embedded=tf.nn.embedding_lookup(self.Embedding_label,self.decoder_input) #[batch_size,self.decoder_sent_length,embed_size]
decoder_input_splitted = tf.split(decoder_input_embedded, self.decoder_sent_length,axis=1) # it is a list,length is decoder_sent_length, each element is [batch_size,1,embed_size]
decoder_input_squeezed = [tf.squeeze(x, axis=1) for x in decoder_input_splitted] # it is a list,length is decoder_sent_length, each element is [batch_size,embed_size]
#rnn_decoder_with_attention(decoder_inputs, initial_state, cell, loop_function,attention_states,scope=None):
#input1:decoder_inputs:target, shift by one. for example.the target is:"X Y Z",then decoder_inputs should be:"START X Y Z" A list of 2D Tensors [batch_size x input_size].
custom_layers.shape(char_emb, 2),
custom_layers.shape(char_emb, 3)])
# [num_sentences * max_sentence_length, max_word_length, emb]
flattened_aggregated_char_emb = custom_layers.cnn(flattened_char_emb, self.filter_widths, self.filter_size)
# [num_sentences * max_sentence_length, emb]
aggregated_char_emb = tf.reshape(flattened_aggregated_char_emb,
[num_sentences,
max_sentence_length,
custom_layers.shape(flattened_aggregated_char_emb, 1)])
# [num_sentences, max_sentence_length, emb]
text_emb_list.append(aggregated_char_emb)
text_emb = tf.concat(text_emb_list, 2)
text_emb = tf.nn.dropout(text_emb, self.lexical_dropout)
text_len_mask = tf.sequence_mask(text_len, maxlen=max_sentence_length)
text_len_mask = tf.reshape(text_len_mask, [num_sentences * max_sentence_length])
text_outputs = self.encode_sentences(text_emb, text_len, text_len_mask)
text_outputs = tf.nn.dropout(text_outputs, self.dropout) # [num_sentences * max_sentence_length, emb] (my)
genre_emb = tf.gather(tf.get_variable("genre_embeddings",
[len(self.genres), self.feature_size],
dtype=tf.float64),
genre) # [emb]
flattened_text_emb = self.flatten_emb_by_sentence(text_emb, text_len_mask) # [num_words]
if self.train_on_gold:
n_pos_locs = tf.cast(features['n_pos_locs'], tf.int32)
n_neg_locs = tf.cast(features['n_neg_locs'], tf.int32)
image_shape = tf.stack([1,orig_height,orig_width,3])
image = tf.cast(tf.reshape(image,image_shape),tf.float32)
pos_locs_shape = tf.stack([n_pos_locs,4])
pos_locs = tf.reshape(pos_locs,pos_locs_shape)
neg_locs_shape = tf.stack([n_neg_locs,4])
neg_locs = tf.reshape(neg_locs,neg_locs_shape)
positive_cropped = tf.image.crop_and_resize(image,pos_locs,tf.zeros([n_pos_locs],dtype=tf.int32),[227,227])
negative_cropped = tf.image.crop_and_resize(image,neg_locs,tf.zeros([n_neg_locs],dtype=tf.int32),[227,227])
all_images = tf.concat([positive_cropped,negative_cropped],axis=0)
positive_labels = tf.ones([n_pos_locs])
negative_labels = tf.zeros([n_neg_locs])
positive_landmarks = tf.tile(landmarks,[n_pos_locs,1])
negative_landmarks = tf.tile(landmarks,[n_neg_locs,1])
positive_visibility = tf.tile(visibility,[n_pos_locs,1])
negative_visibility = tf.tile(visibility,[n_neg_locs,1])
positive_pose = tf.tile(pose,[n_pos_locs,1])
negative_pose = tf.tile(pose,[n_neg_locs,1])
positive_gender = tf.tile(gender,[n_pos_locs,1])
negative_gender = tf.tile(gender,[n_neg_locs,1])
else:
batchsize = hp.B2
self.prev_max_attentions = tf.ones(shape=(batchsize,), dtype=tf.int32)
self.gts = tf.convert_to_tensor(guided_attention(hp))
else: # Synthesize
self.L = tf.placeholder(tf.int32, shape=(None, None))
self.speakers = None
if hp.multispeaker:
self.speakers = tf.placeholder(tf.int32, shape=(None, None))
self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels))
self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None,))
if num==1 or (not training):
with tf.variable_scope("Text2Mel"):
# Get S or decoder inputs. (B, T//r, n_mels)
self.S = tf.concat((tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)
# Networks
with tf.variable_scope("TextEnc"):
self.K, self.V = TextEnc(hp, self.L, training=training, speaker_codes=self.speakers) # (N, Tx, e)
with tf.variable_scope("AudioEnc"):
self.Q = AudioEnc(hp, self.S, training=training, speaker_codes=self.speakers)
with tf.variable_scope("Attention"):
# R: (B, T/r, 2d)
# alignments: (B, N, T/r)
# max_attentions: (B,)
self.R, self.alignments, self.max_attentions = Attention(hp, self.Q, self.K, self.V,
mononotic_attention=(not training),
prev_max_attentions=self.prev_max_attentions)
with tf.variable_scope("AudioDec"):
else:
noisy_inputs = inputs
#compute the high level features
hlfeat = self.encoder(
inputs=noisy_inputs,
sequence_lengths=input_seq_length,
is_training=is_training)
#prepend a sequence border label to the targets to get the encoder
#inputs, the label is the last label
batch_size = int(targets.get_shape()[0])
s_labels = tf.constant(self.output_dim-1,
dtype=tf.int32,
shape=[batch_size, 1])
encoder_inputs = tf.concat([s_labels, targets], 1)
#compute the output logits
logits, _ = self.decoder(
hlfeat=hlfeat,
encoder_inputs=encoder_inputs,
initial_state=self.decoder.zero_state(batch_size),
first_step=True,
is_training=is_training)
return logits, target_seq_length + 1
# To RNN and beyond
with tf.variable_scope('cnn_output'):
self.cnn_output = conv_3_output_pool; # [batch_size,n_timesteps/8,n_features_final]
with tf.variable_scope('multi_rnn_layers'):
with tf.variable_scope('forward'):
rnn_cells_forward = [tf.nn.rnn_cell.BasicLSTMCell(num_units=n, activation=self.configure.state_activation) for n in self.configure.rnn_units]
rnn_stack_forward = tf.nn.rnn_cell.MultiRNNCell(rnn_cells_forward)
#rnn_stack_forward = tf.contrib.rnn.DropoutWrapper(rnn_stack_forward, output_keep_prob=self.configure.keep_prob_rnn)
outputs_forward, state_forward = tf.nn.dynamic_rnn(rnn_stack_forward, self.cnn_output, dtype = tf.float32)
with tf.variable_scope('backward'):
x_backward_ = tf.reverse(self.cnn_output, axis=[1], name='cnn_output_backward_')
rnn_cells_backward = [tf.nn.rnn_cell.BasicLSTMCell(num_units=n, activation=self.configure.state_activation) for n in self.configure.rnn_units]
rnn_stack_backward = tf.nn.rnn_cell.MultiRNNCell(rnn_cells_backward)
#rnn_stack_backward = tf.contrib.rnn.DropoutWrapper(rnn_stack_backward, output_keep_prob=self.configure.keep_prob_rnn)
outputs_backward, state_backward = tf.nn.dynamic_rnn(rnn_stack_backward, x_backward_, dtype = tf.float32)
self.rnn_output = tf.concat([outputs_forward[:,-1,:],outputs_backward[:,-1,:]],axis=-1) # [batch_size,2*self.configure.rnn_units[-1]]
output_ = self.rnn_output;
with tf.variable_scope('multi_dense_layers'):
for i, units in enumerate(self.configure.dense_layer_units):
output_ = tf.layers.dense(inputs=output_, units=units, activation=self.configure.dense_activation, name='dense_{}'.format(i))
output_ = tf.layers.dropout(output_, rate=self.configure.dropout_rates[i], training=self.training, name='dropout_{}'.format(i))
self.preds = tf.layers.dense(inputs=output_, units=self.configure.n_classes, activation=self.configure.last_activation, name='predictions')
with tf.variable_scope('loss_and_optimizer'):
# 1. Loss function
self.loss = (tf.reduce_sum(getattr(losses, self.configure.custom_loss)(self.y_,self.preds))/tf.cast(tf.shape(self.x_)[0],tf.float32))
self.accuracy = (tf.reduce_sum(tf.cast(tf.equal(tf.argmax(self.y_,1), tf.argmax(self.preds,1)), tf.float32), name='accuracy')/tf.cast(tf.shape(self.x_)[0],tf.float32))
# 2. Calculate and clip gradients
params = tf.trainable_variables()
gradients = tf.gradients(self.loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.configure.max_gradient_norm)
# 3. Set learning Rate: Exponential Decay or a constant value
self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') # global_step just keeps track of the number of batches seen so far
3,
padding='same',
activation=tf.nn.relu,
strides=2)
van_higher_level4 = tf.layers.conv2d(
van_higher_level4,
first_depth * 1,
3,
padding='same',
activation=tf.nn.relu,
strides=2)
van_higher_level4 = tf.reshape(
tf.contrib.layers.layer_norm(van_higher_level4),
[orig_x_shape[0], 2 * 2 * first_depth])
x = tf.concat([x, van_higher_level2, van_higher_level4], 1)
batch_size = x.get_shape().as_list()[0]
if pre_result is None:
pre_result = [tf.zeros(shape=flags.enc_size)] * batch_size
result = tf.concat([x, pre_result], 1)
result, lstm_states[0] = tf_ops.lstm_cell(
result,
lstm_states[0],
flags.enc_size * 4,
use_peepholes=True,
num_proj=flags.enc_size * 4)
result = tf.contrib.layers.layer_norm(result)
def embedding_layer(self):
with tf.name_scope("word_embeddings"):
self.encoder_embeddings = tf.Variable(
initial_value=np.array(self.encoder_embeddings_matrix, dtype=np.float32),
dtype=tf.float32, trainable=False)
self.enc_embed_input = tf.nn.embedding_lookup(self.encoder_embeddings, self.input_data)
# self.enc_embed_input = tf.nn.dropout(self.enc_embed_input, keep_prob=self.keep_prob)
with tf.name_scope("decoder_inputs"):
self.decoder_embeddings = tf.Variable(
initial_value=np.array(self.decoder_embeddings_matrix, dtype=np.float32),
dtype=tf.float32, trainable=False)
ending = tf.strided_slice(self.target_data, [0, 0], [self.batch_size, -1], [1, 1],
name='slice_input') # Minus 1 implies everything till the last dim
self.dec_input = tf.concat([tf.fill([self.batch_size, 1], self.decoder_word_index['GO']), ending], 1,
name='dec_input')
self.dec_embed_input = tf.nn.embedding_lookup(self.decoder_embeddings, self.dec_input)
# self.dec_embed_input = tf.nn.dropout(self.dec_embed_input, keep_prob=self.keep_prob)
dec_inputs = tf.nn.embedding_lookup(embedding, input_tensors["dec_inputs"])
labels = input_tensors["labels"]
labels = tf.reshape(labels, [-1, 1])
rnn_hparams = utils.filter_hparams(hparams, "rnn")
init_state = tf.zeros([hparams.batch_size, rnn_hparams.size])
cell_e = ops.get_rnn_cell(rnn_hparams)
_, z = tf.nn.dynamic_rnn(cell_e, enc_inputs, initial_state=init_state,
scope="encoder")
z = z[:, hparams.dim_y:]
label_proj_g = tf.layers.Dense(hparams.dim_y, name="generator")
h_ori = tf.concat([label_proj_g(labels), z], 1)
h_tsf = tf.concat([label_proj_g(1 - labels), z], 1)
cell_g = ops.get_rnn_cell(rnn_hparams)
softmax_proj = tf.layers.Dense(hparams.vocab_size, name="softmax_proj")
g_outputs, _ = tf.nn.dynamic_rnn(cell_g, dec_inputs, initial_state=h_ori,
scope="generator")
g_outputs = tf.nn.dropout(
g_outputs, switch_dropout(hparams.output_keep_prob))
g_logits = softmax_proj(tf.reshape(g_outputs, [-1, rnn_hparams.size]))
loss_g = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.reshape(input_tensors["targets"], [-1]), logits=g_logits)
loss_g *= tf.reshape(input_tensors["weights"], [-1])
ppl_g = tf.reduce_sum(loss_g) / (tf.reduce_sum(input_tensors["weights"]) \
+ 1e-8)
loss_g = tf.reduce_sum(loss_g) / hparams.batch_size