import numpy as np
import tensorflow as tf
# Data can be downloaded at https://github.com/harvardnlp/sent-summary
article_filename = 'Data/sumdata/train/train.article.txt'
title_filename = 'Data/sumdata/train/train.title.txt'
with open(article_filename) as article_file:
articles = article_file.readlines()
with open(title_filename) as title_file:
titles = title_file.readlines()
def create_lookup_tables(text):
vocab = set(text.split())
vocab_to_int = {'<S>': 0, '<E>': 1, '<UNK>': 2, '<PAD>': 3 }
for i, v in enumerate(vocab, len(vocab_to_int)):
vocab_to_int[v] = i
int_to_vocab = {i: v for v, i in vocab_to_int.items()}
return vocab_to_int, int_to_vocab
vocab_to_int_article, int_to_vocab_article = create_lookup_tables([x.lower() for x in articles])
vocab_to_int_title, int_to_vocab_title = create_lookup_tables([x.lower() for x in titles])
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
source_id_text = [[source_vocab_to_int[word] for word in sentence.split()] for sentence in source_text.split('\n')]
target_id_text = [[target_vocab_to_int[word] for word in sentence.split()]+[target_vocab_to_int['<E>']] for sentence in target_text.split('\n')]
return source_id_text, target_id_text
X, y = text_to_ids(articles.lower(), titles.lower(), vocab_to_int_articles, vocab_to_int_titles)
learning_rate = 0.001
hidden_units = 400
embedding_size = 200
n_layers = 1
dropout = 0.5
n_iters = 40
encoder_forward_cell = tf.contrib.rnn.GRUCell(state_size)
encoder_backward_cell = tf.contrib.rnn.GRUCell(state_size)
decoder_cell = tf.contrib.rnn.GRUCell(state_size)
encoder_forward_cell = tf.contrib.rnn.DropoutWrapper(encoder_forward_cell, output_keep_prob = (1-dropout))
encoder_backward_cell = tf.contrib.rnn.DropoutWrapper(encoder_backward_cell, output_keep_prob = (1-dropout))
decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, output_keep_prob = (1-dropout))
with tf.variable_scope("seq2seq", dtype=dtype):
with tf.variable_scope("encoder"):
encoder_embedding = tf.get_variable("embedding", [source_vocab_size, embedding_size],initializer=embedding_init)
encoder_inputs_embedding = tf.nn.embedding_lookup(encoder_embedding, self.encoder_inputs)
encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn(encoder_forward_cell, encoder_backward_cell, encoder_inputs_embedding, sequence_length=self.encoder_len, dtype=dtype)
with tf.variable_scope("init_state"):
init_state = fc_layer(
tf.concat(encoder_states, 1), state_size)
# the shape of bidirectional_dynamic_rnn is weird
# None for batch_size
self.init_state = init_state
self.init_state.set_shape([self.batch_size, state_size])
self.att_states = tf.concat(encoder_outputs, 2)
self.att_states.set_shape([self.batch_size, None, state_size*2])
with tf.variable_scope("attention"):
attention = tf.contrib.seq2seq.BahdanauAttention(
state_size, self.att_states, self.encoder_len)
decoder_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(
decoder_cell, attention, state_size * 2)
wrapper_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(
self.init_state, self.prev_att)
with tf.variable_scope("decoder") as scope:
decoder_emb = tf.get_variable("embedding", [target_vocab_size, embedding_size],initializer=emb_init)
decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, target_vocab_size)
decoder_inputs_emb = tf.nn.embedding_lookup(decoder_emb, self.decoder_inputs)
helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs_emb, self.decoder_len)
decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, wrapper_state)
outputs, final_state = tf.contrib.seq2seq.dynamic_decode(decoder)
outputs_logits = outputs[0]
self.outputs = outputs_logits
weights = tf.sequence_mask(self.decoder_len, dtype=tf.float32)
loss_t = tf.contrib.seq2seq.sequence_loss(outputs_logits, self.decoder_targets, weights, average_across_timesteps=False, average_across_batch=False)
self.loss = tf.reduce_sum(loss_t) / self.batch_size
params = tf.trainable_variables()
opt = tf.train.AdadeltaOptimizer(self.learning_rate, epsilon=1e-6)
gradients = tf.gradients(self.loss, params)
clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient)
self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
def get_batches(int_text, batch_size, seq_length):
n_batches = int(len(int_text) / (batch_size * seq_length))
inputs = np.array(int_text[: n_batches * batch_size * seq_length])
outputs = np.array(int_text[1: n_batches * batch_size * seq_length + 1])
x = np.split(inputs.reshape(batch_size, -1), n_batches, 1)
y = np.split(outputs.reshape(batch_size, -1), n_batches, 1)
return np.array(list(zip(x, y)))
with tf.Session() as sess:
model = create_model(sess, False)
loss = 0.0
current_step = sess.run(model.global_step)
while current_step <= n_iters:
rand = np.random.random_sample()
bucket_id = min([i for i in range(len(train_buckets_scale))
if train_buckets_scale[i] > rand])
encoder_inputs, decoder_inputs, encoder_len, decoder_len = model.get_batches(train_set, bucket_id)
step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, False, train_writer)
loss += step_loss * batch_size / np.sum(decoder_len)
current_step += 1