import numpy as np
    import tensorflow as tf


    # Data can be downloaded at https://github.com/harvardnlp/sent-summary


    article_filename = 'Data/sumdata/train/train.article.txt'
    title_filename = 'Data/sumdata/train/train.title.txt'

    with open(article_filename) as article_file:
        articles = article_file.readlines()
    with open(title_filename) as title_file:
        titles = title_file.readlines()


    def create_lookup_tables(text):
        vocab = set(text.split())
        vocab_to_int = {'<S>': 0, '<E>': 1, '<UNK>': 2, '<PAD>': 3 }

        for i, v in enumerate(vocab, len(vocab_to_int)):
            vocab_to_int[v] = i

        int_to_vocab = {i: v for v, i in vocab_to_int.items()}

        return vocab_to_int, int_to_vocab

    vocab_to_int_article, int_to_vocab_article = create_lookup_tables([x.lower() for x in articles])
    vocab_to_int_title, int_to_vocab_title = create_lookup_tables([x.lower() for x in titles])


    def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
        source_id_text = [[source_vocab_to_int[word] for word in sentence.split()] for sentence in source_text.split('\n')]
        target_id_text = [[target_vocab_to_int[word] for word in sentence.split()]+[target_vocab_to_int['<E>']] for sentence in target_text.split('\n')]

        return source_id_text, target_id_text

    X, y = text_to_ids(articles.lower(), titles.lower(), vocab_to_int_articles, vocab_to_int_titles)


    learning_rate = 0.001
    hidden_units = 400
    embedding_size = 200
    n_layers = 1
    dropout = 0.5
    n_iters = 40


    encoder_forward_cell = tf.contrib.rnn.GRUCell(state_size)
    encoder_backward_cell = tf.contrib.rnn.GRUCell(state_size)
    decoder_cell = tf.contrib.rnn.GRUCell(state_size)

    encoder_forward_cell = tf.contrib.rnn.DropoutWrapper(encoder_forward_cell, output_keep_prob = (1-dropout))
    encoder_backward_cell = tf.contrib.rnn.DropoutWrapper(encoder_backward_cell, output_keep_prob = (1-dropout))
    decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell, output_keep_prob = (1-dropout))


    with tf.variable_scope("seq2seq", dtype=dtype):
        with tf.variable_scope("encoder"):
            encoder_embedding = tf.get_variable("embedding", [source_vocab_size, embedding_size],initializer=embedding_init)
            encoder_inputs_embedding = tf.nn.embedding_lookup(encoder_embedding, self.encoder_inputs)
            encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn(encoder_forward_cell, encoder_backward_cell, encoder_inputs_embedding, sequence_length=self.encoder_len, dtype=dtype)

        with tf.variable_scope("init_state"):
            init_state = fc_layer(
                tf.concat(encoder_states, 1), state_size)
            # the shape of bidirectional_dynamic_rnn is weird
            # None for batch_size
            self.init_state = init_state
            self.init_state.set_shape([self.batch_size, state_size])
            self.att_states = tf.concat(encoder_outputs, 2)
            self.att_states.set_shape([self.batch_size, None, state_size*2])

        with tf.variable_scope("attention"):
            attention = tf.contrib.seq2seq.BahdanauAttention(
                state_size, self.att_states, self.encoder_len)
            decoder_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(
                decoder_cell, attention, state_size * 2)
            wrapper_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(
                self.init_state, self.prev_att)

        with tf.variable_scope("decoder") as scope:
            decoder_emb = tf.get_variable("embedding", [target_vocab_size, embedding_size],initializer=emb_init)
            decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, target_vocab_size)
            decoder_inputs_emb = tf.nn.embedding_lookup(decoder_emb, self.decoder_inputs)

            helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs_emb, self.decoder_len)
            decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, wrapper_state)

            outputs, final_state = tf.contrib.seq2seq.dynamic_decode(decoder)

            outputs_logits = outputs[0]
            self.outputs = outputs_logits

            weights = tf.sequence_mask(self.decoder_len, dtype=tf.float32)

            loss_t = tf.contrib.seq2seq.sequence_loss(outputs_logits, self.decoder_targets, weights, average_across_timesteps=False, average_across_batch=False)
            self.loss = tf.reduce_sum(loss_t) / self.batch_size

            params = tf.trainable_variables()
            opt = tf.train.AdadeltaOptimizer(self.learning_rate, epsilon=1e-6)
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient)
            self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)


    def get_batches(int_text, batch_size, seq_length):
        n_batches = int(len(int_text) / (batch_size * seq_length))
        inputs = np.array(int_text[: n_batches * batch_size * seq_length])
        outputs = np.array(int_text[1: n_batches * batch_size * seq_length + 1])

        x = np.split(inputs.reshape(batch_size, -1), n_batches, 1)
        y = np.split(outputs.reshape(batch_size, -1), n_batches, 1)

        return np.array(list(zip(x, y)))


    with tf.Session() as sess:
            model = create_model(sess, False)
            loss = 0.0
            current_step = sess.run(model.global_step)

            while current_step <= n_iters:
                rand = np.random.random_sample()
                bucket_id = min([i for i in range(len(train_buckets_scale))
                                 if train_buckets_scale[i] > rand])

                encoder_inputs, decoder_inputs, encoder_len, decoder_len = model.get_batches(train_set, bucket_id)
                step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, False, train_writer)
                loss += step_loss * batch_size / np.sum(decoder_len)
                current_step += 1