seq2seq model解析

train

with tf.Session() as sess:
   # 區塊1，模型初始化
   # Create model.
   print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
   # 透過 create_model() 方法創建一個 seq2seq_model
   model = create_model(sess, False)
   
   # 區塊2，讀入資料
   # Read data into buckets and compute their sizes.
   print ("Reading development and training data (limit: %d)."
          % FLAGS.max_train_data_size)
   # read_data 函數讀取 train, dev 的路徑，
   dev_set = read_data(from_dev, to_dev)
   train_set = read_data(from_train, to_train, FLAGS.max_train_data_size)
   train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
   train_total_size = float(sum(train_bucket_sizes))
   train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                          for i in xrange(len(train_bucket_sizes))]
   while True:
     # 區塊3，建立 batch
     # Choose a bucket according to data distribution. We pick a random number
     # in [0, 1] and use the corresponding interval in train_buckets_scale.
     random_number_01 = np.random.random_sample()
     bucket_id = min([i for i in xrange(len(train_buckets_scale))
                      if train_buckets_scale[i] > random_number_01])
     # Get a batch and make a step.
     start_time = time.time()
     encoder_inputs, decoder_inputs, target_weights = model.get_batch(
         train_set, bucket_id)
     # 區塊4，訓練
     _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                  target_weights, bucket_id, False)

代码解析：

create_model

def __init__(self,
             source_vocab_size, # 英文單詞表的數量
             target_vocab_size, # 法文單詞表的數量
             buckets, # buckets 於下面詳述
             size, # 模型每個 layer 的 neuron size
             num_layers, 
             max_gradient_norm,  # 訓練 RNN 時 clip 梯度的值
             batch_size, 
             learning_rate,
             learning_rate_decay_factor,
             use_lstm=False,
             num_samples=512, # sampled softmax size
             forward_only=False, # train時為False, decode時為true
             dtype=tf.float32):

初始化一个seq2seqModel class

bucket说明

Bucket 是工程上使用的一種方式。理論上 RNN 可以輸出任意長度的句子，但這樣勢必會因為每句話的長度不同，而產生許多無用的 graph。使用 Bucket 可以減少產生大量，並可能會有不少重複的 graph。若有一長度為 ( 6, 16 ) 的 (英文, 法文) 句子，那麼則會被分配到 (20, 25) 這個 bucket。並且英文會被 padding 至長度 20，法文會被 padding 至長度 25

读入资料

def read_data(source_path, target_path, max_size=None):
  data_set = [[] for _ in _buckets]
  # 讀入英文檔案
  with tf.gfile.GFile(source_path, mode="r") as source_file:
    # 讀入法文檔案
    with tf.gfile.GFile(target_path, mode="r") as target_file:
      # 每次讀入一行例如 ( '1 2 3 4 5\n', '99 98 97 96 95\n') 的(英,法)句對
      source, target = source_file.readline(), target_file.readline()
      counter = 0
      # 逐行處理，去除 \n，並且 tokenize 化
      while source and target and (not max_size or counter < max_size):
        counter += 1
        if counter % 100000 == 0:
          print("  reading data line %d" % counter)
          sys.stdout.flush()
        source_ids = [int(x) for x in source.split()]
        target_ids = [int(x) for x in target.split()]
        target_ids.append(data_utils.EOS_ID)
        # 這邊計算每句話的長度，並且分配到適合該長度的 bucket 之中
        for bucket_id, (source_size, target_size) in enumerate(_buckets):
          if len(source_ids) < source_size and len(target_ids) < target_size:
            data_set[bucket_id].append([source_ids, target_ids])
            break
        source, target = source_file.readline(), target_file.readline()
  return data_set

建立batch

random_number_01 = np.random.random_sample()
bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])
encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)

还需要class本身的property:batch_size

def get_batch(self, data, bucket_id):
  # 根據傳進來的 bucket_id 決定這次的 encoder, deocder size，例如 5, 10
  encoder_size, decoder_size = self.buckets[bucket_id]
  encoder_inputs, decoder_inputs = [], []
  # Get a random batch of encoder and decoder inputs from data,
  # pad them if needed, reverse encoder inputs and add GO to decoder.
  for _ in xrange(self.batch_size):
    # 前面提過 data 是一個長度為4的list，data[i] 存放長度符合 bucket[i] 的資料
    encoder_input, decoder_input = random.choice(data[bucket_id])
    # Encoder inputs are padded and then reversed.
    encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
    encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
    # Decoder inputs get an extra "GO" symbol, and are padded then.
    decoder_pad_size = decoder_size - len(decoder_input) - 1
    decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                          [data_utils.PAD_ID] * decoder_pad_size)
  # Now we create batch-major vectors from the data selected above.
  batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
  # Batch encoder inputs are just re-indexed encoder_inputs.
  for length_idx in xrange(encoder_size):
    batch_encoder_inputs.append(
        np.array([encoder_inputs[batch_idx][length_idx]
                  for batch_idx in xrange(self.batch_size)], dtype=np.int32))
  # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
  for length_idx in xrange(decoder_size):
    batch_decoder_inputs.append(
        np.array([decoder_inputs[batch_idx][length_idx]
                  for batch_idx in xrange(self.batch_size)], dtype=np.int32))
    # Create target_weights to be 0 for targets that are padding.
    # 這個 weights 是給模型訓練用的，有目標值的地方為1，其他為0
    # 有目標值的地方，指的是 decoder_input 平移1格的結果
    batch_weight = np.ones(self.batch_size, dtype=np.float32)
    for batch_idx in xrange(self.batch_size):
      # We set weight to 0 if the corresponding target is a PAD symbol.
      # The corresponding target is decoder_input shifted by 1 forward.
       if length_idx < decoder_size - 1:
           target = decoder_inputs[batch_idx][length_idx + 1]
       if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
           batch_weight[batch_idx] = 0.0
    batch_weights.append(batch_weight)
  return batch_encoder_inputs, batch_decoder_inputs, batch_weights

训练

self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
    self.encoder_inputs, self.decoder_inputs, targets,
    self.target_weights, buckets,
    seq2seq=lambda x, y: seq2seq_f(x, y, False),
    softmax_loss_function=softmax_loss_function)

上面這段程式碼中的 seq2seq 參數 seq2seq_f(x, y, False) 也是定義在 seq2seq_model.py 裡面的。指的是將 x: encoder_input 與 y: decoder_input 輸入，回傳的就是這個 seq2seq model 的 output 與 state。False 這個參數則是 seq2seq_f() 裡面自行定義作為 do_decode or not 的 Boolean。我們把相關的程式碼列出來如下，可以看到多為 tensorflow 之中對於 RNN 的設定。其中比較特別的是 sampled_softmax_loss 以及 seq2seq_f

output_projection = None
softmax_loss_function = None
# Sampled softmax only makes sense if we sample less than vocabulary size.
if num_samples > 0 and num_samples < self.target_vocab_size:
  w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
  w = tf.transpose(w_t)
  b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
  output_projection = (w, b)
  def sampled_loss(labels, inputs):
    labels = tf.reshape(labels, [-1, 1])
    # We need to compute the sampled_softmax_loss using 32bit floats to
    # avoid numerical instabilities.
    local_w_t = tf.cast(w_t, tf.float32)
    local_b = tf.cast(b, tf.float32)
    local_inputs = tf.cast(inputs, tf.float32)
    return tf.cast(
        tf.nn.sampled_softmax_loss(
            weights=local_w_t,
            biases=local_b,
            labels=labels,
            inputs=local_inputs,
            num_sampled=num_samples,
            num_classes=self.target_vocab_size),
        dtype)
  softmax_loss_function = sampled_loss
    
# Create the internal multi-layer cell for our RNN.
def single_cell():
  return tf.contrib.rnn.GRUCell(size)
if use_lstm:
  def single_cell():
    return tf.contrib.rnn.BasicLSTMCell(size)
cell = single_cell()
if num_layers > 1:
  cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])
# The seq2seq function: we use embedding for the input and attention.
def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
  return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
      encoder_inputs,
      decoder_inputs,
      cell,
      num_encoder_symbols=source_vocab_size,
      num_decoder_symbols=target_vocab_size,
      embedding_size=size,
      output_projection=output_projection,
      feed_previous=do_decode,
      dtype=dtype)

sampled_softmax_loss 是用在當有大量的輸出類別必須被 predict 的時候，舉例來說，像是英翻法這樣的翻譯工作，法文的詞典(target_vocab_size) size 有 40000 之多，這時候我們採用 sampled_softmax_loss 可以快速有效地建立一個 softmax classifier。其中的參數num_sampled指的是 sampling 的數目，在這邊是512。num_classes指的就是實際的 class 數目，在這邊就是以法文詞典的數目來代表。要注意的是 num_sampled不可以大於 num_classes 就是了

seq2seq_f() 直接呼叫了 tf.contrib.legacy_seq2seq.embedding_attention_seq2seq()。這個 embedding_attention_seq2seq 是一個帶有 embedding + sequence to sequence 並帶有 attention 機制的模型。encoder_input 首先進入一個 embedding layer，轉為 word vector，之後進入一個 encoder RNN。這個 encoder RNN 的每一個 time step 會被記錄下來，作為 attention 機制的參考。接下來，decoder_input 會進入另一個新建立的 embedding layer，在同樣轉為 word vector 之後，進入一個 attention deocder RNN。這個 deocder 是由 encoder 的最後一個 time step 的 state 進行初始化，其後每一個輸入就是 decoder_input 經過 embedding 之後的 word vector，並且具有對 encoder output 專注的 attention 機制

在 tf.contrib.legacy_seq2seq.embedding_attention_seq2seq() 之中的參數 feed_previous，當他為 False 的時候 decoder 會使用前面給的 decoer_input 作為輸入，也就是一般在訓練階段的作法。當值為 True 的時候，前面給的 decoder_input 只有第一個值（通常是 GO symbol，代表一個句子的開始）會作為 decoder 的輸入，而 decoder 的下一個 input，則是 decoder 的前一個 output，也就是只給 deocder 第一個 input，後面讓他自由發揮的意思。這也是一般在 decode/predict 時候的作法。

perplexity

$$
perplexity=e^{-l}, l=\frac{1}{M}\sum_{i=1}^{m}\log\left(p(s_i) \right )
$$

如同precision recall等指标一样，用来评估一个模型的好坏