cnn,rnn을 결합하여 텍스트 분류를 진행하다

주요 참조 코드 사고방식:

cnn과 rnn을 결합하여 텍스트 분류를 하는 주요 사고방식은 다음과 같다.
  1. data--->batch iter-->cnn input-->embedding---> ---> --->rnn --->lstm cell--softmax

앞에서 언급한 블로그에서 텍스트 데이터를batch iter 형식으로 바꾸는 방법에 대해 다음과 같이 cnn-rnn 텍스트 분류에 관한 코드를 붙인다.
기본 구성:
  1. class TCNNRNNConfig(object):
  2. #
  3. embedding_dim = 64 #
  4. seq_length = 300 #
  5. num_classes = 2 #
  6. num_filters = 256 #
  7. kernel_size = 5 #
  8. vocab_size = 130000 #
  9. max_pool_size=4 # pool
  10. hidden_dim = 128 #
  11. dropout_keep_prob = 0.8 # dropout
  12. learning_rate = 1e-3 #
  13. hidden_unit=256 #lstm
  14. batch_size = 128 #
  15. num_epochs = 20 #
  16. print_per_batch = 100 #
  17. multi_kernel_size = '3,4,5'
  18. l2_reg_lambda = 0.0

모델 코드:
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import tensorflow as tf
  4. import numpy as np
  5. class TextCnnRnn(object):
  6. def __init__(self,config):
  7. self.config=config
  8. self.input_x=tf.placeholder(tf.int32,[None, self.config.seq_length],name="input_x")
  9. self.input_y=tf.placeholder(tf.float32,[None, self.config.num_classes],name="inpyt_y")
  10. self.keep_prob=tf.placeholder(tf.float32,None,name='keep_prob')
  11. self.pad = tf.placeholder(tf.float32, [None, 1, self.config.embedding_dim, 1], name='pad')
  12. self.l2_loss = tf.constant(0.0)
  13. self.real_len = tf.placeholder(tf.int32, [None], name='real_len')
  14. self.filter_sizes = list(map(int, self.config.multi_kernel_size.split(",")))
  15. self.cnnrnn()
  16. def input_embedding(self):
  17. """ """
  18. with tf.device('/cpu:0'):
  19. embedding =tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
  20. _input = tf.nn.embedding_lookup(embedding, self.input_x)
  21. _input_expanded = tf.expand_dims(_input, -1)
  22. return _input_expanded
  23. def cnnrnn(self):
  24. emb=self.input_embedding()
  25. pooled_concat = []
  26. reduced = np.int32(np.ceil((self.config.seq_length) * 1.0 / self.config.max_pool_size))
  27. for i, filter_size in enumerate(self.filter_sizes):
  28. with tf.name_scope('conv-maxpool-%s' % filter_size):
  29. # Zero paddings so that the convolution output have dimension batch x sequence_length x emb_size x channel
  30. num_prio = (filter_size - 1) // 2
  31. num_post = (filter_size - 1) - num_prio
  32. pad_prio = tf.concat([self.pad] * num_prio, 1)
  33. pad_post = tf.concat([self.pad] * num_post, 1)
  34. emb_pad = tf.concat([pad_prio, emb, pad_post], 1)
  35. filter_shape = [filter_size, self.config.embedding_dim, 1, self.config.num_filters]
  36. W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
  37. b = tf.Variable(tf.constant(0.1, shape=[self.config.num_filters]), name='b')
  38. conv = tf.nn.conv2d(emb_pad, W, strides=[1, 1, 1, 1], padding='VALID', name='conv')
  39. h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
  40. # Maxpooling over the outputs
  41. pooled = tf.nn.max_pool(h, ksize=[1, self.config.max_pool_size, 1, 1], strides=[1, self.config.max_pool_size, 1, 1], padding='SAME',
  42. name='pool')
  43. pooled = tf.reshape(pooled, [-1, reduced, self.config.num_filters])
  44. pooled_concat.append(pooled)
  45. pooled_concat = tf.concat(pooled_concat, 2)
  46. pooled_concat = tf.nn.dropout(pooled_concat, self.keep_prob)
  47. # lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.config.hidden_unit)
  48. # lstm_cell = tf.nn.rnn_cell.GRUCell(num_units=self.config.hidden_unit)
  49. lstm_cell = tf.contrib.rnn.GRUCell(num_units=self.config.hidden_unit)
  50. # lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.dropout_keep_prob)
  51. lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)
  52. self._initial_state = lstm_cell.zero_state(self.config.batch_size, tf.float32)
  53. # inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, reduced, pooled_concat)]
  54. inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(pooled_concat, num_or_size_splits=int(reduced), axis=1)]
  55. # outputs, state = tf.nn.rnn(lstm_cell, inputs, initial_state=self._initial_state, sequence_length=self.real_len)
  56. #outputs, state = tf.contrib.rnn.static_rnn(lstm_cell, inputs, initial_state=self._initial_state,
  57. # sequence_length=self.real_len)
  58. outputs, state=tf.nn.static_rnn( lstm_cell, inputs,self._initial_state,sequence_length=self.real_len)
  59. # Collect the appropriate last words into variable output (dimension = batch x embedding_size)
  60. output = outputs[0]
  61. with tf.variable_scope('Output'):
  62. tf.get_variable_scope().reuse_variables()
  63. one = tf.ones([1, self.config.hidden_unit], tf.float32)
  64. for i in range(1, len(outputs)):
  65. ind = self.real_len < (i + 1)
  66. ind = tf.to_float(ind)
  67. ind = tf.expand_dims(ind, -1)
  68. mat = tf.matmul(ind, one)
  69. output = tf.add(tf.multiply(output, mat), tf.multiply(outputs[i], 1.0 - mat))
  70. with tf.name_scope('score'):
  71. self.W = tf.Variable(tf.truncated_normal([self.config.hidden_unit, self.config.num_classes], stddev=0.1), name='W')
  72. b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name='b')
  73. self.l2_loss += tf.nn.l2_loss(W)
  74. self.l2_loss += tf.nn.l2_loss(b)
  75. self.scores = tf.nn.xw_plus_b(output, self.W, b, name='scores')
  76. self.pred_y = tf.nn.softmax(self.scores, name="pred_y")
  77. tf.add_to_collection('pred_network', self.pred_y)
  78. self.predictions = tf.argmax(self.scores, 1, name='predictions')
  79. with tf.name_scope('loss'):
  80. losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,
  81. logits=self.scores) # only named arguments accepted
  82. self.loss = tf.reduce_mean(losses) + self.config.l2_reg_lambda * self.l2_loss
  83. with tf.name_scope("optimize"):
  84. #
  85. optimizer = tf.train.AdamOptimizer(
  86. learning_rate=self.config.learning_rate)
  87. self.optim = optimizer.minimize(self.loss)
  88. with tf.name_scope('accuracy'):
  89. correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
  90. self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name='accuracy')
  91. with tf.name_scope('num_correct'):
  92. correct = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
  93. self.num_correct = tf.reduce_sum(tf.cast(correct, 'float'))

run 코드:
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. from cnn_rnn_model import TextCnnRnn
  4. from configuration import TCNNRNNConfig
  5. from data_utils_cut import preocess_file,batch_iter
  6. import time
  7. import tensorflow as tf
  8. import os
  9. import numpy as np
  10. from datetime import timedelta
  11. trainpath="/Users/shuubiasahi/Desktop/tensorflow/adx/"
  12. def run_epoch(cnnrnnmodel=True):
  13. #
  14. print('Loading data...')
  15. start_time = time.time()
  16. x_train, y_train, words = preocess_file(data_path=trainpath+"cnn.txt")
  17. if cnnrnnmodel:
  18. print('Using CNNRNN model...')
  19. config = TCNNRNNConfig()
  20. config.vocab_size = len(words)
  21. print("vocab_size is:", config.vocab_size)
  22. model = TextCnnRnn(config)
  23. tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/boardlog'
  24. end_time = time.time()
  25. time_dif = end_time - start_time
  26. time_dif = timedelta(seconds=int(round(time_dif)))
  27. print('Time usage:', time_dif)
  28. print('Constructing TensorFlow Graph...')
  29. session = tf.Session()
  31. saver = tf.train.Saver()
  32. # tensorboard
  33. tf.summary.scalar("loss", model.loss)
  34. tf.summary.scalar("accuracy", model.acc)
  35. if not os.path.exists(tensorboard_dir):
  36. os.makedirs(tensorboard_dir)
  37. merged_summary = tf.summary.merge_all()
  38. writer = tf.summary.FileWriter(tensorboard_dir)
  39. writer.add_graph(session.graph)
  40. #
  41. print('Generating batch...')
  42. batch_train = batch_iter(list(zip(x_train, y_train)),
  43. config.batch_size, config.num_epochs)
  44. def feed_data(batch):
  45. """ """
  46. x_batch, y_batch = zip(*batch)
  47. feed_dict = {
  48. model.input_x: x_batch,
  49. model.input_y: y_batch,
  50. model.real_len:real_len(x_batch)
  51. }
  52. return feed_dict, len(x_batch)
  53. def real_len(batches):
  54. return [np.ceil(np.argmin(batch + [0]) * 1.0 / config.max_pool_size) for batch in batches]
  55. def evaluate(x_, y_):
  56. """
  57. OOM,
  58. """
  59. batch_eval = batch_iter(list(zip(x_, y_)), 128, 1)
  60. total_loss = 0.0
  61. total_acc = 0.0
  62. cnt = 0
  63. for batch in batch_eval:
  64. feed_dict, cur_batch_len = feed_data(batch)
  65. feed_dict[model.keep_prob] = 1.0
  66. loss, acc =[model.loss, model.acc],
  67. feed_dict=feed_dict)
  68. total_loss += loss * cur_batch_len
  69. total_acc += acc * cur_batch_len
  70. cnt += cur_batch_len
  71. return total_loss / cnt, total_acc / cnt
  72. #
  73. print('Training and evaluating...')
  74. start_time = time.time()
  75. print_per_batch = config.print_per_batch
  76. for i, batch in enumerate(batch_train):
  77. feed_dict, lenbatch = feed_data(batch)
  78. feed_dict[model.keep_prob] = config.dropout_keep_prob
  79. feed_dict[model.pad]=np.zeros([lenbatch, 1, config.embedding_dim, 1])
  80. if i % 5 == 0: # 5 tensorboard scalar
  81. s =, feed_dict=feed_dict)
  82. writer.add_summary(s, i)
  83. if i % print_per_batch == print_per_batch - 1: # 200
  84. loss_train, acc_train =[model.loss, model.acc],
  85. feed_dict=feed_dict)
  86. #loss, acc = evaluate(x_val, y_val)
  87. #
  88. end_time = time.time()
  89. time_dif = end_time - start_time
  90. time_dif = timedelta(seconds=int(round(time_dif)))
  91. msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
  92. + ' Time: {3}'
  93. print(msg.format(i + 1, loss_train, acc_train, time_dif))
  94. # if i%10==0 and i>0:
  95. # graph=tf.graph_util.convert_variables_to_constants(session,session.graph_def,["keep_prob","input_x","score/pred_y"])
  96. # tf.train.write_graph(graph,".","/Users/shuubiasahi/Desktop/tensorflow/modelsavegraph/graph.db",as_text=False)
  97. if i%500==0 and i>0:
  98. graph = tf.graph_util.convert_variables_to_constants(session, session.graph_def,
  99. ["keep_prob","real_len","pad", "input_x", "score/pred_y"])
  100. if cnnrnnmodel:
  101. tf.train.write_graph(graph, ".", trainpath+"graphcnnrnn.model",
  102. as_text=False)
  103. print(" {0} ".format(i))
  104., feed_dict=feed_dict) #
  105. #
  106. session.close()
  107. if __name__ == '__main__':
  108. run_epoch()

간단한 결과 분석:
  1. Using CNNRNN model...
  2. vocab_size is: 160238
  3. Time usage: 0:00:35
  4. Constructing TensorFlow Graph...
  5. 2017-10-30 23:22:18.426329: W tensorflow/core/platform/] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
  6. 2017-10-30 23:22:18.426342: W tensorflow/core/platform/] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
  7. 2017-10-30 23:22:18.426346: W tensorflow/core/platform/] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
  8. 2017-10-30 23:22:18.426351: W tensorflow/core/platform/] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
  9. Generating batch...
  10. Training and evaluating...
  11. Iter: 100, Train Loss: 0.66, Train Acc: 71.09%, Time: 0:02:47
  12. Iter: 200, Train Loss: 0.65, Train Acc: 61.72%, Time: 0:05:38

수백 보를 반복하는 것보다 단순히 cnn,bi-lstm를 사용하면 실제 효과가 매우 떨어진다. 텍스트 자체의 특징이 이미 뚜렷할 수 있다. 이런 것을 사용하면 오히려 효과가 떨어진다. cnn은 슈퍼 n-gram에 해당한다. bi-lstm는 텍스트 상하문의 정보를 양면으로 포착하여 정보를 출력한다. 이전에 GitHub에서 다른 사람이 텍스트 분류를 하는 것을 보았는데 cnn,bilstm 이런 양의 효과가 가장 좋아요...컴퓨터 때문에 많은 걸음이 교체되지 않았기 때문에, 언제 gpu로 시험해 봅시다

