cnn,rnn을 결합하여 텍스트 분류를 진행하다
206191 단어 python 프로그래밍
https://github.com/jiegzhan/multi-class-text-classification-cnn-rnn
cnn과 rnn을 결합하여 텍스트 분류를 하는 주요 사고방식은 다음과 같다.
data--->batch iter-->cnn input-->embedding---> ---> --->rnn --->lstm cell--softmax
앞에서 언급한 블로그에서 텍스트 데이터를batch iter 형식으로 바꾸는 방법에 대해 다음과 같이 cnn-rnn 텍스트 분류에 관한 코드를 붙인다.
기본 구성:
class TCNNRNNConfig(object):
#
embedding_dim = 64 #
seq_length = 300 #
num_classes = 2 #
num_filters = 256 #
kernel_size = 5 #
vocab_size = 130000 #
max_pool_size=4 # pool
hidden_dim = 128 #
dropout_keep_prob = 0.8 # dropout
learning_rate = 1e-3 #
hidden_unit=256 #lstm
batch_size = 128 #
num_epochs = 20 #
print_per_batch = 100 #
multi_kernel_size = '3,4,5'
l2_reg_lambda = 0.0
모델 코드:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
class TextCnnRnn(object):
def __init__(self,config):
self.config=config
self.input_x=tf.placeholder(tf.int32,[None, self.config.seq_length],name="input_x")
self.input_y=tf.placeholder(tf.float32,[None, self.config.num_classes],name="inpyt_y")
self.keep_prob=tf.placeholder(tf.float32,None,name='keep_prob')
self.pad = tf.placeholder(tf.float32, [None, 1, self.config.embedding_dim, 1], name='pad')
self.l2_loss = tf.constant(0.0)
self.real_len = tf.placeholder(tf.int32, [None], name='real_len')
self.filter_sizes = list(map(int, self.config.multi_kernel_size.split(",")))
self.cnnrnn()
def input_embedding(self):
""" """
with tf.device('/cpu:0'):
embedding =tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
_input = tf.nn.embedding_lookup(embedding, self.input_x)
_input_expanded = tf.expand_dims(_input, -1)
return _input_expanded
def cnnrnn(self):
emb=self.input_embedding()
pooled_concat = []
reduced = np.int32(np.ceil((self.config.seq_length) * 1.0 / self.config.max_pool_size))
for i, filter_size in enumerate(self.filter_sizes):
with tf.name_scope('conv-maxpool-%s' % filter_size):
# Zero paddings so that the convolution output have dimension batch x sequence_length x emb_size x channel
num_prio = (filter_size - 1) // 2
num_post = (filter_size - 1) - num_prio
pad_prio = tf.concat([self.pad] * num_prio, 1)
pad_post = tf.concat([self.pad] * num_post, 1)
emb_pad = tf.concat([pad_prio, emb, pad_post], 1)
filter_shape = [filter_size, self.config.embedding_dim, 1, self.config.num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
b = tf.Variable(tf.constant(0.1, shape=[self.config.num_filters]), name='b')
conv = tf.nn.conv2d(emb_pad, W, strides=[1, 1, 1, 1], padding='VALID', name='conv')
h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
# Maxpooling over the outputs
pooled = tf.nn.max_pool(h, ksize=[1, self.config.max_pool_size, 1, 1], strides=[1, self.config.max_pool_size, 1, 1], padding='SAME',
name='pool')
pooled = tf.reshape(pooled, [-1, reduced, self.config.num_filters])
pooled_concat.append(pooled)
pooled_concat = tf.concat(pooled_concat, 2)
pooled_concat = tf.nn.dropout(pooled_concat, self.keep_prob)
# lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.config.hidden_unit)
# lstm_cell = tf.nn.rnn_cell.GRUCell(num_units=self.config.hidden_unit)
lstm_cell = tf.contrib.rnn.GRUCell(num_units=self.config.hidden_unit)
# lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.dropout_keep_prob)
lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)
self._initial_state = lstm_cell.zero_state(self.config.batch_size, tf.float32)
# inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, reduced, pooled_concat)]
inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(pooled_concat, num_or_size_splits=int(reduced), axis=1)]
# outputs, state = tf.nn.rnn(lstm_cell, inputs, initial_state=self._initial_state, sequence_length=self.real_len)
#outputs, state = tf.contrib.rnn.static_rnn(lstm_cell, inputs, initial_state=self._initial_state,
# sequence_length=self.real_len)
outputs, state=tf.nn.static_rnn( lstm_cell, inputs,self._initial_state,sequence_length=self.real_len)
# Collect the appropriate last words into variable output (dimension = batch x embedding_size)
output = outputs[0]
with tf.variable_scope('Output'):
tf.get_variable_scope().reuse_variables()
one = tf.ones([1, self.config.hidden_unit], tf.float32)
for i in range(1, len(outputs)):
ind = self.real_len < (i + 1)
ind = tf.to_float(ind)
ind = tf.expand_dims(ind, -1)
mat = tf.matmul(ind, one)
output = tf.add(tf.multiply(output, mat), tf.multiply(outputs[i], 1.0 - mat))
with tf.name_scope('score'):
self.W = tf.Variable(tf.truncated_normal([self.config.hidden_unit, self.config.num_classes], stddev=0.1), name='W')
b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name='b')
self.l2_loss += tf.nn.l2_loss(W)
self.l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(output, self.W, b, name='scores')
self.pred_y = tf.nn.softmax(self.scores, name="pred_y")
tf.add_to_collection('pred_network', self.pred_y)
self.predictions = tf.argmax(self.scores, 1, name='predictions')
with tf.name_scope('loss'):
losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,
logits=self.scores) # only named arguments accepted
self.loss = tf.reduce_mean(losses) + self.config.l2_reg_lambda * self.l2_loss
with tf.name_scope("optimize"):
#
optimizer = tf.train.AdamOptimizer(
learning_rate=self.config.learning_rate)
self.optim = optimizer.minimize(self.loss)
with tf.name_scope('accuracy'):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name='accuracy')
with tf.name_scope('num_correct'):
correct = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.num_correct = tf.reduce_sum(tf.cast(correct, 'float'))
run 코드:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from cnn_rnn_model import TextCnnRnn
from configuration import TCNNRNNConfig
from data_utils_cut import preocess_file,batch_iter
import time
import tensorflow as tf
import os
import numpy as np
from datetime import timedelta
trainpath="/Users/shuubiasahi/Desktop/tensorflow/adx/"
def run_epoch(cnnrnnmodel=True):
#
print('Loading data...')
start_time = time.time()
x_train, y_train, words = preocess_file(data_path=trainpath+"cnn.txt")
if cnnrnnmodel:
print('Using CNNRNN model...')
config = TCNNRNNConfig()
config.vocab_size = len(words)
print("vocab_size is:", config.vocab_size)
model = TextCnnRnn(config)
tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/boardlog'
end_time = time.time()
time_dif = end_time - start_time
time_dif = timedelta(seconds=int(round(time_dif)))
print('Time usage:', time_dif)
print('Constructing TensorFlow Graph...')
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
# tensorboard
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("accuracy", model.acc)
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
writer.add_graph(session.graph)
#
print('Generating batch...')
batch_train = batch_iter(list(zip(x_train, y_train)),
config.batch_size, config.num_epochs)
def feed_data(batch):
""" """
x_batch, y_batch = zip(*batch)
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch,
model.real_len:real_len(x_batch)
}
return feed_dict, len(x_batch)
def real_len(batches):
return [np.ceil(np.argmin(batch + [0]) * 1.0 / config.max_pool_size) for batch in batches]
def evaluate(x_, y_):
"""
OOM,
"""
batch_eval = batch_iter(list(zip(x_, y_)), 128, 1)
total_loss = 0.0
total_acc = 0.0
cnt = 0
for batch in batch_eval:
feed_dict, cur_batch_len = feed_data(batch)
feed_dict[model.keep_prob] = 1.0
loss, acc = session.run([model.loss, model.acc],
feed_dict=feed_dict)
total_loss += loss * cur_batch_len
total_acc += acc * cur_batch_len
cnt += cur_batch_len
return total_loss / cnt, total_acc / cnt
#
print('Training and evaluating...')
start_time = time.time()
print_per_batch = config.print_per_batch
for i, batch in enumerate(batch_train):
feed_dict, lenbatch = feed_data(batch)
feed_dict[model.keep_prob] = config.dropout_keep_prob
feed_dict[model.pad]=np.zeros([lenbatch, 1, config.embedding_dim, 1])
if i % 5 == 0: # 5 tensorboard scalar
s = session.run(merged_summary, feed_dict=feed_dict)
writer.add_summary(s, i)
if i % print_per_batch == print_per_batch - 1: # 200
loss_train, acc_train = session.run([model.loss, model.acc],
feed_dict=feed_dict)
#loss, acc = evaluate(x_val, y_val)
#
end_time = time.time()
time_dif = end_time - start_time
time_dif = timedelta(seconds=int(round(time_dif)))
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
+ ' Time: {3}'
print(msg.format(i + 1, loss_train, acc_train, time_dif))
# if i%10==0 and i>0:
# graph=tf.graph_util.convert_variables_to_constants(session,session.graph_def,["keep_prob","input_x","score/pred_y"])
# tf.train.write_graph(graph,".","/Users/shuubiasahi/Desktop/tensorflow/modelsavegraph/graph.db",as_text=False)
if i%500==0 and i>0:
graph = tf.graph_util.convert_variables_to_constants(session, session.graph_def,
["keep_prob","real_len","pad", "input_x", "score/pred_y"])
if cnnrnnmodel:
tf.train.write_graph(graph, ".", trainpath+"graphcnnrnn.model",
as_text=False)
print(" {0} ".format(i))
session.run(model.optim, feed_dict=feed_dict) #
#
session.close()
if __name__ == '__main__':
run_epoch()
간단한 결과 분석:
Using CNNRNN model...
vocab_size is: 160238
Time usage: 0:00:35
Constructing TensorFlow Graph...
2017-10-30 23:22:18.426329: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426342: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426346: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426351: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
Generating batch...
Training and evaluating...
Iter: 100, Train Loss: 0.66, Train Acc: 71.09%, Time: 0:02:47
Iter: 200, Train Loss: 0.65, Train Acc: 61.72%, Time: 0:05:38
수백 보를 반복하는 것보다 단순히 cnn,bi-lstm를 사용하면 실제 효과가 매우 떨어진다. 텍스트 자체의 특징이 이미 뚜렷할 수 있다. 이런 것을 사용하면 오히려 효과가 떨어진다. cnn은 슈퍼 n-gram에 해당한다. bi-lstm는 텍스트 상하문의 정보를 양면으로 포착하여 정보를 출력한다. 이전에 GitHub에서 다른 사람이 텍스트 분류를 하는 것을 보았는데 cnn,bilstm 이런 양의 효과가 가장 좋아요...컴퓨터 때문에 많은 걸음이 교체되지 않았기 때문에, 언제 gpu로 시험해 봅시다
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
anaconda에서 jupyter notebook의 기본 브라우저 수정1、Anaconda prompt 2를 열고 Jupyter notebook --generate-config 3을 입력하면 Jupyternotebook_config.py 파일이 있는 디렉터리입니다.이 파일을 찾아서 수첩...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.