NLP 기본 작업 2: 딥러닝 기반 텍스트 분류
제목: 파이터치를 익히고 파이터치로 퀘스트 1을 다시 써서 CNN, RNN의 텍스트 분류를 실현한다.
코드:
주: 코드는 요구를 엄격히 참조하지 않았다.
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pylab as plt
%matplotlib inline
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0" # GPU
# , delimiter='\t'
df_train = pd.read_csv(r'sentiment-analysis-on-movie-reviews/train.tsv',delimiter='\t')
df_test = pd.read_csv(r'sentiment-analysis-on-movie-reviews/test.tsv',delimiter='\t')
df_train.head()
# transformer,CountVectorizer , 。
word_vectorizer = CountVectorizer(ngram_range = (1,1),analyzer = 'word',stop_words = 'english',min_df = 0.001) #
spare_matric = word_vectorizer.fit_transform(df_train['Phrase']) # ,
# print(spare_matric)
# :
# (0, 480) 1
# (0, 352) 1
# (0, 222) 2
# (0, 451) 1
# (1, 222) 1
# (1, 451) 1
# (2, 451) 1
# print(sum(spare_matric))
# , :
# (0, 570) 161
# (0, 28) 213
#
# print(spare_matric.shape) #(156060, 587)
frequency = sum(spare_matric).toarray()[0] #toarray [[ 179 204 176 ]], [0]
# print(len(frequency)) #587
# print(frequency)
freq = pd.DataFrame(frequency,index = word_vectorizer.get_feature_names(),columns = ['frequency'])
freq.sort_values('frequency',ascending = False)
# , , , log 。
a = df_train.Sentiment.value_counts() #
# a.plot(kind = 'bar') #
# print(a.index)
# print(a.values)
plt.bar(a.index,a.values)
#
# a = pd.DataFrame(a)
# a['Rating'] = a.index
# sns.set_style("darkgrid", {"axes.facecolor": ".9"})
# fig, ax = plt.subplots(figsize=(10,6))
# sns.barplot(y='Sentiment', x='Rating', data=a)
#
import re
df_train['Phrase'] = df_train['Phrase'].str.lower()
df_train['Phrase'] = df_train['Phrase'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]','',x))
df_test['Phrase'] = df_test['Phrase'].str.lower()
df_test['Phrase'] = df_test['Phrase'].apply(lambda x: re.sub('[^a-zA-Z0-9]','',x))
# print(df_train['Phrase'])
X_train = df_train.Phrase
y_train = df_train.Sentiment
#
from keras.preprocessing.text import Tokenizer # https://blog.csdn.net/lovebyz/article/details/77712003
tokenizer = Tokenizer()
# print(X_train)
# :
# 0 a series of escapades demonstrating the adage ...
# 1 a series of escapades demonstrating the adage ...
# 2 a series
# 3 a
# 4 series
tokenizer.fit_on_texts(X_train.values) # token , 。
X_train = tokenizer.texts_to_sequences(X_train) # word
# print(len(X_train)) #156060
# print(X_train[0]) #[2, 304, 3, 15110, 5906, 1, 6499, 9, 51, 8, 49, 13, 1, 3514, 8, 167, 49, 13, 1, 11381, 62, 3, 75, 615, 10453, 19, 576, 3, 75, 2003, 5, 54, 3, 2, 40]
# print(len(X_train[0])) #35
# print(len(X_train[1])) #14
# print(len(X_train[2])) #2
X_test = df_test.Phrase
X_test = tokenizer.texts_to_sequences(X_test)
# ,
from keras.preprocessing.sequence import pad_sequences
max_length = max([len(x.split()) for x in df_train['Phrase']])
# print(max_length) #48
X_train = pad_sequences(X_train,max_length)
X_test = pad_sequences(X_test,max_length)
# print(X_train.shape) #(156060, 48)
# print(X_test.shape) #(66292, 48)
#
from keras import Sequential
from keras.layers import Embedding,LSTM,Dense
EMBEDDING_DIM = 128
dict_len = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(dict_len,EMBEDDING_DIM,input_length = max_length)) # https://blog.csdn.net/jiangpeng59/article/details/77533309
model.add(LSTM(units = 128,dropout = 0.2,recurrent_dropout = 0.2)) # dropout x hidden dropout, hidden-hidden dropout
model.add(Dense(5,activation = 'softmax'))
model.compile(loss = 'sparse_categorical_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
# print(model.summary())
# Layer (type) Output Shape Param #
# =================================================================
# embedding_4 (Embedding) (None, 48, 128) 2099712
# _________________________________________________________________
# lstm_3 (LSTM) (None, 128) 131584
# _________________________________________________________________
# dense_3 (Dense) (None, 5) 645
# =================================================================
# Total params: 2,231,941
# Trainable params: 2,231,941
# Non-trainable params: 0
# _________________________________________________________________
# None
model.fit(X_train,y_train,batch_size= 128,epochs= 7,verbose= 1)
# Epoch 6/7
# 156060/156060 [==============================] - 101s 650us/step - loss: 0.5748 - acc: 0.7544
# Epoch 7/7
# 156060/156060 [==============================] - 101s 644us/step - loss: 0.5448 - acc: 0.7645
# ,
y_test_pred = model.predict_classes(X_test)
final_pred = pd.read_csv(r'sentiment-analysis-on-movie-reviews/sampleSubmission.csv', sep=',')
final_pred.Sentiment=final_pred
final_pred.to_csv(r'results.csv', sep=',', index=False)
# CNN
from keras.layers import Conv1D,Dropout,MaxPooling1D,Flatten
def build_model():
model = Sequential()
model.add(Embedding(dict_len,output_dim=32,input_length = max_length))
model.add(Conv1D(filters = 32,kernel_size = 3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(5,activation = 'softmax'))
model.compile(loss = 'sparse_categorical_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
model.fit(X_train,y_train,batch_size= 128,epochs= 7,verbose= 1)
return model
model2 = build_model()
# Epoch 6/7
# 156060/156060 [==============================] - 7s 45us/step - loss: 0.6345 - acc: 0.7340
# Epoch 7/7
# 156060/156060 [==============================] - 7s 43us/step - loss: 0.6068 - acc: 0.7462
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
형태소 분석을 시도 할 수있는 페이지 (NLP4J)일본어 텍스트 분석에서 가장 중요한 기술 중 하나가 형태소 분석입니다. 그러나 「형태소 해석을 시험해 보자」라고 생각했을 때에는 Java나 Python의 실행 환경이 필요하게 되는 경우가 많아, 간편하게 시험할 수 ...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.