NLP 기본 작업 2: 딥러닝 기반 텍스트 분류

7032 단어 ML&DLNLP
본 블로그는 복단대학 컴퓨터과학기술대학 구석붕 교수의 글을 참조하였다https://www.zhihu.com/question/324189960
제목: 파이터치를 익히고 파이터치로 퀘스트 1을 다시 써서 CNN, RNN의 텍스트 분류를 실현한다.
  • 참조
  • https://pytorch.org/
  • Convolutional Neural Networks for Sentence Classification https://arxiv.org/abs/1408.5882
  • https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

  • word embedding 방식으로 초기화
  • 무작위embedding의 초기화 방식
  • glove예훈련의embedding으로 초기화https://nlp.stanford.edu/projects/glove/
  • 지식 포인트:
  • CNN/RNN의 피쳐 추출
  • 어 삽입
  • Dropout


  • 코드:
    주: 코드는 요구를 엄격히 참조하지 않았다.
    import nltk
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    import matplotlib.pylab as plt
    %matplotlib inline
    
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = "0"   #  GPU
    
    #    ,    delimiter='\t'
    df_train = pd.read_csv(r'sentiment-analysis-on-movie-reviews/train.tsv',delimiter='\t')
    df_test = pd.read_csv(r'sentiment-analysis-on-movie-reviews/test.tsv',delimiter='\t')
    df_train.head()
    
    #  transformer,CountVectorizer             ,           。
    word_vectorizer = CountVectorizer(ngram_range = (1,1),analyzer = 'word',stop_words = 'english',min_df = 0.001)  #           
    spare_matric = word_vectorizer.fit_transform(df_train['Phrase'])  #    ,          
    
    # print(spare_matric)
    #   :
    # (0, 480)	1
    #   (0, 352)	1
    #   (0, 222)	2
    #   (0, 451)	1
    #   (1, 222)	1
    #   (1, 451)	1
    #   (2, 451)	1
    # print(sum(spare_matric))  
    #    ,   :
    # (0, 570)	161
    # (0, 28)	213
    
    #           
    # print(spare_matric.shape)  #(156060, 587)
    frequency = sum(spare_matric).toarray()[0]  #toarray  [[ 179  204  176  ]],    [0]
    # print(len(frequency))  #587
    # print(frequency)
    freq = pd.DataFrame(frequency,index = word_vectorizer.get_feature_names(),columns = ['frequency'])
    freq.sort_values('frequency',ascending = False)
    
    #         ,          ,        ,      log                。
    a = df_train.Sentiment.value_counts()  #              
    # a.plot(kind = 'bar')  #          
    # print(a.index)
    # print(a.values)
    plt.bar(a.index,a.values)
    
    #      
    # a = pd.DataFrame(a)
    # a['Rating'] = a.index
    # sns.set_style("darkgrid", {"axes.facecolor": ".9"})
    # fig, ax = plt.subplots(figsize=(10,6))
    # sns.barplot(y='Sentiment', x='Rating', data=a)
    
    #        
    import re
    df_train['Phrase'] = df_train['Phrase'].str.lower()
    df_train['Phrase'] = df_train['Phrase'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]','',x))
    df_test['Phrase'] = df_test['Phrase'].str.lower()
    df_test['Phrase'] = df_test['Phrase'].apply(lambda x: re.sub('[^a-zA-Z0-9]','',x))
    # print(df_train['Phrase'])
    
    X_train = df_train.Phrase
    y_train = df_train.Sentiment
    
    #         
    from keras.preprocessing.text import Tokenizer  #  https://blog.csdn.net/lovebyz/article/details/77712003
    tokenizer = Tokenizer()
    # print(X_train)
    #    :
    # 0         a series of escapades demonstrating the adage ...
    # 1         a series of escapades demonstrating the adage ...
    # 2                                                  a series
    # 3                                                         a
    # 4                                                    series
    tokenizer.fit_on_texts(X_train.values)  #          token  ,         。
    
    X_train = tokenizer.texts_to_sequences(X_train)  #        word       
    # print(len(X_train))  #156060
    # print(X_train[0])  #[2, 304, 3, 15110, 5906, 1, 6499, 9, 51, 8, 49, 13, 1, 3514, 8, 167, 49, 13, 1, 11381, 62, 3, 75, 615, 10453, 19, 576, 3, 75, 2003, 5, 54, 3, 2, 40]
    # print(len(X_train[0]))  #35
    # print(len(X_train[1]))  #14
    # print(len(X_train[2]))  #2
    X_test = df_test.Phrase
    X_test = tokenizer.texts_to_sequences(X_test)
    
    #        ,       
    from keras.preprocessing.sequence import pad_sequences
    max_length = max([len(x.split()) for x in df_train['Phrase']])
    # print(max_length)  #48
    X_train = pad_sequences(X_train,max_length)
    X_test = pad_sequences(X_test,max_length)
    # print(X_train.shape)  #(156060, 48)
    # print(X_test.shape)  #(66292, 48)
    
    #        
    from keras import Sequential
    from keras.layers import Embedding,LSTM,Dense
    
    EMBEDDING_DIM = 128
    dict_len = len(tokenizer.word_index) + 1
    model = Sequential()
    model.add(Embedding(dict_len,EMBEDDING_DIM,input_length = max_length))  #  https://blog.csdn.net/jiangpeng59/article/details/77533309
    model.add(LSTM(units = 128,dropout = 0.2,recurrent_dropout = 0.2))  #   dropout x hidden   dropout,    hidden-hidden   dropout
    model.add(Dense(5,activation = 'softmax'))
    model.compile(loss = 'sparse_categorical_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
    # print(model.summary())
    # Layer (type)                 Output Shape              Param #   
    # =================================================================
    # embedding_4 (Embedding)      (None, 48, 128)           2099712   
    # _________________________________________________________________
    # lstm_3 (LSTM)                (None, 128)               131584    
    # _________________________________________________________________
    # dense_3 (Dense)              (None, 5)                 645       
    # =================================================================
    # Total params: 2,231,941
    # Trainable params: 2,231,941
    # Non-trainable params: 0
    # _________________________________________________________________
    # None
    
    model.fit(X_train,y_train,batch_size= 128,epochs= 7,verbose= 1)
    # Epoch 6/7
    # 156060/156060 [==============================] - 101s 650us/step - loss: 0.5748 - acc: 0.7544
    # Epoch 7/7
    # 156060/156060 [==============================] - 101s 644us/step - loss: 0.5448 - acc: 0.7645
    
    #    ,    
    y_test_pred = model.predict_classes(X_test)
    final_pred = pd.read_csv(r'sentiment-analysis-on-movie-reviews/sampleSubmission.csv', sep=',')
    final_pred.Sentiment=final_pred
    final_pred.to_csv(r'results.csv', sep=',', index=False)
    
    #  CNN
    from keras.layers import Conv1D,Dropout,MaxPooling1D,Flatten
    def build_model():
        model = Sequential()
        model.add(Embedding(dict_len,output_dim=32,input_length = max_length))  
        model.add(Conv1D(filters = 32,kernel_size = 3,padding='same',activation='relu'))  
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(0.2))
        model.add(Flatten())
        model.add(Dense(5,activation = 'softmax'))
        model.compile(loss = 'sparse_categorical_crossentropy',optimizer= 'adam',metrics= ['accuracy'])
        model.fit(X_train,y_train,batch_size= 128,epochs= 7,verbose= 1)
        return model
    
    model2 = build_model()
    # Epoch 6/7
    # 156060/156060 [==============================] - 7s 45us/step - loss: 0.6345 - acc: 0.7340
    # Epoch 7/7
    # 156060/156060 [==============================] - 7s 43us/step - loss: 0.6068 - acc: 0.7462

    좋은 웹페이지 즐겨찾기