nltk-중국어 문서 싱크로 율 비교-전체 인 스 턴 스

5369 단어 Python
nltk 는 중국어 장면 도 처리 할 수 있 습 니 다.다음 과 같이 변경 하면 됩 니 다.
  • 중국어 단어 기 사용(예 를 들 어 내 가 말 더 듬 단 어 를 선택 했다)
  • 중국어 문 자 를 인 코딩 처리 하고 유 니 버 설 인 코딩 방식
  • 을 사용 합 니 다.
  • python 의 소스 코드 인 코딩 은 gbk
  • 로 통일 되 었 습 니 다.
  • 중국 어 를 지원 하 는 언어 자료 실 사용
  • 코드 는 다음 과 같 습 니 다.jieba 의 지원 이 필요 합 니 다.
    #!/usr/bin/env python
    #-*-coding=gbk-*-
     
    """
             ,      
    """
    #    courses,              \t    \t    ,    html     
    courses = [           
                u'Writing II: Rhetorical Composing',
                u'Genetics and Society: A Course for Educators',
                u'General Game Playing',
                u'Genes and the Human Condition (From Behavior to Biotechnology)',
                u'A Brief History of Humankind',
                u'New Models of Business in Society',
                u'Analyse Numrique pour Ingnieurs',
                u'Evolution: A Course for Educators',
                u'Coding the Matrix: Linear Algebra through Computer Science Applications',
                u'The Dynamic Earth: A Course for Educators',
                u'Tiny Wings\tYou have always dreamed of flying - but your wings are tiny. Luckily the world is full of beautiful hills. Use the hills as jumps - slide down, flap your wings and fly! At least for a moment - until this annoying gravity brings you back down to earth. But the next hill is waiting for you already. Watch out for the night and fly as fast as you can. ',
                u'Angry Birds Free',
                u'  \    ',
                u'  \t    ',
                u'  \t    ',
                u'  \t     ',
                u'  ',
                u'    ',
                u'   ',
                u'      ',
                u'Angry Birds Stella',
                u'Flappy Wings - FREE\tFly into freedom!A parody of the #1 smash hit game!',
                u'    ',
                u'    2',
               ]
     
    #           
    #    courses_name = [course.split('\t')[0] for course in courses]
    courses_name = courses
     
     
    """
           (easy_install nltk)
    """
    def pre_process_cn(courses, low_freq_filter = True):
        """
               +      
            1.     
            2.      
            3.     
            4.     
     
        """
        import nltk
        import jieba.analyse
        from nltk.tokenize import word_tokenize
        
        texts_tokenized = []
        for document in courses:
            texts_tokenized_tmp = []
            for word in word_tokenize(document):
                texts_tokenized_tmp += jieba.analyse.extract_tags(word,10)
            texts_tokenized.append(texts_tokenized_tmp)   
        
        texts_filtered_stopwords = texts_tokenized
     
        #      
        english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
        texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
     
        #   
        from nltk.stem.lancaster import LancasterStemmer
        st = LancasterStemmer()
        texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
        
        #      
        if low_freq_filter:
            all_stems = sum(texts_stemmed, [])
            stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
            texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
        else:
            texts = texts_stemmed
        return texts
     
    lib_texts = pre_process_cn(courses)
     
     
     
    """
          gensim,      (easy_install gensim)
    """
     
    def train_by_lsi(lib_texts):
        """
              LSI     
        """
        from gensim import corpora, models, similarities
     
        #         
        #import logging
        #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
     
        dictionary = corpora.Dictionary(lib_texts)
        corpus = [dictionary.doc2bow(text) for text in lib_texts]     #doc2bow():  collection words     ,    (word_id, word_frequency)  
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
     
        #    :  topic   10 LSI  
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
        index = similarities.MatrixSimilarity(lsi[corpus])     # index   gensim.similarities.docsim.MatrixSimilarity   
        
        return (index, dictionary, lsi)
     
        
    #      --          ,       ,    
    (index,dictionary,lsi) = train_by_lsi(lib_texts)
        
        
    #        
    target_courses = [u'  ']
    target_text = pre_process_cn(target_courses, low_freq_filter=False)
     
     
    """
              
    """
     
    #        
    ml_course = target_text[0]
     
    #    
    ml_bow = dictionary.doc2bow(ml_course)  
     
    #           lsi  ,            
    ml_lsi = lsi[ml_bow]     #ml_lsi     (topic_id, topic_value)
    sims = index[ml_lsi]     #sims       , index[xxx]        __getitem__()    ml_lsi
     
    #  ,     
    sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
     
    #    
    print sort_sims[0:10]   #   10     ,          
    print courses_name[sort_sims[1][0]]   #             
    print courses_name[sort_sims[2][0]]   #             
    print courses_name[sort_sims[3][0]]   #             

    좋은 웹페이지 즐겨찾기