gensim 테마 모델 을 이용 하여 비슷 한 coursera 과정 을 찾 습 니 다.

레 퍼 런 스http://www.52nlp.cn/어떻게 두 문서 의 싱크로 율 3 을 계산 합 니까?
#encoding=utf-8
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

courses=[line.strip() for line in file('coursera_corpus')]
courses_name=[course.split('\t')[0] for course in courses]

texts_tokenized=[[word.lower()for word in word_tokenize(document.decode('utf-8'))]
				 for document in courses]

# nltk                  
english_stopwords=stopwords.words('english')
texts_filtered_stopwords=[[word for word in document if word not in english_stopwords]
							for document in texts_tokenized]

#      
english_punctuations=[',','.',':','?','(',')','[',']','&','!','*','@','#','$','%']
texts_filted=[[word for word in document if word not in english_punctuations]
			   for document in texts_filtered_stopwords]

#    
st=LancasterStemmer()
texts_stemmed=[[st.stem(word) for word in document]
				for document in texts_filted]

#          
from collections import defaultdict
frequency=defaultdict(int)
for text in texts_stemmed:
	for token in text:
		frequency[token]+=1

texts=[[token for token in text if frequency[token]>1]
		for text in texts_stemmed]

from gensim import corpora, models, similarities
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary=corpora.Dictionary(texts)
corpus=[dictionary.doc2bow(text) for text in texts]

tfidf=models.TfidfModel(corpus)
corpus_tfidf=tfidf[corpus]
lsi=models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=10)
index=similarities.MatrixSimilarity(lsi[corpus])

print "the query course is:",courses_name[174]
ml_course=texts[174]
ml_bow=dictionary.doc2bow(ml_course)
ml_lsi=lsi[ml_bow]
sims=index[ml_lsi]
sort_sims=sorted(enumerate(sims),key=lambda item:-item[1])
courses_nameTop=[tup[0]for tup in sort_sims[0:10]]
courses_sim=[courses_name[num] for num in courses_nameTop]

print "the similarity courses are:"
for doc in courses_sim:
	print doc

좋은 웹페이지 즐겨찾기