호텔 평론 예처리 프로그램

4173 단어 텍스트 처리
def corpus_content(corpus_path, seg_path):
    catelist = os.listdir(corpus_path)  #  corpus_path 
    print("catelist", catelist)
    with open(seg_path, 'w', encoding="UTF-8") as fw:
        #  
        for mydir in catelist:
            print(mydir)
            if mydir == "neg":
                label = -1
            else:
                label = 1
            class_path = corpus_path + mydir + "/"  #  :6000/neg/

            file_list = os.listdir(class_path)  #  

            for file_path in file_list:  #  
                filedir = class_path + file_path  # #  :train_corpus/art/21.txt
                content = readfile(filedir).replace(' ','').replace('
'
, '') label_content = str(label)+" "+content+"
"
fw.write(label_content)

좋은 웹페이지 즐겨찾기