모든 영어의 순수한 텍스트 파일, 그 중의 단어 출현 개수 통계

7849 단어
1면: 낮은 효율성
path = 'test.txt'
with open(path,encoding='utf-8',newline='') as f:
    word = []
    words_dict= {}
    for letter in f.read():
        if letter.isalnum():
            word.append(letter)
        elif letter.isspace(): #    \t 
if word: word = ''.join(word).lower() # if word not in words_dict: words_dict[word] = 1 else: words_dict[word] += 1 word = [] # if word: word = ''.join(word).lower() # if word not in words_dict: words_dict[word] = 1 else: words_dict[word] += 1 word = [] for k,v in words_dict.items(): print(k,v)

제2판:
단점: 큰 파일을 만나면 메모리를 한 번에 읽어야 하기 때문에 성능이 좋지 않다
path = 'test.txt'
with open(path,'r',encoding='utf-8') as f:
    data = f.read()
    word_reg = re.compile(r'\w+')
    #word_reg = re.compile(r'\w+\b')
    word_list = word_reg.findall(data)
    word_list = [word.lower() for word in word_list] # 
    word_set = set(word_list)  # 
    # words_dict = {}
    # for word in word_set:
    #     words_dict[word] = word_list.count(word)

    #  
    words_dict = {word: word_list.count(word) for word in word_set}
    for k,v in words_dict.items():
        print(k,v)

제3판:
path = 'test.txt'
with open(path, 'r', encoding='utf-8') as f:
    word_list = []
    word_reg = re.compile(r'\w+')
    for line in f:
        #line_words = word_reg.findall(line)
        # 
        line_words = line.split()
        word_list.extend(line_words)
    word_set = set(word_list)  #  
    words_dict = {word: word_list.count(word) for word in word_set}
    for k, v in words_dict.items():
        print(k, v)

제4판: Counter 통계 사용
import collections  
path = 'test.txt'
with open(path, 'r', encoding='utf-8') as f:
    word_list = []
    word_reg = re.compile(r'\w+')
    for line in f:
        line_words = line.split()
        word_list.extend(line_words)
   
    words_dict = dict(Counter(word_list)) # Counter 
    for k, v in words_dict.items():
        print(k, v)

 
다음으로 전송:https://www.cnblogs.com/hupeng1234/p/6680491.html

좋은 웹페이지 즐겨찾기