import jieba
from collections import Counter
with open('hongloumeng.txt', encoding='utf-8') as fp:
text = fp.read()
seg_list = jieba.cut(text)
words = Counter(seg_list)
print(words.most_common(10))
import os
import re
from collections import Counter
def count_words(filename):
"""
统计文本单词词频
"""
with open(filename, encoding='utf-8') as fp:
text = fp.read()
words = re.findall(r'\b\w+\b', text.lower())
return Counter(words)
if __name__ == '__main__':
path = r'E:\python\documents'
filenames = os.listdir(path)
for filename in filenames:
filename = os.path.join(path, filename)
print(count_words(filename).most_common(10))
七、Python英文词频统计代码
下面是一个使用Python实现英文文本词频统计的示例:
def count_words(filename):
"""
统计英文单词词频
"""
with open(filename) as fp:
text = fp.read()
words = re.findall(r'\b\w+\b', text.lower())
return Counter(words)
if __name__ == '__main__':
print(count_words('english_text.txt').most_common(10))
import jieba
from collections import Counter
def count_words(filename):
"""
文本词频统计
"""
with open(filename, encoding='utf-8') as fp:
text = fp.read()
# 分词
seg_list = jieba.cut(text)
# 过滤停用词
stop_words = set()
with open('stop_words.txt', encoding='utf-8') as fp:
for line in fp:
stop_words.add(line.strip())
words = [w for w in seg_list if w not in stop_words]
# 统计单词出现次数
words_count = Counter(words)
return words_count
if __name__ == '__main__':
words_count = count_words('text.txt')
sorted_words = sorted(words_count.items(), key=lambda x: x[1], reverse=True)
for w, c in sorted_words:
print(w, c)