原碼
def getText():
txt=open("hmlt.txt","r").read()
txt=txt.lower()
for ch in '`!@#~$%^&*()_+-=*/{}[];,./?<>':
txt=txt.replace(ch," ")
return txt
hmltTxt=getText()
words=hmltTxt.split()
counts={}
for word in words:
counts[word]=counts.get(word,0)+1
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(100):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
帶解析
def getText():
txt=open("hmlt.txt","r").read() #打開文件
txt=txt.lower() #將所有單詞轉換爲小寫去掉大小寫的干擾
for ch in '`!@#~$%^&*()_+-=*/{}[];,./?<>': #去掉所有的特殊符號
txt=txt.replace(ch," ") #將特殊符號替換成空格 即去掉
return txt
hmltTxt=getText() #對文件進行讀取
words=hmltTxt.split()
#因爲現在單詞間均爲空格分隔開來,所以用split用空格分隔他們並變成列表返回
counts={} #建立一個字典
for word in words:
counts[word]=counts.get(word,0)+1
#用當前的某一個單詞作爲鍵索引字典 如果在裏面則返回次數再加一 若不在裏面則直接加1
items=list(counts.items())
#用list將counts變爲一個列表類型 counts.items()-->返回可遍歷的(鍵,值)元組數組
items.sort(key=lambda x:x[1],reverse=True)
#使用list.sort()方法來排序,此時list本身將被修改
for i in range(100):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
#從輸出結果來看,高頻單詞大多數是冠詞,代詞、連接詞等詞彙,並不能代表文章的含義
#進一步的可以採用集合類型構建一個排除詞彙庫excludes,在輸出結果中排除這個詞彙庫中的內容
excludes={"the","and","of","you","a","with","but","as","be","in","or","are"}
def getText():
txt=open("hmlt.txt","r").read()
txt=txt.lower()
for ch in '`!@#~$%^&*()_+-=*/{}[];,./?<>':
txt=txt.replace(ch," ")
return txt
hmltTxt=getText()
words=hmltTxt.split()
counts={}
for word in words:
counts[word]=counts.get(word,0)+1
for word in excludes:
del(counts[word])
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
/**************中文文本********************/
import jieba
txt=open("threekingdoms.txt","r",encoding='utf-8').read()
words=jieba.lcut(txt)
counts={}
for word in words:
if len(word)==1: #排除單個字符的分詞結果
continue
else:
counts[word]=counts.get(word,0)+1
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
#輸出結果中,出現了“玄德”、“玄德曰”,應該爲同一個人但jieba劃分爲兩個詞彙,這種情況需要整合處理
excludes={"將軍","卻說","二人","不可","荊州","不能","如此"}
import jieba
txt=open("threekingdoms.txt","r",encoding='utf-8').read()
words=jieba.lcut(txt)
counts={}
for word in words:
if len(word)==1: #排除單個字符的分詞結果
continue
elif word=="諸葛亮"or word=="孔明曰":
rword="孔明"
elif word=="關公"or word=="雲長":
rword="關羽"
elif word=="玄德"or word=="玄德曰":
rword="劉備"
elif word=="孟德"or word=="丞相":
rword="曹操"
else:
rword=word
counts[word]=counts.get(word,0)+1
for word in excludes:
del(counts[word])
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word,count=items[i]
print("{0:<10}{1:>5}".format(word,count))
其中sort()的使用
1.方法sort用於對列表就地排序。就地排序意味着對原來的列表進行修改,使其元素按順序排列,而不是返回排序後的列表的副本
x = [4, 6, 2, 1, 7, 9]
x.sort()
print x
# [1, 2, 4, 6, 7, 9]
如果需要一個排序好的副本,同時保持原有列表不變,怎麼實現呢
①
>>> x = [4, 6, 2, 1, 7, 9]
>>> y=x[ : ]
>>> y.sort()
>>> print(y)
[1, 2, 4, 6, 7, 9]
>>> print(x)
[4, 6, 2, 1, 7, 9]
注意:y = x[:] 通過分片操作將列表x的元素全部拷貝給y,如果簡單的把x賦值給y:y = x,y和x還是指向同一個列表,並沒有產生新的副本。
②
>>> x = [4, 6, 2, 1, 7, 9]
>>> y=x.copy()
>>> y.sort()
>>> print(y)
[1, 2, 4, 6, 7, 9]
>>> print(x)
[4, 6, 2, 1, 7, 9]
先產生一個副本賦予y然後再對y排序高級排序
reverse-->只需將其指定爲一個真值(False或True),以指出是否要按相反的順序對列表進行排序