Python 英文文本字母跳轉概率統計

# 英文文本字母跳轉概率統計.py
# 文本示例:hemlet.txt
import csv
def getText():
    txt = open("hamlet.txt","r").read()
    txt = txt.lower()
    for ch in '!"#$%()&*+-,:;<>={}[]\/_.~`“”\'‘’?':
        txt = txt.replace(ch," ")
    return txt
hamletTxt = getText()
words = hamletTxt.split()
counts = {}
# for word in words:
#     counts[word] = counts.get(word,0)+1
# items = list(counts.items())
# items.sort(key=lambda x:x[1],reverse= True)
counts_ch={}
PR_counts={}
for word in words:
    ls_word=list(word)
    # print(ls_word)
    for i in range(len(ls_word)-1):
        Answer_ch="{}->{}".format(ls_word[i],ls_word[i+1])
        counts_ch[Answer_ch]=counts_ch.get(Answer_ch,0)+1
amount=0
for i in counts_ch.values():
    amount=amount+i
for key in counts_ch.keys():
    PR_counts[key]=counts_ch[key]/amount
# print(amount)
# for i in range (10):
#     word,count = items[i]
#     print("{0:<10}{1:>5}".format(word,count))
# print(counts_ch)
items_ch=list(PR_counts.items())
items_ch.sort(key=lambda x: x[1], reverse=True)
Rank=0
for i in range(len(PR_counts)):
    Rank=Rank+1
    Answer_ch,PR_count=items_ch[i]
    print("{0:<3}:{1:<10}{2:>5}".format(Rank,Answer_ch, PR_count))
# print(PR_counts)
matrix_ch=[]
for row in range(0,26):
    matrix_ch.append([])
    for column in range(0,26):
        try:
            PR=PR_counts["{}->{}".format(chr(row+97),chr(column+97))]
        except KeyError:
            PR=0
        matrix_ch[row].append(PR)
# print(matrix_ch)
Answer_matrix=[]
for row in range(27):
    Answer_matrix.append([])
    if row==0:
        Answer_matrix[0].append(" ")
        for i in range(26):
            Answer_matrix[0].append(chr(i+97))
    else:
        Answer_matrix[row].append(chr(row+96))
        for column in range(26):
            Answer_matrix[row].append(matrix_ch[row-1][column])
writer = csv.writer(open("字母跳轉概率輸出矩陣.csv",'w',newline=''))
for item in Answer_matrix:
    writer.writerow(item)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章