python基礎 class6(基本統計值計算、文本詞頻統計)

 

# 不定長度輸入
def getNum():
    nums = []
    iNumStr = input('輸入數字:')
    while iNumStr != '':  # 空字符結束
        nums.append(eval(iNumStr))
        iNumStr = input('輸入數字:')
    return nums


# 平均值
def mean(n):
    smean = 0.0
    for i in n:
        smean += i
    return smean / len(n)


# 計算樣本標準差
def dev(n, mean_n):
    sdev = 0.0
    for i in n:
        sdev += pow(i - mean_n, 2)
    return pow(sdev / (len(n) - 1), 0.5)


# 計算中位數
def median(n):
    n=sorted(n)
    size = len(n)
    if size % 2 == 0:
        return (n[size // 2] + n[size // 2 - 1]) / 2
    else:
        return n[size // 2]


def main():
    n = getNum()
    m = mean(n)
    print("平均數:{:.2f}\t標準差:{:.2f}\t中位數:{:.2f}".format(m, dev(n, m), median(n)))

try:
    main()
except:
    print("輸入錯誤")
# 文本詞頻統計

# 英文
# def getText():
#     txt = open('hamlet.txt', 'r').read()
#     txt = txt.lower()
#     for char in '!@#$%^&*()_+-={}[]|\\;:"\'<,>.?/`~':
#         txt = txt.replace(char, ' ')
#     return txt
#
#
# hamlettxt = getText()
# words = hamlettxt.split()
# counts = {}
# for word in words:
#     counts[word] = counts.get(word, 0) + 1
# items = list(counts.items())
# items.sort(key=lambda x: x[1], reverse=True)
# for i in range(10):
#     word, count = items[i]
#     # print(type(items[i]))
#     print('{:<10}{:>5}'.format(word, count))

# 中文
import jieba


def getText():
    txt = open('threekingdoms.txt', 'r', encoding='utf-8').read()
    return txt


excludes = {'將軍', '卻說', '荊州', '二人', '不可', '不能', '如此', '商議',
            '如何', '主公', '軍士', '左右', '軍馬', '引兵', '次日', '大喜',
            '天下', '東吳', '於是', '今日', '不敢', '魏兵', '陛下', '一人',
            '都督', '人馬', '不知', '漢中', '只見', '衆將', '蜀兵', '上馬',
            '大叫', '太守', '此人', '夫人', '先生', '後人', '背後', '城中',
            '一面', '何不', '大軍', '忽報', '百姓', '何故', '然後', '先鋒',
            '天子', '不如', '趕來', '原來', '令人', '江東', '下馬', '喊聲',
            '正是', '徐州', '忽然', '因此', '成都', '不見', '未知', '大敗',
            '大事', '之後', '一軍', '引軍', '起兵', '軍中', '接應', '進兵',
            '大驚', '可以', '以爲', '大怒', '不得', '心中'}
words = jieba.lcut(getText())
counts = {}
for word in words:
    if len(word) == 1:
        continue
    elif word == '孔明' or word == '孔明曰':
        rword = '諸葛亮'
    elif word == '關公' or word == '雲長':
        rword = '關羽'
    elif word == '丞相' or word == '孟德':
        rword = '曹操'
    elif word == '玄德' or word == '玄德曰' or word == '先主':
        rword = '劉備'
    elif word == '後主':
        rword = '劉禪'
    elif word == '呂奉先' or word == '奉先':
        rword = '呂布'
    elif word == '張翼德' or word == '翼德':
        rword = '張飛'
    else:
        rword = word
    counts[rword] = counts.get(rword, 0) + 1
for word in excludes:
    del counts[word]
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(20):
    word, count = items[i]
    print('{:<8}{:>5}'.format(word, count))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章