python pdf plumer讀取pdf統計tfidf

import pdfplumber as plmber
import os
import jieba
import jieba.analyse


def TextByWords(page):
    content = ""
    for idx, word in enumerate(page.extract_words()):
        content = content + word['text'].strip('\n')
    return content

def TextByText(page):
    return page.extract_text()


if __name__ == '__main__':

    base="./reports/"

    f_paths=[base+path for path in os.listdir(base)]
    trgt_f=plmber.open(f_paths[0])

    for idx,page in enumerate(trgt_f.pages[:10]):
        content=TextByText(page)
        print(content)
        #allowPOS 選定關鍵詞詞性 allowPOS=('n','nr','ns') details see: https://github.com/fxsjy/jieba
        keywords = jieba.analyse.extract_tags(content, topK=50, withWeight=True,allowPOS=('n','nr','vn'))

        #key words in jieba contains useless keywords such as number like 2018

        #ranking's key is keyword item, corresponding value is its tfidf value
        #tfidf is actually numeralize the word,so as to make the sentence a vector.
        ranking={}
        for item in keywords:
            if not item[0].isdigit():
                ranking[item[0]]=item[1]

        trgt_p='./rankings/page{}.rnk'.format(idx+1)

        with open(trgt_p,'w',encoding='utf-8') as f:
            for key in ranking.keys():
                f.write("{} : {}\n".format(key,ranking.get(key)))
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章