import pdfplumber as plmber
import os
import jieba
import jieba.analyse
def TextByWords(page):
content = ""
for idx, word in enumerate(page.extract_words()):
content = content + word['text'].strip('\n')
return content
def TextByText(page):
return page.extract_text()
if __name__ == '__main__':
base="./reports/"
f_paths=[base+path for path in os.listdir(base)]
trgt_f=plmber.open(f_paths[0])
for idx,page in enumerate(trgt_f.pages[:10]):
content=TextByText(page)
print(content)
#allowPOS 選定關鍵詞詞性 allowPOS=('n','nr','ns') details see: https://github.com/fxsjy/jieba
keywords = jieba.analyse.extract_tags(content, topK=50, withWeight=True,allowPOS=('n','nr','vn'))
#key words in jieba contains useless keywords such as number like 2018
#ranking's key is keyword item, corresponding value is its tfidf value
#tfidf is actually numeralize the word,so as to make the sentence a vector.
ranking={}
for item in keywords:
if not item[0].isdigit():
ranking[item[0]]=item[1]
trgt_p='./rankings/page{}.rnk'.format(idx+1)
with open(trgt_p,'w',encoding='utf-8') as f:
for key in ranking.keys():
f.write("{} : {}\n".format(key,ranking.get(key)))
python pdf plumer讀取pdf統計tfidf
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.