Python讀取PDF、WORD、EXCEL、PPT裏文本

場景

獲取文件中的文本內容(只讀不寫

PDF

安裝:pip install pdfminer3k

from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf

def read_pdf(path_pdf):
    with open(path_pdf, 'rb') as pdf:
        # resource manager
        # PDF資源管理器
        rsrcmgr = PDFResourceManager()
        # 輸出str到內存
        outfp = StringIO()
        # 解析PDF的參數
        laparams = LAParams()
        # 文本轉換器
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
        # 進行處理
        process_pdf(rsrcmgr, device, pdf)
        # 獲取處理後的文本內容
        content = outfp.getvalue()
        # 關閉設備
        device.close()
        outfp.close()
        return content  # <class 'str'>

if __name__ == '__main__':
    lines = read_pdf('P020190716349644060705.pdf')
    print(lines)

WORD

安裝:pip install python-docx

讀取段落

from docx import Document
# 加載文件
d = Document('a.docx')
# 遍歷段落
for paragraph in d.paragraphs:
    print(paragraph.text)

讀取表格

from docx import Document
# 加載文件
d = Document('a.docx')
# 按行取數
for table in d.tables:
    for row in table.rows:
        for cell in row.cells:
            print(cell.text)

EXCEL

from pandas import read_excel

def xlsx2df(fname, sheet_name=0):
    return read_excel(fname, sheet_name)

PPT

安裝:pip install python-pptx

import pptx
# 打開PPT
p = pptx.Presentation('a.pptx')
# 遍歷幻燈片
for slide in p.slides:
    # 遍歷幻燈片內每個形狀
    for shape in slide.shapes:
        # 文本框
        if isinstance(shape, pptx.shapes.placeholder.SlidePlaceholder):
            for paragraph in shape.text_frame.paragraphs:
                print(paragraph.text)
        # 表格
        if isinstance(shape, pptx.shapes.graphfrm.GraphicFrame):
            for cell in shape.table.iter_cells():
                print(cell.text)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章