場景
獲取文件中的文本內容(只讀不寫)
安裝:pip install pdfminer3k
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
def read_pdf(path_pdf):
with open(path_pdf, 'rb') as pdf:
# resource manager
# PDF資源管理器
rsrcmgr = PDFResourceManager()
# 輸出str到內存
outfp = StringIO()
# 解析PDF的參數
laparams = LAParams()
# 文本轉換器
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
# 進行處理
process_pdf(rsrcmgr, device, pdf)
# 獲取處理後的文本內容
content = outfp.getvalue()
# 關閉設備
device.close()
outfp.close()
return content # <class 'str'>
if __name__ == '__main__':
lines = read_pdf('P020190716349644060705.pdf')
print(lines)
WORD
安裝:pip install python-docx
讀取段落
from docx import Document
# 加載文件
d = Document('a.docx')
# 遍歷段落
for paragraph in d.paragraphs:
print(paragraph.text)
讀取表格
from docx import Document
# 加載文件
d = Document('a.docx')
# 按行取數
for table in d.tables:
for row in table.rows:
for cell in row.cells:
print(cell.text)
EXCEL
from pandas import read_excel
def xlsx2df(fname, sheet_name=0):
return read_excel(fname, sheet_name)
PPT
安裝:pip install python-pptx
import pptx
# 打開PPT
p = pptx.Presentation('a.pptx')
# 遍歷幻燈片
for slide in p.slides:
# 遍歷幻燈片內每個形狀
for shape in slide.shapes:
# 文本框
if isinstance(shape, pptx.shapes.placeholder.SlidePlaceholder):
for paragraph in shape.text_frame.paragraphs:
print(paragraph.text)
# 表格
if isinstance(shape, pptx.shapes.graphfrm.GraphicFrame):
for cell in shape.table.iter_cells():
print(cell.text)