主要使用python-docx 與pandas
因爲python-docx對錶格的解析不夠友好且效率低,故需轉換一次
代碼如下
# coding:utf-8
import os, re
import docx
from docx.document import Document as dc
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import RGBColor # 設置字體顏色
from docx import Document
from docx.shared import Pt # 設置字體
from docx.oxml.ns import qn # 設置中文字體
import pandas as pd
FILE_PATH = r"D:\xxxx\xxxx\xxxx\xxxx.docx"
obj = docx.Document(FILE_PATH)
def iter_block_items(parent):
# print('utils.py ----> iter_block_items:', 2)
if isinstance(parent, dc):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("[TypeError] Document in insuitable type.")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
def table2list(table):
data = []
for i, row in enumerate(table.rows):
row_data = []
for cell in row.cells:
row_data.append(cell.text)
data.append(row_data)
return data
#替換的段落關鍵字
word = '段落關鍵字'
#替換的表格關鍵字
table_text = '表格關鍵字'
def set_run(run, font_size, bold, color, name):
'''
設置run對象
:param run:
:param font_size: 字體大小
:param bold: 是否加粗
:param color: 字體顏色
:param name: 字體名
:return:
'''
run.font.size = font_size
run.bold = bold
run.font.color.rgb = color
run.font.name = name
# 設置字體必須要下面2步
s = run._element
s.rPr.rFonts.set(qn('w:eastAsia'), name)
def paragraphs_utils(obj):
for p in obj.paragraphs:
# 先循環得到單個段落p
for r in p.runs:
if word not in r.text:
# 判斷關鍵字是否存在於段落文本中
continue
# print(r.text)
# print(r.style.name)
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷體'
# 使用關鍵詞切分當前run的文本
rest = r.text.split(word)
# 清除當前run的內容
r.text = ''
for text in rest[:-1]:
# 循環切割出來的列表 ['','xxxxxxx']或者['xxxxx','']
run = p.add_run(text=text)
set_run(run, font_size, bold, color, name)
run = p.add_run(word)
# 重寫關鍵字部分
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
run = p.add_run(rest[-1])
# 在補齊r.text的內容
set_run(run, font_size, bold, color, name)
obj.save('標註後的文檔.docx')
def table_utils(obj):
for p in obj.tables:
# 先循環得到單個表格p
pd_block = pd.DataFrame(table2list(p))
# 使用table2list 將table轉成列表,然後轉成pandas的DateFrame對象
for rows in range(pd_block.shape[0]):
# 循環pd_block(DateFrame對象)的行數 -》shape方法得到元祖 爲行數和列數
if rows == 0: continue
if table_text != pd_block.iloc[rows, 0]: continue
# 判斷關鍵字是否等於當前表的 rows行0列,否則跳過
for cols in range(pd_block.shape[1]):
if cols == 0: continue
rs = p.cell(rows, cols).paragraphs[0]
# 此時rows和cols肯定爲關鍵字所在的那行數據,用document對象獲取paragraphs取0
for r in rs.runs: # paragraphs中有個runs 是個列表
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷體'
data = r.text.strip()
# 清除當前run的內容
r.text = ''
run = rs.add_run(data)
# 此時要使用paragraphs的add_run方法重寫data數據
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
obj.save('標註後的表格.docx')
for block in iter_block_items(obj):
if isinstance(block, Paragraph):
for r in block.runs:
if word not in r.text:
continue
print(r.text)
print(r.style.name)
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷體'
# 使用關鍵詞切分當前run的文本
rest = r.text.split(word)
# 清除當前run的內容
r.text = ''
for text in rest[:-1]:
run = block.add_run(text=text)
set_run(run, font_size, bold, color, name)
run = block.add_run(word)
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
run = block.add_run(rest[-1])
set_run(run, font_size, bold, color, name)
else:
pd_block = pd.DataFrame(table2list(block))
# 使用table2list 將table轉成列表,然後轉成pandas的DateFrame對象
for rows in range(pd_block.shape[0]):
# 循環pd_block(DateFrame對象)的行數 -》shape方法得到元祖 爲行數和列數
if rows == 0: continue
if table_text != pd_block.iloc[rows, 0]: continue
# 判斷關鍵字是否等於當前表的 rows行0列,否則跳過
for cols in range(pd_block.shape[1]):
if cols == 0: continue
rs = block.cell(rows, cols).paragraphs[0]
# 此時rows和cols肯定爲關鍵字所在的那行數據,用document對象獲取paragraphs取0
for r in rs.runs: # paragraphs中有個runs 是個列表
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷體'
data = r.text.strip()
# 清除當前run的內容
r.text = ''
run = rs.add_run(data)
# 此時要使用paragraphs的add_run方法重寫data數據
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
obj.save('段落與表格標註後的文檔.docx')