python docx通過關鍵字標註字體以及顏色大小等

主要使用python-docx 與pandas

因爲python-docx對錶格的解析不夠友好且效率低,故需轉換一次

代碼如下

# coding:utf-8
import os, re
import docx
from docx.document import Document as dc
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import RGBColor  # 設置字體顏色
from docx import Document
from docx.shared import Pt  # 設置字體
from docx.oxml.ns import qn  # 設置中文字體
import pandas as pd

FILE_PATH = r"D:\xxxx\xxxx\xxxx\xxxx.docx"

obj = docx.Document(FILE_PATH)


def iter_block_items(parent):
    # print('utils.py ----> iter_block_items:', 2)
    if isinstance(parent, dc):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("[TypeError] Document in insuitable type.")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)


def table2list(table):
    data = []
    for i, row in enumerate(table.rows):
        row_data = []
        for cell in row.cells:
            row_data.append(cell.text)
        data.append(row_data)
    return data

#替換的段落關鍵字
word = '段落關鍵字'
#替換的表格關鍵字
table_text = '表格關鍵字'


def set_run(run, font_size, bold, color, name):
    '''
    設置run對象
    :param run:
    :param font_size: 字體大小
    :param bold: 是否加粗
    :param color: 字體顏色
    :param name: 字體名
    :return:
    '''
    run.font.size = font_size
    run.bold = bold
    run.font.color.rgb = color
    run.font.name = name
    # 設置字體必須要下面2步
    s = run._element
    s.rPr.rFonts.set(qn('w:eastAsia'), name)


def paragraphs_utils(obj):
    for p in obj.paragraphs:
        # 先循環得到單個段落p
        for r in p.runs:
            if word not in r.text:
                # 判斷關鍵字是否存在於段落文本中
                continue
            # print(r.text)
            # print(r.style.name)
            font_size = r.font.size
            bold = r.bold
            color = r.font.color.rgb
            name = u'楷體'
            # 使用關鍵詞切分當前run的文本
            rest = r.text.split(word)
            # 清除當前run的內容
            r.text = ''
            for text in rest[:-1]:
                # 循環切割出來的列表 ['','xxxxxxx']或者['xxxxx','']
                run = p.add_run(text=text)
                set_run(run, font_size, bold, color, name)
                run = p.add_run(word)
                # 重寫關鍵字部分
                set_run(run, font_size, bold, color, name)
                run.font.color.rgb = RGBColor(255, 0, 0)
            run = p.add_run(rest[-1])
            # 在補齊r.text的內容
            set_run(run, font_size, bold, color, name)
    obj.save('標註後的文檔.docx')


def table_utils(obj):
    for p in obj.tables:
        # 先循環得到單個表格p
        pd_block = pd.DataFrame(table2list(p))
        # 使用table2list 將table轉成列表,然後轉成pandas的DateFrame對象
        for rows in range(pd_block.shape[0]):
            # 循環pd_block(DateFrame對象)的行數 -》shape方法得到元祖 爲行數和列數
            if rows == 0: continue
            if table_text != pd_block.iloc[rows, 0]: continue
            # 判斷關鍵字是否等於當前表的 rows行0列,否則跳過
            for cols in range(pd_block.shape[1]):
                if cols == 0: continue
                rs = p.cell(rows, cols).paragraphs[0]
                # 此時rows和cols肯定爲關鍵字所在的那行數據,用document對象獲取paragraphs取0
                for r in rs.runs:  # paragraphs中有個runs   是個列表
                    font_size = r.font.size
                    bold = r.bold
                    color = r.font.color.rgb
                    name = u'楷體'
                    data = r.text.strip()
                    # 清除當前run的內容
                    r.text = ''
                    run = rs.add_run(data)
                    # 此時要使用paragraphs的add_run方法重寫data數據
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
    obj.save('標註後的表格.docx')


for block in iter_block_items(obj):
    if isinstance(block, Paragraph):
        for r in block.runs:
            if word not in r.text:
                continue
            print(r.text)
            print(r.style.name)
            font_size = r.font.size
            bold = r.bold
            color = r.font.color.rgb
            name = u'楷體'
            # 使用關鍵詞切分當前run的文本
            rest = r.text.split(word)
            # 清除當前run的內容
            r.text = ''
            for text in rest[:-1]:
                run = block.add_run(text=text)
                set_run(run, font_size, bold, color, name)
                run = block.add_run(word)
                set_run(run, font_size, bold, color, name)
                run.font.color.rgb = RGBColor(255, 0, 0)
            run = block.add_run(rest[-1])
            set_run(run, font_size, bold, color, name)
    else:
        pd_block = pd.DataFrame(table2list(block))
        # 使用table2list 將table轉成列表,然後轉成pandas的DateFrame對象
        for rows in range(pd_block.shape[0]):
            # 循環pd_block(DateFrame對象)的行數 -》shape方法得到元祖 爲行數和列數
            if rows == 0: continue
            if table_text != pd_block.iloc[rows, 0]: continue
            # 判斷關鍵字是否等於當前表的 rows行0列,否則跳過
            for cols in range(pd_block.shape[1]):
                if cols == 0: continue
                rs = block.cell(rows, cols).paragraphs[0]
                # 此時rows和cols肯定爲關鍵字所在的那行數據,用document對象獲取paragraphs取0
                for r in rs.runs:  # paragraphs中有個runs   是個列表
                    font_size = r.font.size
                    bold = r.bold
                    color = r.font.color.rgb
                    name = u'楷體'
                    data = r.text.strip()
                    # 清除當前run的內容
                    r.text = ''
                    run = rs.add_run(data)
                    # 此時要使用paragraphs的add_run方法重寫data數據
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
                    
obj.save('段落與表格標註後的文檔.docx')

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章