pip install python-docx
1.讀取sample.docx
:
import docx
from docx import Document
from pprint import pprint
def getText(filename):
doc = Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return(fullText)
pprint(getText('sample.docx'))
2.創建demo_docx.docx
:
import docx
from docx import Document
document = Document()
document.add_heading('This is Title',0) # Title
p = document.add_paragraph('A paragraph!')
p.add_run('bold text ').bold = True
p.add_run('italic text').italic = True
document.add_paragraph(
'unordered list 1', style='ListBullet'
)
for i in range(3):
document.add_paragraph(
'ordered list {}'.format(i) , style='ListNumber'
)
# doc.add_picture('pic.png',width=shared.Inches(num),height=shared.Cm(num))
document.add_picture('1.jpg')
# ----------------add table ------------
table = document.add_table(rows=1, cols=2)
headr_cells = table.rows[0].cells # 0
headr_cells[0].text = 'name'
headr_cells[0].text = 'gender'
d = [dict(name='A',gender='male') , dict(name='B',gender='female')]
for item in d :
row_cells = table.add_row().cells # 1
row_cells[0].text = str(item['name'])
row_cells[1].text = str(item['gender'])
document.add_page_break() # 分頁
document.save('demo_docx.docx')
# ------------
word批量轉pdf 及 失敗解決方案
利用下面code批量將dir_word
內的docx
、doc
文檔轉爲dir_pdf
內的.pdf
,只是簡單利用word的另存爲pdf
功能。
但不加sleep
時每次只能轉成功第一個word,報錯
raise AttributeError("%s.%s" % (self._username_, attr)) AttributeError: <unknown>.Open
檢測代碼也沒錯,後來想到是不是因爲上個操作沒完成導致的,就加了個休眠
1s,運行就正常了。
from win32com.client import Dispatch
import os
from time import sleep
wdFormatPDF = 17
def doc2pdf(input_file,output_file):
print(input_file)
print(output_file)
word = Dispatch('Word.Application')
doc = word.Documents.Open(input_file)
doc.SaveAs(output_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()
if __name__ == "__main__":
dir_word = "F:\\python\\word2pdf\\word" # word目錄
dir_pdf = "F:\\python\\word2pdf\\pdf" # pdf存放目錄
for root, dirs, filenames in os.walk(dir_word):
for file in filenames:
if file.endswith(".doc") :
doc2pdf( str(dir_word + "\\" + file), str(dir_pdf + "\\" + file.replace(".doc",".pdf")) )
elif file.endswith(".docx"):
doc2pdf( str(dir_word + "\\" + file), str(dir_pdf + "\\" + file.replace(".docx",".pdf")) )
sleep(1) # 每次間隔1s