import docx
from win32com import client as wc
import re
import os
import os.path
def getListFiles(path):
ret = []
for root, dirs, files in os.walk(path):
for filespath in files:
if filespath.endswith(".docx"):
ret.append(os.path.join(root,filespath))
return ret
ret = getListFiles(r'C:\Users\Administrator\Desktop\國網遼寧電力\新建文件夾')
f2=open(r'C:\Users\Administrator\Desktop\國網遼寧電力\電子版知識庫20171120版本(大部分知識內容請在省中心文件夾查看)\wordsdoc.txt','a',encoding='utf-8',buffering=4096)
for file in ret:
print(file)
# word = wc.Dispatch(file)
# doc = word.Documents.Open(file)
# doc = file
doc = docx.Document(file)
# parag_num = 0
# f2=open('F:\\test2'+'(改).txt','w',encoding='utf-8')
for para in doc.paragraphs :
line=para.text
# while line:
line = re.split('[\t。\n]', line) # line=line.split('\t')
for phrase in line:
# if phrase=='\n':
# break
tmp = re.findall('(http|www|\d{8,})', phrase)
if len(tmp) > 0:
break
for letter in phrase:
if u'\u4e00' <= letter <= u'\u9fff':
f2.write(phrase + '\n')
phrase = re.sub('\n', '', phrase)
print(phrase)
break
# f2.flush()
f2.close()