wod清洗,docx

import docx
from win32com import client as wc
import re
import os
import os.path
def getListFiles(path):
    ret = []
    for root, dirs, files in os.walk(path):
        for filespath in files:
          if filespath.endswith(".docx"):
            ret.append(os.path.join(root,filespath))
    return ret
ret = getListFiles(r'C:\Users\Administrator\Desktop\國網遼寧電力\新建文件夾')
f2=open(r'C:\Users\Administrator\Desktop\國網遼寧電力\電子版知識庫20171120版本(大部分知識內容請在省中心文件夾查看)\wordsdoc.txt','a',encoding='utf-8',buffering=4096)
for file in ret:
    print(file)
    # word = wc.Dispatch(file)
    # doc = word.Documents.Open(file)

    # doc = file
    doc = docx.Document(file)
    # parag_num = 0
    # f2=open('F:\\test2'+'(改).txt','w',encoding='utf-8')

    for para in doc.paragraphs :
        line=para.text
            # while line:
        line = re.split('[\t。\n]', line)  # line=line.split('\t')

        for phrase in line:
                    # if phrase=='\n':
                    # break
            tmp = re.findall('(http|www|\d{8,})', phrase)
            if len(tmp) > 0:
                break
            for letter in phrase:
                if u'\u4e00' <= letter <= u'\u9fff':
                    f2.write(phrase + '\n')

                    phrase = re.sub('\n', '', phrase)
                    print(phrase)
                    break
        # f2.flush()
f2.close()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章