在線分析office(xls, xlsx, doc, docx)文檔內容 擴展可在線預覽

最近在做一個文檔分析系統,要求是在上傳的兩千份左右的office文件裏篩選出含有一千個左右的關鍵詞,因爲以前主業開發PHP,但是PHP在處理這塊的時候“力不從心”,整好研究了小半年的Python,於是希望用Py和PHP混合開發,簡單架構如下:

(爲啥有Node,因爲後邊調用了 textract 來處理某些文件)

前臺PHP上傳那塊不做贅述,常規CURD,Upload,重點是後邊 Py+Node處理這一塊,貼代碼

環境:Centos6.9 Python3.6 Node10.16 Liboffice

框架:Thinkphp5.1 Layui

依賴都在代碼聲明裏了。

# -*- coding: UTF-8 -*-
import sys
import getopt
import pymysql
import os
import re
import subprocess
import time

# 這個文件是處理主邏輯,基本思路是 將所有文件都轉爲 xhtml, 部分liboffice轉不了的文件交給textract處理, 雙保險, 再處理不了的, 返回提示, 讓用戶處理一下文件, 那種文件數量極少.
# init
db = pymysql.connect("127.0.0.1", "root", "root", "document")
# 使用 cursor() 方法創建一個遊標對象 cursor
cursor = db.cursor()


# 接受一個參數, 文件id
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "i:p:h", ["id=", "path=", "help"])
    except getopt.GetoptError:
        print('Wrong opts! you can use do.py -h to get help')
        sys.exit(3)
    params = {
        'type': 0,
        'args': ''
    }
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print('if you want to deal files by id, this order will according to these id')
            print('main.py -i <ex:1,2,3,4,5,6,7,8>')
            print('if you want to deal files by path, this order will according to this path , get all files to deal')
            print('main.py -p <ex: ./pdf/>')
            sys.exit()
        elif opt in ('-i', '--id='):
            params = {
                'type': 1,
                'args': arg
            }
        elif opt in ('-p', '--path='):
            params = {
                'type': 2,
                'args': arg
            }
    return params


def getFile(params):
    # 加一層判斷, 如果傳入的 id = all , 則處理全部文件,

    # 判斷 傳入的是 id 還是 路徑
    # 如果傳入的是 id, 去數據庫 獲取到他的路徑
    if 1 == params['type']:
        if 'all' == params['args']:
            sql = "SELECT id, path from doc_file where delete_time is null"
        else:
            sql = "SELECT id, path from doc_file where id IN (" + params['args'] + ")"
        cds = cursor.execute(sql)
        cds = cursor.fetchall()
        for i in cds:
            # print(i)
            do(i[0], i[1])


def do(id, path):
    # 追加 path 完整路徑
    # path_reg = r"/www_2/wwwroot/document/public/"
    # path = path_reg+ path

    # 先轉
    x_path = dealFormat(path)
    if os.path.exists(x_path):
        sql = "INSERT INTO doc_content(doc_id) VALUES ('%s')" % (id)
        cursor.execute(sql)
        db.commit()
    else:
        # 嘗試 textract 讀取
        sub = subprocess.Popen([r"textract", path],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
        while sub.poll() is None:
            print('textract Waiting...')
            time.sleep(0.1)
        content = sub.stdout.read()
        print(content)
        content = content.decode('utf-8', 'ignore')
        f = open(x_path,'w+')
        f.write(content)
        f.close()
        sql = "INSERT INTO doc_content(doc_id) VALUES ('%s')" % (id)
        cursor.execute(sql)
        db.commit()


        f = open('bad.txt','a+')
        f.write(id.__str__())
        f.write('|')
        # f.seek(0)
        f.close()

        print('This file is bad:', id)
    # 再讀
    # content = read(x_path)
    # try:
    #     cont_str = content.decode('utf-8', 'ignore')
    # except:
    # cont_str = content.decode('gbk', 'ignore')

    # 存

    # print(cont_str)


def dealFormat(path):

    path = r_path+path
    file = os.path.splitext(path)
    filename, type = file

    _, tempfilename = os.path.split(filename + '.xhtml')
    x_path = r_path + r'cache3/' + tempfilename
    # print(x_path)
    if not os.path.exists(x_path):
        # 轉換文件格式
        print('Now:',path)
        if '.html' == type:
            sub = subprocess.Popen([r"cp", path, r_path+'cache3/' + tempfilename])
            # os.system('cp ' + path + ' ' + r_path+'/cache/' + filename + '.xhtml')
        else:
            # sub = subprocess.Popen([r"/opt/openoffice4/program/soffice", "--convert-to", "xhtml", "--outdir", r_path+r"cache3/", path])
            sub = subprocess.Popen([r"/opt/libreoffice6.1/program/soffice", "--convert-to", "xhtml", "--outdir", r_path+r"cache3/", path])
            # 檢查進程是否結束
        t = 0
        while sub.poll() is None:
            print('soffice Waiting...')
            time.sleep(0.1)
        return x_path
    else:
        print('This file has exists, pass')
        return 'This file has exists, pass'


# 讀html
def read(path=''):
    try:
        if 0 == len(path):
            return False
        fp = open(path, "rb")
        data = fp.read()
        return data
    except:
        print('error')


if __name__ == "__main__":
    # 網站所在的目錄
    r_path = r"/www_2/wwwroot/document/public/"
    # print(sys.argv[1:])
    params = main(sys.argv[1:])

    getFile(params)
# -*- coding: UTF-8 -*-

# 文件獲取兩個參數 -i [文件id] -c [公司id]
import sys
import getopt
import pymysql
import os

# 這個文件是搜索關鍵的邏輯, 直接去讓python讀上一步處理好的 xhtml 文件, 利用python處理字符串快的優勢, 返回所需格式
# init
db = pymysql.connect("127.0.0.1", "root", "root", "document")
# 使用 cursor() 方法創建一個遊標對象 cursor
cursor = db.cursor()


# 接受兩個參數 文件id  公司id
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "i:c:h", ["id=", "cid=", "help"])
    except getopt.GetoptError:
        print('Wrong opts! you can use do.py -h to get help')
        sys.exit(3)
    params = {}
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print('if you want to deal files by id, this order will according to these id')
            print('main.py -i <ex:1,2,3,4,5,6,7,8> -c <ex:1,2,3,4,5,6,7,8>')
            sys.exit()
        elif opt in ('-i', '--id='):
            params['id'] = arg
        elif opt in ('-c', '--cid='):
            params['cid'] = arg
    return params


def got(params):
    # 獲取文件 id
    id = params['id']
    # 獲取id 所對的 生成 xhtml 文件名
    sql = "SELECT  path, id FROM doc_file WHERE id IN (" + id + ")"
    cds = cursor.execute(sql)
    cds = cursor.fetchall()
    # print(cds)
    xhtml_file = []
    for i in cds:
        file = os.path.splitext(i[0])
        filename, type = file
        _, tempfilename = os.path.split(filename + '.xhtml')
        tempfilename = r_path+"/cache3/"+ tempfilename
        xhtml_file.append([tempfilename, i[1]])
    # 獲取企業 id
    cid = params['cid']
    sql_c = "SELECT company_id, name FROM doc_keyword WHERE delete_time is null AND company_id IN (" + cid + ")"
    cds = cursor.execute(sql_c)
    cds = cursor.fetchall()
    keys = []
    for c_i in cds:
        keys.append([c_i[1],c_i[0]])

    # print(xhtml_file)
    # 開始循環 文件判斷文件中是否存在關鍵詞
    has_key = []
    for x_i in xhtml_file:
        f = open(x_i[0], 'r', -1, 'utf-8','ignore')
        fr = f.read()
        for key in keys:
            if key[0] in fr:
                has_key.append([x_i[1], key[1], key[0]])
        f.close()
    print(has_key)





if __name__ == "__main__":
    # 網站所在的目錄
    r_path = r"/www_2/wwwroot/document/public/"
    # print(sys.argv[1:])
    params = main(sys.argv[1:])
    got(params)

至於調用, PHP exec 調用即可, 不過要注意對 exec的 安全

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章