提取PDF簡歷信息

原文地址:http://www.yooongchun.com/2019/08/13/pdf-resume-extractor/

業務場景

有一個需求,就是從大批量的簡歷中篩選出需要的信息。本文所需要的信息是姓名、電話和Email地址。爲了提高效率,故使用程序來完成這個工作。

文件轉換:word轉爲pdf

原始文件包含了.doc.doc .docx.docx.pdf.pdf三種格式。經過驗證,發現使用PDF轉換後提取信息的效果更好一些,因此第一步需要把word文件轉換爲pdf文件。這裏使用python的win32comwin32com包來實現轉換,需要說明一點的是這個包需要調用windows下的word程序,因此只支持windows平臺。
第一步:安裝依賴庫

pip install pywin32

第二步:轉換文件

def word2pdf(file_dir):
    """word轉爲pdf"""
    w = Dispatch('Word.Application')
    w.Visible = 0
    w.DisplayAlerts = 0
    doc = w.Documents.Open(file_dir)
    new_path = os.path.splitext(file_dir)[0] + '.pdf'
    doc.SaveAs(new_path, FileFormat=17)
    doc.Close()
    w.Quit()

提取思路

基本的提取思路是先把PDF文件的文本內容提取出來,然後通過正則表達式去匹配值。這就導致了一個問題,姓名只能猜測而無法準確獲取。PDF文本提取使用的是pdfplumber這個庫,通過以下命令安裝:

pip install pdfplumber

然後提取PDF內容只需要簡單地幾行代碼即可搞定

pdf=pdfplumber.open(file)
for page in pdf.pages:
    text=page.extract_text()
    print(text)

代碼

思路清楚之後把完整的代碼封裝爲一個類,方便使用,如下:

import os
import re
import pdfplumber as pb
from win32com.client import Dispatch
import pandas as pd
import sys

class Extractor(object):
    """抽取單個文件的信息"""

    def __init__(self, file_dir):
        self.file_dir = file_dir
        if os.path.splitext(self.file_dir)[1] in [".doc", ".docx"]:
            try:
                self.__word2pdf()
            except Exception as e:
                print(e)
                return

    def __doc2docx(self):
        """doc轉爲docx"""
        w = Dispatch('Word.Application')
        w.Visible = 0
        w.DisplayAlerts = 0
        doc = w.Documents.Open(self.file_dir)
        new_path = os.path.splitext(self.file_dir)[0] + '.docx'
        doc.SaveAs(new_path, 12, False, "", True, "", False, False, False, False)
        doc.Close()
        w.Quit()
        os.remove(self.file_dir)
        self.file_dir = new_path
        return new_path

    def __word2pdf(self):
        """word轉爲pdf"""
        w = Dispatch('Word.Application')
        w.Visible = 0
        w.DisplayAlerts = 0
        doc = w.Documents.Open(self.file_dir)
        new_path = os.path.splitext(self.file_dir)[0] + '.pdf'
        doc.SaveAs(new_path, FileFormat=17)
        doc.Close()
        w.Quit()
        os.remove(self.file_dir)
        self.file_dir = new_path
        return new_path

    def __extract_text(self):
        """抽取文本內容"""
        text = ""
        if os.path.splitext(self.file_dir)[1] == ".pdf":
            pdf = pb.open(self.file_dir)
            for page in pdf.pages:
                text += page.extract_text() if page.extract_text() else ""
        # elif os.path.splitext(self.file_dir)[1] == ".docx":
        #     doc = docx.Document(self.file_dir)
        #     for para in doc.paragraphs:
        #         text += para.text
        return text

    def __extract_words(self):
        """抽取單詞"""
        words = []
        if os.path.splitext(self.file_dir)[1] == ".pdf":
            pdf = pb.open(self.file_dir)
            for page in pdf.pages:
                words += page.extract_words()
        # elif os.path.splitext(self.file_dir)[1] == ".docx":
        #     doc = docx.Document(self.file_dir)
        #     for para in doc.paragraphs:
        #         words.append(para.text)
        return words

    def __search_name(self):
        """搜索姓名"""
        names = []
        full_text = self.__extract_text()
        # 先通過"姓名"字段去查找”
        for line in full_text.split("\n"):
            if re.search(r"姓\s*名", line):
                name = re.findall(r"姓\s*名[::\s]*[\u4e00-\u9fa5]{2,4}", line)[0]
                names.append(re.sub(r"[姓名::\s]", "", name))
        # 在"姓名"字段中找不到結果,則按照文字長度去猜測一個
        if len(names) < 1:
            for line in re.split(r"\n|\s+", full_text):
                if re.search(r"\d", line):
                    continue
                word = ""
                for w in line:  # 去重
                    if w not in word:
                        word += w
                if 2 <= len(word) <= 4:
                    _names = re.findall(r"[\u4e00-\u9fa5]{2,4}", word)
                    names += _names
                    # break
        return names

    def __search_email(self):
        """搜索Email地址"""
        full_words = self.__extract_words()
        email = ""
        for word in full_words:
            if os.path.splitext(self.file_dir)[1] == ".pdf":
                text = word["text"]
            else:
                text = word
            if "@" in text and "." in text:
                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                    if "@" in e:
                        email = e
                        break
            if email != "":
                break
        return email

    def __search_phone(self):
        """搜索電話號碼"""
        full_text = self.__extract_text()
        phone = ""
        # 直接通過文件名查找
        file_name = re.split(r"/+|\\+", self.file_dir)[-1]
        number = re.findall(r"\d{11,13}", file_name)
        if len(number) > 0 and re.search(r"^1", number[0]):
            phone = number[0]
        else:
            # 通過關鍵詞查找
            for line in re.split(r"[\n\s]+", full_text):
                if "電話" in line or "手機" in line:
                    line = re.sub(r"[()()::+\-]", "", line)
                    number = re.findall(r"\d{11,13}", line)[0]
                    phone = re.sub(r"^(86)", "", number)
                    break
            # 直接通過數字長度查找
            if phone == "":
                text = re.sub(r"[()()+\-]", "", full_text)
                phones = re.findall(r"\d{11,13}", text)
                phones = [re.sub(r"^(86)", "", p) for p in phones if re.search(r"^1", re.sub(r"^(86)", "", p))]
                phone = ",".join(set(phones))
        return phone

    def search(self):
        """入口函數,返回搜索結果"""
        sep_dir = re.split(r"/+|\\+", self.file_dir)
        directory = sep_dir[-2]
        file_name = sep_dir[-1]
        info = {"directory": directory, "file_name": file_name, "phone": "", "user_name": "", "email": ""}

        # 查找姓名
        try:
            names = self.__search_name()
            info["user_name"] = ",".join(names)
        except Exception as e:
            print(e)

        # 查找Email
        try:
            email = self.__search_email()
            info["email"] = email
        except Exception as e:
            print(e)

        # 查找電話
        try:
            phone = self.__search_phone()
            info["phone"] = phone
        except Exception as e:
            print(e)
        return info

最後,還需要處理批量文件,寫一個迭代查找文件的函數:

def find_files(file_dir):
    """迭代查找文件"""
    file_paths = []
    for root, _, files in os.walk(file_dir):
        for file in files:
            path = os.path.join(root, file)
            rear = os.path.splitext(path)[1]
            if rear in [".doc", ".docx", ".pdf"]:
                file_paths.append(path)
    return file_paths

以及,文件入口:

if __name__ == "__main__":
    FILE_DIR = r"data"
    OUT_DIR = r"resume-data.xlsx"
    args = sys.argv

    if len(args) > 1:
        FILE_DIR = args[1]
    if len(args) > 2:
        OUT_DIR = args[2]
        FILE_DIR = args[1]
    # 文件存在,則追加序號
    cnt = 0
    while os.path.isfile(os.path.abspath(OUT_DIR)):
        OUT_DIR = os.path.splitext(OUT_DIR)[0] + "_" + str(cnt) + ".xlsx"
        cnt += 1
    writer = pd.ExcelWriter(OUT_DIR)
    for folder in os.listdir(FILE_DIR):
        file_dir = os.path.join(os.path.abspath(FILE_DIR), folder)
        paths = find_files(file_dir)
        print("Total {} file(s) in directory {}:".format(len(paths), folder))
        df = pd.DataFrame()
        for index, file_path in enumerate(paths):
            info = Extractor(file_dir=file_path).search()
            df = df.append(info, ignore_index=True)
            print(index, info["file_name"], info["email"], info["phone"], info["user_name"])
        df.to_excel(writer, folder)
    print("Save to file ", OUT_DIR)
    writer.save()
    print("All done.")

使用說明

首先,你的簡歷文件結構應該如下:

data
 - 目錄一
   - 一些pdf或者word文件
 - 目錄...
   - ...

使用時可以直接通過以下方式調用:

python extractor.py data result.xlsx

其中 data代表簡歷存放的根目錄,result.xlsx代表保存文件名,這兩個參數都是可選的,不加則代表使用默認值data 和resume-data.xlsx

完整的代碼下載地址:extractor.py

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章