原文地址:http://www.yooongchun.com/2019/08/13/pdf-resume-extractor/
業務場景
有一個需求,就是從大批量的簡歷中篩選出需要的信息。本文所需要的信息是姓名、電話和Email地址。爲了提高效率,故使用程序來完成這個工作。
文件轉換:word轉爲pdf
原始文件包含了 和三種格式。經過驗證,發現使用PDF轉換後提取信息的效果更好一些,因此第一步需要把word文件轉換爲pdf文件。這裏使用python的包來實現轉換,需要說明一點的是這個包需要調用windows下的word程序,因此只支持windows平臺。
第一步:安裝依賴庫
pip install pywin32
第二步:轉換文件
def word2pdf(file_dir):
"""word轉爲pdf"""
w = Dispatch('Word.Application')
w.Visible = 0
w.DisplayAlerts = 0
doc = w.Documents.Open(file_dir)
new_path = os.path.splitext(file_dir)[0] + '.pdf'
doc.SaveAs(new_path, FileFormat=17)
doc.Close()
w.Quit()
提取思路
基本的提取思路是先把PDF文件的文本內容提取出來,然後通過正則表達式去匹配值。這就導致了一個問題,姓名只能猜測而無法準確獲取。PDF文本提取使用的是pdfplumber這個庫,通過以下命令安裝:
pip install pdfplumber
然後提取PDF內容只需要簡單地幾行代碼即可搞定
pdf=pdfplumber.open(file)
for page in pdf.pages:
text=page.extract_text()
print(text)
代碼
思路清楚之後把完整的代碼封裝爲一個類,方便使用,如下:
import os
import re
import pdfplumber as pb
from win32com.client import Dispatch
import pandas as pd
import sys
class Extractor(object):
"""抽取單個文件的信息"""
def __init__(self, file_dir):
self.file_dir = file_dir
if os.path.splitext(self.file_dir)[1] in [".doc", ".docx"]:
try:
self.__word2pdf()
except Exception as e:
print(e)
return
def __doc2docx(self):
"""doc轉爲docx"""
w = Dispatch('Word.Application')
w.Visible = 0
w.DisplayAlerts = 0
doc = w.Documents.Open(self.file_dir)
new_path = os.path.splitext(self.file_dir)[0] + '.docx'
doc.SaveAs(new_path, 12, False, "", True, "", False, False, False, False)
doc.Close()
w.Quit()
os.remove(self.file_dir)
self.file_dir = new_path
return new_path
def __word2pdf(self):
"""word轉爲pdf"""
w = Dispatch('Word.Application')
w.Visible = 0
w.DisplayAlerts = 0
doc = w.Documents.Open(self.file_dir)
new_path = os.path.splitext(self.file_dir)[0] + '.pdf'
doc.SaveAs(new_path, FileFormat=17)
doc.Close()
w.Quit()
os.remove(self.file_dir)
self.file_dir = new_path
return new_path
def __extract_text(self):
"""抽取文本內容"""
text = ""
if os.path.splitext(self.file_dir)[1] == ".pdf":
pdf = pb.open(self.file_dir)
for page in pdf.pages:
text += page.extract_text() if page.extract_text() else ""
# elif os.path.splitext(self.file_dir)[1] == ".docx":
# doc = docx.Document(self.file_dir)
# for para in doc.paragraphs:
# text += para.text
return text
def __extract_words(self):
"""抽取單詞"""
words = []
if os.path.splitext(self.file_dir)[1] == ".pdf":
pdf = pb.open(self.file_dir)
for page in pdf.pages:
words += page.extract_words()
# elif os.path.splitext(self.file_dir)[1] == ".docx":
# doc = docx.Document(self.file_dir)
# for para in doc.paragraphs:
# words.append(para.text)
return words
def __search_name(self):
"""搜索姓名"""
names = []
full_text = self.__extract_text()
# 先通過"姓名"字段去查找”
for line in full_text.split("\n"):
if re.search(r"姓\s*名", line):
name = re.findall(r"姓\s*名[::\s]*[\u4e00-\u9fa5]{2,4}", line)[0]
names.append(re.sub(r"[姓名::\s]", "", name))
# 在"姓名"字段中找不到結果,則按照文字長度去猜測一個
if len(names) < 1:
for line in re.split(r"\n|\s+", full_text):
if re.search(r"\d", line):
continue
word = ""
for w in line: # 去重
if w not in word:
word += w
if 2 <= len(word) <= 4:
_names = re.findall(r"[\u4e00-\u9fa5]{2,4}", word)
names += _names
# break
return names
def __search_email(self):
"""搜索Email地址"""
full_words = self.__extract_words()
email = ""
for word in full_words:
if os.path.splitext(self.file_dir)[1] == ".pdf":
text = word["text"]
else:
text = word
if "@" in text and "." in text:
for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
if "@" in e:
email = e
break
if email != "":
break
return email
def __search_phone(self):
"""搜索電話號碼"""
full_text = self.__extract_text()
phone = ""
# 直接通過文件名查找
file_name = re.split(r"/+|\\+", self.file_dir)[-1]
number = re.findall(r"\d{11,13}", file_name)
if len(number) > 0 and re.search(r"^1", number[0]):
phone = number[0]
else:
# 通過關鍵詞查找
for line in re.split(r"[\n\s]+", full_text):
if "電話" in line or "手機" in line:
line = re.sub(r"[()()::+\-]", "", line)
number = re.findall(r"\d{11,13}", line)[0]
phone = re.sub(r"^(86)", "", number)
break
# 直接通過數字長度查找
if phone == "":
text = re.sub(r"[()()+\-]", "", full_text)
phones = re.findall(r"\d{11,13}", text)
phones = [re.sub(r"^(86)", "", p) for p in phones if re.search(r"^1", re.sub(r"^(86)", "", p))]
phone = ",".join(set(phones))
return phone
def search(self):
"""入口函數,返回搜索結果"""
sep_dir = re.split(r"/+|\\+", self.file_dir)
directory = sep_dir[-2]
file_name = sep_dir[-1]
info = {"directory": directory, "file_name": file_name, "phone": "", "user_name": "", "email": ""}
# 查找姓名
try:
names = self.__search_name()
info["user_name"] = ",".join(names)
except Exception as e:
print(e)
# 查找Email
try:
email = self.__search_email()
info["email"] = email
except Exception as e:
print(e)
# 查找電話
try:
phone = self.__search_phone()
info["phone"] = phone
except Exception as e:
print(e)
return info
最後,還需要處理批量文件,寫一個迭代查找文件的函數:
def find_files(file_dir):
"""迭代查找文件"""
file_paths = []
for root, _, files in os.walk(file_dir):
for file in files:
path = os.path.join(root, file)
rear = os.path.splitext(path)[1]
if rear in [".doc", ".docx", ".pdf"]:
file_paths.append(path)
return file_paths
以及,文件入口:
if __name__ == "__main__":
FILE_DIR = r"data"
OUT_DIR = r"resume-data.xlsx"
args = sys.argv
if len(args) > 1:
FILE_DIR = args[1]
if len(args) > 2:
OUT_DIR = args[2]
FILE_DIR = args[1]
# 文件存在,則追加序號
cnt = 0
while os.path.isfile(os.path.abspath(OUT_DIR)):
OUT_DIR = os.path.splitext(OUT_DIR)[0] + "_" + str(cnt) + ".xlsx"
cnt += 1
writer = pd.ExcelWriter(OUT_DIR)
for folder in os.listdir(FILE_DIR):
file_dir = os.path.join(os.path.abspath(FILE_DIR), folder)
paths = find_files(file_dir)
print("Total {} file(s) in directory {}:".format(len(paths), folder))
df = pd.DataFrame()
for index, file_path in enumerate(paths):
info = Extractor(file_dir=file_path).search()
df = df.append(info, ignore_index=True)
print(index, info["file_name"], info["email"], info["phone"], info["user_name"])
df.to_excel(writer, folder)
print("Save to file ", OUT_DIR)
writer.save()
print("All done.")
使用說明
首先,你的簡歷文件結構應該如下:
data
- 目錄一
- 一些pdf或者word文件
- 目錄...
- ...
使用時可以直接通過以下方式調用:
python extractor.py data result.xlsx
其中 data代表簡歷存放的根目錄,result.xlsx代表保存文件名,這兩個參數都是可選的,不加則代表使用默認值data 和resume-data.xlsx
完整的代碼下載地址:extractor.py