解析Office文件文本内容,扫描文件是否涉敏(doc、docx、xls、xlsx、ppt、pptx、pdf、txt)
实现思路:将解析到的文本内容利用正则表达式去匹配
加入主要的依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
Excel扫描文件是否为涉敏文件
private static List<SensitiveEntity> excelScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
if ("xls".equals(fileType)){
return excelScanXlsFile(filePath,list,ployEntity);
}else if("xlsx".equals(fileType)){
return excelScanXlsxFile(filePath,list,ployEntity);
}else {
return new ArrayList<>();
}
}
/**
* Excel扫描文件是否为涉敏文件(xls)
*/
private static List<SensitiveEntity> excelScanXlsFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
// 已匹配上的策略放这里
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
// 读取文件
HSSFWorkbook sheets = new HSSFWorkbook(new FileInputStream(new File(filePath)));
Matcher matcher;
// 循环所有sheet页
for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
HSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
// 计算发现策略扫描区间
calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
// 第一个sheet页所有行
for (int rowIndex = start; rowIndex <= end; rowIndex++) {
HSSFRow row = sheetAt.getRow(rowIndex);
if(row == null){
continue;
}
for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
Cell cell = row.getCell(cellIndex);
if(cell == null){
continue;
}
// 获取表格内容
String cellText = getCellValString(cell);
logger.info("表格文本内容:{}",cellText);
for (SensitiveEntity sensitiveEntity : list) {
matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("规则:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
cellText = cellText.replaceFirst(group,"****");
}
// 若集合中不存在改策略则加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}
}
sheets.close();
}catch (Exception e){
logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
/**
* Excel扫描文件是否为涉敏文件(xlsx)
*/
private static List<SensitiveEntity> excelScanXlsxFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
// 已匹配上的策略放这里
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
// 读取文件
XSSFWorkbook sheets = new XSSFWorkbook(new FileInputStream(new File(filePath)));
Matcher matcher;
// 循环所有sheet页
for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
XSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
// 计算发现策略扫描区间
calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
// 第一个sheet页所有行
for (int rowIndex = start; rowIndex <= end; rowIndex++) {
XSSFRow row = sheetAt.getRow(rowIndex);
if(row == null){
continue;
}
for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
XSSFCell cell = row.getCell(cellIndex);
if(cell == null){
continue;
}
// 获取表格内容
String cellText = getCellValString(cell);
logger.info("表格文本内容:{}",cellText);
for (SensitiveEntity sensitiveEntity : list) {
matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("规则:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
cellText = cellText.replaceFirst(group,"****");
}
// 若集合中不存在改策略则加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}
}
sheets.close();
}catch (Exception e){
logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
Word扫描文件是否为涉敏文件
private static List<SensitiveEntity> wordScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
if ("doc".equals(fileType)){
return wordScanDocFile(filePath,list,ployEntity);
}else if("docx".equals(fileType)){
return wordScanDocxFile(filePath,list,ployEntity);
}else {
return new ArrayList<>();
}
}
/**
* Word扫描文件是否为涉敏文件(doc)
*/
private static List<SensitiveEntity> wordScanDocFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放这里
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
HWPFDocument doc = new HWPFDocument(new FileInputStream(new File(filePath)));
Range range = doc.getRange();
int rowNum = range.numParagraphs();
// 计算发现策略扫描区间
calculationDiscoveryStrategy(mapDiscoveryStrategy,rowNum,ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
Matcher matcher;
for (int rowIndex = start; rowIndex < end; rowIndex++) {
String text = range.getParagraph(rowIndex).text();
logger.info("文本内容:{}", text);
for (SensitiveEntity sensitiveEntity : list) {
String rule = sensitiveEntity.getRules();
if (matcherTxt(rule, text).find()) {
matcher = matcherTxt(rule, text);
if (matcher.find()) {
while ((matcher = matcherTxt(rule, text)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("规则:{} 涉敏信息:{}", sensitiveEntity.getRules(), group);
// 不要去掉(while循环校验使用)
text = text.replaceFirst(group,"****");
}
}
// 若集合中不存在改策略则加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}catch (Exception e){
logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
/**
* Word扫描文件是否为涉敏文件(docx)
*/
private static List<SensitiveEntity> wordScanDocxFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放这里
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try{
XWPFDocument doc = new XWPFDocument(new FileInputStream(new File(filePath)));
List<XWPFParagraph> paragraphs = doc.getParagraphs();
// 计算发现策略扫描区间
calculationDiscoveryStrategy(mapDiscoveryStrategy,paragraphs.size(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
Matcher matcher;
for (int rowIndex = start; rowIndex < end; rowIndex++) {
logger.info("runs文本内容:{}",paragraphs.get(rowIndex).getParagraphText());
for (SensitiveEntity sensitiveEntity : list) {
String rule = sensitiveEntity.getRules();
if (matcherTxt(rule,paragraphs.get(rowIndex).getParagraphText()).find()) {
String runText = paragraphs.get(rowIndex).getParagraphText();
matcher = matcherTxt(rule,runText);
if (matcher.find()) {
while ((matcher = matcherTxt(rule,runText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("规则:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
// 不要去掉(while循环校验使用)
runText = runText.replaceFirst(group,"****");
}
}
// 若集合中不存在改策略则加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}catch (Exception e){
logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
TxT扫描文件是否为涉敏文件
private static List<SensitiveEntity> txtScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放这里
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
List<String> listStr = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
try {
BufferedReader bufferedReader = new BufferedReader(new FileReader(filePath));
Matcher matcher;
String lineStr = "";
while ((lineStr = bufferedReader.readLine()) != null){
listStr.add(lineStr);
}
// 计算发现策略扫描区间
calculationDiscoveryStrategy(mapDiscoveryStrategy,listStr.size(),ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
for (int rowIndex = start; rowIndex < end; rowIndex++) {
lineStr = listStr.get(rowIndex);
logger.info("文本内容:{}",lineStr);
for (SensitiveEntity sensitiveEntity : list) {
// 获取表格内容
matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("规则:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
lineStr = lineStr.replaceFirst(group,"****");
}
// 若集合中不存在改策略则加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
}catch (Exception e){
logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
PPT扫描文件是否为涉敏文件
private static List<SensitiveEntity> pptScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
// 已匹配上的策略放这里
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
Matcher matcher;
try {
String[] texts = new String[]{};
if ("ppt".equals(fileType)){
logger.info("扫描文件类型为PPT");
PowerPointExtractor extractor = new PowerPointExtractor(new FileInputStream(new File(filePath)));
texts = extractor.getText().split("\n");
extractor.close();
}else if ("pptx".equals(fileType)){
logger.info("扫描文件类型为PPTX");
XSLFPowerPointExtractor xslfExtractor = new XSLFPowerPointExtractor(POIXMLDocument.openPackage(filePath));
texts = xslfExtractor.getText().split("\n");
xslfExtractor.close();
}
// 计算发现策略扫描区间
calculationDiscoveryStrategy(mapDiscoveryStrategy,texts.length,ployEntity);
int start = mapDiscoveryStrategy.get("start");
int end = mapDiscoveryStrategy.get("end");
logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
for (int rowIndex = start; rowIndex < end; rowIndex++) {
String lineStr = texts[rowIndex];
logger.info("文本内容:{}",lineStr);
for (SensitiveEntity sensitiveEntity : list) {
// 获取表格内容
matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("规则:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
lineStr = lineStr.replaceFirst(group,"****");
}
// 若集合中不存在改策略则加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
PDF扫描文件是否为涉敏文件
private static List<SensitiveEntity> pdfScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
logger.info("==============pdfScanFile==========");
// 已匹配上的策略放这里
List<SensitiveEntity> matchedSensitive = new ArrayList<>();
Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
Matcher matcher;
try {
// 获取PDF文件
com.itextpdf.text.pdf.PdfReader pdfReader = new com.itextpdf.text.pdf.PdfReader(filePath);
// 解析PDF文件
com.itextpdf.text.pdf.parser.PdfReaderContentParser pdfReaderContentParser = new com.itextpdf.text.pdf.parser.PdfReaderContentParser(pdfReader);
// 计算发现策略扫描区间(PDF按页扫描暂定每页为30行)
calculationDiscoveryStrategy(mapDiscoveryStrategy,pdfReader.getNumberOfPages()*30,ployEntity);
int start = mapDiscoveryStrategy.get("start") / 30;
int end = mapDiscoveryStrategy.get("end") / 30;
logger.info("扫描文件从{}页开始扫描到{}页结束",start,end);
// 获取每页的文本内容
for (int i = (start==0?1:start); i <= end; i++) {
com.itextpdf.text.pdf.parser.TextExtractionStrategy simpleTextExtractionStrategy = pdfReaderContentParser.processContent(i, new com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy());
String resultantText = simpleTextExtractionStrategy.getResultantText();
logger.info("PDF每页文本内容:{}",resultantText);
for (SensitiveEntity sensitiveEntity : list) {
// 判断是否匹配策略
matcher = matcherTxt(sensitiveEntity.getRules(),resultantText);
if (matcher.find()) {
while ((matcher = matcherTxt(sensitiveEntity.getRules(),resultantText)).find()) {
String group = matcher.group(0);
// 打印涉敏信息
logger.info("规则:{} 涉敏信息:{}",sensitiveEntity.getRules(),group);
resultantText = resultantText.replaceFirst(group,"****");
}
// 若集合中不存在改策略则加入集合
if (!matchedSensitive.contains(sensitiveEntity)){
matchedSensitive.add(sensitiveEntity);
}
}
}
}
pdfReader.close();
}catch (IOException e){
logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
return matchedSensitive;
}
return matchedSensitive;
}
Main测试及补充方法
package com.zxl.demo.utiles;
import com.zxl.demo.entity.PloyEntity;
import com.zxl.demo.entity.SensitiveEntity;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.util.NumberToTextConverter;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Describe: 扫描脱敏工具类
* @Author: zml
* @Date: 2020-4-27 11:25:30
*/
public class OfficeScanDesensitizationUtils {
private static Logger logger = LoggerFactory.getLogger(OfficeScanDesensitizationUtils.class);
public static void main(String[] args) {
String fileType = "docx";
String filePath = "D:\\liang\\office扫描\\word-docx扫描.docx";
// 创建扫描策略
SensitiveEntity sensitiveEntity1 = new SensitiveEntity();
sensitiveEntity1.setRulename("策略1");
sensitiveEntity1.setRules("17711131114");
sensitiveEntity1.setNode("扫描包含17711131114手机号的文件");
SensitiveEntity sensitiveEntity2 = new SensitiveEntity();
sensitiveEntity2.setRulename("策略2");
sensitiveEntity2.setRules("\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*");
sensitiveEntity2.setNode("扫描Email地址");
List<SensitiveEntity> sensitiveEntities = Arrays.asList(sensitiveEntity1,sensitiveEntity2);
// 制定扫描规则(如:扫描前100行)
PloyEntity ployEntity = new PloyEntity();
ployEntity.setTop100(true);
// 开始扫描
List<SensitiveEntity> sensitives = scanFile(fileType, filePath, sensitiveEntities, ployEntity);
if (sensitives.size() > 0){
logger.info("该文件为涉敏文件");
sensitives.forEach(sensitive -> {
logger.info("涉敏策略为:{} 描述:{}",sensitive.getRulename(),sensitive.getNode());
});
}else {
logger.info("不是涉敏文件");
}
}
/**
* 扫描文件是否为涉敏文件
*/
public static List<SensitiveEntity> scanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity){
List<SensitiveEntity> sensitiveEntities = new ArrayList<>();
try{
if (fileType.toLowerCase().contains("doc")){
sensitiveEntities = wordScanFile(fileType,filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("xls")){
sensitiveEntities = excelScanFile(fileType, filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("txt")){
sensitiveEntities = txtScanFile(filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("ppt")){
sensitiveEntities = pptScanFile(fileType,filePath, list, ployEntity);
}else if (fileType.toLowerCase().contains("pdf")){
logger.info("扫描PDF类型文件");
sensitiveEntities = pdfScanFile(filePath, list, ployEntity);
}
}catch (Exception e){
e.printStackTrace();
logger.error("扫描文件是否为涉敏文件异常:{}",e.getMessage());
return sensitiveEntities;
}
return sensitiveEntities;
}
/**
* 计算发现策略扫描区间
*/
private static void calculationDiscoveryStrategy(Map mapDiscoveryStrategy, int size, PloyEntity ployEntity) {
// 是否全文扫描
if(ployEntity.isAll()){
mapDiscoveryStrategy.put("start",0);
mapDiscoveryStrategy.put("end",size);
}else if (ployEntity.isCustomize()){
//是否为自定义
mapDiscoveryStrategy.put("start",ployEntity.getStart());
mapDiscoveryStrategy.put("end",ployEntity.getEnd() > size ? size : ployEntity.getEnd());
}else if (ployEntity.isTop100()){
// 是否扫描前100行
mapDiscoveryStrategy.put("start",0);
mapDiscoveryStrategy.put("end",100 > size ? size : 100);
}else if (ployEntity.isLast100()){
// 是否扫描后100行
mapDiscoveryStrategy.put("start",100 > size ? 0 : size-100);
mapDiscoveryStrategy.put("end",size);
}
}
/**
* 将所有类型转换为String
*/
public static String getCellValString(Cell cell){
CellType cellType = cell.getCellType();
String val = "";
if (CellType.STRING.equals(cellType)){
val = cell.getStringCellValue();
}else if (CellType.BOOLEAN.equals(cellType)){
val = String.valueOf(cell.getBooleanCellValue());
}else if (CellType.NUMERIC.equals(cellType)){
val = NumberToTextConverter.toText(cell.getNumericCellValue());
}
return val;
}
/**
* 正则匹配字符串
*/
private static Matcher matcherTxt(String regex,String str) {
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(str);
return matcher;
}