解析Office文件文本内容,扫描文件是否涉敏(doc、docx、xls、xlsx、ppt、pptx、pdf、txt)

解析Office文件文本内容,扫描文件是否涉敏(doc、docx、xls、xlsx、ppt、pptx、pdf、txt)

实现思路:将解析到的文本内容利用正则表达式去匹配

加入主要的依赖

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>4.1.2</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>4.1.2</version>
		</dependency>

Excel扫描文件是否为涉敏文件

    private static List<SensitiveEntity> excelScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        if ("xls".equals(fileType)){
            return excelScanXlsFile(filePath,list,ployEntity);
        }else if("xlsx".equals(fileType)){
            return excelScanXlsxFile(filePath,list,ployEntity);
        }else {
            return new ArrayList<>();
        }
    }

	/**
     * Excel扫描文件是否为涉敏文件(xls)
     */
    private static List<SensitiveEntity> excelScanXlsFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
        // 已匹配上的策略放这里
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            // 读取文件
            HSSFWorkbook sheets = new HSSFWorkbook(new FileInputStream(new File(filePath)));
            Matcher matcher;
            // 循环所有sheet页
            for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
                HSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
                // 计算发现策略扫描区间
                calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
                int start = mapDiscoveryStrategy.get("start");
                int end = mapDiscoveryStrategy.get("end");
                logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
                // 第一个sheet页所有行
                for (int rowIndex = start; rowIndex <= end; rowIndex++) {
                    HSSFRow row = sheetAt.getRow(rowIndex);
                    if(row == null){
                        continue;
                    }
                    for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
                        Cell cell = row.getCell(cellIndex);
                        if(cell == null){
                            continue;
                        }
                        // 获取表格内容
                        String cellText = getCellValString(cell);
                        logger.info("表格文本内容:{}",cellText);
                        for (SensitiveEntity sensitiveEntity : list) {
                            matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
                            if (matcher.find()) {
                                while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
                                    String group = matcher.group(0);
                                    // 打印涉敏信息
                                    logger.info("规则:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                                    cellText = cellText.replaceFirst(group,"****");
                                }
                                // 若集合中不存在改策略则加入集合
                                if (!matchedSensitive.contains(sensitiveEntity)){
                                    matchedSensitive.add(sensitiveEntity);
                                }
                            }
                        }
                    }
                }
            }
            sheets.close();
        }catch (Exception e){
            logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

    /**
     * Excel扫描文件是否为涉敏文件(xlsx)
     */
    private static List<SensitiveEntity> excelScanXlsxFile(String filePath, List<SensitiveEntity> list,PloyEntity ployEntity){
        // 已匹配上的策略放这里
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            // 读取文件
            XSSFWorkbook sheets = new XSSFWorkbook(new FileInputStream(new File(filePath)));
            Matcher matcher;
            // 循环所有sheet页
            for (int sheetIndex = 0; sheetIndex < sheets.getNumberOfSheets(); sheetIndex++){
                XSSFSheet sheetAt = sheets.getSheetAt(sheetIndex);
                // 计算发现策略扫描区间
                calculationDiscoveryStrategy(mapDiscoveryStrategy,sheetAt.getLastRowNum(),ployEntity);
                int start = mapDiscoveryStrategy.get("start");
                int end = mapDiscoveryStrategy.get("end");
                logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
                // 第一个sheet页所有行
                for (int rowIndex = start; rowIndex <= end; rowIndex++) {
                    XSSFRow row = sheetAt.getRow(rowIndex);
                    if(row == null){
                        continue;
                    }
                    for (int cellIndex = 0; cellIndex < row.getPhysicalNumberOfCells(); cellIndex++) {
                        XSSFCell cell = row.getCell(cellIndex);
                        if(cell == null){
                            continue;
                        }
                        // 获取表格内容
                        String cellText = getCellValString(cell);
                        logger.info("表格文本内容:{}",cellText);
                        for (SensitiveEntity sensitiveEntity : list) {
                            matcher = matcherTxt(sensitiveEntity.getRules(),cellText);
                            if (matcher.find()) {
                                while ((matcher = matcherTxt(sensitiveEntity.getRules(),cellText)).find()) {
                                    String group = matcher.group(0);
                                    // 打印涉敏信息
                                    logger.info("规则:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                                    cellText = cellText.replaceFirst(group,"****");
                                }
                                // 若集合中不存在改策略则加入集合
                                if (!matchedSensitive.contains(sensitiveEntity)){
                                    matchedSensitive.add(sensitiveEntity);
                                }
                            }
                        }
                    }
                }
            }
            sheets.close();
        }catch (Exception e){
            logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

Word扫描文件是否为涉敏文件

	private static List<SensitiveEntity> wordScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        if ("doc".equals(fileType)){
            return wordScanDocFile(filePath,list,ployEntity);
        }else if("docx".equals(fileType)){
            return wordScanDocxFile(filePath,list,ployEntity);
        }else {
            return new ArrayList<>();
        }
    }
	
	/**
     * Word扫描文件是否为涉敏文件(doc)
     */
    private static List<SensitiveEntity> wordScanDocFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放这里
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            HWPFDocument doc = new HWPFDocument(new FileInputStream(new File(filePath)));
            Range range = doc.getRange();
            int rowNum = range.numParagraphs();
            // 计算发现策略扫描区间
            calculationDiscoveryStrategy(mapDiscoveryStrategy,rowNum,ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
            Matcher matcher;
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                String text = range.getParagraph(rowIndex).text();
                logger.info("文本内容:{}", text);
                for (SensitiveEntity sensitiveEntity : list) {
                    String rule = sensitiveEntity.getRules();
                    if (matcherTxt(rule, text).find()) {
                        matcher = matcherTxt(rule, text);
                        if (matcher.find()) {
                            while ((matcher = matcherTxt(rule, text)).find()) {
                                String group = matcher.group(0);
                                // 打印涉敏信息
                                logger.info("规则:{}    涉敏信息:{}", sensitiveEntity.getRules(), group);
                                // 不要去掉(while循环校验使用)
                                text = text.replaceFirst(group,"****");
                            }
                        }
                        // 若集合中不存在改策略则加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        }catch (Exception e){
            logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

    /**
     * Word扫描文件是否为涉敏文件(docx)
     */
    private static List<SensitiveEntity> wordScanDocxFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放这里
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try{
            XWPFDocument doc = new XWPFDocument(new FileInputStream(new File(filePath)));
            List<XWPFParagraph> paragraphs = doc.getParagraphs();
            // 计算发现策略扫描区间
            calculationDiscoveryStrategy(mapDiscoveryStrategy,paragraphs.size(),ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
            Matcher matcher;
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                logger.info("runs文本内容:{}",paragraphs.get(rowIndex).getParagraphText());
                for (SensitiveEntity sensitiveEntity : list) {
                    String rule = sensitiveEntity.getRules();
                    if (matcherTxt(rule,paragraphs.get(rowIndex).getParagraphText()).find()) {
                        String runText = paragraphs.get(rowIndex).getParagraphText();
                        matcher = matcherTxt(rule,runText);
                        if (matcher.find()) {
                            while ((matcher = matcherTxt(rule,runText)).find()) {
                                String group = matcher.group(0);
                                // 打印涉敏信息
                                logger.info("规则:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                                // 不要去掉(while循环校验使用)
                                runText = runText.replaceFirst(group,"****");
                            }
                        }
                        // 若集合中不存在改策略则加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        }catch (Exception e){
            logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

TxT扫描文件是否为涉敏文件

	private static List<SensitiveEntity> txtScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放这里
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        List<String> listStr = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(filePath));
            Matcher matcher;
            String lineStr = "";
            while ((lineStr = bufferedReader.readLine()) != null){
                listStr.add(lineStr);
            }
            // 计算发现策略扫描区间
            calculationDiscoveryStrategy(mapDiscoveryStrategy,listStr.size(),ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                lineStr = listStr.get(rowIndex);
                logger.info("文本内容:{}",lineStr);
                for (SensitiveEntity sensitiveEntity : list) {
                    // 获取表格内容
                    matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
                    if (matcher.find()) {
                        while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
                            String group = matcher.group(0);
                            // 打印涉敏信息
                            logger.info("规则:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                            lineStr = lineStr.replaceFirst(group,"****");
                        }
                        // 若集合中不存在改策略则加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        }catch (Exception e){
            logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

PPT扫描文件是否为涉敏文件

	private static List<SensitiveEntity> pptScanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        // 已匹配上的策略放这里
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        Matcher matcher;
        try {
            String[] texts = new String[]{};
            if ("ppt".equals(fileType)){
                logger.info("扫描文件类型为PPT");
                PowerPointExtractor extractor = new PowerPointExtractor(new FileInputStream(new File(filePath)));
                texts = extractor.getText().split("\n");
                extractor.close();
            }else if ("pptx".equals(fileType)){
                logger.info("扫描文件类型为PPTX");
                XSLFPowerPointExtractor xslfExtractor = new XSLFPowerPointExtractor(POIXMLDocument.openPackage(filePath));
                texts = xslfExtractor.getText().split("\n");
                xslfExtractor.close();
            }
            // 计算发现策略扫描区间
            calculationDiscoveryStrategy(mapDiscoveryStrategy,texts.length,ployEntity);
            int start = mapDiscoveryStrategy.get("start");
            int end = mapDiscoveryStrategy.get("end");
            logger.info("扫描文件从{}行开始扫描到{}行结束",start,end);
            for (int rowIndex = start; rowIndex < end; rowIndex++) {
                String lineStr = texts[rowIndex];
                logger.info("文本内容:{}",lineStr);
                for (SensitiveEntity sensitiveEntity : list) {
                    // 获取表格内容
                    matcher = matcherTxt(sensitiveEntity.getRules(),lineStr);
                    if (matcher.find()) {
                        while ((matcher = matcherTxt(sensitiveEntity.getRules(),lineStr)).find()) {
                            String group = matcher.group(0);
                            // 打印涉敏信息
                            logger.info("规则:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                            lineStr = lineStr.replaceFirst(group,"****");
                        }
                        // 若集合中不存在改策略则加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
            logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

PDF扫描文件是否为涉敏文件

	private static List<SensitiveEntity> pdfScanFile(String filePath, List<SensitiveEntity> list, PloyEntity ployEntity) {
        logger.info("==============pdfScanFile==========");
        // 已匹配上的策略放这里
        List<SensitiveEntity> matchedSensitive = new ArrayList<>();
        Map<String, Integer> mapDiscoveryStrategy = new HashMap<>();
        Matcher matcher;
        try {
            // 获取PDF文件
            com.itextpdf.text.pdf.PdfReader pdfReader = new com.itextpdf.text.pdf.PdfReader(filePath);
            // 解析PDF文件
            com.itextpdf.text.pdf.parser.PdfReaderContentParser pdfReaderContentParser = new com.itextpdf.text.pdf.parser.PdfReaderContentParser(pdfReader);
            // 计算发现策略扫描区间(PDF按页扫描暂定每页为30行)
            calculationDiscoveryStrategy(mapDiscoveryStrategy,pdfReader.getNumberOfPages()*30,ployEntity);
            int start = mapDiscoveryStrategy.get("start") / 30;
            int end = mapDiscoveryStrategy.get("end") / 30;
            logger.info("扫描文件从{}页开始扫描到{}页结束",start,end);
            // 获取每页的文本内容
            for (int i = (start==0?1:start); i <= end; i++) {
                com.itextpdf.text.pdf.parser.TextExtractionStrategy simpleTextExtractionStrategy = pdfReaderContentParser.processContent(i, new com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy());
                String resultantText = simpleTextExtractionStrategy.getResultantText();
                logger.info("PDF每页文本内容:{}",resultantText);
                for (SensitiveEntity sensitiveEntity : list) {
                    // 判断是否匹配策略
                    matcher = matcherTxt(sensitiveEntity.getRules(),resultantText);
                    if (matcher.find()) {
                        while ((matcher = matcherTxt(sensitiveEntity.getRules(),resultantText)).find()) {
                            String group = matcher.group(0);
                            // 打印涉敏信息
                            logger.info("规则:{}    涉敏信息:{}",sensitiveEntity.getRules(),group);
                            resultantText = resultantText.replaceFirst(group,"****");
                        }
                        // 若集合中不存在改策略则加入集合
                        if (!matchedSensitive.contains(sensitiveEntity)){
                            matchedSensitive.add(sensitiveEntity);
                        }
                    }
                }
            }
            pdfReader.close();
        }catch (IOException e){
            logger.error("扫描【{}】文件是否为涉敏文件异常:{}",filePath,e.getMessage());
            return matchedSensitive;
        }
        return matchedSensitive;
    }

Main测试及补充方法

package com.zxl.demo.utiles;

import com.zxl.demo.entity.PloyEntity;
import com.zxl.demo.entity.SensitiveEntity;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.util.NumberToTextConverter;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @Describe: 扫描脱敏工具类
 * @Author: zml
 * @Date: 2020-4-27 11:25:30
 */
public class OfficeScanDesensitizationUtils {

    private static Logger logger = LoggerFactory.getLogger(OfficeScanDesensitizationUtils.class);
    
    public static void main(String[] args) {
        String fileType = "docx";
        String filePath = "D:\\liang\\office扫描\\word-docx扫描.docx";
        // 创建扫描策略
        SensitiveEntity sensitiveEntity1 = new SensitiveEntity();
        sensitiveEntity1.setRulename("策略1");
        sensitiveEntity1.setRules("17711131114");
        sensitiveEntity1.setNode("扫描包含17711131114手机号的文件");
        SensitiveEntity sensitiveEntity2 = new SensitiveEntity();
        sensitiveEntity2.setRulename("策略2");
        sensitiveEntity2.setRules("\\w+([-+.]\\w+)*@\\w+([-.]\\w+)*\\.\\w+([-.]\\w+)*");
        sensitiveEntity2.setNode("扫描Email地址");
        List<SensitiveEntity> sensitiveEntities = Arrays.asList(sensitiveEntity1,sensitiveEntity2);
        // 制定扫描规则(如:扫描前100行)
        PloyEntity ployEntity = new PloyEntity();
        ployEntity.setTop100(true);
        // 开始扫描
        List<SensitiveEntity> sensitives = scanFile(fileType, filePath, sensitiveEntities, ployEntity);
        if (sensitives.size() > 0){
            logger.info("该文件为涉敏文件");
            sensitives.forEach(sensitive -> {
                logger.info("涉敏策略为:{}  描述:{}",sensitive.getRulename(),sensitive.getNode());
            });
        }else {
            logger.info("不是涉敏文件");
        }
    }
    
    /**
     * 扫描文件是否为涉敏文件
     */
    public static List<SensitiveEntity> scanFile(String fileType, String filePath, List<SensitiveEntity> list, PloyEntity ployEntity){
        List<SensitiveEntity> sensitiveEntities = new ArrayList<>();
        try{
            if (fileType.toLowerCase().contains("doc")){
                sensitiveEntities = wordScanFile(fileType,filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("xls")){
                sensitiveEntities = excelScanFile(fileType, filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("txt")){
                sensitiveEntities = txtScanFile(filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("ppt")){
                sensitiveEntities = pptScanFile(fileType,filePath, list, ployEntity);
            }else if (fileType.toLowerCase().contains("pdf")){
                logger.info("扫描PDF类型文件");
                sensitiveEntities = pdfScanFile(filePath, list, ployEntity);
            }
        }catch (Exception e){
            e.printStackTrace();
            logger.error("扫描文件是否为涉敏文件异常:{}",e.getMessage());
            return sensitiveEntities;
        }
        return sensitiveEntities;
    }

 	/** 
    * 计算发现策略扫描区间
    */
    private static void calculationDiscoveryStrategy(Map mapDiscoveryStrategy, int size, PloyEntity ployEntity) {
        // 是否全文扫描
        if(ployEntity.isAll()){
            mapDiscoveryStrategy.put("start",0);
            mapDiscoveryStrategy.put("end",size);
        }else if (ployEntity.isCustomize()){
            //是否为自定义
            mapDiscoveryStrategy.put("start",ployEntity.getStart());
            mapDiscoveryStrategy.put("end",ployEntity.getEnd() > size ? size : ployEntity.getEnd());
        }else if (ployEntity.isTop100()){
            // 是否扫描前100行
            mapDiscoveryStrategy.put("start",0);
            mapDiscoveryStrategy.put("end",100 > size ? size : 100);
        }else if (ployEntity.isLast100()){
            // 是否扫描后100行
            mapDiscoveryStrategy.put("start",100 > size ? 0 : size-100);
            mapDiscoveryStrategy.put("end",size);
        }
    }

    /** 
    * 将所有类型转换为String
    */
    public static String getCellValString(Cell cell){
        CellType cellType = cell.getCellType();
        String val = "";
        if (CellType.STRING.equals(cellType)){
            val = cell.getStringCellValue();
        }else if (CellType.BOOLEAN.equals(cellType)){
            val = String.valueOf(cell.getBooleanCellValue());
        }else if (CellType.NUMERIC.equals(cellType)){
            val = NumberToTextConverter.toText(cell.getNumericCellValue());
        }
        return val;
    }

    /** 
    * 正则匹配字符串 
    */
    private static Matcher matcherTxt(String regex,String str) {
        Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(str);
        return matcher;
    }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章