java 敏感词汇过滤

 

  • 声明变量
    // 存储 *
    private StringBuilder replaceSymbol;
    // 声明替换字符 *
    private String symbol = "*";
    // 文件名称
    private String fileName;
    // 存储所有敏感词汇
    private List<String> arrayList ;
  •  初始化敏感词汇

    /**
     * @Description: 把文本中的敏感词汇读取到 list集合中
     * @Date: 2019/5/7 17:11
     **/
    public void initSensitiveWord(String str){

        replaceSymbol = new StringBuilder();
        for (int i =0 ; i< str.length(); i++){
            replaceSymbol.append(symbol);
        }

        arrayList = new ArrayList<>();
        InputStreamReader inputStreamReader = null;
        BufferedReader bufferedReader = null;
        try {
            inputStreamReader =
                    new InputStreamReader(SensitiveWordUtil.class.getClassLoader().getResourceAsStream(fileName),"UTF-8");
            bufferedReader = new BufferedReader(inputStreamReader);

            for (String txt = null;(txt = bufferedReader.readLine()) != null;){
                if (!arrayList.contains(txt))
                    arrayList.add(txt);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                    if (null != bufferedReader)
                        bufferedReader.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }

                if (null != inputStreamReader)
                    try {
                        inputStreamReader.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
        }
    }
  •  对输入的字符串进行敏感词汇处理
     **
     * @Description: 对输入的敏感词汇进行处理
     * @Date: 2019/5/7 17:13
     **/
    public String filterSensitiveWord(String str){

//        HashMap<Integer,Integer> map = new HashMap<>();
        Map<Integer,Integer> map = new HashMap<>();
        StringBuilder builder = new StringBuilder(str);

        String sensitive ;
        // 遍历所有的敏感词汇
        for (int i = 0; i < arrayList.size(); i++){
            sensitive = arrayList.get(i);

            int startIndex = 0;
            // 查找字符串中是否包含 指定得敏感词汇,若包含返回该词汇首个词的索引值,否则返回 -1;
            for (int start = -1; ( start = builder.indexOf(sensitive,startIndex) ) > -1 ;){
                startIndex = start + sensitive.length();
               Integer mapStart =  map.get(start);
               if (null == mapStart || (mapStart != null && startIndex > mapStart))
                   map.put(start,startIndex);
            }
        }

        // 获取存入的敏感词索引值集合
       Collection<Integer> keys = map.keySet();
        for (Integer startIndex : keys){
            // 结束索引
            Integer endIndex = map.get(startIndex);
            // 把字符串中的关键字替换成*
           builder.replace(startIndex,endIndex,replaceSymbol.substring(startIndex,endIndex));
        }
        map.clear();
    return builder.toString();
    }
  • 调用方法进行测试
  public static void main(String[] args) {
        String string = "你好,毛一鲜,hello,老丁,这是一个错误的六四事件";
        // 敏感词汇文件放在 resource/file 下
        SensitiveWordUtil swu = new SensitiveWordUtil("file//CensorWords.txt");
        swu.initSensitiveWord(string);
        String str = swu.filterSensitiveWord(string);
        System.out.println(str);
    }
  • 测试结果

  • 敏感词汇文件位置

文件下载:https://pan.baidu.com/s/12NpqFyvJiNz98mNAePEpjg 提取码:k3nd

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章