DFA算法實現過濾多家公司自定義敏感字 原 薦

背景

因爲最近有通訊有個需求,說需要讓多家客戶公司可以自定義敏感詞過濾掉他們自定義的規則,選擇了DFA算法來做,不過和以前傳統了DFA寫法不太一樣了

模式圖

輸入圖片說明

直接上代碼

public class KeywordFilter {
//	private static ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
	public static Map<String, HashMap> currentMap = new ConcurrentHashMap<String, HashMap>();
	public static Map nowhash = null;
	public static Object wordMap;// map子節點

	// 不建立對象
	private KeywordFilter() {
	}

	private static String getKey(int companyId) {
		return "companyId" + companyId;
	}

	/*
	 * <p>說明:清掃內容</p>
	 * 
	 * @author:姚旭民
	 * 
	 * @data:2017-8-22 上午10:13:11
	 */
	public static void clear() {
		try {
			currentMap.clear();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
		}

	}

	/*
	 * <p>說明:各個渠道的過濾字符</p>
	 * 
	 * @author:姚旭民
	 * 
	 * @data:2017-8-20 下午2:55:06
	 */
	public static void saveKeywords(int companyId, List<String> keywords) {
		try {
			Map tempAllMap = currentMap;
			String key = getKey(companyId);
			int l = keywords.size();
			int il;
			Map tempMap;
			for (int i = 0; i < l; i++) {
				String key2 = keywords.get(i).trim();// 去掉空白
				nowhash = currentMap;
				il = key2.length();
				for (int j = 0; j < il; j++) {
					char word = key2.charAt(j);
					tempMap = (Map) nowhash.get(word);
					wordMap = nowhash.get(word);
					if (wordMap != null) {// 檢查數據
						if (!tempMap.containsKey(key)) {
							nowhash.put(key, 0);
						}
						nowhash = (HashMap) wordMap;
					} else {
						HashMap<String, String> newWordHash = new HashMap<String, String>();
						newWordHash.put(key, "0");
						nowhash.put(word, newWordHash);
						nowhash = newWordHash;
					}
					if (j == il - 1) {
						nowhash.put(key, "1");
					}
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			nowhash = null;
			wordMap = null;
		}
	}

	/*
	 * <p>說明:替換掉對應的渠道規定掉敏感字</p>
	 * 
	 * @author:姚旭民
	 * 
	 * @data:2017-8-20 上午11:41:47
	 */
	public static List<String> repword(int companyId, String txt) {
		Map tempMap = currentMap;
		List<String> result = new ArrayList<String>();
		String key = getKey(companyId);
		nowhash = currentMap;
		int l = txt.length();
		char word;
		String keywordStr = "";
		String keyStatu;
		StringBuilder keyword = new StringBuilder();// 敏感字
		for (int i = 0; i < l; i++) {
			word = txt.charAt(i);
			wordMap = nowhash.get(word);
			if (wordMap != null) {// 找到類似敏感字的字體,開始查詢
				keyword.append(word);
				Object te = nowhash = (HashMap) wordMap;
				// 遍歷到這一步,就符合完整的關鍵字模板
				if (nowhash.get(key) != null
						&& nowhash.get(key).toString().equals("1")) {// 確定是敏感字,開始替換
					if (i < l - 1 && nowhash.get(txt.charAt(i + 1)) != null) {// 優先過濾長敏感詞,去掉就檳城了優先過濾段敏感詞
						continue;
					}
					txt = txt.replaceAll(keyword.toString(), "*");
					nowhash = currentMap;
					keywordStr += keyword.toString() + ",";
					i = i - keyword.length() + 1;
					l = txt.length();// 重新獲取字符長度
					keyword.delete(0, keyword.length());// 清空數據
				}
			} else {// 這個字不是敏感字,直接排除
				nowhash = currentMap;
				keyword.delete(0, keyword.length());// 清空數據
				continue;
			}
		}
		// 清除內存指向
		nowhash = null;
		wordMap = null;
		result.add(txt);
		result.add(keywordStr.length() - 1 > 0 ? keywordStr.substring(0,
				keywordStr.length() - 1) : keywordStr);
		return result;

	}

	/*
	 * <p>說明:檢查是否存在敏感字</p>
	 * 
	 * @author:姚旭民
	 * 
	 * @data:2017-8-20 下午3:00:06 專門設計成私有的,如果沒有理由,別改動他
	 */
	private static int checkKeyWords(String txt, int companyId, int begin) {
		int result = 0;
		String key = getKey(companyId);
		try {
			nowhash = currentMap;
			int l = txt.length();
			char word = 0;
			for (int i = begin; i < l; i++) {
				word = txt.charAt(i);
				wordMap = nowhash.get(word);
				if (wordMap != null) {
					result++;
					nowhash = (HashMap) wordMap;
					if (((String) nowhash.get(key)).equals("1")) {
						nowhash = null;
						wordMap = null;
						return result;
					}
				} else {
					result = 0;
					break;
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			nowhash = null;
			wordMap = null;
			return result;
		}
	}

	/*
	 * <p>說明:返回檢查的文本中包含的敏感字</p>
	 * 
	 * @author:姚旭民
	 * 
	 * @data:2017-8-20 下午3:32:53
	 */
	public static String getTxtKeyWords(String txt, int companyId) {
		String result = null;
		StringBuilder temp = new StringBuilder();
		String key;
		int l = txt.length();
		for (int i = 0; i < l;) {
			int len = checkKeyWords(txt, companyId, i);
			if (len > 0) {
				key = (txt.substring(i, i + len));// 挑選出來的關鍵字
				temp.append(key + ",");
				txt = txt.replaceAll(key, "");// 挑選出來的關鍵字替換成空白,加快挑選速度
				l = txt.length();
			} else {
				i++;
			}
		}
		if (temp.length() > 0) {
			result = temp.substring(0, temp.length() - 1);
		}
		return result;
	}

	/*
	 * <p>說明:判斷文中是否包含渠道規定的敏感字</p>
	 * 
	 * @author:姚旭民
	 * 
	 * @data:2017-8-20 下午3:33:19
	 */
	public boolean isKeyWords(String txt, int companyId) {
		for (int i = 0; i < txt.length(); i++) {
			int len = checkKeyWords(txt, companyId, i);
			if (len > 0) {
				return true;
			}
		}
		return false;
	}

	public static void main(String[] arg) {
		List<String> keywords = new ArrayList<String>();
		keywords.add("傻×");
		keywords.add("漢奸");
		keywords.add("草");
		keywords.add("草泥馬");
		KeywordFilter.saveKeywords(1, keywords);
		String txt = "是傻×漢奸傻A傻B傻C傻D漢奸傻×草泥馬";
		List<String> list = repword(1, txt);
		System.out.println("文中包含的敏感字爲:" + list.get(1));
		System.out.println("原文:" + txt);
		System.out.println("敏感字過濾後:" + list.get(0));
	}
}


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章