敏感詞過濾的方法有很多,如何能更快速更大範圍的匹配敏感詞?
敏感詞庫小的情況,用正則、字符串匹配即可實現,還可以按照首字母對敏感詞分組以提高效率。
當詞庫比較大時,就需要考慮效率問題了···
找了很多實現方法,感覺不是很適合現在的需求,於是自己嘗試實現了一下,用的也是DFA算法。
1. 創建敏感詞庫
1.1 收集敏感詞
1.2.1 指定字符穿分割
// 測試敏感詞庫:
敏感詞|敏感詞彙|敏感詞語|敏感單詞
1.2.2 處理敏感詞彙(DFA算法: 有窮自動機, 通過event和當前的state得到下一個state, 即 event + state = nextstate)
將拆分的字符,整理成樹狀結構
// 詞庫處理後格式(僞代碼, 僅用於展示):
{
item: "R",
isEnd: false,
childre:
{
item: "敏",
isEnd: false,
children:
{
item: "感",
isEnd: false,
children:
{
item: "詞",
isEnd: true,
children:
{
"item": "匯",
"isEnd": true,
"children": null
},
{
"item": "語",
"isEnd": true,
"children": null
}
},
{
"item": "單",
"isEnd": false,
"children":
{
"item": "詞",
"isEnd": true,
"children": null
}
}
}
}
}
處理:
/// 詞庫樹節點
public class TreeNode
{
public char item;
public bool isEnd;
public List<TreeNode> children;
}
public class SensitiveWordsLibrary
{
//DFA:有窮自動機, 通過event和當前的state得到下一個state, 即 event + state = nextstate
string sensitiveWordsResourcesPath = "SensitiveWordsLibrary/SensitiveWords";
/// 詞庫樹
private TreeNode treeRoot;
public SensitiveWordsLibrary()
{
//敏感詞組
string[] sensitiveWords = LoadSensitiveWords();
//創建詞庫樹
treeRoot = CreateTree(sensitiveWords);
}
/// 加載敏感詞組
private string[] LoadSensitiveWords()
{
//todo 源文件字符校驗
TextAsset textAsset = Resources.Load<TextAsset>(sensitiveWordsResourcesPath);
if (textAsset != null)
{
string wordStr = textAsset.text;
if (!string.IsNullOrEmpty(wordStr))
{
string[] words = wordStr.Split('|');
return words;
}
}
return null;
}
#region --- Create Tree ---
/// 創建詞庫樹
private TreeNode CreateTree(string[] words)
{
if (words == null || words.Length == 0)
{
Debug.Log("無敏感詞庫");
return new TreeNode() { item = 'R', isEnd = true, children = null };
}
List<TreeNode> treeList = new List<TreeNode>();
foreach (string word in words)
{
if (string.IsNullOrEmpty(word))
continue;
char cha = word[0];
TreeNode treeNode = treeList.Find(e => e.item == cha);
if (treeNode == null)
{
TreeNode newTreeNode = CreateSingleTree(word);
if (newTreeNode != null)
treeList.Add(newTreeNode);
}
else
AddChildTree(treeNode, word);
}
return new TreeNode() { item = 'R', isEnd = false, children = treeList };
}
/// 創建單個完整樹
private TreeNode CreateSingleTree(string word)
{
if (string.IsNullOrEmpty(word))
return null;
TreeNode root = new TreeNode() { item = word[0], isEnd = false, children = null };
TreeNode lastNode = root;
if (word.Length > 1)
{
for (int i = 1; i < word.Length; i++)
{
TreeNode child = new TreeNode() { item = word[i], isEnd = false, children = null };
lastNode.children = new List<TreeNode>() { child };
lastNode = child;
}
}
lastNode.isEnd = true;
return root;
}
/// 附加分支子樹
private void AddChildTree(TreeNode childTree, string word)
{
if (childTree == null || string.IsNullOrEmpty(word))
return;
//字符長度==1
if (word.Length == 1)
{
childTree.isEnd = true;
return;
}
TreeNode lastNode = childTree;
//從第二個字符開始
for (int i = 1; i < word.Length; i++)
{
char cha = word[i];
if (lastNode.children == null)
{
//新建子節點
TreeNode newNode = new TreeNode() { item = cha, isEnd = false, children = null };
lastNode.children = new List<TreeNode>() { newNode };
lastNode = newNode;
}
else
{
//查找匹配子節點
TreeNode childNode = lastNode.children.Find(e => e.item == cha);
if (childNode == null)
{
//新建子節點
childNode = new TreeNode() { item = cha, isEnd = false, children = null };
lastNode.children.Add(childNode);
lastNode = childNode;
}
else
{
//有公共子節點
lastNode = childNode;
}
}
}
lastNode.isEnd = true;
}
#endregion
}
2. 對比字符串
按照樹結構依次比對字符
注意問題:全角/半角轉換,中文簡繁體轉換,英文大小寫轉換,特殊符號判斷跳過
#region --- Checker ---
/// 替換敏感詞
public bool CheckSensitiveWord(ref string word, char replaceChar = '*')
{
List<int> indexList = CheckWord(word);
if (indexList == null || indexList.Count == 0)
return true;
char[] chars = word.ToCharArray();
for (int i = 0; i < indexList.Count; i++)
{
if (indexList[i] >= 0 && indexList[i] < chars.Length)
chars[indexList[i]] = replaceChar;
}
word = new string(chars);
return false;
}
/// 檢測敏感詞
private List<int> CheckWord(string text)
{
if (treeRoot == null || string.IsNullOrEmpty(text))
return null;
//敏感字符index
List<int> checkIndexList = new List<int>();
List<int> tempIndexList = new List<int>();
TreeNode treeNode = treeRoot;
for (int i = 0; i < text.Length; i++)
{
char cha = text[i];
//校驗字符
if (!CorrectChar(ref cha))
continue;
//查找匹配字符
TreeNode targetNode = treeNode.children.Find(e => e.item == cha);
//是否匹配字符
if (targetNode != null)
{
//記錄字符位置
tempIndexList.Add(i);
//詞彙樹是結束節點或詞彙樹無子樹
if (targetNode.isEnd || targetNode.children == null || targetNode.children.Count == 0)
{
//記錄問題字符
for (int m = 0; m < tempIndexList.Count; m++)
{
if (!checkIndexList.Contains(tempIndexList[m]))
checkIndexList.Add(tempIndexList[m]);
}
//詞彙樹有子樹, 可以繼續匹配
if (targetNode.children != null && targetNode.children.Count > 0)
{
//下個字符符合匹配條件
int k = i + 1;
if (k < text.Length && targetNode.children.Exists(e => e.item == text[k]))
{
//繼續校驗
treeNode = targetNode;
continue;
}
}
//清除記錄
tempIndexList.Clear();
//重新校驗
treeNode = treeRoot;
}
else
{
//匹配未完成, 繼續校驗
treeNode = targetNode;
}
}
else
{
//沒有匹配到字符
if(tempIndexList.Count > 0)
{
//如果上個字符匹配到, 當前字符未匹配, 用該字符作爲第一個節點再次匹配
tempIndexList.Clear();
i--;
}
//重新校驗
treeNode = treeRoot;
}
}
return checkIndexList;
}
//校驗字符
bool CorrectChar(ref char cha)
{
////全角/半角 todo
//ToDBC(ref cha);
//忽略對比數字
if (IsNumber(cha))
return false;
//英文字符統一轉爲小寫
if(IsAlphabet(cha))
{
cha = char.ToLower(cha);
return true;
}
//中文統一轉爲簡體
if (IsChinese(cha))
{
//繁體轉簡體 todo
return true;
}
//判斷特殊符號, 其餘視爲特殊符號, 忽略對比
return false;
}
// ----------
int charValue;
//是否是中文
private bool IsChinese(char character)
{
// 中文表意字符的範圍 4E00-9FA5
charValue = (int)character;
return (charValue >= 0x4e00 && charValue <= 0x9fa5);
}
//是否是數字
private bool IsNumber(char character)
{
charValue = (int)character;
return (charValue >= 48 && charValue <= 57);
}
//是否是英文字母
private bool IsAlphabet(char character)
{
charValue = (int)character;
return ((charValue >= 65 && charValue <= 90) || (charValue >= 97 && charValue <= 122));
}
/// <summary>
/// 轉半角小寫的函數(DBC case)
/// </summary>
/// <param name="input">任意字符串</param>
/// <returns>半角字符串</returns>
///<remarks>
///全角空格爲12288,半角空格爲32
///其他字符半角(33-126)與全角(65281-65374)的對應關係是:均相差65248
///</remarks>
private void ToDBC(ref char cha)
{
if (cha == 12288)
cha = (char)32;
else if (cha > 65280 && cha < 65375)
cha = (char)(cha - 65248);
}
#region --- 簡體/繁體 ---
////需要引用庫 Microsoft.VisualBasic
///// 中文字符工具類
//private const int LOCALE_SYSTEM_DEFAULT = 0x0800;
//private const int LCMAP_SIMPLIFIED_CHINESE = 0x02000000;
//private const int LCMAP_TRADITIONAL_CHINESE = 0x04000000;
//[System.Runtime.InteropServices.DllImport("kernel32", CharSet = System.Runtime.InteropServices.CharSet.Auto, SetLastError = true)]
//private static extern int LCMapString(int Locale, int dwMapFlags, string lpSrcStr, int cchSrc, [System.Runtime.InteropServices.Out] string lpDestStr, int cchDest);
///// 將字符轉換成簡體中文
//public static string ToSimplified(string source)
//{
// String target = new String(' ', source.Length);
// int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_SIMPLIFIED_CHINESE, source, source.Length, target, source.Length);
// return target;
//}
///// 將字符轉換爲繁體中文
//public static string ToTraditional(string source)
//{
// String target = new String(' ', source.Length);
// int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_TRADITIONAL_CHINESE, source, source.Length, target, source.Length);
// return target;
//}
#endregion
#endregion