【C#】敏感詞過濾校驗

敏感詞過濾的方法有很多，如何能更快速更大範圍的匹配敏感詞？

敏感詞庫小的情況，用正則、字符串匹配即可實現，還可以按照首字母對敏感詞分組以提高效率。

當詞庫比較大時，就需要考慮效率問題了···

找了很多實現方法，感覺不是很適合現在的需求，於是自己嘗試實現了一下，用的也是DFA算法。

參考：C#敏感詞過濾算法實現、Java實現敏感詞過濾

1. 創建敏感詞庫

1.1 收集敏感詞

1.2.1 指定字符穿分割

// 測試敏感詞庫:
敏感詞|敏感詞彙|敏感詞語|敏感單詞

1.2.2 處理敏感詞彙（DFA算法: 有窮自動機, 通過event和當前的state得到下一個state, 即 event + state = nextstate）

將拆分的字符，整理成樹狀結構

// 詞庫處理後格式（僞代碼, 僅用於展示）:
{
    item: "R",
    isEnd: false,
    childre:
        {
            item: "敏",
            isEnd: false,
            children: 
                {
                    item: "感",
                    isEnd: false,
                    children: 
                        {
                            item: "詞",
                            isEnd: true,
                            children: 
                                {
                                    "item": "匯",
                                    "isEnd": true,
                                    "children": null 
                                },
                                {
                                    "item": "語",
                                    "isEnd": true,
                                    "children": null
                                }
                        },
                        {
                            "item": "單",
                            "isEnd": false,
                            "children":
                                {
                                    "item": "詞",
                                    "isEnd": true,
                                    "children": null
                                }
                        }
                }
        }
}

處理：

    /// 詞庫樹節點
    public class TreeNode
    {
        public char item;
        public bool isEnd;
        public List<TreeNode> children;
    }

public class SensitiveWordsLibrary
{
    //DFA:有窮自動機, 通過event和當前的state得到下一個state, 即 event + state = nextstate

    string sensitiveWordsResourcesPath = "SensitiveWordsLibrary/SensitiveWords";

    /// 詞庫樹
    private TreeNode treeRoot;

    public SensitiveWordsLibrary()
    {
        //敏感詞組
        string[] sensitiveWords = LoadSensitiveWords();
        //創建詞庫樹
        treeRoot = CreateTree(sensitiveWords);
    }

    /// 加載敏感詞組
    private string[] LoadSensitiveWords()
    {
        //todo 源文件字符校驗
        TextAsset textAsset = Resources.Load<TextAsset>(sensitiveWordsResourcesPath);

        if (textAsset != null)
        {
            string wordStr = textAsset.text;
            if (!string.IsNullOrEmpty(wordStr))
            {
                string[] words = wordStr.Split('|');
                return words;
            }
        }
        return null;
    }

    #region --- Create Tree ---

    /// 創建詞庫樹
    private TreeNode CreateTree(string[] words)
    {
        if (words == null || words.Length == 0)
        {
            Debug.Log("無敏感詞庫");
            return new TreeNode() { item = 'R', isEnd = true, children = null };
        }

        List<TreeNode> treeList = new List<TreeNode>();

        foreach (string word in words)
        {
            if (string.IsNullOrEmpty(word))
                continue;

            char cha = word[0];
            TreeNode treeNode = treeList.Find(e => e.item == cha);

            if (treeNode == null)
            {
                TreeNode newTreeNode = CreateSingleTree(word);
                if (newTreeNode != null)
                    treeList.Add(newTreeNode);
            }
            else
                AddChildTree(treeNode, word);
        }

        return new TreeNode() { item = 'R', isEnd = false, children = treeList };
    }

    /// 創建單個完整樹
    private TreeNode CreateSingleTree(string word)
    {
        if (string.IsNullOrEmpty(word))
            return null;

        TreeNode root = new TreeNode() { item = word[0], isEnd = false, children = null };
        TreeNode lastNode = root;

        if (word.Length > 1)
        {
            for (int i = 1; i < word.Length; i++)
            {
                TreeNode child = new TreeNode() { item = word[i], isEnd = false, children = null };
                lastNode.children = new List<TreeNode>() { child };
                lastNode = child;
            }
        }
        lastNode.isEnd = true;

        return root;
    }

    /// 附加分支子樹
    private void AddChildTree(TreeNode childTree, string word)
    {
        if (childTree == null || string.IsNullOrEmpty(word))
            return;

        //字符長度==1
        if (word.Length == 1)
        {
            childTree.isEnd = true;
            return;
        }

        TreeNode lastNode = childTree;

        //從第二個字符開始
        for (int i = 1; i < word.Length; i++)
        {
            char cha = word[i];

            if (lastNode.children == null)
            {
                //新建子節點
                TreeNode newNode = new TreeNode() { item = cha, isEnd = false, children = null };
                lastNode.children = new List<TreeNode>() { newNode };
                lastNode = newNode;
            }
            else
            {
                //查找匹配子節點
                TreeNode childNode = lastNode.children.Find(e => e.item == cha);
                if (childNode == null)
                {
                    //新建子節點
                    childNode = new TreeNode() { item = cha, isEnd = false, children = null };
                    lastNode.children.Add(childNode);
                    lastNode = childNode;
                }
                else
                {
                    //有公共子節點
                    lastNode = childNode;
                }
            }
        }
        lastNode.isEnd = true;
    }

    #endregion
}

2. 對比字符串

按照樹結構依次比對字符

注意問題：全角/半角轉換，中文簡繁體轉換，英文大小寫轉換，特殊符號判斷跳過


    #region --- Checker ---

    /// 替換敏感詞
    public bool CheckSensitiveWord(ref string word, char replaceChar = '*')
    {
        List<int> indexList = CheckWord(word);

        if (indexList == null || indexList.Count == 0)
            return true;

        char[] chars = word.ToCharArray();

        for (int i = 0; i < indexList.Count; i++)
        {
            if (indexList[i] >= 0 && indexList[i] < chars.Length)
                chars[indexList[i]] = replaceChar;
        }
        word = new string(chars);

        return false;
    }

    /// 檢測敏感詞
    private List<int> CheckWord(string text)
    {
        if (treeRoot == null || string.IsNullOrEmpty(text))
            return null;

        //敏感字符index
        List<int> checkIndexList = new List<int>();
        List<int> tempIndexList = new List<int>();

        TreeNode treeNode = treeRoot;

        for (int i = 0; i < text.Length; i++)
        {
            char cha = text[i];

            //校驗字符
            if (!CorrectChar(ref cha))
                continue;

            //查找匹配字符
            TreeNode targetNode = treeNode.children.Find(e => e.item == cha);

            //是否匹配字符
            if (targetNode != null)
            {
                //記錄字符位置
                tempIndexList.Add(i);
                //詞彙樹是結束節點或詞彙樹無子樹
                if (targetNode.isEnd || targetNode.children == null || targetNode.children.Count == 0)
                {
                    //記錄問題字符
                    for (int m = 0; m < tempIndexList.Count; m++)
                    {
                        if (!checkIndexList.Contains(tempIndexList[m]))
                            checkIndexList.Add(tempIndexList[m]);
                    }
                    //詞彙樹有子樹, 可以繼續匹配
                    if (targetNode.children != null && targetNode.children.Count > 0)
                    {
                        //下個字符符合匹配條件
                        int k = i + 1;
                        if (k < text.Length && targetNode.children.Exists(e => e.item == text[k]))
                        {
                            //繼續校驗
                            treeNode = targetNode;
                            continue;
                        }
                    }
                    //清除記錄
                    tempIndexList.Clear();
                    //重新校驗
                    treeNode = treeRoot;
                }
                else
                {
                    //匹配未完成, 繼續校驗
                    treeNode = targetNode;
                }
            }
            else
            {
                //沒有匹配到字符
                if(tempIndexList.Count > 0)
                {
                    //如果上個字符匹配到, 當前字符未匹配, 用該字符作爲第一個節點再次匹配
                    tempIndexList.Clear();
                    i--;
                }

                //重新校驗
                treeNode = treeRoot;
            }
        }
        return checkIndexList;
    }

    //校驗字符
    bool CorrectChar(ref char cha)
    {
        ////全角/半角  todo
        //ToDBC(ref cha);

        //忽略對比數字
        if (IsNumber(cha))
            return false;

        //英文字符統一轉爲小寫
        if(IsAlphabet(cha))
        {
            cha = char.ToLower(cha);
            return true;
        }

        //中文統一轉爲簡體
        if (IsChinese(cha))
        {
            //繁體轉簡體 todo
            return true;
        }

        //判斷特殊符號, 其餘視爲特殊符號, 忽略對比
        return false;
    }

    // ----------

    int charValue;

    //是否是中文
    private bool IsChinese(char character)
    {
        //  中文表意字符的範圍 4E00-9FA5
        charValue = (int)character;
        return (charValue >= 0x4e00 && charValue <= 0x9fa5);
    }

    //是否是數字
    private bool IsNumber(char character)
    {
        charValue = (int)character;
        return (charValue >= 48 && charValue <= 57);
    }

    //是否是英文字母
    private bool IsAlphabet(char character)
    {
        charValue = (int)character;
        return ((charValue >= 65 && charValue <= 90) || (charValue >= 97 && charValue <= 122));
    }

    /// <summary>
    /// 轉半角小寫的函數(DBC case)
    /// </summary>
    /// <param name="input">任意字符串</param>
    /// <returns>半角字符串</returns>
    ///<remarks>
    ///全角空格爲12288，半角空格爲32
    ///其他字符半角(33-126)與全角(65281-65374)的對應關係是：均相差65248
    ///</remarks>
    private void ToDBC(ref char cha)
    {
        if (cha == 12288)
            cha = (char)32;
        else if (cha > 65280 && cha < 65375)
            cha = (char)(cha - 65248);
    }

    #region --- 簡體/繁體 ---

    ////需要引用庫 Microsoft.VisualBasic

    ///// 中文字符工具類
    //private const int LOCALE_SYSTEM_DEFAULT = 0x0800;
    //private const int LCMAP_SIMPLIFIED_CHINESE = 0x02000000;
    //private const int LCMAP_TRADITIONAL_CHINESE = 0x04000000;
    //[System.Runtime.InteropServices.DllImport("kernel32", CharSet = System.Runtime.InteropServices.CharSet.Auto, SetLastError = true)]
    //private static extern int LCMapString(int Locale, int dwMapFlags, string lpSrcStr, int cchSrc, [System.Runtime.InteropServices.Out] string lpDestStr, int cchDest);

    ///// 將字符轉換成簡體中文
    //public static string ToSimplified(string source)
    //{
    //    String target = new String(' ', source.Length);
    //    int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_SIMPLIFIED_CHINESE, source, source.Length, target, source.Length);
    //    return target;
    //}

    ///// 將字符轉換爲繁體中文
    //public static string ToTraditional(string source)
    //{
    //    String target = new String(' ', source.Length);
    //    int ret = LCMapString(LOCALE_SYSTEM_DEFAULT, LCMAP_TRADITIONAL_CHINESE, source, source.Length, target, source.Length);
    //    return target;
    //}

    #endregion

    #endregion