先建立關鍵字hash樹,對每個關鍵字建立一個hash映射,加快查找速度。對hash相同的建立數結構,比如ab,aab,bc,建立的結果就是:
key : a key: b
數據結構如下:
typedef struct _Node{
char value;
_Node* parent;
_Node* childs;
_Node* siblings;
_Node() {parent= childs = siblings = NULL;};
}Node, *NodePtr;
這裏的關鍵字key是取的單個字節的char值。對於包含漢字的utf關鍵字,當然可以轉換成ansi,定下字節數(漢字佔2字節)然後以2字節做key。如此就可以建立一個字典樹。建立的過程如下:
for (i = 0; i < m_str.size(); i ++){
// 首字key做hash
int key = m_str[i][0];
m_it = m_map.find(key);
if (m_it == m_map.end()){
NodePtr pNode = new Node;
pNode->value = key;
pNode->parent = NULL;
pNode->childs = NULL;
pNode->siblings = NULL;
m_map[key] = pNode;
}
NodePtr pNode = m_map[key];
int j = 1, count = m_str[i].size() - 1;
while (count -- > 0){
key = m_str[i][j++];
NodePtr pChild = pNode->childs;
if (!pChild){ // first child
pChild = new Node;
pChild->value = key;
pChild->parent = pNode;
pNode->childs = pChild;
pNode = pChild;// new root
} else { // second child
while (pChild->siblings){
if (pChild->siblings->value == key) break;
pChild = pChild->siblings;
}
if (pChild->siblings){ // found
pNode = pChild->siblings; // new root
} else {
pChild->siblings = new Node;
pChild->siblings->value = key;
pChild->siblings->parent = pNode;
pNode = pChild->siblings; // new root
}
}
}
}
對於一個給定的字符串,來判斷是否包含關鍵字。這個過程就是一個查找的過程,和建立詞典數是類似的方法:
bool KeyWordFilter::checkWord(const char* str)
{
int i,length = strlen(str);
std::stack<NodePtr> toTmp, toCheck;
std::stack<NodePtr> *pTmp, *pToCheck;
pTmp = &toTmp;
pToCheck = &toCheck;
for (i = 0; i < length; i ++){
int key = str[i];
while (!pToCheck->empty()){
// test each one
NodePtr pNode = pToCheck->top();
pToCheck->pop();
if (pNode->childs){
if (pNode->childs->value == key){
// found
if (pNode->childs->childs)
pTmp->push(pNode->childs);
else {
//dump(pNode->childs);
return true;
}
} else {
NodePtr pChild = pNode->childs;
while(pChild->siblings){
if (pChild->siblings->value == key) break;
pChild = pChild->siblings;
}
if (pChild->siblings){
// found
if (pChild->siblings->childs)
pTmp->push(pChild->siblings);
else {
//dump(pChild->siblings);
return true;
}
}
}
}
}
// 新的單詞開始
m_it = m_map.find(key);
if (m_it != m_map.end()){
pTmp->push(m_it->second);
}
// switch
std::stack<NodePtr> *tmp;
tmp = pToCheck;
pToCheck = pTmp;
pTmp = tmp;
}
return false;
}