先建立关键字hash树,对每个关键字建立一个hash映射,加快查找速度。对hash相同的建立数结构,比如ab,aab,bc,建立的结果就是:
key : a key: b
数据结构如下:
typedef struct _Node{
char value;
_Node* parent;
_Node* childs;
_Node* siblings;
_Node() {parent= childs = siblings = NULL;};
}Node, *NodePtr;
这里的关键字key是取的单个字节的char值。对于包含汉字的utf关键字,当然可以转换成ansi,定下字节数(汉字占2字节)然后以2字节做key。如此就可以建立一个字典树。建立的过程如下:
for (i = 0; i < m_str.size(); i ++){
// 首字key做hash
int key = m_str[i][0];
m_it = m_map.find(key);
if (m_it == m_map.end()){
NodePtr pNode = new Node;
pNode->value = key;
pNode->parent = NULL;
pNode->childs = NULL;
pNode->siblings = NULL;
m_map[key] = pNode;
}
NodePtr pNode = m_map[key];
int j = 1, count = m_str[i].size() - 1;
while (count -- > 0){
key = m_str[i][j++];
NodePtr pChild = pNode->childs;
if (!pChild){ // first child
pChild = new Node;
pChild->value = key;
pChild->parent = pNode;
pNode->childs = pChild;
pNode = pChild;// new root
} else { // second child
while (pChild->siblings){
if (pChild->siblings->value == key) break;
pChild = pChild->siblings;
}
if (pChild->siblings){ // found
pNode = pChild->siblings; // new root
} else {
pChild->siblings = new Node;
pChild->siblings->value = key;
pChild->siblings->parent = pNode;
pNode = pChild->siblings; // new root
}
}
}
}
对于一个给定的字符串,来判断是否包含关键字。这个过程就是一个查找的过程,和建立词典数是类似的方法:
bool KeyWordFilter::checkWord(const char* str)
{
int i,length = strlen(str);
std::stack<NodePtr> toTmp, toCheck;
std::stack<NodePtr> *pTmp, *pToCheck;
pTmp = &toTmp;
pToCheck = &toCheck;
for (i = 0; i < length; i ++){
int key = str[i];
while (!pToCheck->empty()){
// test each one
NodePtr pNode = pToCheck->top();
pToCheck->pop();
if (pNode->childs){
if (pNode->childs->value == key){
// found
if (pNode->childs->childs)
pTmp->push(pNode->childs);
else {
//dump(pNode->childs);
return true;
}
} else {
NodePtr pChild = pNode->childs;
while(pChild->siblings){
if (pChild->siblings->value == key) break;
pChild = pChild->siblings;
}
if (pChild->siblings){
// found
if (pChild->siblings->childs)
pTmp->push(pChild->siblings);
else {
//dump(pChild->siblings);
return true;
}
}
}
}
}
// 新的单词开始
m_it = m_map.find(key);
if (m_it != m_map.end()){
pTmp->push(m_it->second);
}
// switch
std::stack<NodePtr> *tmp;
tmp = pToCheck;
pToCheck = pTmp;
pTmp = tmp;
}
return false;
}