/*****************************************************
* Boyer Moore Pattern Matching Algorithm
* 20060427 by [email protected]
*****************************************************/
這個算法的特點在於從pattern的最後一位開始比較,一旦不
符合則把pattern提前到當前比較位置上的元素和pattern中某
個一致爲止。
Boyer_Moore的算法說明有很多,但不是很明白,結合下面的
數據將會很容易理解這個算法。(由於字符位置比較重要,
下面的例子需要在等字符寬度的字體下顯示纔有效果)
抱歉我沒有太多的時間說明這個問題。
-------------------------------------------------
1 2 3 4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
This is a test of the Boyer Moore algorithm
algorithm
87654321
>> (sf['a'] - 0) = 8
algorithm
>> (sf['f'] - 0) = 9
algorithm
>> (sf['e'] - 0) = 9
algorithm
>> (sf['a'] - 0 = 8
algorithm
-------------------------------------------------
1 2 3 4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
CooBooooBooooo
Booooo
51111
>> (sf['B'] - 2) = 3
Booooo
>> (sf['B'] - 0) = 5
Booooo
|
xxxxBooooxxxx
Boooo
|
-------------------------------------------------
1 2 3 4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
aaaaaaabcc.....
aaabbbbccc
777333311
>> (sf['b'] - 2) = 1
aaabbbbccc
aaaaaaabccabaaabbbbccc....
>> (sf['a'] - 0) = 7
aaabbbbccc
>> (sf['b'] - 0) = 3
aaabbbbccc
>> (sf['b'] - 2) = 1
aaabbbbccc
CooooBooooo
Booo
>> (sf[C] - 3) = 1
Booo
>> (sf[o] - 3) = -2
=============================================================
相信到這裏你已經明白這個算法了,下面給出一個只匹配字符串的
簡單例子
=============================================================
/////////////////////////////////////////////////////////
// matcher_bm.h
// Boyer Moore
// 20060427 by [email protected]
#ifndef _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_
#define _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_
struct st_chpat;
class matcher_bm
{
public:
matcher_bm(const char*);
~matcher_bm();
public:
void init();
const char* scan(const char*);
private:
struct st_chpat *m_chpat;
};
#endif
/////////////////////////////////////////////////////////
// matcher_bm.cpp
// Boyer Moore
// 20060427 by [email protected]
#include <iostream>
#include <map>
#include <string.h>
#include "matcher_bm.h"
using namespace std;
typedef map<char, int>::iterator sf_itor;
class chpat_sf : public map<char, int> {
public:
int operator[](map<char, int>::key_type key) {
sf_itor itor = find(key);
if (itor == end())
return dvaule;
else
return itor->second;
}
int dvaule;
};
struct st_chpat {
const char *_pat;
int pat_len;
chpat_sf *_shift;
};
matcher_bm::matcher_bm(const char *_pattern) {
m_chpat = new struct st_chpat;
m_chpat->_shift = new chpat_sf;
m_chpat->pat_len = strlen(_pattern);
m_chpat->_pat = _pattern;
}
matcher_bm::~matcher_bm() {
delete m_chpat->_shift;
delete m_chpat;
}
typedef pair <char, int> sf_pair;
void matcher_bm::init()
{
m_chpat->_shift->dvaule = m_chpat->pat_len;
m_chpat->_shift->clear();
int size = m_chpat->pat_len - 1;
pair<sf_itor, bool> pr;
for (int iLoc(0); iLoc < size; iLoc++) {
int shift = size - iLoc;
pr = m_chpat->_shift->insert(sf_pair(*(m_chpat->_pat + iLoc), shift));
if (!pr.second)
pr.first->second = shift;
}
}
const char* matcher_bm::scan(const char *_target) {
const char *_pat_tail = m_chpat->_pat + m_chpat->pat_len - 1;
const char *_suffix = _target + m_chpat->pat_len - 1;
const char *_target_end = _target + strlen(_target);
cout << " 01234567890123456789012345678901234567890123456789" << endl;
cout << "target : " << _target << endl;
cout << "pattern : " << m_chpat->_pat << endl;
while (_suffix < _target_end) {
cout << "match : " << _suffix - m_chpat->pat_len + 1 << endl;
int i = 0;
for (; i < m_chpat->pat_len; i++) {
if (*(_pat_tail - i) != *(_suffix - i)) {
int step = m_chpat->_shift->operator[](*(_suffix - i)) - i;
if (step < 1) step = 1;
//cout << ">> (sf['" << *(_suffix - i) << "'] - "
//<< i << ") = " << step << endl;
_suffix += step;
break;
}
}
if (i == m_chpat->pat_len)
return (_suffix - m_chpat->pat_len + 1);
}
return NULL;
}
/////////////////////////////////////////////////////////
// demo.cpp 這是一個演示的程序
// Boyer Moore
// 20060427 by daineng
#include <iostream>
#include "matcher_bm.h"
using namespace std;
struct st_target_pat {
const char *target;
const char *pattern;
} target_pat [] = {
{"This is a test of the Boyer Moore algorithm", "algorithm"},
{"CooBooooBooooo", "Booooo"},
{"aaaaaaabccabaaabbbbccc.", "aaabbbbccc"},
{"CooooBooooo", "Booo"},
{"11111111", "0"},
{NULL, NULL}
};
int main(void)
{
for (int itp(0); target_pat[itp].target != NULL; itp++) {
cout << endl << "Target-Pattern[" << itp << "] :" << endl;
matcher_bm matcher(target_pat[itp].pattern);
matcher.init();
const char *p;
if (NULL != (p = matcher.scan(target_pat[itp].target))) {
cout << "Match @ " << (int)(p - target_pat[itp].target)
<< " Zero-Based" << endl;
}
}
return 0;
}
=============================================================
如果以上對這個算法的理解沒有錯的話,那麼從下面的例子可以看出
這個算法在某些情況下效果非常不好,記得大學的數據結構上介紹過
快速匹配,當時沒有仔細研究,依稀記得好像是根據已經匹配過的紀
錄來決定步進量,不需要再匹配已經匹配的部分。按這樣的原理就需
要初始化的時候用pattern匹配一次pattern,然後紀錄某種結果。
結合到BM算法中應該就可以加強BM的匹配算法,在BM步進爲1的時候
如果快速匹配的步進大於1就選擇後者。快速匹配好像就是用來處理
這種重複比較多的情況。
按BM的算法有時候步進會小於1,這個時候把步進量設置成1
=============================================================
010000000000
110 :1=1
>> (sf[0] - 2) = 1
110
>> (sf[0] - 1) = 2
110
000000000000
100 :1=2,0=1
>> (sf[0] - 2]) = -1
00100000
1010 :1=1, 0=2
>> (sf[0] - 3) = -1
00110000
1010 :1=1, 0=2
>> (sf[1] - 0) = 1