Boyer Moore Pattern Matching Algorithm

/*****************************************************
 * Boyer Moore Pattern Matching Algorithm
 * 20060427 by
[email protected]
 *****************************************************/

這個算法的特點在於從pattern的最後一位開始比較,一旦不
符合則把pattern提前到當前比較位置上的元素和pattern中某
個一致爲止。

Boyer_Moore的算法說明有很多,但不是很明白,結合下面的
數據將會很容易理解這個算法。(由於字符位置比較重要,
下面的例子需要在等字符寬度的字體下顯示纔有效果)

抱歉我沒有太多的時間說明這個問題。

-------------------------------------------------
         1         2         3         4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
This is a test of the Boyer Moore algorithm
algorithm
87654321
        >> (sf['a'] - 0) = 8
        algorithm
                 >> (sf['f'] - 0) = 9
                 algorithm
                          >> (sf['e'] - 0) = 9
                          algorithm
                                  >> (sf['a'] - 0 = 8
                                  algorithm

-------------------------------------------------
         1         2         3         4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
CooBooooBooooo
Booooo
51111
   >> (sf['B'] - 2) = 3
   Booooo
        >> (sf['B'] - 0) = 5
        Booooo

          |
    xxxxBooooxxxx
      Boooo
          |

-------------------------------------------------
         1         2         3         4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
aaaaaaabcc.....
aaabbbbccc
777333311
 >> (sf['b'] - 2) = 1
 aaabbbbccc
aaaaaaabccabaaabbbbccc....
        >> (sf['a'] - 0) = 7
        aaabbbbccc
           >> (sf['b'] - 0) = 3
           aaabbbbccc
            >> (sf['b'] - 2) = 1
            aaabbbbccc

CooooBooooo
Booo
>> (sf[C] - 3) = 1
 Booo
 >> (sf[o] - 3) = -2

=============================================================
相信到這裏你已經明白這個算法了,下面給出一個只匹配字符串的
簡單例子
=============================================================

/////////////////////////////////////////////////////////
// matcher_bm.h
// Boyer Moore
// 20060427 by
[email protected]

#ifndef _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_
#define _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_

struct st_chpat;

class matcher_bm
{
    public:
        matcher_bm(const char*);
        ~matcher_bm();

    public:
        void init();
        const char* scan(const char*);

    private:
        struct st_chpat *m_chpat;
};

#endif

/////////////////////////////////////////////////////////
// matcher_bm.cpp
// Boyer Moore
// 20060427 by
[email protected]

#include <iostream>
#include <map>
#include <string.h>

#include "matcher_bm.h"

using namespace std;

typedef map<char, int>::iterator sf_itor;

class chpat_sf : public map<char, int> {
    public:
        int operator[](map<char, int>::key_type key) {
            sf_itor itor = find(key);
            if (itor == end())
                return dvaule;
            else
                return itor->second;
        }
        int dvaule;
};

struct st_chpat {
    const char *_pat;
    int pat_len;
    chpat_sf *_shift;
};

matcher_bm::matcher_bm(const char *_pattern) {
    m_chpat = new struct st_chpat;
    m_chpat->_shift = new chpat_sf;
    m_chpat->pat_len = strlen(_pattern);
    m_chpat->_pat = _pattern;
}

matcher_bm::~matcher_bm() {
    delete m_chpat->_shift;
    delete m_chpat;
}

typedef pair <char, int> sf_pair;

void matcher_bm::init()
{
    m_chpat->_shift->dvaule = m_chpat->pat_len;
    m_chpat->_shift->clear();
    int size = m_chpat->pat_len - 1;
    pair<sf_itor, bool> pr;
    for (int iLoc(0); iLoc < size; iLoc++) {
        int shift = size - iLoc;
        pr = m_chpat->_shift->insert(sf_pair(*(m_chpat->_pat + iLoc), shift));
        if (!pr.second)
            pr.first->second = shift;
    }
}

const char* matcher_bm::scan(const char *_target) {
    const char *_pat_tail = m_chpat->_pat + m_chpat->pat_len - 1;
    const char *_suffix = _target + m_chpat->pat_len - 1;
    const char *_target_end = _target + strlen(_target);
    cout << "          01234567890123456789012345678901234567890123456789" << endl;
    cout << "target  : " << _target << endl;
    cout << "pattern : " << m_chpat->_pat << endl;
    while (_suffix < _target_end) {
        cout << "match   : " << _suffix - m_chpat->pat_len + 1 << endl;
        int i = 0;
        for (; i < m_chpat->pat_len; i++) {
            if (*(_pat_tail - i) != *(_suffix - i)) {
                int step = m_chpat->_shift->operator[](*(_suffix - i)) - i;
                if (step < 1) step = 1;
                //cout << ">> (sf['" << *(_suffix - i) << "'] - "
                    //<< i << ") = " << step << endl;
                _suffix += step;
                break;
            }
        }
        if (i == m_chpat->pat_len)
            return (_suffix - m_chpat->pat_len + 1);
    }
    return NULL;
}

/////////////////////////////////////////////////////////
// demo.cpp 這是一個演示的程序
// Boyer Moore
// 20060427 by daineng

#include <iostream>
#include "matcher_bm.h"

using namespace std;

struct st_target_pat {
    const char *target;
    const char *pattern;
} target_pat [] = {
    {"This is a test of the Boyer Moore algorithm", "algorithm"},
    {"CooBooooBooooo", "Booooo"},
    {"aaaaaaabccabaaabbbbccc.", "aaabbbbccc"},
    {"CooooBooooo", "Booo"},
    {"11111111", "0"},
    {NULL, NULL}
};

int main(void)
{
    for (int itp(0); target_pat[itp].target != NULL; itp++) {
        cout << endl << "Target-Pattern[" << itp << "] :" << endl;
        matcher_bm matcher(target_pat[itp].pattern);
        matcher.init();
        const char *p;
        if (NULL != (p = matcher.scan(target_pat[itp].target))) {
            cout << "Match @ " << (int)(p - target_pat[itp].target)
                << " Zero-Based" << endl;
        }
    }
    return 0;
}


=============================================================
如果以上對這個算法的理解沒有錯的話,那麼從下面的例子可以看出
這個算法在某些情況下效果非常不好,記得大學的數據結構上介紹過
快速匹配,當時沒有仔細研究,依稀記得好像是根據已經匹配過的紀
錄來決定步進量,不需要再匹配已經匹配的部分。按這樣的原理就需
要初始化的時候用pattern匹配一次pattern,然後紀錄某種結果。

結合到BM算法中應該就可以加強BM的匹配算法,在BM步進爲1的時候
如果快速匹配的步進大於1就選擇後者。快速匹配好像就是用來處理
這種重複比較多的情況。

按BM的算法有時候步進會小於1,這個時候把步進量設置成1
=============================================================

010000000000
110 :1=1
>> (sf[0] - 2) = 1
 110
 >> (sf[0] - 1) = 2
   110

000000000000
100 :1=2,0=1
>> (sf[0] - 2]) = -1

00100000
1010 :1=1, 0=2
>> (sf[0] - 3) = -1

00110000
1010 :1=1, 0=2
>> (sf[1] - 0) = 1

發佈了46 篇原創文章 · 獲贊 0 · 訪問量 15萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章