AC自動機模板及基礎例題小結

AC自動機(Aho-Corasick Automation)用於解決多模式串匹配主串的問題
給所有模式串寫一個Trie,在Trie上跑KMP,其中KMP的next數組變成了AC自動機的Fail指針
計算fail和計算next一樣,用dp,只不過這裏是樹上dp
原理不再贅述,上模板

#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
using namespace std;
class ACautomation {
private:
    int cnt, root;
    int **child, *sta, *fail;
    bool *flag;
public:
    ACautomation(int tot) {
        cnt = root = 1;
        sta = new int[tot+1];
        memset(sta, 0, sizeof(int)*(tot+1));
        fail = new int[tot+1];
        memset(fail, 0, sizeof(int)*(tot+1));
        flag = new bool[tot+1];
        memset(flag, 0, sizeof(bool)*(tot+1));
        child = new int*[tot+1];
        for (int i = 0; i <= tot; i++) {
            child[i] = new int[26];
            memset(child[i], 0, sizeof(int)*26);
        }
        for (int i = 0; i < 26; i++)    child[0][i] = 1;
    }
    ~ACautomation() {
        delete[] sta;
        delete[] fail;
        delete[] flag;
        for (int i = 0; i <= cnt; i++)
            delete[] child[i];
        delete[] child;
    }
    void insert(string& s) {
        int cur = 1, i;
        for (i = 0; i < s.length(); cur = child[cur][s[i++]-'a'])
            if (!child[cur][s[i]-'a'])  child[cur][s[i]-'a'] = ++cnt;
        sta[cur]++;
    }
    void setFail() {
        queue <int> que;
        que.push(1);
        while (!que.empty()) {
            int u = que.front();
            for (int i = 0; i < 26; i++) {
                if (child[u][i]) {
                    fail[child[u][i]] = child[fail[u]][i];
                    que.push(child[u][i]);
                } else {
                    child[u][i] = child[fail[u]][i];
                }
            }
            que.pop();
        }
    }
    int query(string& s) {
        int ret = 0, cur = 1, index;
        for (int i = 0; i < s.length(); i++) {
            flag[cur] = true;
            index = s[i]-'a';
            while (!child[cur][index])  cur = fail[cur];
            cur = child[cur][index];
            if (flag[cur])  continue;
            for (int j = cur; j; j = fail[j]) {
                ret += sta[j];
                sta[j] = 0;
            }
        }
        return ret;
    }
};
int main() {
    int n;
    cin >> n;
    ACautomation T(n*50);
    for (int i = 0; i < n; i++) {
        string s;
        cin >> s;
        T.insert(s);
    }
    T.setFail();
    string str;
    cin >> str;
    cout << T.query(str);
    return 0;
}

這是本人第一次寫AC自動機封裝的代碼,很拙劣。由於我寫Trie樹一直把根節點設爲1號,所以想出把0號節點所有兒子都指向1,這樣如果fail指回到0的時候,child[u][i]都會將其重新指向根節點1,這樣思考會簡單一些。但是有時候要TLE

我做的第一道AC自動機題目是HDU2222,這是裸的模板,數據也很水

【HDU2222】 Keywords Search
Time Limit: 2000/1000 MS (Java/Others)
Memory Limit: 131072/131072 K (Java/Others)

Problem Description
In the modern time, Search engine came into the life of everybody like Google, Baidu, etc.
Wiskey also wants to bring this feature to his image retrieval system.
Every image have a long description, when users type some keywords to find the image, the system will match the keywords with description of image and show the image which the most keywords be matched.
To simplify the problem, giving you a description of image, and some keywords, you should tell me how many keywords will be match.

Input
First line will contain one integer means how many cases will follow by.
Each case will contain two integers N means the number of keywords and N keywords follow. (N <= 10000)
Each keyword will only contains characters ‘a’-‘z’, and the length will be not longer than 50.
The last line is the description, and the length will be not longer than 1000000.
Output
Print how many keywords are contained in the description.

Sample Input
1
5
she
he
say
shr
her
yasherhs
Sample Output
3

AC代碼:

#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#define MAX_LETTER 500000
using namespace std;
int trie[MAX_LETTER+5][26], tag[MAX_LETTER+5], cnt;
int fail[MAX_LETTER+5], flag[MAX_LETTER+5], que[MAX_LETTER+5];
char s[55], str[1000005];
void insert() {
    scanf("%s", s);
    int cur, i, l = strlen(s);
    for (cur = 1, i = 0; i < l; cur = trie[cur][s[i++]-'a'])
        if (!trie[cur][s[i]-'a'])   trie[cur][s[i]-'a'] = ++cnt;
    tag[cur]++;
}
void BFS() {
    int head = 0, tail = 1;
    que[head] = 1;
    while (head < tail) {
        int u = que[head++];
        for (int i = 0; i < 26; i++) {
            if (trie[u][i]) {
                que[tail++] = trie[u][i];
                fail[trie[u][i]] = trie[fail[u]][i];
            } else {
                trie[u][i] = trie[fail[u]][i];
            }
        }
    }
}
void query() {
    int ret = 0, cur = 1, index;
    int len = strlen(str);
    for (int i = 0; i < len; i++) {
        flag[cur] = 1;
        index = str[i]-'a';
        while (!trie[cur][index])   cur = fail[cur];
        cur = trie[cur][index];
        if (flag[cur])  continue;
        for (int j = cur; j; j = fail[j]) {
            ret += tag[j];
            tag[j] = 0;
        }
    }
    printf("%d\n", ret);
}
int main() {
    int T, n;
    for (int i = 0; i < 26; i++)    trie[0][i] = 1;
    scanf("%d", &T);
    while (T--) {
        cnt = 1;
        scanf("%d", &n);
        for (int i = 0; i < n; i++) insert();
        BFS();
        scanf("%s", str);
        query();
        for (int i = 1; i <= cnt; i++) {
            tag[i] = fail[i] = flag[i] = 0;
            for (int j = 0; j < 26; j++)
                trie[i][j] = 0;
        }
    }
    return 0;
}

HDU3065和這道題很像,也是幾乎裸的模板

【HDU3065】病毒侵襲持續中
Time Limit: 2000/1000 MS (Java/Others)
Memory Limit: 32768/32768 K (Java/Others)

Problem Description
小t非常感謝大家幫忙解決了他的上一個問題。然而病毒侵襲持續中。在小t的不懈努力下,他發現了網路中的“萬惡之源”。這是一個龐大的病毒網站,他有着好多好多的病毒,但是這個網站包含的病毒很奇怪,這些病毒的特徵碼很短,而且只包含“英文大寫字符”。當然小t好想好想爲民除害,但是小t從來不打沒有準備的戰爭。知己知彼,百戰不殆,小t首先要做的是知道這個病毒網站特徵:包含多少不同的病毒,每種病毒出現了多少次。大家能再幫幫他嗎?

Input
第一行,一個整數N(1<=N<=1000),表示病毒特徵碼的個數。
接下來N行,每行表示一個病毒特徵碼,特徵碼字符串長度在1—50之間,並且只包含“英文大寫字符”。任意兩個病毒特徵碼,不會完全相同。
在這之後一行,表示“萬惡之源”網站源碼,源碼字符串長度在2000000之內。字符串中字符都是ASCII碼可見字符(不包括回車)。
Output
按以下格式每行一個,輸出每個病毒出現次數。未出現的病毒不需要輸出。
病毒特徵碼: 出現次數
冒號後有一個空格,按病毒特徵碼的輸入順序進行輸出。

Sample Input
3
AA
BB
CC
ooxxCC%dAAAoen….END
Sample Output
AA: 2
CC: 1

Hint
題目描述中沒有被提及的所有情況都應該進行考慮。比如兩個病毒特徵碼可能有相互包含或者有重疊的特徵碼段。
計數策略也可一定程度上從Sample中推測。

AC代碼:

#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#define MAX_N 1000
#define MAX_LETTER 100000
#define DICNUM 26
using namespace std;
int n, cnt;
int child[MAX_LETTER+5][DICNUM], id[MAX_LETTER+5], fail[MAX_LETTER+5], ans[MAX_N+5];
int que[MAX_LETTER+5];
char s[2000005], dic[1005][55];
void init() {
    cnt = 1;
    memset(child, 0, sizeof(child));
    for (int i = 0; i < DICNUM; i++)    child[0][i] = 1;
    memset(id, 0, sizeof(id));
    memset(fail, 0, sizeof(fail));
    memset(ans, 0, sizeof(ans));
}
void insert() {
    for (int j = 1; j <= n; j++) {
        int cur = 1, l = strlen(dic[j]);
        for (int i = 0; i < l; cur = child[cur][dic[j][i++]-'A'])
            if (!child[cur][dic[j][i]-'A']) child[cur][dic[j][i]-'A'] = ++cnt;
        id[cur] = j;
    }
}
void setFail() {
    int head = 0, tail = 1;
    que[head] = 1;
    while (head < tail) {
        int u = que[head++];
        for (int i = 0; i < DICNUM; i++) {
            if (child[u][i]) {
                que[tail++] = child[u][i];
                int cur = fail[u];
                while (!child[cur][i])  cur = fail[cur];
                fail[child[u][i]] = child[cur][i];
            } else {
                child[u][i] = child[fail[u]][i];
            }
        }
    }
}
void query() {
    int cur = 1, index, l = strlen(s);
    for (int i = 0; i < l; i++) {
        index = s[i]-'A';
        if (index < 0 || index > 25) {
            cur = 1;
        } else {
            while (!child[cur][index])  cur = fail[cur];
            cur = child[cur][index];
        }
        for (int j = cur; j; j = fail[j])
            if (id[j])
                ans[id[j]]++;
    }
}
int main() {
    while (scanf("%d", &n) != EOF) {
        init();
        for (int i = 1; i <= n; i++)    scanf("%s", dic[i]);
        insert();
        setFail();
        scanf("%s", s);
        query();
        for (int i = 1; i <= n; i++)
            if (ans[i])
                printf("%s: %d\n", dic[i], ans[i]);
    }
    return 0;
}

這兩道題用我拙劣的寫法還可以過,但是HDU2896就過不了

【HDU2896】病毒侵襲
Time Limit: 2000/1000 MS (Java/Others)
Memory Limit: 32768/32768 K (Java/Others)

Problem Description
當太陽的光輝逐漸被月亮遮蔽,世界失去了光明,大地迎來最黑暗的時刻。。。。在這樣的時刻,人們卻異常興奮——我們能在有生之年看到500年一遇的世界奇觀,那是多麼幸福的事兒啊~~
但網路上總有那麼些網站,開始藉着民衆的好奇心,打着介紹日食的旗號,大肆傳播病毒。小t不幸成爲受害者之一。小t如此生氣,他決定要把世界上所有帶病毒的網站都找出來。當然,誰都知道這是不可能的。小t卻執意要完成這不能的任務,他說:“子子孫孫無窮匱也!”(愚公後繼有人了)。
萬事開頭難,小t收集了好多病毒的特徵碼,又收集了一批詭異網站的源碼,他想知道這些網站中哪些是有病毒的,又是帶了怎樣的病毒呢?順便還想知道他到底收集了多少帶病毒的網站。這時候他卻不知道何從下手了。所以想請大家幫幫忙。小t又是個急性子哦,所以解決問題越快越好哦~~

Input
第一行,一個整數N(1<=N<=500),表示病毒特徵碼的個數。
接下來N行,每行表示一個病毒特徵碼,特徵碼字符串長度在20—200之間。
每個病毒都有一個編號,依此爲1—N。
不同編號的病毒特徵碼不會相同。
在這之後一行,有一個整數M(1<=M<=1000),表示網站數。
接下來M行,每行表示一個網站源碼,源碼字符串長度在7000—10000之間。
每個網站都有一個編號,依此爲1—M。
以上字符串中字符都是ASCII碼可見字符(不包括回車)。
Output
依次按如下格式輸出按網站編號從小到大輸出,帶病毒的網站編號和包含病毒編號,每行一個含毒網站信息。
web 網站編號: 病毒編號 病毒編號 …
冒號後有一個空格,病毒編號按從小到大排列,兩個病毒編號之間用一個空格隔開,如果一個網站包含病毒,病毒數不會超過3個。
最後一行輸出統計信息,如下格式
total: 帶病毒網站數
冒號後有一個空格。

Sample Input
3
aaa
bbb
ccc
2
aaabbbccc
bbaacc
Sample Output
web 1: 1 2 3
total: 1

AC代碼:

#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
using namespace std;
struct AC {
    int child[200*500+500][128], fail[200*500+500], end[200*500+500];
    int root, cnt;
    int newnode() {
        for (int i = 0; i < 128; i++)   child[cnt][i] = -1;
        end[cnt++] = -1;
        return cnt-1;
    }
    void init() {
        cnt = 0;
        root = newnode();
    }
    void insert(int id, char s[]) {
        int len = strlen(s), cur = root;
        for (int i = 0; i < len; cur = child[cur][s[i++]])
            if (child[cur][s[i]] == -1) child[cur][s[i]] = newnode();
        end[cur] = id;
    }
    void setFail() {
        queue <int> que;
        fail[root] = root;
        for (int i = 0; i < 128; i++) {
            if (child[root][i] == -1) {
                child[root][i] = root;
            } else {
                fail[child[root][i]] = root;
                que.push(child[root][i]);
            }
        }
        while (!que.empty()) {
            int u = que.front();
            for (int i = 0; i < 128; i++) {
                if (child[u][i] != -1) {
                    fail[child[u][i]] = child[fail[u]][i];
                    que.push(child[u][i]);
                } else {
                    child[u][i] = child[fail[u]][i];
                }
            }
            que.pop();
        }
    }
    bool flag[505];
    bool query(int n, int id, char s[]) {
        int len = strlen(s);
        int cur = root;
        bool mark = false;
        memset(flag, 0, sizeof(flag));
        for (int i = 0; i < len; i++) {
            cur = child[cur][s[i]];
            int tmp = cur;
            while (tmp != root) {
                if (end[tmp] != -1) flag[end[tmp]] = mark = true;
                tmp = fail[tmp];
            }
        }
        if (!mark)  return false;
        printf("web %d:", id);
        for (int i = 1; i <= n; i++)
            if (flag[i])
                printf(" %d", i);
        printf("\n");
        return true;
    }
} ac;
char str[10005];
int main() {
    int n, m, ans;
    while (scanf("%d", &n) != EOF) {
        ac.init();
        for (int i = 1; i <= n; i++) {
            scanf("%s", str);
            ac.insert(i, str);
        }
        ac.setFail();
        ans = 0;
        scanf("%d", &m);
        for (int i = 1; i <= m; i++) {
            scanf("%s", str);
            ans += ac.query(n, i, str);
        }
        printf("total: %d\n", ans);
    }
    return 0;
}

接下來看一道稍微有些變化的題(雖然還是基礎水題)
【ZOJ3034】 Detect the Virus
One day, Nobita found that his computer is extremely slow. After several hours’ work, he finally found that it was a virus that made his poor computer slow and the virus was activated by a misoperation of opening an attachment of an email.
Nobita did use an outstanding anti-virus software, however, for some strange reason, this software did not check email attachments. Now Nobita decide to detect viruses in emails by himself.
To detect an virus, a virus sample (several binary bytes) is needed. If these binary bytes can be found in the email attachment (binary data), then the attachment contains the virus.
Note that attachments (binary data) in emails are usually encoded in base64. To encode a binary stream in base64, first write the binary stream into bits. Then take 6 bits from the stream in turn, encode these 6 bits into a base64 character according the following table:
That is, translate every 3 bytes into 4 base64 characters. If the original binary stream contains 3k + 1 bytes, where k is an integer, fill last bits using zero when encoding and append ‘==’ as padding. If the original binary stream contains 3k + 2 bytes, fill last bits using zero when encoding and append ‘=’ as padding. No padding is needed when the original binary stream contains 3k bytes.

Value Encoding Value Encoding
0 A 32 g
1 B 33 h
2 C 34 i
3 D 35 j
4 E 36 k
5 F 37 l
6 G 38 m
7 H 39 n
8 I 40 o
9 J 41 p
10 K 42 q
11 L 43 r
12 M 44 s
13 N 45 t
14 O 46 u
15 P 47 v
16 Q 48 w
17 R 49 x
18 S 50 y
19 T 51 z
20 U 52 0
21 V 53 1
22 W 54 2
23 X 55 3
24 Y 56 4
25 Z 57 5
26 a 58 6
27 b 59 7
28 c 60 8
29 d 61 9
30 e 62 +
31 f 63 /

For example, to encode ‘hello’ into base64, first write ‘hello’ as binary bits, that is: 01101000 01100101 01101100 01101100 01101111
Then, take 6 bits in turn and fill last bits as zero as padding (zero padding bits are marked in bold): 011010 000110 010101 101100 011011 000110 111100
They are 26 6 21 44 27 6 60 in decimal. Look up the table above and use corresponding characters: aGVsbG8
Since original binary data contains 1 * 3 + 2 bytes, padding is needed, append ‘=’ and ‘hello’ is finally encoded in base64: aGVsbG8=
Section 5.2 of RFC 1521 describes how to encode a binary stream in base64 much more detailedly:
Here is a piece of ANSI C code that can encode binary data in base64. It contains a function, encode (infile, outfile), to encode binary file infile in base64 and output result to outfile.

Input
Input contains multiple cases (about 15, of which most are small ones). The first line of each case contains an integer N (0 <= N <= 512). In the next N distinct lines, each line contains a sample of a kind of virus, which is not empty, has not more than 64 bytes in binary and is encoded in base64. Then, the next line contains an integer M (1 <= M <= 128). In the following M lines, each line contains the content of a file to be detected, which is not empty, has no more than 2048 bytes in binary and is encoded in base64.
There is a blank line after each case.
Output
For each case, output M lines. The ith line contains the number of kinds of virus detected in the ith file.
Output a blank line after each case.

Sample Input
3
YmFzZTY0
dmlydXM=
dDog
1
dGVzdDogdmlydXMu

1
QA==
2
QA==
ICAgICAgICA=
Sample Output
2

1
0

Hint
In the first sample case, there are three virus samples: base64, virus and t: , the data to be checked is test: virus., which contains the second and the third, two virus samples.

思路很簡單,先解碼,然後就是模板題了

#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#define DICNUM 256
using namespace std;
struct ACautomation {
    int cnt, root;
    int child[50000+50][DICNUM], end[50000+50], fail[50000+50];
    int Map[256];
    int newnode() {
        for (int i = 0; i < DICNUM; i++)    child[cnt][i] = -1;
        end[cnt++] = -1;
        return cnt-1;
    }
    void init() {
        cnt = 0;
        root = newnode();
        for (int i = 0; i < 26; i++)    Map[i+'A'] = i;
        for (int i = 26; i < 52; i++)   Map[i-26+'a'] = i;
        for (int i = 52; i < 62; i++)   Map[i-52+'0'] = i;
        Map['+'] = 62, Map['/'] = 63;
    }
    int tmp[5000+5], tmpl;
    void ReEncode(char s[]) {
        memset(tmp, 0, sizeof(tmp));
        tmpl = 0;
        for (int i = 0, len = 0, x = 0; s[i] && s[i] != '='; i++) {
            len += 6, x = (x<<6)|Map[s[i]];
            if (len >= 8) {
                tmp[tmpl++] = (x>>(len-8))&0xff;
                len -= 8;
            }
        }
    }
    void insert(int id, char s[]) {
        ReEncode(s);
        int cur = root;
        for (int i = 0; i < tmpl; cur = child[cur][tmp[i++]])
            if (child[cur][tmp[i]] == -1)   child[cur][tmp[i]] = newnode();
        end[cur] = id;
    }
    void setFail() {
        queue <int> que;
        fail[root] = root;
        for (int i = 0; i < DICNUM; i++) {
            if (child[root][i] == -1) {
                child[root][i] = root;
            } else {
                fail[child[root][i]] = root;
                que.push(child[root][i]);
            }
        }
        while (!que.empty()) {
            int cur = que.front();
            for (int i = 0; i < DICNUM; i++) {
                if (child[cur][i] == -1) {
                    child[cur][i] = child[fail[cur]][i];
                } else {
                    fail[child[cur][i]] = child[fail[cur]][i];
                    que.push(child[cur][i]);
                }
            }
            que.pop();
        }
    }
    bool flag[50000+50];
    int query(int id, char s[]) {
        ReEncode(s);
        int cur = root, ret = 0;
        memset(flag, 0, sizeof(flag));
        for (int i = 0; i < tmpl; i++) {
            cur = child[cur][tmp[i]];
            int t = cur;
            while (t != root) {
                if (end[t] != -1 && !flag[end[t]])  ret++, flag[end[t]] = true;
                t = fail[t];
            }
        }
        return ret;
    } 
} ACauto;
int main() {
    int n, m;
    char str[5000+5];
    while (scanf("%d", &n) != EOF) {
        ACauto.init();
        for (int i = 1; i <= n; i++) {
            scanf("%s", str);
            ACauto.insert(i, str);
        }
        ACauto.setFail();
        scanf("%d", &m);
        for (int i = 1; i <= m; i++) {
            scanf("%s", str);
            printf("%d\n", ACauto.query(i, str));
        }
        printf("\n");
    }
    return 0;
}

AC自動機的玄學博大精深,還可以和dp扯上關係,還是多練題爲好

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章