字符串匹配,KMP瞭解一下

在這裏插入圖片描述
​ 如何判斷一個字符串是不是另一個字符串的子串,我們第一反應就是indexOf或includes或者用正則,雖然沒有什麼不對,但是還是需要了解一下字符串匹配是怎麼匹配的。在計算機科學中,Knuth-Morris-Pratt字符串查找算法(簡稱爲KMP算法)可在一個主文本字符串S內查找一個詞W的出現位置。此算法通過運用對這個詞在不匹配時本身就包含足夠的信息來確定下一個匹配將在哪裏開始的發現,從而避免重新檢查先前匹配的字符。(尷尬了,大一時學的算法,不用就全忘了┭┮﹏┭┮)

​ 設母串S=S0...SnS = S_0...S_n,子串T=T0...TmT = T_0...T_m,在傳統的字符串匹配算法中,當母串S與子串T在i和j點失配時,即Si≠TjS_i =\not T_j ,此時i回溯到i - j + 1處(i = i - j + 1),j回溯到0 (j = 0)處繼續匹配,由此可見匹配算法效率不高,複雜度爲O(n * m)。

​ 而在KMP算法,當母串S與子串T在i和j點失配時,i不需要回溯,j只需要回溯到某一個特定位置即可整體複雜度爲O(n + m),。那j需要回溯到哪呢,我們定義一個next數組,令next[j] = k表示當母串S和子串T在i點失配時,即Si≠TjS_i =\not T_j, j需要回溯到k這個位置(k < j)。接下來我們來討論如何確定k點,並求出next數組。

​ 若母串S和子串T在i和j點失配,Si≠TjS_i =\not T_j,此時必有Sij...Si1=T0...Tj1S_{i - j} ... S_{i - 1} = T_0 ... T_{j - 1},對於任意的0k&lt;j0 \le k &lt; j,有Sik...Si1=Tjk...Tj1S_{i - k} ... S_{i - 1} = T_{j - k} ... T_{j - 1}。若有Sik...Si1=T0...Tk1S_{i - k} ... S_{i - 1} = T_0 ... T_{k - 1},則有T0...Tk1=Tjk...Tj1T_0 ... T_{k - 1} = T_{j - k} ... T_{j - 1},因此next[j] = k。

​ 對於next[j + 1],若Tk=TjT_k = T_j,則必有T0...Tk1Tk=Tjk...Tj1TjT_0 ... T_{k - 1}T_k = T_{j - k} ... T_{j - 1}T_j,因此next[j + 1] = k + 1 = next[j] + 1。若Tk≠TjT_k =\not T_j,顯然T0...Tk1Tk≠Tjk...Tj1TjT_0 ... T_{k - 1}T_k =\not T_{j - k} ... T_{j - 1}T_j,此時令母串S=...Tjk...Tj1Tj...S^{&#x27;} = ...T_{j - k}...T_{j - 1}T_j...,子串T=T0...Tk1Tk...T^{&#x27;} = T_0...T_{k - 1}T_k...,此時就相當於母串SS&#x27;和子串TT&#x27;在j和k點失配,由next數組的定義,next[k] = k’,即T0...Tk1=Tkk...Tk1=Tjk...Tj1T_0...T_{k&#x27; - 1} = T_{k - k&#x27;}...T_{k - 1} = T_{j - k&#x27;} ... T_{j - 1},此時若有Tk=TjT_{k&#x27;} = T_j,則T0...Tk1Tk=Tjk...Tj1TjT_0 ... T_{k&#x27; - 1}T_{k&#x27;} = T_{j - k&#x27;} ... T_{j - 1}T_j,因此next[j + 1] = k’ + 1 = next[k] + 1,若Tk≠TjT_{k&#x27;} =\not T_j,此時需要重複上面的步驟去尋找更小的k’‘使得Tk=TjT_{k&#x27;&#x27;} = T_j,此時next[j + 1] = k’’ + 1 = next[k’] + 1,若Tk≠TjT_{k&#x27;&#x27;} =\not T_j,繼續尋找k’’’,直到到達next[0] = -1爲止。

​ C++代碼實現

#include <cstdio>
#include <cstring>
#include <iostream>
using namespace std;

const int max_size = 100000;
int Next[max_size] = {};

void make_next(char* t, int* next) {
  int j = 0, k = -1;
  next[j] = k;
  while (j < strlen(t)) {
    if (k == -1 || t[j] == t[k]) next[++j] = ++k; // next[j + 1] = k + 1;
    else k = next[k]; // 尋找更小的k使得t[j] = t[k]
  }
}

int kmp(char* s, char* t, int* next) {
	int i = 0, j = 0;
	make_next(t, next);
	while (i < strlen(s)) {
          if (j == -1 || s[i] == t[j]) i ++, j ++;
          else j = next[j]; // i點失配,j回溯到next[j]點
          if (j == strlen(t)) return i - j;
	}
	return -1;
}

int kmp_count(char* s, char* t, int* next) {
	int i = 0, j = 0, v = 0;
	make_next(t, next);
    while (i < strlen(s)) {
		if (j == -1 || s[i] == t[j]) i++, j++;
         else j = next[j];
		if (j == strlen(t)) v++, j = next[j]; 
	} 
	return v;
}

int main() { 
	memset(Next, 0, sizeof(Next));
	char* s = const_cast<char*>("abcabacabaa");
	char* t = const_cast<char*>("aba");
	int i = kmp(s, t, Next);
	cout<<i<<endl;
	memset(Next, 0, sizeof(Next));
	int v = kmp_count(s, t, Next);
	cout<<v<<endl;
}

JavaScript實現

function makeNext(t) {
  let k = -1,  next = [k], j = 0;
  while (j < t.length) {
    if (k === -1 || t[k] === t[j]) ++j, next.push(++k);
    else k = next[k];
  }
  return next;
}

function kmp(s, t) {
  let i = 0, j = 0, next = makeNext(t);
  while (i < s.length) {
    if (j === -1 || s[i] === t[j]) j++ , i++;
    else j = next[j];
    if (j === t.length) return i - j;
  }
  return -1;
}

function kmpCount(s, t) {
  let i = 0, j = 0, v = 0, next = makeNext(t);
  while (i < s.length) {
    if (j === -1 || s[i] === t[j]) j++ , i++;
    else j = next[j];
    if (j === t.length) v++ , j = next[j];
  }
  return v;
}

let s = "ababababacadababa";
let t = "ba";
console.log(kmp(s, t));
console.log(kmpCount(s, t));
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章