使用增強型後綴數組(ESA)的文本匹配算法

設模式串的長度爲m,文本的長度爲n,使用後綴數組做文本匹配,如果只用後綴表suftab和折半查找算法的複雜度爲O(m*logn); 如果使用最長公共前綴表lcptab和折半查找算法,複雜度可以降至O(m+logn);使用增強型後綴數組(ESA)表childtab,複雜度爲O(m)。本文使用複雜度爲O(m)的算法,在匹配之前要求先構造SA(下面採用DC3算法構造後綴數組),然後計算出後綴數組的suftab,lcptab和childtab。由於通過childtab可以在O(1)時間複雜度內找到每一個lcp-interval的所有child-interval,因此這跟後綴樹的自頂向下匹配模式串的算法複雜度相當;進一步通過lcp-interval可以很容易地得到z個匹配子串的起始位置,複雜度爲O(m+z)。


實現:

import java.util.Stack;

 

/**
 * 
 * 
 * finding all z occurrences of a pattern with suftab,lcptab and childtab
 * (m: the length of pattern, n: the length of text)
 * time complexity: O(m+z)
 * 
 * (The suffix array is constructed with DC3 algorithm)
 * 
 *  
 * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)
 * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) 
 * 
 * @author ljs
 * 2011-07-28
 *
 */ 
public class ESAPatternMatch {
	private String text;
	private int[] suftab;
	private int[] lcptable;
	private int[] childtab;
	private int len;
	
	public ESAPatternMatch(String text){
		this.text = text;
		this.len = text.length();
		
		childtab=new int[len];	
		for(int i=0;i<len;i++){
			childtab[i] = -1;
		}
	}
	public static final char MAX_CHAR = '\u00FF';

	class Suffix{
		int[] sa;  
		//Note: the p-th suffix in sa: SA[rank[p]-1]];
		//p is the index of the array "rank", start with 0;
		//a text S's p-th suffix is S[p..n], n=S.length-1.
		int[] rank; 
		boolean done;
		 
		public Suffix(int[] sa,int[] rank){
			this.sa = sa;
			this.rank = rank;
		}
	}
	

	//a prefix of suffix[isuffix] represented with digits
	class Tuple{
		int isuffix; //the p-th suffix
		int[] digits;
		public Tuple(int suffix,int[] digits){
			this.isuffix = suffix;
			this.digits = digits;			
		}
		public String toString(){
			StringBuffer sb = new StringBuffer();			
			sb.append(isuffix);
			sb.append("(");
			for(int i=0;i<digits.length;i++){
				sb.append(digits[i]);
				if(i<digits.length-1)
					sb.append("-");
			}
			sb.append(")");
			return sb.toString();
		}
	}
	
	//d: the digit to do countingsort
	//max: A value's range is 0...max
	private void countingSort(int d,Tuple[] tA,Tuple[] tB,int max){
		//init the counter array
		int[] C = new int[max+1];
		for(int i=0;i<=max;i++){
			C[i] = 0;
		}
		//stat the count
		for(int j=0;j<tA.length;j++){
			C[tA[j].digits[d]]++;
		}
		//process the counter array C
		for(int i=1;i<=max;i++){
			C[i]+=C[i-1];
		}
		//distribute the values  
		for(int j=tA.length-1;j>=0;j--){
			//C[A[j]] <= A.length 
			tB[--C[tA[j].digits[d]]]=tA[j];			
		}
	}
	
	//tA: input
	//tB: output for rank caculation
	private void radixSort(Tuple[] tA,Tuple[] tB,int max,int digitsLen){
		int len = tA.length;
		int digitsTotalLen = tA[0].digits.length;
			
		for(int d=digitsTotalLen-1,j=0;j<digitsLen;d--,j++){
			this.countingSort(d, tA, tB, max);
			//assign tB to tA
			if(j<digitsLen-1){
				for(int i=0;i<len;i++){
					tA[i] = tB[i];
				}		
			}
		}
	}
	
	//max is the maximum value in any digit of TA.digits[], used for counting sort
	//tA: input
	//tB: the place holder, reused between iterations
	private Suffix rank(Tuple[] tA,Tuple[] tB,int max,int digitsLen){		
		int len = tA.length;		
		radixSort(tA,tB,max,digitsLen);	
		
		int digitsTotalLen = tA[0].digits.length;
		
		//caculate rank and sa	
		int[] sa = new int[len];
		sa[0] = tB[0].isuffix;	
		
		int[] rank = new int[len+2]; //add 2 for sentinel	
		rank[len]=1;rank[len+1] = 1;
		int r = 1; //rank starts with 1
		rank[tB[0].isuffix] = r;		
		for(int i=1;i<len;i++){
			sa[i] = tB[i].isuffix;	
			
			boolean equalLast = true;
			for(int j=digitsTotalLen-digitsLen;j<digitsTotalLen;j++){
				if(tB[i].digits[j]!=tB[i-1].digits[j]){
					equalLast = false;
					break;
				}
			}
			if(!equalLast){
				r++;
			}
			rank[tB[i].isuffix] = r;	
		}
				 
		Suffix suffix = new Suffix(sa,rank);
		//judge if we are done
		if(r==len){
			suffix.done = true;
		}else{
			suffix.done = false;
		}
		return suffix;
		
	}
	
	private int[] orderSuffixes(Tuple[] tA,Tuple[] tB,int max,int digitsLen){		
		int len = tA.length;		
		radixSort(tA,tB,max,digitsLen);			
		//caculate rank and sa	
		int[] sa = new int[len];
		for(int i=0;i<len;i++){
			sa[i] = tB[i].isuffix;				
		}
		return sa;		 
	}
	
	//rank needs sentinel: len+2
	private Suffix reduce(int[] rank,int max){
		int len = rank.length - 2;
		
		int n1 = (len+1)/3;
		int n2 = len/3;
		Tuple[] tA = new Tuple[n1+n2];
		Tuple[] tB = new Tuple[n1+n2];
		
		for(int i=0,j=1;i<n1;i++,j+=3){
			int r1 =  rank[j];
			int r2 =  rank[j+1];
			int r3 =  rank[j+2];
			tA[i] = new Tuple(i,new int[]{r1,r2,r3});
		}
		for(int i=n1,j=2;i<n1+n2;i++,j+=3){
			int r1 =  rank[j];
			int r2 =  rank[j+1];
			int r3 =  rank[j+2];	 
			tA[i] = new Tuple(i,new int[]{r1,r2,r3});
		}
		 
		return rank(tA,tB,max,3);		
	}
	
	
	private int[] skew(int[] rank,int max){
		int len = rank.length - 2;
		
		//step 1: caculate sa12
		Suffix suffixT12 = reduce(rank,max);
		 
		
		int[] sa12 = null;
		if(!suffixT12.done){
			int[] rankT12 = suffixT12.rank;
			int maxT12 = rankT12[suffixT12.sa[suffixT12.sa.length-1]];
			sa12 = skew(rankT12,maxT12);
			// debug for string: GACCCACCACC#
			//s12 = new Suffix();
			//s12.rank = new int[]{3,6,5,4,7,2,1,1,1};
			//s12.sa = new int[]{7,6,5,0,3,2,1,4};
			//s12.done =true;						
		}else{
			sa12 = suffixT12.sa;			
		}
		
		//index conversion for sa12
		int n1 = (len+1)/3;
		for(int j=0;j<sa12.length;j++){
			if(sa12[j]<n1){
				sa12[j] = 1 + 3*sa12[j];
			}else{
				sa12[j] = 2 + 3*(sa12[j]-n1);
			}				
		}
		//recaculate rank for sa12
		int[] rank12 = new int[len+2];
		rank12[len] = 1;rank12[len+1] = 1;
		for(int k=0;k<sa12.length;k++){
			rank12[sa12[k]] = k+1;
		}
		 
		  
		
		//step 2: caculate sa0		
		int n0=(len+2)/3;
		Tuple[] tA = new Tuple[n0];
		Tuple[] tB = new Tuple[n0];
		for(int i=0,j=0;i<n0;i++,j+=3){
			int r1 =  rank[j];
			int r2 =  rank12[j+1]; 
			tA[i] = new Tuple(i,new int[]{r1,r2});
		}
		int max12 = rank12[sa12[sa12.length-1]];		
		int[] sa0 = orderSuffixes(tA,tB,max<max12?max12:max,2);
		//index conversion for sa0
		for(int j=0;j<n0;j++){
			sa0[j] = 3*sa0[j];					
		}		 
		
		//step 3: merge sa12 and sa0
		int[] sa = new int[len];
		int i=0,j=0;
		int k=0;
		while(i<sa12.length && j<sa0.length){
			int p = sa12[i];
			int q = sa0[j];
			if(p%3==1){
				//case 1
				if(rank[p]<rank[q]){
					sa[k++] = p;i++;
				}else if(rank[p]>rank[q]){
					sa[k++] = q;j++;
				}else{
					if(rank12[p+1]<rank12[q+1]){
						sa[k++] = p;i++;
					}else{
						sa[k++] = q;j++;
					}					
				}
			}else{
				//case 2
				if(rank[p]<rank[q]){
					sa[k++] = p;i++;
				}else if(rank[p]>rank[q]){
					sa[k++] = q;j++;
				}else{
					if(rank[p+1]<rank[q+1]){
						sa[k++] = p;i++;
					}else if(rank[p+1]>rank[q+1]){
						sa[k++] = q;j++;
					}else{
						if(rank12[p+2]<rank12[q+2]){
							sa[k++] = p;i++;
						}else{
							sa[k++] = q;j++;
						}			
					}
				}
			}			
		}
		for(int m=i;m<sa12.length;m++){
			sa[k++] = sa12[m];
		}
		for(int m=j;m<sa0.length;m++){
			sa[k++] = sa0[m];
		}		
		
		return sa;		
	}
	//Precondition: the last char in text must be less than other chars.
	public Suffix DC3(String text) throws Exception{
		if(text == null)return null;
		int len = text.length();
		if(len == 0) return null;
		
		char base = text.charAt(len-1); //the smallest char
		Tuple[] tA = new Tuple[len];
		Tuple[] tB = new Tuple[len]; //placeholder
		for(int i=0;i<len;i++){
			int delta=text.charAt(i)-base;
			if(delta<0) throw new Exception("invalid input: last char must be the smallest one.");
			tA[i] = new Tuple(i,new int[]{0,delta});
		}
		Suffix suffix = rank(tA,tB,MAX_CHAR-base,1);
		 
		int max = suffix.rank[suffix.sa[len-1]];
		int[] sa  = skew(suffix.rank,max);
		
		//caculate rank for result suffix array
		int[] r = new int[len];		
		for(int k=0;k<sa.length;k++){
			r[sa[k]] = k+1;
		}
		return new Suffix(sa,r);
		
	}
	//rank[p]'s index starts with 1 (not 0)
	public int[] computeLCPTable(String text,int[] sa,int[] rank){
		if(text == null)return null;
		int len = text.length();
		if(len == 0) return null;
		 
		int[] lcpz = new int[len];
		
		//base case: p=0
		//caculate LCP of suffix[0]
		int lcp = 0;
		int r = rank[0]-1;
		if(r>0){
		   int q=sa[r-1];
		   //caculate LCP by definition
		   for(int i=0,j=q;i<len && j<len;i++,j++){
			   if(text.charAt(i) != text.charAt(j)){
				   lcp=i;
				   break;
			   }
		   }
		}
		lcpz[0] = lcp;
		
		//other cases: p>=1
		//ignore p == sa[0] because LCP=0 for suffix[p] where rank[p]=0				
		for(int p=1;p<len && p != sa[0];p++){
			int h = lcpz[p-1];
			int q=sa[rank[p]-2];
			lcp = 0;
			if(h>1){ //for h<=1, caculate LCP by definition (i.e. start with lcp=0)			
				//jump h-1 chars for suffix[p] and suffix[q]						
				lcp = h-1;			    
			}
			for(int i=p+lcp,j=q+lcp,k=0;i<len && j<len;i++,j++,k++){
			   if(text.charAt(i) != text.charAt(j)){
				   lcp+=k;
				   break;
			   }
			}
			lcpz[p] = lcp;
		}
		
		//caculate LCP
		int[] LCP = new int[len];
		for(int i=0;i<len;i++){
			LCP[i] = lcpz[sa[i]];
		}
		return LCP;
	}
 
	
	public void buildChildtab(){
		//step 1: caculate up and down value
		Stack<Integer> stack = new Stack<Integer>();
		int lastIndex = -1;
		stack.push(0);
		for(int i=1;i<len;i++){ 
			while(lcptable[i]<lcptable[stack.peek()]){
				lastIndex = stack.pop();
				int next = stack.peek();
				if(lcptable[i]<=lcptable[next] 
				        && lcptable[next] != lcptable[lastIndex]){
					childtab[next] = lastIndex; 
				}
			}
			if(lastIndex != -1){
				childtab[i-1] = lastIndex;
				lastIndex = -1;
			}
			stack.push(i);
		}
		//process remaining elements
		while(0<lcptable[stack.peek()]){
			lastIndex = stack.pop();
			int next = stack.peek();
			if(0<=lcptable[next] 
			        && lcptable[next] != lcptable[lastIndex]){
				childtab[next] = lastIndex; 
			}
		}
		
		
		//step 2: caculate nextLIndex
		stack.clear();
		stack.push(0);
		for(int i=1;i<len;i++){ 
			while(lcptable[i]<lcptable[stack.peek()]){
				stack.pop();				
			}
			if(lcptable[i] == lcptable[stack.peek()]){				
				lastIndex = stack.pop();
				childtab[lastIndex] = i;
			}
			stack.push(i);
		}		
	}
	//get i's up value; otherwise return -1
	private int getUpValue(int i){
		int up = -1;
		if(i>=1 && lcptable[i-1]>lcptable[i]){
			  up = childtab[i-1];//up value
		}
		return up;
	}
	//get i's down value; otherwise return -1
	private int getDownValue(int i){
		int down = -1;
		if(childtab[i]>-1 && lcptable[childtab[i]] > lcptable[i]){			
			down = childtab[i]; //down value
		}
		return down;
	}
	
	//get the i's next L-index value; otherwise return -1
	private int getNextLIndexValue(int i){
		int nextLIndex = -1;		
		if(i<len-1 && getUpValue(i+1)==-1
				&& getDownValue(i)==-1){
			//neither down value nor up value
			nextLIndex = childtab[i];		
		}
		return nextLIndex;
	}	 
	
	//i and j is an lcp-interval 
	private int getlcp(int i,int j){ 
		if(i==j) return lcptable[i];
		
		int up = -1;
		if(j+1<len){
			up = getUpValue(j+1);
		}

		if(up>i && up<=j){
			return lcptable[up];
		}else{
			int down = getDownValue(i);
			if(down>-1)
				return lcptable[down];
			else 
				return 0;
		}
	}
	private int getFirstLIndex(int i,int j){
		int up = -1;
		if(j+1<len){
			up = getUpValue(j+1);
		}
		int i1 = -1;
		if(up>i && up<=j){
			i1 = up;
		}else{
			i1 = getDownValue(i);
		}		
		if(i1==-1){ //accommodate: i's doesn't belong to an lcp-interval
			i1 = getNextLIndexValue(i);
		}
		return i1;
	}
	//find the child-interval or singleton interval starting with char c
	//return null if not found
	//i and j is the parent lcp-interval 
	private int[] getMatchedInterval(int i,int j,char c){
		int lcp = getlcp(i,j);
		
		int[] interval = null;
		
		int i1 = getFirstLIndex(i,j);
		//i..i1-1
		interval = getMatchedChildInterval(i,i1-1,c,lcp);
		
		if(interval != null)
			return interval;
		
		int nextLIndex = -1;
		while((nextLIndex = getNextLIndexValue(i1)) != -1){
			int i2 = nextLIndex;		
			interval = getMatchedChildInterval(i1,i2-1,c,lcp);			
			if(interval != null)
				return interval;
			i1 = i2;
		}
		interval = getMatchedChildInterval(i1,j,c,lcp);	
		return interval;
	}
	private int[] getMatchedChildInterval(int i,int j,char c,int lcp){
		int[] interval = null;
		if(i==j){
			//singleton interval
			if(text.charAt(this.suftab[i]+lcp)==c){
				interval = new int[]{getlcp(i,i),i,j};
			}
		}else{			
			if(text.charAt(this.suftab[i+1]+lcp)==c){
				interval = new int[]{getlcp(i,j),i,j};
			}
		}
		return interval;
	}
	
	public void enhanceSA() throws Exception{
		//prepare suftab, lcptab and childtab
		Suffix suffix = this.DC3(this.text);		
		this.suftab = suffix.sa;
		int[] sufinv = suffix.rank;				
		this.lcptable = this.computeLCPTable(this.text,this.suftab,sufinv);
		this.buildChildtab();	 
	}

	//precondition: call this.enhanceSA() first
	public void match(String pattern){	
		int m = pattern.length();
		if(m==0) return;
						
		int pos=0;		
		boolean found = true;
		int i=0;
		int j=len-1;
		int[] interval = null;
		
		while(found && pos<m && (interval = getMatchedInterval(i,j,pattern.charAt(pos))) != null){			
			i=interval[1];
			j=interval[2];
			if(i!=j){
				int lcp=interval[0];
				int min=(lcp<m)?lcp:m;
				found=(pattern.substring(pos,min).equals(
						text.substring(this.suftab[j]+pos,this.suftab[j]+min)));
				pos=min;	
				//if lcp<m and found, continue 
				//if lcp>=m, exit whether found or not (pos==m)
				//if lcp<m but not found, exit (found=false)
			}else{				
				int tmp=this.suftab[i]+m;
				if(tmp>len) tmp=len;
				found=(pattern.substring(pos).equals(
						text.substring(this.suftab[i]+pos,tmp)));
				pos=m; //exit whether found or not (pos==m)
			}
		}
		if(found && interval != null){
			report(pattern,i,j);
		}else{
			System.out.format("pattern \"%s\" not found!%n",pattern);
		}
	}
	//i,j is an interval (child-interval or a singleton interval)
	private void report(String pattern,int i,int j){
		System.out.format("matched suffix(es) with pattern \"%s\": %d%n",pattern,j-i+1);
		for(int z=i;z<=j;z++){
			System.out.format("[%d:%s]%n",this.suftab[z],this.text.substring(this.suftab[z],this.text.length()-1));
		}
	}
	public static void main(String[] args) throws Exception {	
		String text = "mississippi#";
		System.out.format("Text: %s %n",text.substring(0,text.length()-1));
		ESAPatternMatch esa = new ESAPatternMatch(text);		
		esa.enhanceSA();
		esa.match("z");	
		esa.match("i");	
		esa.match("iss");	
		esa.match("issi");	
		esa.match("j");	
		esa.match("ir");	
		esa.match("ia");
		
		System.out.format("%n********************************%n");	
		text = "acaaacatat#";		
		System.out.format("Text: %s %n",text.substring(0,text.length()-1));
		esa = new ESAPatternMatch(text);	
		esa.enhanceSA();
		esa.match("ac");	
		
		System.out.format("%n********************************%n");	
		text = "After a long text, here's a needle ZZZZZ\u0000";		
		System.out.format("Text: %s %n",text.substring(0,text.length()-1));
		esa = new ESAPatternMatch(text);	
		esa.enhanceSA();
		esa.match("ZZZZZ");	
		
		System.out.format("%n********************************%n");	
		text = "The quick brown fox jumps over the lazy dog.\u0000";		
		System.out.format("Text: %s %n",text.substring(0,text.length()-1));
		esa = new ESAPatternMatch(text);	
		esa.enhanceSA();
		esa.match("lazy");	
		
		System.out.format("%n********************************%n");	
		text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna...\u0000";		
		System.out.format("Text: %s %n",text.substring(0,text.length()-1));
		esa = new ESAPatternMatch(text);	
		esa.enhanceSA();
		esa.match("tempor");	
	}

}


測試:

Text: mississippi
pattern "z" not found!
matched suffix(es) with pattern "i": 4
[10:i]
[7:ippi]
[4:issippi]
[1:ississippi]
matched suffix(es) with pattern "iss": 2
[4:issippi]
[1:ississippi]
matched suffix(es) with pattern "issi": 2
[4:issippi]
[1:ississippi]
pattern "j" not found!
pattern "ir" not found!
pattern "ia" not found!

********************************
Text: acaaacatat
matched suffix(es) with pattern "ac": 2
[0:acaaacatat]
[4:acatat]

********************************
Text: After a long text, here's a needle ZZZZZ
matched suffix(es) with pattern "ZZZZZ": 1
[35:ZZZZZ]

********************************
Text: The quick brown fox jumps over the lazy dog.
matched suffix(es) with pattern "lazy": 1
[35:lazy dog.]

********************************
Text: Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna...
matched suffix(es) with pattern "tempor": 1
[73:tempor incididunt ut labore et dolore magna...]




發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章