.1 fst基本概念
有限狀態機
.1-1 節點(node)和弧(arc)
node包含了進入該node的0個或者多個arc,也包含了從該node走出的0個或者多個arc
.2 lucene中的fst測試代碼
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
public class FSTTest2 {
public static void main(String[] args) throws IOException {
String inputValues[] = {"mop", "moth", "pop","star","stop","top"};//輸入
long outputValues[] = {5, 7, 12, 14, 18,20};//輸出。在lucene中即爲term後綴在tim文件中的偏移量
// String inputValues[] = { "xstop", "xstopxx", "yxxxstop" };
// long outputValues[] = { 12, 10, 6};
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
BytesRef scratchBytes = new BytesRef();
IntsRefBuilder ifb = new IntsRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder bfb = new BytesRefBuilder();
//插入輸入和輸出到fst中
for (int i = 0; i < inputValues.length; i++) {
CharSequence cs = inputValues[i];
NumericUtils.intToPrefixCodedBytes(inputValues[i].length(), 0, bfb);
builder.add(Util.toUTF32(cs, ifb), outputValues[i]);
}
//構建字節流索引
FST<Long> fst = builder.finish();
Long value = Util.get(fst, new BytesRef("stop"));//根據輸入得到輸出
System.out.println(value);
}
}
.3 源碼分析
.3-1 util的get方法分析
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
final BytesReader fstReader = fst.getBytesReader();
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go
T output = fst.outputs.getNoOutput();//初始化輸出
for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {//尋找一個字符,即尋找一個arc
return null;
}
output = fst.outputs.add(output, arc.output);//累加每個弧上的輸出
}
if (arc.isFinal()) {
return fst.outputs.add(output, arc.nextFinalOutput);
} else {
return null;
}
}
.3-2 FST的findTargetArc方法
.3-2-1 類成員變量介紹
.3-2-2方法原型:
private Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in, boolean useRootArcCache)
其中的labelToMatch是要查找的byte,一般是一個字符,follow是前一個弧,在執行中只提供自己的target,即follow的tonode的起始地址,目標arc肯定掛在這個tonode下。arc是要查找的弧,在傳入時,和follow相同,在後續的執行中該arc的內部變量會被不斷替換,最後返回的也是這個arc。
.3-2-3源碼分析
in.setPosition(getNodeAddress(follow.target));
arc.node = follow.target;//follow的tonode就是目標弧的fromnode
// System.out.println("fta label=" + (char) labelToMatch);
// Linear scan
readFirstRealTargetArc(follow.target, arc, in);//例如找到上圖中最左邊那個假節點的第一個真的目標弧,例如本次會找到m
while(true) {
//System.out.println(" non-bs cycle");
// TODO: we should fix this code to not have to create
// object for the output of every arc we scan... only
// for the matching arc, if found
if (arc.label == labelToMatch) {
//System.out.println(" found!");
return arc;//匹配上就返回這個arc
} else if (arc.label > labelToMatch) {
return null;
} else if (arc.isLast()) {
return null;
} else {
readNextRealArc(arc, in);//沒有匹配上label,就繼續找
}
}
.3-3 readFirstRealTargetArc方法
public Arc<T> readFirstRealTargetArc(long node, Arc<T> arc, final BytesReader in) throws IOException {
//這裏的node,是上一次匹配上的那個弧的tonode,要找下一個匹配弧,就要在這個tonode的弧裏找,所以把這個地址所以本次查找的起始地址
final long address = getNodeAddress(node);
in.setPosition(address);
//System.out.println(" readFirstRealTargtArc address="
//+ address);
//System.out.println(" flags=" + arc.flags);
arc.node = node;//這個node是上一個弧的targetnode,也是本次弧的fromnode(1)
if (in.readByte() == ARCS_AS_FIXED_ARRAY) {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
if (packed || version >= VERSION_VINT_TARGET) {
arc.bytesPerArc = in.readVInt();
} else {
arc.bytesPerArc = in.readInt();
}
arc.arcIdx = -1;
arc.nextArc = arc.posArcsStart = in.getPosition();
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
} else {
//arc.flags = b;
arc.nextArc = address;//一個弧的nextArc,是這個弧的fromnode的下一個弧。但是這個弧在構造階段,nextArc就是他自己
arc.bytesPerArc = 0;
}
return readNextRealArc(arc, in);//該行以上代碼初始化了一個arc的fromnode((1)處),也初始化了一個arc的nextArc位置,下來就是要給arc的其他變量賦值,就在這個方法中
.3-3 readNextRealArc方法
// this is a continuing arc in a fixed array
if (arc.bytesPerArc != 0) {
// arcs are at fixed entries
arc.arcIdx++;
assert arc.arcIdx < arc.numArcs;
in.setPosition(arc.posArcsStart);
in.skipBytes(arc.arcIdx*arc.bytesPerArc);
} else {
// arcs are packed //packed的arc表示什麼意思???
in.setPosition(arc.nextArc);//在初始階段,一個arc的nextArc表示這個arc的fromnode的第一個弧的位置,所以可以作爲這個arc的起始位置(這個arc的fromnode的真正的nextArc在後邊會設置)
}
arc.flags = in.readByte();//讀取flags
arc.label = readLabel(in);//讀取label
//判斷是否有輸出
if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
arc.output = outputs.read(in);
} else {
arc.output = outputs.getNoOutput();
}
//判斷是否有final_output
if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
arc.nextFinalOutput = outputs.readFinalOutput(in);
} else {
arc.nextFinalOutput = outputs.getNoOutput();
}
//是否是stopnode
if (arc.flag(BIT_STOP_NODE)) {
if (arc.flag(BIT_FINAL_ARC)) {
arc.target = FINAL_END_NODE;
} else {
arc.target = NON_FINAL_END_NODE;
}
arc.nextArc = in.getPosition();
} else if (arc.flag(BIT_TARGET_NEXT)) {//這個弧是否有target_next優化(target_next優化見下邊解釋)
arc.nextArc = in.getPosition();
// TODO: would be nice to make this lazy -- maybe
// caller doesn't need the target and is scanning arcs...
//如果有target_next優化,就需要調用者開始掃描該節點後續的所有弧,所有的弧都跳過後,就是當前弧的tonode起始地址
if (nodeAddress == null) {
if (!arc.flag(BIT_LAST_ARC)) {
if (arc.bytesPerArc == 0) {
// must scan
seekToNextNode(in);
} else {
in.setPosition(arc.posArcsStart);
in.skipBytes(arc.bytesPerArc * arc.numArcs);
}
}
//跳過了當前節點的所有弧後,就是當前節點的targe節點了,從這個target節點開始,就可以讀當前弧的下一個弧了(不是next,是tonode的弧)
arc.target = in.getPosition();
} else {
arc.target = arc.node - 1;
assert arc.target > 0;
}
} else {
if (packed) {
final long pos = in.getPosition();
final long code = in.readVLong();
if (arc.flag(BIT_TARGET_DELTA)) {
// Address is delta-coded from current address:
arc.target = pos + code;
//System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target);
} else if (code < nodeRefToAddress.size()) {
// Deref
arc.target = nodeRefToAddress.get((int) code);
//System.out.println(" deref code=" + code + " target=" + arc.target);
} else {
// Absolute
arc.target = code;
//System.out.println(" abs code=" + code);
}
} else {
arc.target = readUnpackedNodeTarget(in);//如果沒有target_next優化,需要再讀一個字節,就可以得到這個弧的tonode的起始地址了,這是一個seek過程
}
arc.nextArc = in.getPosition();
}
return arc;
}