lucene5.1 fst源碼分析(fst接口方法寫入和讀取測試)

.1 fst基本概念

有限狀態機

.1-1 節點(node)和弧(arc)

node包含了進入該node的0個或者多個arc,也包含了從該node走出的0個或者多個arc

.2 lucene中的fst測試代碼

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;

public class FSTTest2 {

    public static void main(String[] args) throws IOException {

         String inputValues[] = {"mop", "moth", "pop","star","stop","top"};//輸入
         long outputValues[] = {5, 7, 12, 14, 18,20};//輸出。在lucene中即爲term後綴在tim文件中的偏移量

//      String inputValues[] = { "xstop", "xstopxx", "yxxxstop" };
//      long outputValues[] = { 12, 10, 6};

        PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
        Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
        BytesRef scratchBytes = new BytesRef();
        IntsRefBuilder ifb = new IntsRefBuilder();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        BytesRefBuilder bfb = new BytesRefBuilder();

        //插入輸入和輸出到fst中
        for (int i = 0; i < inputValues.length; i++) {

            CharSequence cs = inputValues[i];
            NumericUtils.intToPrefixCodedBytes(inputValues[i].length(), 0, bfb);
            builder.add(Util.toUTF32(cs, ifb), outputValues[i]);
        }
        //構建字節流索引
        FST<Long> fst = builder.finish();
        Long value = Util.get(fst, new BytesRef("stop"));//根據輸入得到輸出
        System.out.println(value);
    }
}

.3 源碼分析

.3-1 util的get方法分析

public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
    assert fst.inputType == FST.INPUT_TYPE.BYTE1;

    final BytesReader fstReader = fst.getBytesReader();

    // TODO: would be nice not to alloc this on every lookup
    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

    // Accumulate output as we go
    T output = fst.outputs.getNoOutput();//初始化輸出
    for(int i=0;i<input.length;i++) {
      if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {//尋找一個字符,即尋找一個arc
        return null;
      }
      output = fst.outputs.add(output, arc.output);//累加每個弧上的輸出
    }

    if (arc.isFinal()) {
      return fst.outputs.add(output, arc.nextFinalOutput);
    } else {
      return null;
    }
  }

.3-2 FST的findTargetArc方法

.3-2-1 類成員變量介紹

這裏寫圖片描述

.3-2-2方法原型:

private Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in, boolean useRootArcCache)
其中的labelToMatch是要查找的byte,一般是一個字符,follow是前一個弧,在執行中只提供自己的target,即follow的tonode的起始地址,目標arc肯定掛在這個tonode下。arc是要查找的弧,在傳入時,和follow相同,在後續的執行中該arc的內部變量會被不斷替換,最後返回的也是這個arc。

.3-2-3源碼分析

這裏寫圖片描述

    in.setPosition(getNodeAddress(follow.target));

    arc.node = follow.target;//follow的tonode就是目標弧的fromnode

    // System.out.println("fta label=" + (char) labelToMatch);

    // Linear scan
    readFirstRealTargetArc(follow.target, arc, in);//例如找到上圖中最左邊那個假節點的第一個真的目標弧,例如本次會找到m

    while(true) {
      //System.out.println("  non-bs cycle");
      // TODO: we should fix this code to not have to create
      // object for the output of every arc we scan... only
      // for the matching arc, if found
      if (arc.label == labelToMatch) {
        //System.out.println("    found!");
        return arc;//匹配上就返回這個arc
      } else if (arc.label > labelToMatch) {
        return null;
      } else if (arc.isLast()) {
        return null;
      } else {
        readNextRealArc(arc, in);//沒有匹配上label,就繼續找
      }
    }

.3-3 readFirstRealTargetArc方法

public Arc<T> readFirstRealTargetArc(long node, Arc<T> arc, final BytesReader in) throws IOException {
     //這裏的node,是上一次匹配上的那個弧的tonode,要找下一個匹配弧,就要在這個tonode的弧裏找,所以把這個地址所以本次查找的起始地址
    final long address = getNodeAddress(node);
    in.setPosition(address);
    //System.out.println("  readFirstRealTargtArc address="
    //+ address);
    //System.out.println("   flags=" + arc.flags);
    arc.node = node;//這個node是上一個弧的targetnode,也是本次弧的fromnode(1)

    if (in.readByte() == ARCS_AS_FIXED_ARRAY) {
      //System.out.println("  fixedArray");
      // this is first arc in a fixed-array
      arc.numArcs = in.readVInt();
      if (packed || version >= VERSION_VINT_TARGET) {
        arc.bytesPerArc = in.readVInt();
      } else {
        arc.bytesPerArc = in.readInt();
      }
      arc.arcIdx = -1;
      arc.nextArc = arc.posArcsStart = in.getPosition();
      //System.out.println("  bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
    } else {
      //arc.flags = b;
      arc.nextArc = address;//一個弧的nextArc,是這個弧的fromnode的下一個弧。但是這個弧在構造階段,nextArc就是他自己
      arc.bytesPerArc = 0;
    }

    return readNextRealArc(arc, in);//該行以上代碼初始化了一個arc的fromnode((1)處),也初始化了一個arc的nextArc位置,下來就是要給arc的其他變量賦值,就在這個方法中

.3-3 readNextRealArc方法

// this is a continuing arc in a fixed array
    if (arc.bytesPerArc != 0) {
      // arcs are at fixed entries
      arc.arcIdx++;
      assert arc.arcIdx < arc.numArcs;
      in.setPosition(arc.posArcsStart);
      in.skipBytes(arc.arcIdx*arc.bytesPerArc);
    } else {
      // arcs are packed //packed的arc表示什麼意思???
      in.setPosition(arc.nextArc);//在初始階段,一個arc的nextArc表示這個arc的fromnode的第一個弧的位置,所以可以作爲這個arc的起始位置(這個arc的fromnode的真正的nextArc在後邊會設置)
    }
    arc.flags = in.readByte();//讀取flags
    arc.label = readLabel(in);//讀取label

    //判斷是否有輸出
    if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
      arc.output = outputs.read(in);
    } else {
      arc.output = outputs.getNoOutput();
    }
    //判斷是否有final_output
    if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
      arc.nextFinalOutput = outputs.readFinalOutput(in);
    } else {
      arc.nextFinalOutput = outputs.getNoOutput();
    }
    //是否是stopnode
    if (arc.flag(BIT_STOP_NODE)) {
      if (arc.flag(BIT_FINAL_ARC)) {
        arc.target = FINAL_END_NODE;
      } else {
        arc.target = NON_FINAL_END_NODE;
      }
      arc.nextArc = in.getPosition();
    } else if (arc.flag(BIT_TARGET_NEXT)) {//這個弧是否有target_next優化(target_next優化見下邊解釋)
      arc.nextArc = in.getPosition();
      // TODO: would be nice to make this lazy -- maybe
      // caller doesn't need the target and is scanning arcs...
      //如果有target_next優化,就需要調用者開始掃描該節點後續的所有弧,所有的弧都跳過後,就是當前弧的tonode起始地址
      if (nodeAddress == null) {
        if (!arc.flag(BIT_LAST_ARC)) {
          if (arc.bytesPerArc == 0) {
            // must scan
            seekToNextNode(in);
          } else {
            in.setPosition(arc.posArcsStart);
            in.skipBytes(arc.bytesPerArc * arc.numArcs);
          }
        }
        //跳過了當前節點的所有弧後,就是當前節點的targe節點了,從這個target節點開始,就可以讀當前弧的下一個弧了(不是next,是tonode的弧)
        arc.target = in.getPosition();
      } else {
        arc.target = arc.node - 1;
        assert arc.target > 0;
      }
    } else {
      if (packed) {
        final long pos = in.getPosition();
        final long code = in.readVLong();
        if (arc.flag(BIT_TARGET_DELTA)) {
          // Address is delta-coded from current address:
          arc.target = pos + code;
          //System.out.println("    delta pos=" + pos + " delta=" + code + " target=" + arc.target);
        } else if (code < nodeRefToAddress.size()) {
          // Deref
          arc.target = nodeRefToAddress.get((int) code);
          //System.out.println("    deref code=" + code + " target=" + arc.target);
        } else {
          // Absolute
          arc.target = code;
          //System.out.println("    abs code=" + code);
        }
      } else {
        arc.target = readUnpackedNodeTarget(in);//如果沒有target_next優化,需要再讀一個字節,就可以得到這個弧的tonode的起始地址了,這是一個seek過程
      }
      arc.nextArc = in.getPosition();
    }
    return arc;
  }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章