關於分詞算法---雙數組T樹java實現

首先要聲明的是, 這個代碼我也參考過一個C++的實現, 不過, 他實在寫的過於煩瑣,一堆的模板代碼, 和stl的使用。 幸好10年前摸過C/C++ 2年, 否則還真不知道他在幹什麼。 可惜這個代碼有些致命的缺點是,字典需要生成後使用, 無法做動態的擴展。 不過呢, 動態加入一個新詞, 性能是是致命的。 程序的工作模式是:
1. 通過build()函數,把所有的詞生成數據,
2. 然後通過save()函數保存數據。
3. 使用的時候就可以用load()載入數據。

 

public class DoubelArrayTrie{

    // 節點信息
    private int            baseArray[];
    private int            checkArray[];
  
    // 保存節點已經使用
    private boolean        usedArray[];

    private int            nextCheckPos;
    private int            writeSize = 0;

    public void build(List<char[]> wordList, PreProcess process) {
        if (wordList == null) {
            return;
        }
        int size = wordList.size();
        if (size > 0) {
            List<Element> elements = null;
            if (process != null) {
                elements = process.process(wordList);
            } else {
                elements = new ArrayList<Element>(wordList.size());
                for (char[] cs : wordList) {
                    elements.add(new GenericElement(cs));
                }
            }
            Collections.sort(elements, new CharArrayComparator<Element>());
            resize(1);
            baseArray[0] = 1;
            nextCheckPos = 0;
            Node root_node = new Node();
            root_node.left = 0;
            root_node.right = size;
            root_node.depth = 0;
            List<Node> siblings = createSiblings();
            fetch(elements, root_node, siblings);
            insert(elements, siblings);
            size = size + (1 << 8 * 2) + 1;
            if (size > usedArray.length) {
                resize(size);
            }

        }
    }

    private int insert(List<Element> elements, List<Node> siblings) {
        int begin = 0;
        int nonZeroCount = 0;
        boolean first = false;

        int pos = (siblings.get(0).code + 1 > nextCheckPos ? siblings.get(0).code + 1 : nextCheckPos) - 1;
        if (pos >= usedArray.length) {
            resize(pos + 1);
        }
        while (true) {
            pos++;

            if (pos >= usedArray.length) {
                resize(pos + 65535);
            }
            if (checkArray[pos] != 0) {
                nonZeroCount++;
                continue;
            } else if (!first) {
                nextCheckPos = pos;
                first = true;
            }
            begin = pos - siblings.get(0).code;

            int t = begin + siblings.get(siblings.size() - 1).code;
            if (t > usedArray.length) {
                resize(t + 65535);
            }

            if (usedArray[begin]) {
                continue;
            }
            boolean flag = false;
            for (int i = 1; i < siblings.size(); i++) {
                if (checkArray[begin + siblings.get(i).code] != 0) {
                    flag = true;
                    break;
                }
            }
            if (!flag) break;
        }

        if (1.0 * nonZeroCount / (pos - nextCheckPos + 1) >= 0.95) {
            nextCheckPos = pos;
        }
        usedArray[begin] = true;
        writeSize = Math.max(writeSize, begin + siblings.get(siblings.size() - 1).code + 1);
        for (Node node : siblings) {
            checkArray[begin + node.code] = begin;
        }

        for (Node node : siblings) {
            List<Node> newSiblings = createSiblings();
            if (fetch(elements, node, newSiblings) == 0) {
                baseArray[begin + node.code] = -node.left - 1;
               

            } else {
                int ins = insert(elements, newSiblings);
                baseArray[begin + node.code] = ins;
            }

        }

        return begin;
    }

    private List<Node> createSiblings() {
        return new ArrayList<Node>();
    }

    private void resize(int size) {
        // checkArray array
        int tmp[] = new int[size];
        if (baseArray != null) {
            System.arraycopy(baseArray, 0, tmp, 0, baseArray.length);
        }
        baseArray = tmp;

        // baseArray array
        int tmp1[] = new int[size];
        if (checkArray != null) {
            System.arraycopy(checkArray, 0, tmp1, 0, checkArray.length);
        }
        checkArray = tmp1;

        // usedArray array
        boolean tmp2[] = new boolean[size];
        if (usedArray != null) {
            System.arraycopy(usedArray, 0, tmp2, 0, usedArray.length);
        }
        usedArray = tmp2;

     

    }

    private int fetch(List<Element> words, Node parent, List<Node> siblings) {
        int prev = 0;
        Node preNode = null;
        for (int i = parent.left; i < parent.right; i++) {
            char word[] = words.get(i).getChars();
            int len = word.length;
            if (len < parent.depth) {
                continue;
            }
            int cur = 0;
            if (len != parent.depth) {
                cur = word[parent.depth] + 1;
            }

            if (prev > cur) {
                throw new RuntimeException("Fatal: sort dictionary first.\n");
            }
            if (cur != prev || siblings.size() == 0) {
                Node tmpNode = new Node();
                tmpNode.depth = parent.depth + 1;
                tmpNode.code = cur; // 重新計算每個字的映射?
                tmpNode.left = i;
                if (len == parent.depth + 1) {
                    tmpNode.frequence = words.get(i).getFrequence();
                }
                if (preNode != null) {
                    preNode.right = i;
                }
                preNode = tmpNode;
                siblings.add(tmpNode);
            }
            prev = cur;
        }

        if (preNode != null) {
            preNode.right = parent.right;
        }
        return siblings.size();
    }

    public void save(String file) throws IOException {
        DataOutputStream out = null;
        int dsize = checkArray.length;
        try {
            out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
            out.writeInt(dsize);
            for (int i = 0; i < dsize; i++) {
                out.writeInt(checkArray[i]);
                out.writeInt(baseArray[i]);
             
            }
            out.close();
        } finally {
            if (out != null) {
                out.close();
            }
        }
    }

    public void load(String fileName) throws IOException {
        File file = new File(fileName);
        DataInputStream is = null;
        try {
            is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), 1024 * 1024));
            load(is);
        } finally {
            if (is != null) is.close();
        }
    }

    public void load(InputStream in) throws IOException {
        DataInputStream is = new DataInputStream(new BufferedInputStream(in, 1024 * 1024));
        int size = is.readInt();
        checkArray = new int[size];
        baseArray = new int[size];
     
        for (int i = 0; i < size; i++) {
            checkArray[i] = is.readInt();
            baseArray[i] = is.readInt();
          
        }
     
    }

    public int search(String key) {
        return search(key.toCharArray(), 0, key.length());
    }

    public int search(char key[], int pos, int len) {
        if (len == 0) {
            len = key.length;
        }
        int b = baseArray[0];
        int p;
        for (int i = pos; i < len; i++) {

            p = b + key[i] + 1;
            if (b == checkArray[p]) {
                b = baseArray[p];
            } else {
                return -1;
            }
        }
        p = b;
        int n = baseArray[p];
        if (b == checkArray[p] && n < 0) {
            return -n - 1;
        }
        return -1;
    }

   

    public List<Word> prefixSearch(char[] key, int pos, int len) {
        int p, n, i, b = baseArray[0];
        List<Word> result = new ArrayList<Word>();
        for (i = pos; i < len; ++i) {
            p = b; // + 0;
            n = baseArray[p];
            if (b == checkArray[p] && n < 0) {
                Word w = new Word();
                w.position = -n - 1;
                w.begin = pos;
                w.length = i - pos;
             
                result.add(w);
            }
            p = b + (key[i]) + 1;
            if (b == checkArray[p]) {
                b = baseArray[p];
            } else {
                return result;
            }
        }
        p = b;
        n = baseArray[p];
        if (b == checkArray[p] && n < 0) {
            Word w = new Word();
            w.position = -n - 1;
            w.begin = pos;
            w.length = i - pos;
          
            result.add(w);
        }

        return result;
    }

    public Word prefixSearchMax(char[] key, int pos, int len) {
        int p, n, i, b = baseArray[0];
        Word w = null;
        for (i = pos; i < pos + len; ++i) {
            p = b; // + 0;
            n = baseArray[p];
            if (b == checkArray[p] && n < 0) {
                if (w == null) {
                    w = new Word();
                }
                w.position = -n - 1;
                w.begin = pos;
                w.length = i - pos;
              
            }
            p = b + (key[i]) + 1;
            if (b == checkArray[p]) {
                b = baseArray[p];
            } else {
                return w;
            }
        }
        p = b;
        n = baseArray[p];
        if (b == checkArray[p] && n < 0) {
            if (w == null) {
                w = new Word();
            }
            w.position = -n - 1;
            w.begin = pos;
            w.length = i - pos;
         
        }
        return w;
    }
 
//字符數組比較子
public class CharArrayComparator<T> implements Comparator<T> {

    public int compare(T o1, T o2) {
        char[] a = ((Element) o1).getChars();
        char[] b = ((Element) o2).getChars();
        int loop = a.length > b.length ? b.length : a.length;
        for (int i = 0; i < loop; i++) {
            int c = a[i] - b[i];
            if (c != 0) {
                return c;
            }
        }
        return a.length - b.length;
    }
}

//在生成數據前, 這個接口實現了特定的數據處理
public interface PreProcess {
    public  List<Element> process(List<char[]> lines);
}

裏面還有一些簡單的數據結構, 當然這些都不是必然需要的, 我爲了自己的業務需求, 實現了一些特定的數據結構。 當然, 這個版本我已經刪除了我的業務代碼, 可能會編譯通過不了。 但是, 所有BUG已經被修正了。

 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章