1. 通過build()函數,把所有的詞生成數據,
2. 然後通過save()函數保存數據。
3. 使用的時候就可以用load()載入數據。
public class DoubelArrayTrie{
// 節點信息
private int baseArray[];
private int checkArray[];
// 保存節點已經使用
private boolean usedArray[];
private int nextCheckPos;
private int writeSize = 0;
public void build(List<char[]> wordList, PreProcess process) {
if (wordList == null) {
return;
}
int size = wordList.size();
if (size > 0) {
List<Element> elements = null;
if (process != null) {
elements = process.process(wordList);
} else {
elements = new ArrayList<Element>(wordList.size());
for (char[] cs : wordList) {
elements.add(new GenericElement(cs));
}
}
Collections.sort(elements, new CharArrayComparator<Element>());
resize(1);
baseArray[0] = 1;
nextCheckPos = 0;
Node root_node = new Node();
root_node.left = 0;
root_node.right = size;
root_node.depth = 0;
List<Node> siblings = createSiblings();
fetch(elements, root_node, siblings);
insert(elements, siblings);
size = size + (1 << 8 * 2) + 1;
if (size > usedArray.length) {
resize(size);
}
}
}
private int insert(List<Element> elements, List<Node> siblings) {
int begin = 0;
int nonZeroCount = 0;
boolean first = false;
int pos = (siblings.get(0).code + 1 > nextCheckPos ? siblings.get(0).code + 1 : nextCheckPos) - 1;
if (pos >= usedArray.length) {
resize(pos + 1);
}
while (true) {
pos++;
if (pos >= usedArray.length) {
resize(pos + 65535);
}
if (checkArray[pos] != 0) {
nonZeroCount++;
continue;
} else if (!first) {
nextCheckPos = pos;
first = true;
}
begin = pos - siblings.get(0).code;
int t = begin + siblings.get(siblings.size() - 1).code;
if (t > usedArray.length) {
resize(t + 65535);
}
if (usedArray[begin]) {
continue;
}
boolean flag = false;
for (int i = 1; i < siblings.size(); i++) {
if (checkArray[begin + siblings.get(i).code] != 0) {
flag = true;
break;
}
}
if (!flag) break;
}
if (1.0 * nonZeroCount / (pos - nextCheckPos + 1) >= 0.95) {
nextCheckPos = pos;
}
usedArray[begin] = true;
writeSize = Math.max(writeSize, begin + siblings.get(siblings.size() - 1).code + 1);
for (Node node : siblings) {
checkArray[begin + node.code] = begin;
}
for (Node node : siblings) {
List<Node> newSiblings = createSiblings();
if (fetch(elements, node, newSiblings) == 0) {
baseArray[begin + node.code] = -node.left - 1;
} else {
int ins = insert(elements, newSiblings);
baseArray[begin + node.code] = ins;
}
}
return begin;
}
private List<Node> createSiblings() {
return new ArrayList<Node>();
}
private void resize(int size) {
// checkArray array
int tmp[] = new int[size];
if (baseArray != null) {
System.arraycopy(baseArray, 0, tmp, 0, baseArray.length);
}
baseArray = tmp;
// baseArray array
int tmp1[] = new int[size];
if (checkArray != null) {
System.arraycopy(checkArray, 0, tmp1, 0, checkArray.length);
}
checkArray = tmp1;
// usedArray array
boolean tmp2[] = new boolean[size];
if (usedArray != null) {
System.arraycopy(usedArray, 0, tmp2, 0, usedArray.length);
}
usedArray = tmp2;
}
private int fetch(List<Element> words, Node parent, List<Node> siblings) {
int prev = 0;
Node preNode = null;
for (int i = parent.left; i < parent.right; i++) {
char word[] = words.get(i).getChars();
int len = word.length;
if (len < parent.depth) {
continue;
}
int cur = 0;
if (len != parent.depth) {
cur = word[parent.depth] + 1;
}
if (prev > cur) {
throw new RuntimeException("Fatal: sort dictionary first.\n");
}
if (cur != prev || siblings.size() == 0) {
Node tmpNode = new Node();
tmpNode.depth = parent.depth + 1;
tmpNode.code = cur; // 重新計算每個字的映射?
tmpNode.left = i;
if (len == parent.depth + 1) {
tmpNode.frequence = words.get(i).getFrequence();
}
if (preNode != null) {
preNode.right = i;
}
preNode = tmpNode;
siblings.add(tmpNode);
}
prev = cur;
}
if (preNode != null) {
preNode.right = parent.right;
}
return siblings.size();
}
public void save(String file) throws IOException {
DataOutputStream out = null;
int dsize = checkArray.length;
try {
out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
out.writeInt(dsize);
for (int i = 0; i < dsize; i++) {
out.writeInt(checkArray[i]);
out.writeInt(baseArray[i]);
}
out.close();
} finally {
if (out != null) {
out.close();
}
}
}
public void load(String fileName) throws IOException {
File file = new File(fileName);
DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), 1024 * 1024));
load(is);
} finally {
if (is != null) is.close();
}
}
public void load(InputStream in) throws IOException {
DataInputStream is = new DataInputStream(new BufferedInputStream(in, 1024 * 1024));
int size = is.readInt();
checkArray = new int[size];
baseArray = new int[size];
for (int i = 0; i < size; i++) {
checkArray[i] = is.readInt();
baseArray[i] = is.readInt();
}
}
public int search(String key) {
return search(key.toCharArray(), 0, key.length());
}
public int search(char key[], int pos, int len) {
if (len == 0) {
len = key.length;
}
int b = baseArray[0];
int p;
for (int i = pos; i < len; i++) {
p = b + key[i] + 1;
if (b == checkArray[p]) {
b = baseArray[p];
} else {
return -1;
}
}
p = b;
int n = baseArray[p];
if (b == checkArray[p] && n < 0) {
return -n - 1;
}
return -1;
}
public List<Word> prefixSearch(char[] key, int pos, int len) {
int p, n, i, b = baseArray[0];
List<Word> result = new ArrayList<Word>();
for (i = pos; i < len; ++i) {
p = b; // + 0;
n = baseArray[p];
if (b == checkArray[p] && n < 0) {
Word w = new Word();
w.position = -n - 1;
w.begin = pos;
w.length = i - pos;
result.add(w);
}
p = b + (key[i]) + 1;
if (b == checkArray[p]) {
b = baseArray[p];
} else {
return result;
}
}
p = b;
n = baseArray[p];
if (b == checkArray[p] && n < 0) {
Word w = new Word();
w.position = -n - 1;
w.begin = pos;
w.length = i - pos;
result.add(w);
}
return result;
}
public Word prefixSearchMax(char[] key, int pos, int len) {
int p, n, i, b = baseArray[0];
Word w = null;
for (i = pos; i < pos + len; ++i) {
p = b; // + 0;
n = baseArray[p];
if (b == checkArray[p] && n < 0) {
if (w == null) {
w = new Word();
}
w.position = -n - 1;
w.begin = pos;
w.length = i - pos;
}
p = b + (key[i]) + 1;
if (b == checkArray[p]) {
b = baseArray[p];
} else {
return w;
}
}
p = b;
n = baseArray[p];
if (b == checkArray[p] && n < 0) {
if (w == null) {
w = new Word();
}
w.position = -n - 1;
w.begin = pos;
w.length = i - pos;
}
return w;
}
//字符數組比較子
public class CharArrayComparator<T> implements Comparator<T> {
public int compare(T o1, T o2) {
char[] a = ((Element) o1).getChars();
char[] b = ((Element) o2).getChars();
int loop = a.length > b.length ? b.length : a.length;
for (int i = 0; i < loop; i++) {
int c = a[i] - b[i];
if (c != 0) {
return c;
}
}
return a.length - b.length;
}
}
//在生成數據前, 這個接口實現了特定的數據處理
public interface PreProcess {
public List<Element> process(List<char[]> lines);
}
裏面還有一些簡單的數據結構, 當然這些都不是必然需要的, 我爲了自己的業務需求, 實現了一些特定的數據結構。 當然, 這個版本我已經刪除了我的業務代碼, 可能會編譯通過不了。 但是, 所有BUG已經被修正了。