2019.02.24
前言
想在業餘時間做個小工具,設計是不使用數據庫,而是用文件系統來存儲數據。爲了減少文件打開次數,提高索引效率,用B樹構建內存索引。B的原理就不介紹了,提供如下鏈接供擴展閱讀:
- Princeton算法課程slide:https://www.cs.princeton.edu/~rs/AlgsDS07/09BalancedTrees.pdf
- Princeton版本的B樹Java實現(可直接使用,本文會添加配圖說明和添加代碼中文註釋):
https://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/BTree.java.html - B樹可視化:
https://www.cs.usfca.edu/~galles/visualization/BTree.html
網上很多博客都直接複製了Princeton版本的代碼,但Princeton版源碼生成的B樹,葉子節點上都有哨兵,跟手繪出來的B樹不太相同,所以撰寫本文予以說明。同時,爲了構建出與可視化出來更像的B樹,我對Princeton版本源碼略加修改,實現了自己的版本,在本文會有較多附圖說明三者之間的區別。
B樹例子
下面Java源碼都會用如下插入順序構建BTree:
Princeton版本源碼理解
哨兵
Princeton版本的BTree實現,採用了哨兵,使得任意一個結點(包含key1, …, keyi, …, keym),keyi對應子結點內的所有key值都>=keyi,並且都小於key(i+1)。採用哨兵的數據結構有效地簡化了代碼。
對於上一節的B樹例子,採用Princeton版本源碼生成的B樹如下圖所示:
源碼
public class PrincetonBTree<Key extends Comparable<Key>, Value> {
// max children per B-tree node = M-1
// (must be even and greater than 2)
private static final int M = 4;
private Node root; // root of the B-tree
private int height; // height of the B-tree
private int n; // number of key-value pairs in the B-tree
// helper B-tree node data type
private static final class Node {
private int m; // number of children
private Entry[] children = new Entry[M]; // the array of children
// create a node with k children
private Node(int k) {
m = k;
}
}
// internal nodes: only use key and next
// external nodes: only use key and value
// 因此,當索引key時,要獲取value都只能在外部結點中獲得
private static class Entry {
private Comparable key;
private final Object val;
private Node next; // helper field to iterate over array entries
public Entry(Comparable key, Object val, Node next) {
this.key = key;
this.val = val;
this.next = next;
}
}
/**
* Initializes an empty B-tree.
*/
public PrincetonBTree() {
root = new Node(0);
}
/**
* Returns true if this symbol table is empty.
* @return {@code true} if this symbol table is empty; {@code false} otherwise
*/
public boolean isEmpty() {
return size() == 0;
}
/**
* Returns the number of key-value pairs in this symbol table.
* @return the number of key-value pairs in this symbol table
*/
public int size() {
return n;
}
/**
* Returns the height of this B-tree (for debugging).
*
* @return the height of this B-tree
*/
public int height() {
return height;
}
/**
* Returns the value associated with the given key.
*
* @param key the key
* @return the value associated with the given key if the key is in the symbol table
* and {@code null} if the key is not in the symbol table
* @throws IllegalArgumentException if {@code key} is {@code null}
*/
public Value get(Key key) {
if (key == null) throw new IllegalArgumentException("argument to get() is null");
return search(root, key, height);
}
private Value search(Node x, Key key, int ht) {
Entry[] children = x.children;
// external node
if (ht == 0) {
for (int j = 0; j < x.m; j++) {
if (eq(key, children[j].key)) return (Value) children[j].val;
}
}
// internal node
else {
for (int j = 0; j < x.m; j++) {
if (j+1 == x.m || less(key, children[j+1].key))
return search(children[j].next, key, ht-1);
}
}
return null;
}
/**
* Inserts the key-value pair into the symbol table, overwriting the old value
* with the new value if the key is already in the symbol table.
* If the value is {@code null}, this effectively deletes the key from the symbol table.
*
* @param key the key
* @param val the value
* @throws IllegalArgumentException if {@code key} is {@code null}
*/
public void put(Key key, Value val) {
if (key == null) throw new IllegalArgumentException("argument key to put() is null");
Node node = insert(root, key, val, height);
n++;
if (node == null) return;
// need to split root
Node newRoot = new Node(2);
newRoot.children[0] = new Entry(root.children[0].key, null, root);
newRoot.children[1] = new Entry(node.children[0].key, null, node);
root = newRoot;
height++;
}
private Node insert(Node node, Key key, Value val, int height) {
int index;
Entry entry = new Entry(key, val, null);
// external node
if (height == 0) {
for (index = 0; index < node.m; index++) {
if (less(key, node.children[index].key)) break;
}
}
// internal node
else {
for (index = 0; index < node.m; index++) {
if ((index+1 == node.m) || less(key, node.children[index+1].key)) {
Node splitNode = insert(node.children[index++].next, key, val, height-1);
if (splitNode == null) return null;
entry.key = splitNode.children[0].key;
entry.next = splitNode;
break;
}
}
}
for (int i = node.m; i > index; i--)
node.children[i] = node.children[i-1];
node.children[index] = entry;
node.m++;
if (node.m < M) return null;
else return split(node);
}
// split node in half
private Node split(Node h) {
Node t = new Node(M/2);
h.m = M/2;
for (int j = 0; j < M/2; j++)
t.children[j] = h.children[M/2+j];
return t;
}
/**
* Returns a string representation of this B-tree (for debugging).
*
* @return a string representation of this B-tree.
*/
public String toString() {
return toString(root, height, "") + "\n";
}
private String toString(Node h, int ht, String indent) {
StringBuilder s = new StringBuilder();
Entry[] children = h.children;
if (ht == 0) {
for (int j = 0; j < h.m; j++) {
s.append(indent + children[j].key + " " + children[j].val + "\n");
}
}
else {
for (int j = 0; j < h.m; j++) {
if (j > 0) s.append(indent + "(" + children[j].key + ")\n");
s.append(toString(children[j].next, ht-1, indent + " "));
}
}
return s.toString();
}
// comparison functions - make Comparable instead of Key to avoid casts
private boolean less(Comparable k1, Comparable k2) {
return k1.compareTo(k2) < 0;
}
private boolean eq(Comparable k1, Comparable k2) {
return k1.compareTo(k2) == 0;
}
/**
* Unit tests the {@code BTree} data type.
*
* @param args the command-line arguments
*/
public static void main(String[] args) {
PrincetonBTree<Double, String> bTree = new PrincetonBTree<Double, String>();
bTree.put(1D, "test1");
bTree.put(4D, "test4");
bTree.put(7D, "test7");
bTree.put(0D, "test0");
bTree.put(2D, "test2");
bTree.put(5D, "test5");
bTree.put(8D, "test8");
bTree.put(6D, "test6");
bTree.put(9D, "test9");
bTree.put(3D, "test3");
bTree.put(10D, "test10");
System.out.println(bTree.get(-1D));
System.out.println(bTree.get(0D));
System.out.println(bTree.get(1D));
System.out.println(bTree.get(2D));
System.out.println(bTree.get(2.5D));
System.out.println(bTree.get(3D));
System.out.println(bTree.get(4D));
System.out.println(bTree.get(5D));
System.out.println(bTree.get(6D));
System.out.println(bTree.get(7D));
System.out.println(bTree.get(8D));
System.out.println(bTree.get(9D));
System.out.println(bTree.get(10D));
System.out.println(bTree.get(11D));
}
}
基於Princeton的修改版本
相同的哨兵,不同的結點
Princeton版本的BTree,顯然,要查找key對應的value值,都必要索引到葉子結點才能得到,其原因是因爲它區分了內部結點和外部結點。而在我修改的版本里,同樣也採用了哨兵,但結點不區分內部與外部結點,除了哨兵沒有value之外,所有非葉子結點既有value又有子結點。比如同樣是查找4的value值,Princeton的B樹要遞歸3次,而我修改的版本在根結點即可獲得。
下圖是對於同一個B樹例子,用我修改的版本所生成的B樹。對比三張圖,可以明顯地發現,Princeton版本的代碼生成的B樹跟我們手繪出來的是不一樣的;而我修改的版本所生成的B樹,和“B樹例子”裏是完成相同的,只是每個結點多了個哨兵。
源碼
public class BTreeWithSentinel<Key extends Comparable, Value> {
private static final int M = 4;
private static final int M_WITH_SENTINEL = M + 1;
private Node root;
private int height;
BTreeWithSentinel() {
this.root = new Node(1);
this.root.keys[0] = new Entry(null, null, null);
this.height = 0;
}
public static final class Node {
Entry[] keys = new Entry[M_WITH_SENTINEL];
int size = 0;
Node(int size) {
this.size = size;
}
}
public static final class Entry {
Comparable key;
Object value;
Node next;
Entry(Comparable key, Object value, Node next) {
this.key = key;
this.value = value;
this.next = next;
}
}
public void put(Key key, Value value) {
Node newNode = insert(this.root, key, value, this.height);
if (newNode == null) {
return;
}
Node newRoot = new Node(2);
newRoot.keys[0] = new Entry(null, null, root);
newRoot.keys[1] = new Entry(newNode.keys[0].key, newNode.keys[0].value, newNode);
this.root = newRoot;
this.height++;
}
public Value get(Key key) {
return search(this.root, key);
}
public Value search(Node root, Key key) {
if (root == null) {
return null;
}
for (int i = 0; i < root.size; i++) {
if ((i+1) == root.size || less(key, (Key) root.keys[i+1].key)) {
return search(root.keys[i].next, key);
} else if (equal(key, (Key) root.keys[i+1].key)) {
return (Value) root.keys[i+1].value;
}
}
return null;
}
public Node insert(Node root, Key key, Value value, int height) {
Entry entry = new Entry(key, value, null);
int index = 1;
if (height == 0) {
for ( index = 1; index < root.size; index++) {
if (less(key, (Key) root.keys[index].key)) {
break;
}
}
} else {
for ( index = 0; index < root.size; index++) {
if ( (index + 1) == root.size || less(key, (Key) root.keys[index + 1].key)) {
Node newNode = insert(root.keys[index].next, key, value, height - 1);
if (newNode == null) {
return null;
}
entry = new Entry(newNode.keys[0].key, newNode.keys[0].value, newNode);
index++;
break;
}
}
}
for (int i = root.size; i > index; i--) {
root.keys[i] = root.keys[i - 1];
}
root.keys[index] = entry;
root.size++;
if (root.size < M_WITH_SENTINEL) return null;
else return splitNode(root);
}
public Node splitNode(Node node) {
Node newNode = new Node(ceilDiv(node.size, 2));
node.size = floorDiv(node.size, 2);
for (int i = 0; i < newNode.size; i++) {
newNode.keys[i] = node.keys[node.size + i];
}
return newNode;
}
private int ceilDiv(int foo, int bar) {
return Double.valueOf(Math.ceil((double) foo / bar)).intValue();
}
private int floorDiv(int foo, int bar) {
return Double.valueOf(Math.floor((double) foo / bar)).intValue();
}
private Boolean more(Key foo, Key bar) {
return foo.compareTo(bar) > 0;
}
private Boolean equal(Key foo, Key bar) {
return foo.compareTo(bar) == 0;
}
private Boolean less(Key foo, Key bar) {
return foo.compareTo(bar) < 0;
}
public static void main(String[] args) {
BTreeWithSentinel<Double, String> bTree = new BTreeWithSentinel<Double, String>();
bTree.put(1D, "test1");
bTree.put(4D, "test4");
bTree.put(7D, "test7");
bTree.put(0D, "test0");
bTree.put(2D, "test2");
bTree.put(5D, "test5");
bTree.put(8D, "test8");
bTree.put(6D, "test6");
bTree.put(9D, "test9");
bTree.put(3D, "test3");
bTree.put(10D, "test10");
System.out.println(bTree.get(-1D));
System.out.println(bTree.get(0D));
System.out.println(bTree.get(1D));
System.out.println(bTree.get(2D));
System.out.println(bTree.get(2.5D));
System.out.println(bTree.get(3D));
System.out.println(bTree.get(4D));
System.out.println(bTree.get(5D));
System.out.println(bTree.get(6D));
System.out.println(bTree.get(7D));
System.out.println(bTree.get(8D));
System.out.println(bTree.get(9D));
System.out.println(bTree.get(10D));
System.out.println(bTree.get(11D));
}
}