英文原文鏈接:https://cstack.github.io/db_tutorial/parts/part8.html
我們將持續修改表的格式,從未排序的數據改成B-tree。因爲改動太大,需要花費幾節文章來說明。在這節,我們定義葉子節點層,並支持key/value對插入到單節點樹。
可選的表格式
當前的格式,每頁只存儲了rows(不包括元數據),因此從空間上講是非常高效的。數據插入操作非常快,因爲我們支持從表尾插入。但是,但是想要查找數據,只能掃描全表。如果我們想要刪除一行,需要把改行後面的數據move,以填補刪除留在的hole。
如果我們按ID把表存儲爲一個有序數組來保存行,這樣我們可以基於ID使用二分法進行查詢。但在這樣,插入會很慢,因爲我們不得不挪動很多行來騰出空間。
如果使用tree結構呢?每一個節點可以保存多行,所以我們可以保存額外的信息來跟蹤每個節點保存多少行。此外,所有的內部節點不存儲任何行數據,所以這些都是額外空間開銷。但是好處是,對於較大的數據庫文件,我們可以實現快速地插入、刪除和查找。
歸納如下:
Unsorted Array of rows | Sorted Array of rows | Tree of nodes | |
Pages contain | only data | only data | metadata, primary keys, and data |
Rows per page | more | more | fewer |
Insertion | O(1) | O(n) | O(log(n)) |
Deletion | O(n) | O(n) | O(log(n)) |
Lookup by id | O(n) | O(log(n)) | O(log(n)) |
節點頭format
葉節點和內部節點具有不同的佈局。讓我們創建一個枚舉來跟蹤節點類型:
enum NodeType_t { NODE_INTERNAL, NODE_LEAF };
typedef enum NodeType_t NodeType;
每個節點將對應一個頁面。內部節點存儲了孩子節點的頁碼。btree向Pager詢問特定頁碼,並得到指向頁面緩存的指針。頁面按頁碼順序依次存儲在數據庫文件中。
節點需要在頁面頭中存儲一些元數據。包括節點類型,是否是根節點,以及指向其父節點的指針(用來查找兄弟節點)。我爲每個頭字段的大小和偏移量定義幾個常量:
/*
* Common Node Header Layout
*/
const uint32_t NODE_TYPE_SIZE = sizeof(uint8_t);
const uint32_t NODE_TYPE_OFFSET = 0;
const uint32_t IS_ROOT_SIZE = sizeof(uint8_t);
const uint32_t IS_ROOT_OFFSET = NODE_TYPE_SIZE;
const uint32_t PARENT_POINTER_SIZE = sizeof(uint32_t);
const uint32_t PARENT_POINTER_OFFSET = IS_ROOT_OFFSET IS_ROOT_SIZE;
const uint8_t COMMON_NODE_HEADER_SIZE =
NODE_TYPE_SIZE IS_ROOT_SIZE PARENT_POINTER_SIZE;
葉子節點格式
除了這些公共多頭字段外,葉子節點還需要存儲包括了多少個cell,每個cell是一個鍵值對。
/*
* Leaf Node Header Layout
*/
const uint32_t LEAF_NODE_NUM_CELLS_SIZE = sizeof(uint32_t);
const uint32_t LEAF_NODE_NUM_CELLS_OFFSET = COMMON_NODE_HEADER_SIZE;
const uint32_t LEAF_NODE_HEADER_SIZE =
COMMON_NODE_HEADER_SIZE LEAF_NODE_NUM_CELLS_SIZE;
歸檔下,葉子節點格式如下:
在頭部留一點空間來存放公共的信息,雖然有一點開銷,但是對於訪問起來更加地方便。
注意到,在頁尾預留了一些空間。在header之後儘可能多的存儲cell,但是留下的空間可能存放不下一個完整的cell,這種情況下,就不再使用這些空間,避免跨node。
如何訪問葉子節點字段
+ /* 獲取cell個數*/
+uint32_t* leaf_node_num_cells(void* node) {
+ return (char *)node + LEAF_NODE_NUM_CELLS_OFFSET;
+}
+ /* 根據cell編碼獲取相應的cell*/
+void* leaf_node_cell(void* node, uint32_t cell_num) {
+ return (char *)node + LEAF_NODE_HEADER_SIZE + cell_num * LEAF_NODE_CELL_SIZE;
+}
+ /* 根據cell編碼獲取key*/
+uint32_t* leaf_node_key(void* node, uint32_t cell_num) {
+ return leaf_node_cell(node, cell_num);
+}
+ /* 根據cell編碼獲取value*/
+void* leaf_node_value(void* node, uint32_t cell_num) {
+ return leaf_node_cell(node, cell_num) + LEAF_NODE_KEY_SIZE;
+}
+ /* 初始化葉子節點 */
+void initialize_leaf_node(void* node) { *leaf_node_num_cells(node) = 0; }
+
調整Pager和Table對象的實現
-void pager_flush(Pager* pager, uint32_t page_num, uint32_t size) {
+void pager_flush(Pager* pager, uint32_t page_num) {
if (pager->pages[page_num] == NULL) {
printf("Tried to flush null page\n");
exit(EXIT_FAILURE);
@@ -242,7 +337,7 @@ void pager_flush(Pager* pager, uint32_t page_num, uint32_t size) {
}
ssize_t bytes_written =
- write(pager->file_descriptor, pager->pages[page_num], size);
+ write(pager->file_descriptor, pager->pages[page_num], PAGE_SIZE);
if (bytes_written == -1) {
printf("Error writing: %d\n", errno);
void db_close(Table* table) {
Pager* pager = table->pager;
- uint32_t num_full_pages = table->num_rows / ROWS_PER_PAGE;
- for (uint32_t i = 0; i < num_full_pages; i++) {
+ for (uint32_t i = 0; i < pager->num_pages; i++) {
if (pager->pages[i] == NULL) {
continue;
}
- pager_flush(pager, i, PAGE_SIZE);
+ pager_flush(pager, i);
free(pager->pages[i]);
pager->pages[i] = NULL;
}
- // There may be a partial page to write to the end of the file
- // This should not be needed after we switch to a B-tree
- uint32_t num_additional_rows = table->num_rows % ROWS_PER_PAGE;
- if (num_additional_rows > 0) {
- uint32_t page_num = num_full_pages;
- if (pager->pages[page_num] != NULL) {
- pager_flush(pager, page_num, num_additional_rows * ROW_SIZE);
- free(pager->pages[page_num]);
- pager->pages[page_num] = NULL;
- }
- }
-
int result = close(pager->file_descriptor);
if (result == -1) {
printf("Error closing db file.\n");
在我們的數據庫中存儲page編號顯然比存儲row編碼更加合理。現在我們使用page數量取代特定的表後,page的個數同Pager對象建立關係,而不再是同表對象。每顆B-tree對象都使用它的根節點的page編碼來唯一來標記,這樣表對象就需要做相應的調整。
const uint32_t PAGE_SIZE = 4096;
const uint32_t TABLE_MAX_PAGES = 100;
-const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
-const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;
struct Pager_t {
int file_descriptor;
uint32_t file_length;
+ uint32_t num_pages;
void* pages[TABLE_MAX_PAGES];
};
typedef struct Pager_t Pager;
struct Table_t {
Pager* pager;
- uint32_t num_rows;
+ uint32_t root_page_num;
};
typedef struct Table_t Table;
@@ -127,6 +200,10 @@ void* get_page(Pager* pager, uint32_t page_num) {
}
pager->pages[page_num] = page;
+
+ if (page_num >= pager->num_pages) {
+ pager->num_pages = page_num + 1;
+ }
}
return pager->pages[page_num];
@@ -184,6 +269,12 @@ Pager* pager_open(const char* filename) {
Pager* pager = malloc(sizeof(Pager));
pager->file_descriptor = fd;
pager->file_length = file_length;
+ pager->num_pages = (file_length / PAGE_SIZE);
+
+ if (file_length % PAGE_SIZE != 0) {
+ printf("Db file is not a whole number of pages. Corrupt file.\n");
+ exit(EXIT_FAILURE);
+ }
for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
pager->pages[i] = NULL;
調整Cursor對象的實現
一個cursor代表來數據在表中的位置。當我們的表使用一個簡單的數組存儲時,我們可以通過row編碼來訪問它。現在,我們改用了Btree,這個位置就需要替換成node編碼加cell編碼來唯一標識。
struct Cursor_t {
Table* table;
- uint32_t row_num;
+ uint32_t page_num;
+ uint32_t cell_num;
bool end_of_table; // Indicates a position one past the last element
};
typedef struct Cursor_t Cursor;
Cursor* table_start(Table* table) {
Cursor* cursor = malloc(sizeof(Cursor));
cursor->table = table;
- cursor->row_num = 0;
- cursor->end_of_table = (table->num_rows == 0);
+ cursor->page_num = table->root_page_num;
+ cursor->cell_num = 0;
+
+ void* root_node = get_page(table->pager, table->root_page_num);
+ uint32_t num_cells = *leaf_node_num_cells(root_node);
+ cursor->end_of_table = (num_cells == 0);
return cursor;
}
void* cursor_value(Cursor* cursor) {
- uint32_t row_num = cursor->row_num;
- uint32_t page_num = row_num / ROWS_PER_PAGE;
+ uint32_t page_num = cursor->page_num;
void* page = get_page(cursor->table->pager, page_num);
- uint32_t row_offset = row_num % ROWS_PER_PAGE;
- uint32_t byte_offset = row_offset * ROW_SIZE;
- return page + byte_offset;
+ return leaf_node_value(page, cursor->cell_num);
}
void cursor_advance(Cursor* cursor) {
- cursor->row_num += 1;
- if (cursor->row_num >= cursor->table->num_rows) {
+ uint32_t page_num = cursor->page_num;
+ void* node = get_page(cursor->table->pager, page_num);
+
+ cursor->cell_num += 1;
+ if (cursor->cell_num >= (*leaf_node_num_cells(node))) {
cursor->end_of_table = true;
}
}