Bolt的數據是存儲在db文件中,bolt操作始於db文件。
1、db文件初始化和加載流程
open函數實現db文件的打開
func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
var db = &DB{opened: true}
// Set default options if no options are provided.
if options == nil {
options = DefaultOptions
}
//使用默認值初始化,這些字段的含義在前面的已經介紹,此處不再介紹
db.NoGrowSync = options.NoGrowSync
db.MmapFlags = options.MmapFlags
// Set default values for later DB operations.
db.MaxBatchSize = DefaultMaxBatchSize
db.MaxBatchDelay = DefaultMaxBatchDelay
db.AllocSize = DefaultAllocSize
flag := os.O_RDWR
if options.ReadOnly {//如果以只讀方式打開
flag = os.O_RDONLY
db.readOnly = true
}
//獲取文件句柄
db.path = path
var err error
if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
_ = db.close()
return nil, err
}
//鎖定文件,以便在讀寫模式下使用Bolt的其他進程無法同時使用數據庫。只讀使用共享鎖鎖定數據庫文件(多個進程可能同時保持鎖定)否則(設置options.ReadOnly)
if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
_ = db.close()
return nil, err
}
// Default values for test hooks
db.ops.writeAt = db.file.WriteAt
// Initialize the database if it doesn't exist.
if info, err := db.file.Stat(); err != nil {
return nil, err
} else if info.Size() == 0 {//不存在db文件
// 初始化操作
if err := db.init(); err != nil {
return nil, err
}
} else {//db文件已存在
// Read the first meta page to determine the page size.
var buf [0x1000]byte
if _, err := db.file.ReadAt(buf[:], 0); err == nil {
m := db.pageInBuffer(buf[:], 0).meta()
if err := m.validate(); err != nil {
// If we can't read the page size, we can assume it's the same
// as the OS -- since that's how the page size was chosen in the
// first place.
//
// If the first page is invalid and this OS uses a different
// page size than what the database was created with then we
// are out of luck and cannot access the database.
db.pageSize = os.Getpagesize()
} else {
db.pageSize = int(m.pageSize)
}
}
}
// 初始化page 內存池
db.pagePool = sync.Pool{
New: func() interface{} {
return make([]byte, db.pageSize)
},
}
// 將文件映射在內存
if err := db.mmap(options.InitialMmapSize); err != nil {
_ = db.close()
return nil, err
}
// 空閒page加載到freelist.
db.freelist = newFreelist()
db.freelist.read(db.page(db.meta().freelist))
// Mark the database as opened and return.
return db, nil
}
如果之前不存在db文件會調用init進行初始化,否在讀取文件頭部4k數據從中取出想要的信息完成後續加載。早函數尾部進行內存的映射和空閒page的加載。
func (db *DB) init() error {
// Set the page size to the OS page size.
db.pageSize = os.Getpagesize()
// Create two meta pages on a buffer.
buf := make([]byte, db.pageSize*4) //前4個page預留
for i := 0; i < 2; i++ {//初始化兩個meat page
p := db.pageInBuffer(buf[:], pgid(i)) //填充第幾個page
p.id = pgid(i)
p.flags = metaPageFlag
// Initialize the meta page.
m := p.meta()
m.magic = magic
m.version = version
m.pageSize = uint32(db.pageSize)
m.freelist = 2//起始page
m.root = bucket{root: 3}//根page起始page
m.pgid = 4//page id起始位置
m.txid = txid(i)
m.checksum = m.sum64()
}
// Write an empty freelist at page 3.
p := db.pageInBuffer(buf[:], pgid(2)) //第二個page保存free page
p.id = pgid(2)
p.flags = freelistPageFlag
p.count = 0
// Write an empty leaf page at page 4.
p = db.pageInBuffer(buf[:], pgid(3))// 第三個page leaf節點
p.id = pgid(3)
p.flags = leafPageFlag
p.count = 0
// buf寫入db
if _, err := db.ops.writeAt(buf, 0); err != nil {
return err
}
//同步文件
if err := fdatasync(db); err != nil {
return err
}
return nil
}
可以看到open函數比較簡單,如果是第一次使用會open文件然後將固定格式寫入前四個page。
2、查詢/創建bucket
查詢bucket是創建的前提,創建前都會查詢bucket是否存在,如果存在會返回錯誤,如果不存在纔會創建bucket。查詢操作在事務內進行。
func (tx *Tx) Bucket(name []byte) *Bucket {
return tx.root.Bucket(name)
}
func (b *Bucket) Bucket(name []byte) *Bucket {
if b.buckets != nil {
if child := b.buckets[string(name)]; child != nil {
return child
}
}
//創建查詢遊標
c := b.Cursor()
//在當前的bucket中查詢找name對應的value
k, v, flags := c.seek(name)
//key不存在,結束
if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
return nil
}
// 否則打開bucket
var child = b.openBucket(v)
if b.buckets != nil {
b.buckets[string(name)] = child
}
return child
}
Seek是查詢的核心函數,seek中會使用二分查找在b+tree中查找對應的key。因爲b+tree有可能被緩存到了內存,所以查找過程會在node和page之間切換。
func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
_assert(c.bucket.tx.db != nil, "tx closed")
// Start from root page/node and traverse to correct page.
c.stack = c.stack[:0]
c.search(seek, c.bucket.root)//從根page開始查找
ref := &c.stack[len(c.stack)-1]//獲取最後一次查詢的位置
// 未查詢到
if ref.index >= ref.count() {
return nil, nil, 0
}
// 獲取key對應的value.
return c.keyValue()
}
func (c *Cursor) search(key []byte, pgid pgid) {
p, n := c.bucket.pageNode(pgid)//從內存或者磁盤獲取該page
if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 {
panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags))
}
e := elemRef{page: p, node: n}
c.stack = append(c.stack, e)//記錄查詢路徑
// 葉子節點,獲取key對應的value,如果有的話.
if e.isLeaf() {
c.nsearch(key)
return
}
if n != nil {
c.searchNode(key, n) //從內存節點進去下一層繼續查找
return
}
c.searchPage(key, p)//從page進入下一層繼續查找
}
Search是個遞歸調用的函數,優先在從node中查詢,如果node中不存在再去page中查詢,知道查詢到葉子節點,在葉子節點中在查找最終的value。看完seek查詢我們在回到查詢bucket的函數,當查詢到value後打開bukcet函數openBucket()
func (b *Bucket) openBucket(value []byte) *Bucket {
var child = newBucket(b.tx)
// If unaligned load/stores are broken on this arch and value is
// unaligned simply clone to an aligned byte array.
unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0
if unaligned {
value = cloneBytes(value)
}
// 如果這是一個可寫事務,那麼我們需要複製bucket條目。只讀事務可以直接指向mmap條目,bucket中的root記錄了該表的根page
if b.tx.writable && !unaligned {
child.bucket = &bucket{}
*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
} else {
child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
}
//如果 bucket是inline的,將數據保存到page中
if child.root == 0 {
child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
}
return &child
}
創建bucket前半段是查詢以及一些檢查,略過不看。直接看關鍵部分
func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
// Create empty, inline bucket.
var bucket = Bucket{
bucket: &bucket{},
rootNode: &node{isLeaf: true},
FillPercent: DefaultFillPercent,
}
var value = bucket.write()//生成bucket的value
// Insert into node.
key = cloneBytes(key)
//將k-v寫入,page類型設置成bucketLeafFlag
c.node().put(key, key, value, 0, bucketLeafFlag)
// Since subbuckets are not allowed on inline buckets, we need to
// dereference the inline page, if it exists. This will cause the bucket
// to be treated as a regular, non-inline bucket for the rest of the tx.
b.page = nil
return b.Bucket(key), nil
}
先看下是如何生成bucket的value的
value數據結構:
func (b *Bucket) write() []byte {
// Allocate the appropriate size.
var n = b.rootNode
var value = make([]byte, bucketHeaderSize+n.size())
// Write a bucket header.
var bucket = (*bucket)(unsafe.Pointer(&value[0]))
*bucket = *b.bucket//bucket數據拷貝
// Convert byte slice to a fake page and write the root node.
var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
n.write(p)//填充page部分數據
return value
}
func (n *node) write(p *page) {
// Initialize page.
if n.isLeaf {
p.flags |= leafPageFlag//設置leaf標記
} else {
p.flags |= branchPageFlag//設置branch標記
}
if len(n.inodes) >= 0xFFFF {
panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
}
p.count = uint16(len(n.inodes)) //設置count
// Stop here if there are no items to write.
if p.count == 0 {
return
}
// 遍歷node中的每一個inode,填充到page中
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
for i, item := range n.inodes {
_assert(len(item.key) > 0, "write: zero-length inode key")
// Write the page element.
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.flags = item.flags
elem.ksize = uint32(len(item.key))
elem.vsize = uint32(len(item.value))
} else {
elem := p.branchPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.ksize = uint32(len(item.key))
elem.pgid = item.pgid
_assert(elem.pgid != p.id, "write: circular dependency occurred")
}
klen, vlen := len(item.key), len(item.value)
if len(b) < klen+vlen {
b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
}
// Write data for the element to the end of the page.
copy(b[0:], item.key)
b = b[klen:]
copy(b[0:], item.value)
b = b[vlen:]
}
// DEBUG ONLY: n.dump()
}
Bucket已經變成了value,在回到前面的CreateBucket函數,Cursor.node()函數獲取kv插入到哪個node節點。前面我們說過在seek函數中會記錄key的查找軌跡,根據查找軌跡就知道該kv應該插入哪個node中。
func (c *Cursor) node() *node {
_assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")
// 如果node存在且正好是leaf節點,直接返回.
if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
return ref.node
}
// 如果root node不存在,從根page中加載該node
var n = c.stack[0].node
if n == nil {
// page.id 從3開始,前面將open時介紹過
n = c.bucket.node(c.stack[0].page.id, nil)
}
//從root向下遍歷將完整的路徑上的node都加載出來
for _, ref := range c.stack[:len(c.stack)-1] {
_assert(!n.isLeaf, "expected branch node")
n = n.childAt(int(ref.index))
}
_assert(n.isLeaf, "expected leaf node")
return n
}
加載的node的實際執行函數是Bucket.node()
func (b *Bucket) node(pgid pgid, parent *node) *node {
_assert(b.nodes != nil, "nodes map expected")
// 緩存中存在,直接返回
if n := b.nodes[pgid]; n != nil {
return n
}
// 創建新node構建父子關係
n := &node{bucket: b, parent: parent}
if parent == nil {
b.rootNode = n
} else {
parent.children = append(parent.children, n)
}
// 如果是inline bucket就使用bucket自帶page,如果不是就加載一個page
var p = b.page
if p == nil {
p = b.tx.page(pgid)//底層是從db的映射到內存的data中獲取
}
// 使用page數據加載node以及inodes
n.read(p)
b.nodes[pgid] = n//加入緩存
// Update statistics.
b.tx.stats.NodeCount++
return n
}
到此,我們已經爲kv的插入準備好了一切,可以行進kv的插入操作了,kv插入是調用node.put()函數進行:
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
if pgid >= n.bucket.tx.meta.pgid {
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
} else if len(oldKey) <= 0 {
panic("put: zero-length old key")
} else if len(newKey) <= 0 {
panic("put: zero-length new key")
}
// Find insertion index.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
// Add capacity and shift nodes if we don't have an exact match and need to insert.
exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
if !exact {
n.inodes = append(n.inodes, inode{})
copy(n.inodes[index+1:], n.inodes[index:])
}
inode := &n.inodes[index]
inode.flags = flags
inode.key = newKey
inode.value = value
inode.pgid = pgid
_assert(len(inode.key) > 0, "put: zero-length inode key")
}
這函數比較簡單,找到合適的位置,插入kv。創建bucket完成