Bolt源碼解析(二):文件初始化、創建bucket

Bolt的數據是存儲在db文件中,bolt操作始於db文件。

1、db文件初始化和加載流程

open函數實現db文件的打開

func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
    var db = &DB{opened: true}

    // Set default options if no options are provided.
    if options == nil {
        options = DefaultOptions
}
//使用默認值初始化,這些字段的含義在前面的已經介紹,此處不再介紹
    db.NoGrowSync = options.NoGrowSync 
    db.MmapFlags = options.MmapFlags
    // Set default values for later DB operations.
    db.MaxBatchSize = DefaultMaxBatchSize
    db.MaxBatchDelay = DefaultMaxBatchDelay
    db.AllocSize = DefaultAllocSize

    flag := os.O_RDWR
    if options.ReadOnly {//如果以只讀方式打開
        flag = os.O_RDONLY
        db.readOnly = true
    }

    //獲取文件句柄
    db.path = path
    var err error
    if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
        _ = db.close()
        return nil, err
    }
//鎖定文件,以便在讀寫模式下使用Bolt的其他進程無法同時使用數據庫。只讀使用共享鎖鎖定數據庫文件(多個進程可能同時保持鎖定)否則(設置options.ReadOnly)
    if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
        _ = db.close()
        return nil, err
    }

    // Default values for test hooks
    db.ops.writeAt = db.file.WriteAt

    // Initialize the database if it doesn't exist.
    if info, err := db.file.Stat(); err != nil {
        return nil, err
    } else if info.Size() == 0 {//不存在db文件
        // 初始化操作
        if err := db.init(); err != nil {
            return nil, err
        }
    } else {//db文件已存在
        // Read the first meta page to determine the page size.
        var buf [0x1000]byte
        if _, err := db.file.ReadAt(buf[:], 0); err == nil {
            m := db.pageInBuffer(buf[:], 0).meta()
            if err := m.validate(); err != nil {
                // If we can't read the page size, we can assume it's the same
                // as the OS -- since that's how the page size was chosen in the
                // first place.
                //
                // If the first page is invalid and this OS uses a different
                // page size than what the database was created with then we
                // are out of luck and cannot access the database.
                db.pageSize = os.Getpagesize()
            } else {
                db.pageSize = int(m.pageSize)
            }
        }
    }

    // 初始化page 內存池
    db.pagePool = sync.Pool{
        New: func() interface{} {
            return make([]byte, db.pageSize)
        },
    }

    // 將文件映射在內存
    if err := db.mmap(options.InitialMmapSize); err != nil {
        _ = db.close()
        return nil, err
    }

    // 空閒page加載到freelist.
    db.freelist = newFreelist()
    db.freelist.read(db.page(db.meta().freelist))

    // Mark the database as opened and return.
    return db, nil
}

如果之前不存在db文件會調用init進行初始化,否在讀取文件頭部4k數據從中取出想要的信息完成後續加載。早函數尾部進行內存的映射和空閒page的加載。 

func (db *DB) init() error {
    // Set the page size to the OS page size.
    db.pageSize = os.Getpagesize()

    // Create two meta pages on a buffer.
    buf := make([]byte, db.pageSize*4) //前4個page預留
    for i := 0; i < 2; i++ {//初始化兩個meat page
        p := db.pageInBuffer(buf[:], pgid(i)) //填充第幾個page
        p.id = pgid(i)
        p.flags = metaPageFlag

        // Initialize the meta page.
        m := p.meta()
        m.magic = magic
        m.version = version
        m.pageSize = uint32(db.pageSize)
        m.freelist = 2//起始page
        m.root = bucket{root: 3}//根page起始page
        m.pgid = 4//page id起始位置
        m.txid = txid(i)
        m.checksum = m.sum64()
    }

    // Write an empty freelist at page 3.
    p := db.pageInBuffer(buf[:], pgid(2)) //第二個page保存free page
    p.id = pgid(2)
    p.flags = freelistPageFlag
    p.count = 0

    // Write an empty leaf page at page 4.
    p = db.pageInBuffer(buf[:], pgid(3))// 第三個page leaf節點
    p.id = pgid(3)
    p.flags = leafPageFlag
    p.count = 0

    // buf寫入db
    if _, err := db.ops.writeAt(buf, 0); err != nil {
        return err
}
//同步文件
    if err := fdatasync(db); err != nil {
        return err
    }

    return nil
}

可以看到open函數比較簡單,如果是第一次使用會open文件然後將固定格式寫入前四個page。

2、查詢/創建bucket

查詢bucket是創建的前提,創建前都會查詢bucket是否存在,如果存在會返回錯誤,如果不存在纔會創建bucket。查詢操作在事務內進行。

func (tx *Tx) Bucket(name []byte) *Bucket {
    return tx.root.Bucket(name)
}
func (b *Bucket) Bucket(name []byte) *Bucket {
    if b.buckets != nil {
        if child := b.buckets[string(name)]; child != nil {
            return child
        }
    }

//創建查詢遊標
c := b.Cursor()
//在當前的bucket中查詢找name對應的value
    k, v, flags := c.seek(name)

    //key不存在,結束
    if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
        return nil
    }

    // 否則打開bucket
    var child = b.openBucket(v)
    if b.buckets != nil {
        b.buckets[string(name)] = child
    }

    return child
}

Seek是查詢的核心函數,seek中會使用二分查找在b+tree中查找對應的key。因爲b+tree有可能被緩存到了內存,所以查找過程會在nodepage之間切換。

func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
    _assert(c.bucket.tx.db != nil, "tx closed")

    // Start from root page/node and traverse to correct page.
    c.stack = c.stack[:0]
    c.search(seek, c.bucket.root)//從根page開始查找
    ref := &c.stack[len(c.stack)-1]//獲取最後一次查詢的位置

    // 未查詢到
    if ref.index >= ref.count() {
        return nil, nil, 0
    }

    // 獲取key對應的value.
    return c.keyValue()
}

func (c *Cursor) search(key []byte, pgid pgid) {
    p, n := c.bucket.pageNode(pgid)//從內存或者磁盤獲取該page
    if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 {
        panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags))
    }
    e := elemRef{page: p, node: n}
    c.stack = append(c.stack, e)//記錄查詢路徑

    // 葉子節點,獲取key對應的value,如果有的話.
    if e.isLeaf() {
        c.nsearch(key)
        return
    }

    if n != nil {
        c.searchNode(key, n) //從內存節點進去下一層繼續查找
        return
    }
    c.searchPage(key, p)//從page進入下一層繼續查找
}

Search是個遞歸調用的函數,優先在從node中查詢,如果node中不存在再去page中查詢,知道查詢到葉子節點,在葉子節點中在查找最終的value。看完seek查詢我們在回到查詢bucket的函數,當查詢到value後打開bukcet函數openBucket()

func (b *Bucket) openBucket(value []byte) *Bucket {
    var child = newBucket(b.tx)

    // If unaligned load/stores are broken on this arch and value is
    // unaligned simply clone to an aligned byte array.
    unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0

    if unaligned {
        value = cloneBytes(value)
    }

    // 如果這是一個可寫事務,那麼我們需要複製bucket條目。只讀事務可以直接指向mmap條目,bucket中的root記錄了該表的根page
    if b.tx.writable && !unaligned {
        child.bucket = &bucket{}
        *child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
    } else {
        child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
    }

    //如果 bucket是inline的,將數據保存到page中
    if child.root == 0 {
        child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
    }

    return &child
}

創建bucket前半段是查詢以及一些檢查,略過不看。直接看關鍵部分

func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
    
    // Create empty, inline bucket.
    var bucket = Bucket{
        bucket:      &bucket{},
        rootNode:    &node{isLeaf: true},
        FillPercent: DefaultFillPercent,
    }
    var value = bucket.write()//生成bucket的value

    // Insert into node.
key = cloneBytes(key)
//將k-v寫入,page類型設置成bucketLeafFlag
    c.node().put(key, key, value, 0, bucketLeafFlag)

    // Since subbuckets are not allowed on inline buckets, we need to
    // dereference the inline page, if it exists. This will cause the bucket
    // to be treated as a regular, non-inline bucket for the rest of the tx.
    b.page = nil

    return b.Bucket(key), nil
}

先看下是如何生成bucket的value的

value數據結構:

func (b *Bucket) write() []byte {
    // Allocate the appropriate size.
    var n = b.rootNode
    var value = make([]byte, bucketHeaderSize+n.size())

    // Write a bucket header.
    var bucket = (*bucket)(unsafe.Pointer(&value[0]))
    *bucket = *b.bucket//bucket數據拷貝

    // Convert byte slice to a fake page and write the root node.
    var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
    n.write(p)//填充page部分數據

    return value
}

func (n *node) write(p *page) {
    // Initialize page.
    if n.isLeaf {
        p.flags |= leafPageFlag//設置leaf標記
    } else {
        p.flags |= branchPageFlag//設置branch標記
    }

    if len(n.inodes) >= 0xFFFF {
        panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
    }
    p.count = uint16(len(n.inodes)) //設置count

    // Stop here if there are no items to write.
    if p.count == 0 {
        return
    }

    // 遍歷node中的每一個inode,填充到page中
    b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
    for i, item := range n.inodes {
        _assert(len(item.key) > 0, "write: zero-length inode key")

        // Write the page element.
        if n.isLeaf {
            elem := p.leafPageElement(uint16(i))
            elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
            elem.flags = item.flags
            elem.ksize = uint32(len(item.key))
            elem.vsize = uint32(len(item.value))
        } else {
            elem := p.branchPageElement(uint16(i))
            elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
            elem.ksize = uint32(len(item.key))
            elem.pgid = item.pgid
            _assert(elem.pgid != p.id, "write: circular dependency occurred")
        }
        klen, vlen := len(item.key), len(item.value)
        if len(b) < klen+vlen {
            b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
        }

        // Write data for the element to the end of the page.
        copy(b[0:], item.key)
        b = b[klen:]
        copy(b[0:], item.value)
        b = b[vlen:]
    }

    // DEBUG ONLY: n.dump()
}

Bucket已經變成了value,在回到前面的CreateBucket函數,Cursor.node()函數獲取kv插入到哪個node節點。前面我們說過在seek函數中會記錄key的查找軌跡,根據查找軌跡就知道該kv應該插入哪個node中。

func (c *Cursor) node() *node {
    _assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")

    // 如果node存在且正好是leaf節點,直接返回.
    if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
        return ref.node
    }

    // 如果root node不存在,從根page中加載該node
    var n = c.stack[0].node
if n == nil {
// page.id 從3開始,前面將open時介紹過
        n = c.bucket.node(c.stack[0].page.id, nil)
}
//從root向下遍歷將完整的路徑上的node都加載出來
    for _, ref := range c.stack[:len(c.stack)-1] {
        _assert(!n.isLeaf, "expected branch node")
        n = n.childAt(int(ref.index))
    }
    _assert(n.isLeaf, "expected leaf node")
    return n
}

加載的node的實際執行函數是Bucket.node()

func (b *Bucket) node(pgid pgid, parent *node) *node {
    _assert(b.nodes != nil, "nodes map expected")

    // 緩存中存在,直接返回
    if n := b.nodes[pgid]; n != nil {
        return n
    }

    // 創建新node構建父子關係
    n := &node{bucket: b, parent: parent}
    if parent == nil {
        b.rootNode = n
    } else {
        parent.children = append(parent.children, n)
    }

    // 如果是inline bucket就使用bucket自帶page,如果不是就加載一個page
    var p = b.page
    if p == nil {
        p = b.tx.page(pgid)//底層是從db的映射到內存的data中獲取
    }

    // 使用page數據加載node以及inodes
    n.read(p)
    b.nodes[pgid] = n//加入緩存

    // Update statistics.
    b.tx.stats.NodeCount++

    return n
}

到此,我們已經爲kv的插入準備好了一切,可以行進kv的插入操作了,kv插入是調用node.put()函數進行:

func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
    if pgid >= n.bucket.tx.meta.pgid {
        panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
    } else if len(oldKey) <= 0 {
        panic("put: zero-length old key")
    } else if len(newKey) <= 0 {
        panic("put: zero-length new key")
    }

    // Find insertion index.
    index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })

    // Add capacity and shift nodes if we don't have an exact match and need to insert.
    exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
    if !exact {
        n.inodes = append(n.inodes, inode{})
        copy(n.inodes[index+1:], n.inodes[index:])
    }

    inode := &n.inodes[index]
    inode.flags = flags
    inode.key = newKey
    inode.value = value
    inode.pgid = pgid
    _assert(len(inode.key) > 0, "put: zero-length inode key")
}

這函數比較簡單,找到合適的位置,插入kv。創建bucket完成

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章