寫在前面:個人認爲pager層是SQLite實現最爲核心的模塊,它具有四大功能:I/O,頁面緩存,併發控制和日誌恢復。而這些功能不僅是上層Btree的基礎,而且對系統的性能和健壯性有關至關重要的影響。其中併發控制和日誌恢復是事務處理實現的基礎。SQLite併發控制的機制非常簡單——封鎖機制;別外,它的查詢優化機制也非常簡單——基於索引。這一切使得整個SQLite的實現變得簡單,SQLite變得很小,運行速度也非常快,所以,特別適合嵌入式設備。好了,接下來討論事務的剩餘部分。
6、修改位於用戶進程空間的頁面(Changing Database Pages In User Space)
頁面的原始數據寫入日誌之後,就可以修改頁面了——位於用戶進程空間。每個數據庫連接都有自己私有的空間,所以頁面的變化只對該連接可見,而對其它連接的數據仍然是磁盤緩存中的數據。從這裏可以明白一件事:一個進程在修改頁面數據的同時,其它進程可以繼續進行讀操作。圖中的紅色表示修改的頁面。
7、日誌文件刷入磁盤(Flushing The Rollback Journal File To Mass Storage)
接下來把日誌文件的內容刷入磁盤,這對於數據庫從意外中恢復來說是至關重要的一步。而且這通常也是一個耗時的操作,因爲磁盤I/O速度很慢。
這個步驟不只把日誌文件刷入磁盤那麼簡單,它的實現實際上分成兩步:首先把日誌文件的內容刷入磁盤(即頁面數據);然後把日誌文件中頁面的數目寫入日誌文件頭,再把header刷入磁盤(這一過程在代碼中清晰可見)。
代碼如下:
/*
**Sync日誌文件,保證所有的髒頁面寫入磁盤日誌文件
*/
static int syncJournal(Pager *pPager){
PgHdr *pPg;
int rc = SQLITE_OK;
/* Sync the journal before modifying the main database
** (assuming there is a journal and it needs to be synced.)
*/
if( pPager->needSync ){
if( !pPager->tempFile ){
assert( pPager->journalOpen );
/* assert( !pPager->noSync ); // noSync might be set if synchronous
** was turned off after the transaction was started. Ticket #615 */
#ifndef NDEBUG
{
/* Make sure the pPager->nRec counter we are keeping agrees
** with the nRec computed from the size of the journal file.
*/
i64 jSz;
rc = sqlite3OsFileSize(pPager->jfd, &jSz);
if( rc!=0 ) return rc;
assert( pPager->journalOff==jSz );
}
#endif
{
/* Write the nRec value into the journal file header. If in
** full-synchronous mode, sync the journal first. This ensures that
** all data has really hit the disk before nRec is updated to mark
** it as a candidate for rollback.
*/
if( pPager->fullSync ){
TRACE2("SYNC journal of %d\n", PAGERID(pPager));
//首先保證髒頁面中所有的數據都已經寫入日誌文件
rc = sqlite3OsSync(pPager->jfd, 0);
if( rc!=0 ) return rc;
}
rc = sqlite3OsSeek(pPager->jfd,
pPager->journalHdr + sizeof(aJournalMagic));
if( rc ) return rc;
//頁面的數目寫入日誌文件
rc = write32bits(pPager->jfd, pPager->nRec);
if( rc ) return rc;
rc = sqlite3OsSeek(pPager->jfd, pPager->journalOff);
if( rc ) return rc;
}
TRACE2("SYNC journal of %d\n", PAGERID(pPager));
rc = sqlite3OsSync(pPager->jfd, pPager->full_fsync);
if( rc!=0 ) return rc;
pPager->journalStarted = 1;
}
pPager->needSync = 0;
/* Erase the needSync flag from every page.
*/
//清除needSync標誌位
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
pPg->needSync = 0;
}
pPager->pFirstSynced = pPager->pFirst;
}
#ifndef NDEBUG
/* If the Pager.needSync flag is clear then the PgHdr.needSync
** flag must also be clear for all pages. Verify that this
** invariant is true.
*/
else{
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
assert( pPg->needSync==0 );
}
assert( pPager->pFirstSynced==pPager->pFirst );
}
#endif
return rc;
}
8、獲取排斥鎖(Obtaining An Exclusive Lock)
在對數據庫文件進行修改之前(注:這裏不是內存中的頁面),我們必須得到數據庫文件的排斥鎖(Exclusive Lock)。得到排斥鎖的過程可分爲兩步:首先得到Pending lock;然後Pending lock升級到exclusive lock。
Pending lock允許其它已經存在的Shared lock繼續讀數據庫文件,但是不允許產生新的shared lock,這樣做目的是爲了防止寫操作發生餓死情況。一旦所有的shared lock完成操作,則pending lock升級到exclusive lock。
9、修改的頁面寫入文件(Writing Changes To The Database File)
一旦得到exclusive lock,其它的進程就不能進行讀操作,此時就可以把修改的頁面寫回數據庫文件,但是通常OS都把結果暫時保存到磁盤緩存中,直到某個時刻纔會真正把結果寫入磁盤。
以上兩步的實現代碼:
/把所有的髒頁面寫入數據庫
//到這裏開始獲取EXCLUSIVEQ鎖,並將頁面寫回操作系統文件
static int pager_write_pagelist(PgHdr *pList){
Pager *pPager;
int rc;
if( pList==0 ) return SQLITE_OK;
pPager = pList->pPager;
/* At this point there may be either a RESERVED or EXCLUSIVE lock on the
** database file. If there is already an EXCLUSIVE lock, the following
** calls to sqlite3OsLock() are no-ops.
**
** Moving the lock from RESERVED to EXCLUSIVE actually involves going
** through an intermediate state PENDING. A PENDING lock prevents new
** readers from attaching to the database but is unsufficient for us to
** write. The idea of a PENDING lock is to prevent new readers from
** coming in while we wait for existing readers to clear.
**
** While the pager is in the RESERVED state, the original database file
** is unchanged and we can rollback without having to playback the
** journal into the original database file. Once we transition to
** EXCLUSIVE, it means the database file has been changed and any rollback
** will require a journal playback.
*/
//加EXCLUSIVE_LOCK鎖
rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
if( rc!=SQLITE_OK ){
return rc;
}
while( pList ){
assert( pList->dirty );
rc = sqlite3OsSeek(pPager->fd, (pList->pgno-1)*(i64)pPager->pageSize);
if( rc ) return rc;
/* If there are dirty pages in the page cache with page numbers greater
** than Pager.dbSize, this means sqlite3pager_truncate() was called to
** make the file smaller (presumably by auto-vacuum code). Do not write
** any such pages to the file.
*/
if( pList->pgno<=pPager->dbSize ){
char *pData = CODEC2(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
TRACE3("STORE %d page %d\n", PAGERID(pPager), pList->pgno);
//寫入文件
rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize);
TEST_INCR(pPager->nWrite);
}
#ifndef NDEBUG
else{
TRACE3("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno);
}
#endif
if( rc ) return rc;
//設置dirty
pList->dirty = 0;
#ifdef SQLITE_CHECK_PAGES
pList->pageHash = pager_pagehash(pList);
#endif
//指向下一個髒頁面
pList = pList->pDirty;
}
return SQLITE_OK;
}
10、修改結果刷入存儲設備(Flushing Changes To Mass Storage)
爲了保證修改結果真正寫入磁盤,這一步必不要少。對於數據庫存的完整性,這一步也是關鍵的一步。由於要進行實際的I/O操作,所以和第7步一樣,將花費較多的時間。
最後來看看這幾步是如何實現的:
其實以上以上幾步是在函數sqlite3BtreeSync()---btree.c中調用的(而關於該函數的調用後面再講)。
代碼如下:
//同步btree對應的數據庫文件
//該函數返回之後,只需要提交寫事務,刪除日誌文件
int sqlite3BtreeSync(Btree *p, const char *zMaster){
int rc = SQLITE_OK;
if( p->inTrans==TRANS_WRITE ){
BtShared *pBt = p->pBt;
Pgno nTrunc = 0;
#ifndef SQLITE_OMIT_AUTOVACUUM
if( pBt->autoVacuum ){
rc = autoVacuumCommit(pBt, &nTrunc);
if( rc!=SQLITE_OK ){
return rc;
}
}
#endif
//調用pager進行sync
rc = sqlite3pager_sync(pBt->pPager, zMaster, nTrunc);
}
return rc;
}
//把pager所有髒頁面寫回文件
int sqlite3pager_sync(Pager *pPager, const char *zMaster, Pgno nTrunc){
int rc = SQLITE_OK;
TRACE4("DATABASE SYNC: File=%s zMaster=%s nTrunc=%d\n",
pPager->zFilename, zMaster, nTrunc);
/* If this is an in-memory db, or no pages have been written to, or this
** function has already been called, it is a no-op.
*/
//pager不處於PAGER_SYNCED狀態,dirtyCache爲1,
//則進行sync操作
if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
PgHdr *pPg;
assert( pPager->journalOpen );
/* If a master journal file name has already been written to the
** journal file, then no sync is required. This happens when it is
** written, then the process fails to upgrade from a RESERVED to an
** EXCLUSIVE lock. The next time the process tries to commit the
** transaction the m-j name will have already been written.
*/
if( !pPager->setMaster ){
//pager修改計數
rc = pager_incr_changecounter(pPager);
if( rc!=SQLITE_OK ) goto sync_exit;
#ifndef SQLITE_OMIT_AUTOVACUUM
if( nTrunc!=0 ){
/* If this transaction has made the database smaller, then all pages
** being discarded by the truncation must be written to the journal
** file.
*/
Pgno i;
void *pPage;
int iSkip = PAGER_MJ_PGNO(pPager);
for( i=nTrunc+1; i<=pPager->origDbSize; i++ ){
if( !(pPager->aInJournal[i/8] & (1<<(i&7))) && i!=iSkip ){
rc = sqlite3pager_get(pPager, i, &pPage);
if( rc!=SQLITE_OK ) goto sync_exit;
rc = sqlite3pager_write(pPage);
sqlite3pager_unref(pPage);
if( rc!=SQLITE_OK ) goto sync_exit;
}
}
}
#endif
rc = writeMasterJournal(pPager, zMaster);
if( rc!=SQLITE_OK ) goto sync_exit;
//sync日誌文件
rc = syncJournal(pPager);
if( rc!=SQLITE_OK ) goto sync_exit;
}
#ifndef SQLITE_OMIT_AUTOVACUUM
if( nTrunc!=0 ){
rc = sqlite3pager_truncate(pPager, nTrunc);
if( rc!=SQLITE_OK ) goto sync_exit;
}
#endif
/* Write all dirty pages to the database file */
pPg = pager_get_all_dirty_pages(pPager);
//把所有髒頁面寫回操作系統文件
rc = pager_write_pagelist(pPg);
if( rc!=SQLITE_OK ) goto sync_exit;
/* Sync the database file. */
//sync數據庫文件
if( !pPager->noSync ){
rc = sqlite3OsSync(pPager->fd, 0);
}
pPager->state = PAGER_SYNCED;
}else if( MEMDB && nTrunc!=0 ){
rc = sqlite3pager_truncate(pPager, nTrunc);
}
sync_exit:
return rc;
}
下圖可以進一步解釋該過程: