SQLite入門與分析(四)---Page Cache之事務處理(2)

寫在前面:個人認爲pager層是SQLite實現最爲核心的模塊,它具有四大功能:I/O,頁面緩存,併發控制和日誌恢復。而這些功能不僅是上層Btree的基礎,而且對系統的性能和健壯性有關至關重要的影響。其中併發控制和日誌恢復是事務處理實現的基礎。SQLite併發控制的機制非常簡單——封鎖機制;別外,它的查詢優化機制也非常簡單——基於索引。這一切使得整個SQLite的實現變得簡單,SQLite變得很小,運行速度也非常快,所以,特別適合嵌入式設備。好了,接下來討論事務的剩餘部分。
6、修改位於用戶進程空間的頁面(Changing Database Pages In User Space)
頁面的原始數據寫入日誌之後,就可以修改頁面了——位於用戶進程空間。每個數據庫連接都有自己私有的空間,所以頁面的變化只對該連接可見,而對其它連接的數據仍然是磁盤緩存中的數據。從這裏可以明白一件事:一個進程在修改頁面數據的同時,其它進程可以繼續進行讀操作。圖中的紅色表示修改的頁面。


7、日誌文件刷入磁盤(Flushing The Rollback Journal File To Mass Storage)
接下來把日誌文件的內容刷入磁盤,這對於數據庫從意外中恢復來說是至關重要的一步。而且這通常也是一個耗時的操作,因爲磁盤I/O速度很慢。
這個步驟不只把日誌文件刷入磁盤那麼簡單,它的實現實際上分成兩步:首先把日誌文件的內容刷入磁盤(即頁面數據);然後把日誌文件中頁面的數目寫入日誌文件頭,再把header刷入磁盤(這一過程在代碼中清晰可見)。

 

代碼如下:

 

Code
/*
**Sync日誌文件,保證所有的髒頁面寫入磁盤日誌文件
*/
static int syncJournal(Pager *pPager){
  PgHdr *pPg;
  int rc = SQLITE_OK;

  /* Sync the journal before modifying the main database
  ** (assuming there is a journal and it needs to be synced.)
  */
  if( pPager->needSync ){
    if( !pPager->tempFile ){
      assert( pPager->journalOpen );
      /* assert( !pPager->noSync ); // noSync might be set if synchronous
      ** was turned off after the transaction was started.  Ticket #615 */
#ifndef NDEBUG
      {
        /* Make sure the pPager->nRec counter we are keeping agrees
        ** with the nRec computed from the size of the journal file.
        */
        i64 jSz;
        rc = sqlite3OsFileSize(pPager->jfd, &jSz);
        if( rc!=0 ) return rc;
        assert( pPager->journalOff==jSz );
      }
#endif
      {
        /* Write the nRec value into the journal file header. If in
        ** full-synchronous mode, sync the journal first. This ensures that
        ** all data has really hit the disk before nRec is updated to mark
        ** it as a candidate for rollback. 
        */
        if( pPager->fullSync ){
          TRACE2("SYNC journal of %d\n", PAGERID(pPager));
    //首先保證髒頁面中所有的數據都已經寫入日誌文件
          rc = sqlite3OsSync(pPager->jfd, 0);
          if( rc!=0 ) return rc;
        }
        rc = sqlite3OsSeek(pPager->jfd,
                           pPager->journalHdr + sizeof(aJournalMagic));
        if( rc ) return rc;
    //頁面的數目寫入日誌文件
        rc = write32bits(pPager->jfd, pPager->nRec);
        if( rc ) return rc;

        rc = sqlite3OsSeek(pPager->jfd, pPager->journalOff);
        if( rc ) return rc;
      }
      TRACE2("SYNC journal of %d\n", PAGERID(pPager));
      rc = sqlite3OsSync(pPager->jfd, pPager->full_fsync);
      if( rc!=0 ) return rc;
      pPager->journalStarted = 1;
    }
    pPager->needSync = 0;

    /* Erase the needSync flag from every page.
    */
    //清除needSync標誌位
    for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
      pPg->needSync = 0;
    }
    pPager->pFirstSynced = pPager->pFirst;
  }

#ifndef NDEBUG
  /* If the Pager.needSync flag is clear then the PgHdr.needSync
  ** flag must also be clear for all pages.  Verify that this
  ** invariant is true.
  */
  else{
    for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
      assert( pPg->needSync==0 );
    }
    assert( pPager->pFirstSynced==pPager->pFirst );
  }
#endif

  return rc;
}


 

8、獲取排斥鎖(Obtaining An Exclusive Lock)
在對數據庫文件進行修改之前(注:這裏不是內存中的頁面),我們必須得到數據庫文件的排斥鎖(Exclusive Lock)。得到排斥鎖的過程可分爲兩步:首先得到Pending lock;然後Pending lock升級到exclusive lock。
Pending lock允許其它已經存在的Shared lock繼續讀數據庫文件,但是不允許產生新的shared lock,這樣做目的是爲了防止寫操作發生餓死情況。一旦所有的shared lock完成操作,則pending lock升級到exclusive lock。

 

9、修改的頁面寫入文件(Writing Changes To The Database File)
一旦得到exclusive lock,其它的進程就不能進行讀操作,此時就可以把修改的頁面寫回數據庫文件,但是通常OS都把結果暫時保存到磁盤緩存中,直到某個時刻纔會真正把結果寫入磁盤。

 

以上兩步的實現代碼:

 

Code
/把所有的髒頁面寫入數據庫
//到這裏開始獲取EXCLUSIVEQ鎖,並將頁面寫回操作系統文件
static int pager_write_pagelist(PgHdr *pList){
  Pager *pPager;
  int rc;

  if( pList==0 ) return SQLITE_OK;
  pPager = pList->pPager;

  /* At this point there may be either a RESERVED or EXCLUSIVE lock on the
  ** database file. If there is already an EXCLUSIVE lock, the following
  ** calls to sqlite3OsLock() are no-ops.
  **
  ** Moving the lock from RESERVED to EXCLUSIVE actually involves going
  ** through an intermediate state PENDING.   A PENDING lock prevents new
  ** readers from attaching to the database but is unsufficient for us to
  ** write.  The idea of a PENDING lock is to prevent new readers from
  ** coming in while we wait for existing readers to clear.
  **
  ** While the pager is in the RESERVED state, the original database file
  ** is unchanged and we can rollback without having to playback the
  ** journal into the original database file.  Once we transition to
  ** EXCLUSIVE, it means the database file has been changed and any rollback
  ** will require a journal playback.
  */
  //加EXCLUSIVE_LOCK鎖
  rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
  if( rc!=SQLITE_OK ){
    return rc;
  }

  while( pList ){
    assert( pList->dirty );
    rc = sqlite3OsSeek(pPager->fd, (pList->pgno-1)*(i64)pPager->pageSize);
    if( rc ) return rc;
    /* If there are dirty pages in the page cache with page numbers greater
    ** than Pager.dbSize, this means sqlite3pager_truncate() was called to
    ** make the file smaller (presumably by auto-vacuum code). Do not write
    ** any such pages to the file.
    */
    if( pList->pgno<=pPager->dbSize ){
      char *pData = CODEC2(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
      TRACE3("STORE %d page %d\n", PAGERID(pPager), pList->pgno);
      //寫入文件
      rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize);
      TEST_INCR(pPager->nWrite);
    }
#ifndef NDEBUG
    else{
      TRACE3("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno);
    }
#endif
    if( rc ) return rc;
    //設置dirty
    pList->dirty = 0;
#ifdef SQLITE_CHECK_PAGES
    pList->pageHash = pager_pagehash(pList);
#endif
//指向下一個髒頁面
    pList = pList->pDirty;
  }
  return SQLITE_OK;
}


 

10、修改結果刷入存儲設備(Flushing Changes To Mass Storage)
爲了保證修改結果真正寫入磁盤,這一步必不要少。對於數據庫存的完整性,這一步也是關鍵的一步。由於要進行實際的I/O操作,所以和第7步一樣,將花費較多的時間。

最後來看看這幾步是如何實現的:

其實以上以上幾步是在函數sqlite3BtreeSync()---btree.c中調用的(而關於該函數的調用後面再講)。

代碼如下:

 

Code
//同步btree對應的數據庫文件
//該函數返回之後,只需要提交寫事務,刪除日誌文件
int sqlite3BtreeSync(Btree *p, const char *zMaster){
  int rc = SQLITE_OK;
  if( p->inTrans==TRANS_WRITE ){
    BtShared *pBt = p->pBt;
    Pgno nTrunc = 0;
#ifndef SQLITE_OMIT_AUTOVACUUM
    if( pBt->autoVacuum ){
      rc = autoVacuumCommit(pBt, &nTrunc); 
      if( rc!=SQLITE_OK ){
        return rc;
      }
    }
#endif

 //調用pager進行sync
    rc = sqlite3pager_sync(pBt->pPager, zMaster, nTrunc);
  }
  return rc;
}

//把pager所有髒頁面寫回文件
int sqlite3pager_sync(Pager *pPager, const char *zMaster, Pgno nTrunc){
  int rc = SQLITE_OK;

  TRACE4("DATABASE SYNC: File=%s zMaster=%s nTrunc=%d\n", 
      pPager->zFilename, zMaster, nTrunc);

  /* If this is an in-memory db, or no pages have been written to, or this
  ** function has already been called, it is a no-op.
  */
  //pager不處於PAGER_SYNCED狀態,dirtyCache爲1,
  //則進行sync操作
  if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
    PgHdr *pPg;
    assert( pPager->journalOpen );

    /* If a master journal file name has already been written to the
    ** journal file, then no sync is required. This happens when it is
    ** written, then the process fails to upgrade from a RESERVED to an
    ** EXCLUSIVE lock. The next time the process tries to commit the
    ** transaction the m-j name will have already been written.
    */
    if( !pPager->setMaster ){
        //pager修改計數
      rc = pager_incr_changecounter(pPager);
      if( rc!=SQLITE_OK ) goto sync_exit;
#ifndef SQLITE_OMIT_AUTOVACUUM
      if( nTrunc!=0 ){
        /* If this transaction has made the database smaller, then all pages
        ** being discarded by the truncation must be written to the journal
        ** file.
        */
        Pgno i;
        void *pPage;
        int iSkip = PAGER_MJ_PGNO(pPager);
        for( i=nTrunc+1; i<=pPager->origDbSize; i++ ){
          if( !(pPager->aInJournal[i/8] & (1<<(i&7))) && i!=iSkip ){
            rc = sqlite3pager_get(pPager, i, &pPage);
            if( rc!=SQLITE_OK ) goto sync_exit;
            rc = sqlite3pager_write(pPage);
            sqlite3pager_unref(pPage);
            if( rc!=SQLITE_OK ) goto sync_exit;
          }
        } 
      }
#endif
      rc = writeMasterJournal(pPager, zMaster);
      if( rc!=SQLITE_OK ) goto sync_exit;
      
        //sync日誌文件
      rc = syncJournal(pPager);
      if( rc!=SQLITE_OK ) goto sync_exit;
    }

#ifndef SQLITE_OMIT_AUTOVACUUM
    if( nTrunc!=0 ){
      rc = sqlite3pager_truncate(pPager, nTrunc);
      if( rc!=SQLITE_OK ) goto sync_exit;
    }
#endif

    /* Write all dirty pages to the database file */
    pPg = pager_get_all_dirty_pages(pPager);


   //把所有髒頁面寫回操作系統文件
    rc = pager_write_pagelist(pPg);
    if( rc!=SQLITE_OK ) goto sync_exit;

    /* Sync the database file. */
       //sync數據庫文件
    if( !pPager->noSync ){
      rc = sqlite3OsSync(pPager->fd, 0);
    }

    pPager->state = PAGER_SYNCED;
  }else if( MEMDB && nTrunc!=0 ){
    rc = sqlite3pager_truncate(pPager, nTrunc);
  }

sync_exit:
  return rc;
}


 

下圖可以進一步解釋該過程:

 


發佈了17 篇原創文章 · 獲贊 31 · 訪問量 14萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章