Mongodb源碼分析--刪除記錄

在之前的一篇文章中，介紹了assembleResponse函數(位於instance.cpp第224行)，它會根據op操作枚舉類型來調用相應的crud操作，枚舉類型定義如下：

enum Operations {
opReply = 1, /* reply. responseTo is set. */
dbMsg = 1000, /* generic msg command followed by a string */
dbUpdate = 2001, /* update object */
dbInsert = 2002,
//dbGetByOID = 2003,
dbQuery = 2004,
dbGetMore = 2005,
dbDelete = 2006,
dbKillCursors = 2007
};

可以看到dbDelete = 2002 爲刪除操作枚舉值。當客戶端將要刪除的記錄(或條件的document)發到服務端之後，mongodb通過消息封裝方式將數據包中的字節流解析轉成 message類型，並進一步轉換成dbmessage之後，mongodb就會根據消息類型進行判斷，以決定接下來執行的操作），下面我們看一下 assembleResponse在確定是刪除操作時調用的方法，如下：

view plain copy to clipboard print ?

assembleResponse( Message &m, DbResponse &dbresponse, const SockAddr &client ) {
.....
try {
if ( op == dbInsert ) { //添加記錄操作
receivedInsert(m, currentOp);
}
else if ( op == dbUpdate ) { //更新記錄
receivedUpdate(m, currentOp);
}
else if ( op == dbDelete ) { //刪除記錄
receivedDelete(m, currentOp);
}
else if ( op == dbKillCursors ) { //刪除Cursors（遊標）對象
currentOp.ensureStarted();
logThreshold = 10;
ss << "killcursors ";
receivedKillCursors(m);
}
else {
mongo::log() << " operation isn't supported: " << op << endl;
currentOp.done();
log = true;
}
}
.....
}
}

從上面代碼可以看出，系統在確定dbDelete操作時，調用了receivedDelete()方法（位於instance.cpp文件第323行），下面是該方法的定義：

view plain copy to clipboard print ?

void receivedDelete(Message & m, CurOp & op) {
DbMessage d(m); // 將Message消息轉換成數據庫消息格式
const char * ns = d.getns(); // 獲取相應名空間信息
assert( * ns);
uassert( 10056 , " not master " , isMasterNs( ns ) ); // 因爲CUD操作在主庫中操作，所以這裏斷言名空間包含的db信息中是不是主庫,即"master"
op.debug().str <<
ns << ' ' ;
// 獲取"刪除消息"結構體中的flags 標識位，如設置了該位，則僅刪除查找到的第一條記錄(document)，否則刪除所有匹配記錄.
// 關於消息結構體，參見我的這篇文章： http://www.cnblogs.com/daizhj/archive/2011/04/02/2003335.html
int flags = d.pullInt(); //
bool justOne = flags & RemoveOption_JustOne;
bool broadcast = flags & RemoveOption_Broadcast;
assert( d.moreJSObjs() );
BSONObj pattern = d.nextJsObj(); // 獲取"刪除消息"結構體中的selector（也就是要刪數據條件where）
{
string s = pattern.toString();
op.debug().str << " query: " << s;
op.setQuery(pattern);
}
writelock lk(ns);
// 如果不更新所有節點（sharding）且當前物理結點是shard 狀態時
if ( ! broadcast & handlePossibleShardedMessage( m , 0 ) )
return ;
// if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
Client::Context ctx(ns);
long long n = deleteObjects(ns, pattern, justOne, true ); // 刪除對象信息
lastError.getSafe() -> recordDelete( n );
}

上面方法主要是對消息中的flag信息進行解析，以獲取消息中的刪除條件等信息，並最終調用 deleteObjects方法，該方法位於query.cpp文件中，如下：

view plain copy to clipboard print ?

// query.cpp文件 128行
/* ns: 要刪除的表集合(namespace, e.g. <database>.<collection>)
pattern: 刪除條件，相當於 "where" 字語(clause / criteria)
justOne: 是否僅刪除第一個匹配對象信息
god: 是否允許訪問系統名空間(system namespaces)
*/
long long deleteObjects( const char * ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
if ( ! god ) { // 如果不能訪問system空間，但卻刪除該空間信息時
if ( strstr(ns, " .system. " ) ) {
/* note a delete from system.indexes would corrupt the db. if done here, as there are pointers into those objects in NamespaceDetails.
*/
uassert( 12050 , " cannot delete from system namespace " , legalClientSystemNS( ns , true ) );
}
if ( strchr( ns , ' $ ' ) ) {
log() << " cannot delete from collection with reserved $ in name: " << ns << endl;
uassert( 10100 , " cannot delete from collection with reserved $ in name " , strchr(ns, ' $ ' ) == 0 );
}
}
NamespaceDetails * d = nsdetails( ns ); // 獲取名空間詳細信息
if ( ! d )
return 0 ;
uassert( 10101 , " can't remove from a capped collection " , ! d -> capped ); // 確保當前collection不是capped類型（該類型集合會自動刪除舊數據）
long long nDeleted = 0 ;
int best = 0 ;
shared_ptr < MultiCursor::CursorOp > opPtr( new DeleteOp( justOneOrig, best ) ); // 構造“刪除操作”實例對象並用其構造遊標操作（符）實例
shared_ptr < MultiCursor > creal( new MultiCursor( ns, pattern, BSONObj(), opPtr, ! god ) ); // 構造MultiCursor查詢遊標（參見構造方法中的 nextClause()語句）
if ( ! creal -> ok() ) // 如果查詢遊標指向地址是否正常(主要判斷是否null)，因爲系統會根據上面遊標初始信息決定使用什麼樣的方式進行信息查詢(比如是否使用B樹索引等）
return nDeleted;
shared_ptr < Cursor > cPtr = creal;
auto_ptr < ClientCursor > cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) ); // 將遊標封裝以便下面遍歷使用
cc -> setDoingDeletes( true ); // 設置_doingDeletes(刪除中)標誌
CursorId id = cc -> cursorid();
bool justOne = justOneOrig;
bool canYield = ! god && ! creal -> matcher() -> docMatcher().atomic();
do {
if ( canYield && ! cc -> yieldSometimes() ) { // 查看是否已到期（每個cc都會有一個讀寫操作時間，該值取決子獲取讀寫鎖時系統分配的時間，詳見client.cpp 文件中的方法 int Client::recommendedYieldMicros( int * writers , int * readers ) {）
cc.release(); // 時間已到則釋放該對象（意味着已在別的地方被刪除?）
// TODO should we assert or something?
break ;
}
if ( ! cc -> ok() ) {
break ; // if we yielded, could have hit the end
}
// this way we can avoid calling updateLocation() every time (expensive)
// as well as some other nuances handled
cc -> setDoingDeletes( true );
DiskLoc rloc = cc -> currLoc(); // 遊標當前所指向的記錄所在地址
BSONObj key = cc -> currKey(); // 遊標當前所指向的記錄的key
// NOTE Calling advance() may change the matcher, so it's important
// to try to match first.
bool match = creal -> matcher() -> matches( key , rloc ); // 將當前遊標指向的記錄與遊標中的where條件進行比較
if ( ! cc -> advance() ) // 遊標移到下一個記錄位置
justOne = true ;
if ( ! match )
continue ;
assert( ! cc -> c() -> getsetdup(rloc) ); // 不允許複本, 因爲在多鍵值索引中可能會返回複本
if ( ! justOne ) {
/* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore
blocks. here we might call millions of times which would be bad.
*/
cc -> c() -> noteLocation(); // 記錄當前遊標移動到的位置
}
if ( logop ) { // 是否保存操作日誌
BSONElement e;
if ( BSONObj( rloc.rec() ).getObjectID( e ) ) {
BSONObjBuilder b;
b.append( e );
bool replJustOne = true ;
logOp( " d " , ns, b.done(), 0 , & replJustOne ); // d表示delete
}
else {
problem() << " deleted object without id, not logging " << endl;
}
}
if ( rs ) // 將刪除記錄的bson objects 信息保存到磁盤文件上
rs -> goingToDelete( rloc.obj() /* cc->c->current() */ );
theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc); // 刪除查詢匹配到的記錄
nDeleted ++ ; // 累計刪除信息數
if ( justOne ) {
break ;
}
cc -> c() -> checkLocation(); // 因爲刪除完記錄好，會造成緩存中相關索引信息過期，用該方法能確保索引有效
if ( ! god )
getDur().commitIfNeeded();
if ( debug && god && nDeleted == 100 ) // 刪除100條信息之後，顯示內存使用預警信息
log() << " warning high number of deletes with god=true which could use significant memory " << endl;
}
while ( cc -> ok() );
if ( cc. get () && ClientCursor::find( id , false ) == 0 ) { // 再次在btree bucket中查找，如沒有找到，表示記錄已全部被刪除
cc.release();
}
return nDeleted; // 返回已刪除的記錄數
}

上面的代碼主要執行構造查詢遊標，並將遊標指向地址的記錄取出來與查詢條件進行匹配，如果匹配命中，則進行刪除。這裏考慮到如果記錄在內存時，如果刪除記錄後，內存中的b樹結構會有影響，所以在刪除記錄前/後分別執行noteLocation/checkLocation方法以校正查詢cursor的當前位置。因爲這裏是一個while循環，它會找到所有滿足條件的記錄，依次刪除它們。因爲這裏使用了MultiCursor，該遊標在我看來就是一個複合遊標，它不僅包括了cursor 中所有功能，還支持or條件操作。而有關遊標的構造和繼承實現體系，mongodb做的有些複雜，很難幾句說清，我會在本系列後面另用篇幅進行說明，敬請期待 。
注意上面代碼段中的這行代碼：

view plain copy to clipboard print ?

theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc); // 刪除查詢匹配到的記錄
該行代碼執行了最終的刪除記錄操作，其定義如下：
// pdfile.cpp文件 912行
// 刪除查詢匹配查詢到的記錄
void DataFileMgr::deleteRecord( const char * ns, Record * todelete, const DiskLoc & dl, bool cappedOK, bool noWarn) {
dassert( todelete == dl.rec() ); // debug斷言，檢查要刪除的Record信息與傳入的dl是否一致（避免函數調用過程中被修改?）
NamespaceDetails * d = nsdetails(ns);
if ( d -> capped && ! cappedOK ) { // 如果是capped collection類型，則不刪除
out () << " failing remove on a capped ns " << ns << endl;
uassert( 10089 , " can't remove from a capped collection " , 0 );
return ;
}
// 如果還有別的遊標指向當前dl（併發情況下），則提升它們
ClientCursor::aboutToDelete(dl);
// 將要刪除的記錄信息從索引b村中移除
unindexRecord(d, todelete, dl, noWarn);
// 刪除指定記錄信息
_deleteRecord(d, ns, todelete, dl);
NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
}

上面刪除記錄方法deleteRecord中，執行的刪除順序與我之前寫的那篇插入記錄方式正好相反（那篇文章中是選在內存中分配記錄然後將地址放到b樹中），這裏是先將要刪除記錄的索引信息刪除，然後再刪除指定記錄（更新內存中的記錄信息而不是真的刪除，稍後會進行解釋）。

首先我們先看一下上面代碼段的unindexRecord方法：

view plain copy to clipboard print ?

// pdfile.cpp文件 845行
/* 在所有索引中去掉當前記錄信息中的相關索引鍵(包括多鍵值索用)信息 */
static void unindexRecord(NamespaceDetails * d, Record * todelete, const DiskLoc & dl, bool noWarn = false ) {
BSONObj obj(todelete);
int n = d -> nIndexes;
for ( int i = 0 ; i < n; i ++ )
_unindexRecord(d -> idx(i), obj, dl, ! noWarn); // 操作見下面代碼段
if ( d -> indexBuildInProgress ) { // 對後臺正在創建的索引進行_unindexRecord操作
// always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
_unindexRecord(d -> idx(n), obj, dl, false ); // 操作見下面代碼段
}
}
// pdfile.cpp文件 815行
/* unindex all keys in index for this record. */
static void _unindexRecord(IndexDetails & id, BSONObj & obj, const DiskLoc & dl, bool logMissing = true ) {
BSONObjSetDefaultOrder keys;
id.getKeysFromObject(obj, keys); // 通過記錄獲取鍵值信息
for ( BSONObjSetDefaultOrder::iterator i = keys.begin(); i != keys.end(); i ++ ) {
BSONObj j = * i;
if ( otherTraceLevel >= 5 ) { // otherTraceLevel爲外部變量，定義在query.cpp中，目前作用不清楚
out () << " _unindexRecord() " << obj.toString();
out () << " /n unindex: " << j.toString() << endl;
}
nUnindexes ++ ; // 累加索引數
bool ok = false ;
try {
ok = id.head.btree() -> unindex(id.head, id, j, dl); // 在btree bucket中刪除記錄的索引信息
}
catch (AssertionException & e) {
problem() << " Assertion failure: _unindex failed " << id.indexNamespace() << endl;
out () << " Assertion failure: _unindex failed: " << e.what() << ' /n ' ;
out () << " obj: " << obj.toString() << ' /n ' ;
out () << " key: " << j.toString() << ' /n ' ;
out () << " dl: " << dl.toString() << endl;
sayDbContext();
}
if ( ! ok && logMissing ) {
out () << " unindex failed (key too big?) " << id.indexNamespace() << ' /n ' ;
}
}
}

上面代碼主要是把要刪除的記錄的B樹鍵值信息取出，然後通過循環（可能存在多鍵索引，具體參見我之前插入記錄那篇文章中B樹索引構造的相關內容）刪除相應B樹索引信息，下面代碼段就是在B樹中查找(locate)並最終刪除（delKeyAtPos）的邏輯：

view plain copy to clipboard print ?

// btree.cpp文件 1116行
/* 從索引中移除鍵值 */
bool BtreeBucket::unindex( const DiskLoc thisLoc, IndexDetails & id, const BSONObj & key, const DiskLoc recordLoc ) const {
if ( key.objsize() > KeyMax ) { // 判斷鍵值是否大於限制
OCCASIONALLY problem() << " unindex: key too large to index, skipping " << id.indexNamespace() << /* ' ' << key.toString() << */ endl;
return false ;
}
int pos;
bool found;
DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1 ); // 從btree bucket中查找指定記錄並獲得位置信息（pos）
if ( found ) {
loc.btreemod() -> delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern())); // 刪除指定位置的記錄信息
return true ;
}
return false ;
}

在刪除b樹索引之後，接着就是“刪除內存（或磁盤，因爲mmap機制）中的記錄”了，也就是之前DataFileMgr::deleteRecord()方法的下面代碼：

view plain copy to clipboard print ?

_deleteRecord(d, ns, todelete, dl)

其定義如下：

view plain copy to clipboard print ?

//pdfile.cpp文件 859行
/* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
caller must check if capped
*/
void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
/* remove ourself from the record next/prev chain */
{
if ( todelete->prevOfs != DiskLoc::NullOfs )//如果要刪除記錄的前面有信息則記錄到日誌中
getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
if ( todelete->nextOfs != DiskLoc::NullOfs )//如果要刪除記錄的前面有信息則記錄到日誌中
getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
}
//extents是一個數據文件區域，該區域有所有記錄（records）均屬於同一個名空間namespace
/* remove ourself from extent pointers */
{
Extent *e = getDur().writing( todelete->myExtent(dl) );
if ( e->firstRecord == dl ) {//如果要刪除記錄爲該extents區域第一條記錄時
if ( todelete->nextOfs == DiskLoc::NullOfs )//且爲唯一記錄時
e->firstRecord.Null();//則該空間第一元素爲空
else//將當前空間第一條（有效）記錄後移一位
e->firstRecord.set(dl.a(), todelete->nextOfs);
}
if ( e->lastRecord == dl ) {//如果要刪除記錄爲該extents區域最後一條記錄時
if ( todelete->prevOfs == DiskLoc::NullOfs )//如果要刪除記錄的前一條信息位置爲空時
e->lastRecord.Null();//該空間最後一條記錄清空
else //設置該空間最後一條（有效）記錄位置前移一位
e->lastRecord.set(dl.a(), todelete->prevOfs);
}
}
/* 添加到釋放列表中 */
{
{//更新空間統計信息
NamespaceDetails::Stats *s = getDur().writing(&d->stats);
s->datasize -= todelete->netLength();
s->nrecords--;
}
if ( strstr(ns, ".system.indexes") ) {//如果爲索引空間，則把要刪除記錄在內存中的信息標識爲0
/* temp: if in system.indexes, don't reuse, and zero out: we want to be
careful until validated more, as IndexDetails has pointers
to this disk location. so an incorrectly done remove would cause
a lot of problems.
*/
memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
}
else {
DEV {
unsigned long long *p = (unsigned long long *) todelete->data;
*getDur().writing(p) = 0;
//DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
}
d->addDeletedRec((DeletedRecord*)todelete, dl);//向當前空間的“要刪除記錄鏈表”中添加當前要刪除的記錄信息
}
}
}

這裏有一個數據結構要先解析一下，因爲mongodb在刪除記錄時並不是真把記錄從內存中remove出來，而是將該刪除記錄數據置空（寫0或特殊數字加以標識）同時將該記錄所在地址放到一個list列表中，也就是上面代碼註釋中所說的“釋放列表”，這樣做的好就是就是如果有用戶要執行插入記錄操作時，mongodb會首先從該“釋放列表”中獲取size合適的“已刪除記錄”地址返回，這種廢物利用 的方法會提升性能（避免了malloc內存操作），同時mongodb也使用了bucket size數組來定義多個大小size不同的列表，用於將要刪除的記錄根據其size大小放到合適的“釋放列表”中（deletedList），有關該 deletedList內容，詳見namespace.h文件中的註釋內容。
上面代碼中如果記錄的ns 在索引中則進行使用memset方法重置該記錄數據，否則才執行將記錄添加到“釋放列表”操作，如下：

view plain copy to clipboard print ?

void NamespaceDetails::addDeletedRec(DeletedRecord * d, DiskLoc dloc) {
BOOST_STATIC_ASSERT( sizeof (NamespaceDetails::Extra) <= sizeof (NamespaceDetails) );
{
Record * r = (Record * ) getDur().writingPtr(d, sizeof (Record));
d = & r -> asDeleted(); // 轉換成DeletedRecord類型
// 防止引用已刪除的記錄
(unsigned & ) (r -> data) = 0xeeeeeeee ; // 修改要刪除記錄的數據信息
}
DEBUGGING log() << " TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d -> extentOfs << endl;
if ( capped ) { // 如果是cap集合方式，則會將記錄放到該集全中
if ( ! cappedLastDelRecLastExtent().isValid() ) {
// Initial extent allocation. Insert at end.
d -> nextDeleted = DiskLoc();
if ( cappedListOfAllDeletedRecords().isNull() ) // deletedList[0] 是否爲空，該值指向一個被刪除的記錄列表
getDur().writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc; // 持久化該刪除記錄
else {
DiskLoc i = cappedListOfAllDeletedRecords(); // 如果爲空向該列表中添加刪除記錄
for (; ! i.drec() -> nextDeleted.isNull(); i = i.drec() -> nextDeleted ) // 遍歷到最後一條記錄
;
i.drec() -> nextDeleted.writing() = dloc; // 將要刪除的記錄信息追加到鏈接尾部
}
}
else {
d -> nextDeleted = cappedFirstDeletedInCurExtent(); // 將deletedList[0]放到“刪除記錄”的後面
getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc; // 持久化deletedList[0]信息並將當前要刪除的dloc綁定到deletedList[0]位置
// always compact() after this so order doesn't matter
}
}
else {
int b = bucket(d -> lengthWithHeaders); // 獲取一個適合存儲當前數據尺寸大小的bucket的序號, 參見當前文件的bucketSizes設置
DiskLoc & list = deletedList[b]; // 該值會與上面的cappedLastDelRecLastExtent（獲取deletedList[0]）相關聯
DiskLoc oldHead = list; // 取出第一條(head)記錄
getDur().writingDiskLoc(list) = dloc; // 將舊的記錄信息數據持久化，並將list首記錄綁定成當前要刪除的dloc
d -> nextDeleted = oldHead; // 將(第一條)舊記錄綁定到當前已刪除記錄的nextDeleted上，形成一個鏈表
}
}

這樣，就完成了將記錄放到“釋放列表”中的操作，上面的bucket中提供的大小款式 如下：

view plain copy to clipboard print ?

// namespace.cpp 文件37行
/* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
so you can look for a deleterecord about the right size.
*/
int bucketSizes[] = {
32 , 64 , 128 , 256 , 0x200 , 0x400 , 0x800 , 0x1000 , 0x2000 , 0x4000 ,
0x8000 , 0x10000 , 0x20000 , 0x40000 , 0x80000 , 0x100000 , 0x200000 ,
0x400000 , 0x800000
};

最後，用一張時序圖回顧一下刪除記錄時mongodb服務端代碼的執行流程：

     好了，今天的內容到這裏就告一段落了，在接下來的文章中，將會介紹客戶端發起Update操作時，Mongodb的執行流程和相應實現部分。

   原文鏈接:http://www.cnblogs.com/daizhj/archive/2011/04/06/2006740.html
    作者: daizhj, 代震軍
    微博: http://t.sina.com.cn/daizhj
    Tags: mongodb,c++,source code

jackfirst86

發佈了27 篇原創文章 · 獲贊 1 · 訪問量 7萬+

私信關注

Mongodb源碼分析--刪除記錄

自動分詞算法的分類

ME, HMM, MEMM, CRF

一個基於搜索的中文分詞方法( A Search-based Chinese Word Segmentation Method)

最大熵模型文獻閱讀指南

Mongodb源碼分析--插入記錄及索引B樹構建

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結