linux內核源碼閱讀之facebook硬盤加速利器flashcache之一

從來沒有寫過源碼閱讀，這種感覺越來越強烈，雖然劣於文筆，但還是下定決心認真寫一回。

源代碼下載請參見上一篇flashcache之我見 http://blog.csdn.net/liumangxiong/article/details/11643473

下面代碼對應的是tag下面的1.0版本的。

看內核模塊源碼，閉着眼睛打開flashcache_init函數，區區百來行代碼何足懼也。

1963int __init 
1964flashcache_init(void)
1965{
1966	int r;
1967
1968	r = flashcache_jobs_init();
1969	if (r)
1970		return r;
1971	atomic_set(&nr_cache_jobs, 0);
1972	atomic_set(&nr_pending_jobs, 0);
1973#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
1974	INIT_WORK(&_kcached_wq, do_work, NULL);
1975#else
1976	INIT_WORK(&_kcached_wq, do_work);
1977#endif
1978	for (r = 0 ; r < 33 ; r++)
1979		size_hist[r] = 0;
1980	r = dm_register_target(&flashcache_target);
1981	if (r < 0) {
1982		DMERR("cache: register failed %d", r);
1983	}
1984#ifdef CONFIG_PROC_FS
1985#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
1986	flashcache_table_header = 
1987		register_sysctl_table(flashcache_root_table, 1);
1988#else
1989	flashcache_table_header = 
1990		register_sysctl_table(flashcache_root_table);
1991#endif
1992	{
1993		struct proc_dir_entry *entry;
1994		
1995		entry = create_proc_entry("flashcache_stats", 0, NULL);
1996		if (entry)
1997			entry->proc_fops =  &flashcache_stats_operations;
1998		entry = create_proc_entry("flashcache_errors", 0, NULL);
1999		if (entry)
2000			entry->proc_fops =  &flashcache_errors_operations;
2001		entry = create_proc_entry("flashcache_iosize_hist", 0, NULL);
2002		if (entry)
2003			entry->proc_fops =  &flashcache_iosize_hist_operations;
2004		entry = create_proc_entry("flashcache_pidlists", 0, NULL);
2005		if (entry)
2006			entry->proc_fops =  &flashcache_pidlists_operations;
2007		entry = create_proc_entry("flashcache_version", 0, NULL);
2008		if (entry)
2009			entry->proc_fops =  &flashcache_version_operations;
2010	}
2011#endif
2012	flashcache_control = (struct flashcache_control_s *)
2013		kmalloc(sizeof(struct flashcache_control_s *), GFP_KERNEL);
2014	flashcache_control->synch_flags = 0;
2015	register_reboot_notifier(&flashcache_notifier);
2016	return r;
2017}

先大致看一眼，flashcache_jobs_init()分配job內存結構的，INIT_WORK初始化WORK的，接下來一看proc字眼就知道是/proc下目錄的文件，再後來創建一個flashcache_control_s管理結構，再註冊一個關機回調函數。

這樣就走馬觀花地把這個函數看完了，那讓寫代碼的人情何以堪？

再問一下自己，flashcache究竟做了什麼？腦子裏還是一片空白。那接下來就到每個函數內探個究竟。

441static int 
442flashcache_jobs_init(void)
443{
444#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
445	_job_cache = kmem_cache_create("kcached-jobs",
446	                               sizeof(struct kcached_job),
447	                               __alignof__(struct kcached_job),
448	                               0, NULL, NULL);
449#else
450	_job_cache = kmem_cache_create("kcached-jobs",
451	                               sizeof(struct kcached_job),
452	                               __alignof__(struct kcached_job),
453	                               0, NULL);
454#endif
455	if (!_job_cache)
456		return -ENOMEM;
457
458	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
459	                           mempool_free_slab, _job_cache);
460	if (!_job_pool) {
461		kmem_cache_destroy(_job_cache);
462		return -ENOMEM;
463	}
464#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
465	_pending_job_cache = kmem_cache_create("pending-jobs",
466					       sizeof(struct pending_job),
467					       __alignof__(struct pending_job),
468					       0, NULL, NULL);
469#else
470	_pending_job_cache = kmem_cache_create("pending-jobs",
471					       sizeof(struct pending_job),
472					       __alignof__(struct pending_job),
473					       0, NULL);
474#endif
475	if (!_pending_job_cache) {
476		mempool_destroy(_job_pool);
477		kmem_cache_destroy(_job_cache);
478		return -ENOMEM;
479	}
480
481	_pending_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
482					   mempool_free_slab, _pending_job_cache);
483	if (!_pending_job_pool) {
484		kmem_cache_destroy(_pending_job_cache);
485		mempool_destroy(_job_pool);
486		kmem_cache_destroy(_job_cache);
487		return -ENOMEM;
488	}
489
490	return 0;
491}

首先是flashcache_jobs_init()函數，該函數裏創建了兩類job和兩類的mem_pool，就像雙胞胎看起來一樣，實際上並不一樣。

_job_pool => flashcache_alloc_cache_job => new_kcached_job 調用new_kcached_job 有好多個，有flashcache_dirty_writeback、flashcache_read_hit、flashcache_read_miss、flashcache_write_miss、flashcache_write_hit、flashcache_dirty_writeback_sync、flashcache_start_uncached_io。如果仔細地看一下這些函數的名稱，發現這些函數所做的事情正是一個寫緩存的基本操作和動作，即writeback, writethrough, hit, miss。

現在就以flashcache_dirty_writeback爲例，看看到底在kcacheed_job起了什麼作用？

944static void
945flashcache_dirty_writeback(struct cache_c *dmc, int index)
946{
947	struct kcached_job *job;
948	unsigned long flags;
949	struct cacheblock *cacheblk = &dmc->cache[index];
950	int device_removal = 0;
951	
952	DPRINTK("flashcache_dirty_writeback: Index %d", index);
953	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
954	VERIFY((cacheblk->cache_state & BLOCK_IO_INPROG) == DISKWRITEINPROG);
955	VERIFY(cacheblk->cache_state & DIRTY);
956	dmc->cache_sets[index / dmc->assoc].clean_inprog++;
957	dmc->clean_inprog++;
958	spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
959	job = new_kcached_job(dmc, NULL, index);
960	if (unlikely(sysctl_flashcache_error_inject & DIRTY_WRITEBACK_JOB_ALLOC_FAIL)) {
961		if (job)
962			flashcache_free_cache_job(job);
963		job = NULL;
964		sysctl_flashcache_error_inject &= ~DIRTY_WRITEBACK_JOB_ALLOC_FAIL;
965	}
966	/*
967	 * If the device is being (fast) removed, do not kick off any more cleanings.
968	 */
969	if (unlikely(atomic_read(&dmc->fast_remove_in_prog))) {
970		DMERR("flashcache: Dirty Writeback (for set cleaning) aborted for device removal, block %lu", 
971		      cacheblk->dbn);
972		if (job)
973			flashcache_free_cache_job(job);
974		job = NULL;
975		device_removal = 1;
976	}
977	if (unlikely(job == NULL)) {
978		spin_lock_irqsave(&dmc->cache_spin_lock, flags);
979		dmc->cache_sets[index / dmc->assoc].clean_inprog--;
980		dmc->clean_inprog--;
981		flashcache_free_pending_jobs(dmc, cacheblk, -EIO);
982		cacheblk->cache_state &= ~(BLOCK_IO_INPROG);
983		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
984		if (device_removal == 0)
985			DMERR("flashcache: Dirty Writeback (for set cleaning) failed ! Can't allocate memory, block %lu", 
986			      cacheblk->dbn);
987	} else {
988		job->bio = NULL;
989		job->action = WRITEDISK;
990		atomic_inc(&dmc->nr_jobs);
991		dmc->ssd_reads++;
992		dmc->disk_writes++;
993#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
994		kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0, 
995			    flashcache_kcopyd_callback, job);
996#else
997		dm_kcopyd_copy(dmc->kcp_client, &job->cache, 1, &job->disk, 0, 
998			       (dm_kcopyd_notify_fn) flashcache_kcopyd_callback, 
999			       (void *)job);
1000#endif
1001	}
1002}

首先是用new_kcached_job申請一個kcached_job結構體，接下來判斷dmc->fast_remove_in_prog，這個是移除flashcache標誌，設備都要刪除掉了，顯然就沒必要再下發命令了。再判斷job是否爲空，else這裏纔是乾的正事。這裏job->action = WRITEDISK;是最重要的一句話，就是前面講的寫緩存基本操作，而這個action就可以看作是一個狀態機，對應的狀態如下：

245/* kcached/pending job states */
246#define READCACHE	1
247#define WRITECACHE	2
248#define READDISK	3
249#define WRITEDISK	4
250#define READFILL	5	/* Read Cache Miss Fill */
251#define INVALIDATE	6
252#define WRITEDISK_SYNC	7

這裏設置的是WRITEDISK，就是寫磁盤，那是從哪裏寫呢？是從寫緩存寫的，寫緩存的數據又是在哪裏呢？我們把SSD盤當作寫緩存，所以是從SSD盤寫到磁盤。那我們是不是要做很多事情，先從SSD讀數據然後再往磁盤寫呢？是的，但是我們不用做太多的事情，因爲linux內核有大名鼎鼎的kcopyd線程，我們只需要把這些煩索的工作交給kcopyd完成就可以了，調用的接口是

int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,

unsigned int num_dests, struct dm_io_region *dests,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)

第一個參數是kcopyd_client，這是是flashcache_ctr即flashcache設備創建的構造函數中創建的，即每一個flashcache設備都對應一個kcopyd_client，那麼爲什麼要創建這個結構體呢？可以簡單地理解爲使用kcopyd服務的一個句柄。第二參數是數據源，第三個爲目的數量，第四個參數爲要寫的目標，第五個參數爲額外標識，這裏都設置爲0，第六個參數fn是回調函數，設置了回調函數則此函數爲異步，不阻塞，如果fn設置爲NULL，則會同步等待。最後一個參數context是用於回調函數使用的參數，這裏傳入的正是我們現在最關心的job。

我們已經把kcached_job派發出去了，接着來看是kcached_job是什麼時候回來的，回來又做了什麼事情，最後是怎麼銷燬的？

在dm_kcopyd_copy中設置的回調函數是flashcache_kcopyd_callback。

901static void 
902flashcache_kcopyd_callback(int read_err, unsigned int write_err, void *context)
903{
904	struct kcached_job *job = (struct kcached_job *)context;
905	struct cache_c *dmc = job->dmc;
906	int index = job->index;
907	unsigned long flags;
908
909	VERIFY(!in_interrupt());
910	DPRINTK("kcopyd_callback: Index %d", index);
911	VERIFY(job->bio == NULL);
912	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
913	VERIFY(dmc->cache[index].cache_state & (DISKWRITEINPROG | VALID | DIRTY));
914	if (unlikely(sysctl_flashcache_error_inject & KCOPYD_CALLBACK_ERROR)) {
915		read_err = -EIO;
916		sysctl_flashcache_error_inject &= ~KCOPYD_CALLBACK_ERROR;
917	}
918	if (likely(read_err == 0 && write_err == 0)) {
919		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
920		flashcache_md_write(job);
921	} else {
922		/* Disk write failed. We can not purge this block from flash */
923		DMERR("flashcache: Disk writeback failed ! read error %d write error %d block %lu", 
924		      -read_err, -write_err, job->disk.sector);
925		VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
926		VERIFY(dmc->clean_inprog > 0);
927		dmc->cache_sets[index / dmc->assoc].clean_inprog--;
928		dmc->clean_inprog--;
929		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
930		/* Set the error in the job and let do_pending() handle the error */
931		if (read_err) {
932			dmc->ssd_read_errors++;			
933			job->error = read_err;
934		} else {
935			dmc->disk_write_errors++;			
936			job->error = write_err;
937		}
938		flashcache_do_pending(job);
939		flashcache_clean_set(dmc, index / dmc->assoc); /* Kick off more cleanings */
940		dmc->cleanings++;
941	}
942}

到這裏就表明寫緩存的數據寫到磁盤的過程已經完成了。首先檢查結果是否成功了，如果都成功的話就調用flashcache_md_write。

860
861/* 
862 * Kick off a cache metadata update (called from workqueue).
863 * Cache metadata update IOs to a given metadata sector are serialized using the 
864 * nr_in_prog bit in the md sector bufhead.
865 * If a metadata IO is already in progress, we queue up incoming metadata updates
866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
867 * cluster all these pending updates and do all of them as 1 flash write (that 
868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
869 * list and does all of those updates.
870 */
871void
872flashcache_md_write(struct kcached_job *job)
873{
874	struct cache_c *dmc = job->dmc;
875	struct cache_md_sector_head *md_sector_head;
876	unsigned long flags;
877	
878	VERIFY(!in_interrupt());
879	VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 
880	       job->action == WRITEDISK_SYNC);
881	md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
882	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
883	/* If a write is in progress for this metadata sector, queue this update up */
884	if (md_sector_head->nr_in_prog != 0) {
885		struct kcached_job **nodepp;
886		
887		/* A MD update is already in progress, queue this one up for later */
888		nodepp = &md_sector_head->pending_jobs;
889		while (*nodepp != NULL)
890			nodepp = &((*nodepp)->next);
891		job->next = NULL;
892		*nodepp = job;
893		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
894	} else {
895		md_sector_head->nr_in_prog = 1;
896		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
897		flashcache_md_write_kickoff(job);
898	}
899}

如果函數有註釋還是仔細看一下吧，據個人觀察，寫linux內核的哥們都是惜字如金，如果他願意寫註釋，那看註釋絕對比看代碼更重要，更有意義，如果有文檔的話，那文檔就是重中之重。看到這裏有註釋，真是欣喜萬分，基本上看了註釋不用看代碼都行，但對於我這樣的小菜鳥來說，有時還不能完全領會大俠的神意，就會繼續讀一下代碼。

861/* 
862 * Kick off a cache metadata update (called from workqueue).
863 * Cache metadata update IOs to a given metadata sector are serialized using the 
864 * nr_in_prog bit in the md sector bufhead.
865 * If a metadata IO is already in progress, we queue up incoming metadata updates
866 * on the pending_jobs list of the md sector bufhead. When kicking off an IO, we
867 * cluster all these pending updates and do all of them as 1 flash write (that 
868 * logic is in md_write_kickoff), where it switches out the entire pending_jobs
869 * list and does all of those updates.
870 */

派發cache metadata更新（從workqueue調用=》因爲這裏是從kcopyd回調回來的，所以這裏友情提示一下，在內核要十分關心調用的上下文，是看內核代碼的必修課，有時也是解決疑難問題的基礎）。cache metadata的更新是由結構cache_md_sector_head中nr_in_prog字段來控制更新次序的（就是說更新cache metadata是按次序的，如果前面的更新未完成，後面的更新就排隊等候）。排隊等候的kcached_job就掛在cache_md_sector_head的pending_jobs上。在前面的更新操作回來時，就一次性把pending_jobs上的所有更新操作一次性派發。（因爲所有更新就是對應一個sector中flashcache管理結構的）。

這一段看不明白也沒關係，因爲這裏還沒有講到flashcache的數據組織。但必須明白，我們在flashcache_dirty_writeback中把髒數據從寫緩存SSD刷到磁盤，這裏要做的事情就是把這個髒數據的的metadata從內存刷到SSD，這樣就保證了在異常掉電的情況下元數據可以從SSD中找回。

到這裏kcached_job還沒有銷燬，我們繼續跟蹤下去 flashcache_md_write=>flashcache_md_write_kickoff。

660static void
661flashcache_md_write_kickoff(struct kcached_job *job)
662{
663	struct cache_c *dmc = job->dmc;	
664	struct flash_cacheblock *md_sector;
665	int md_sector_ix;
666#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27)
667	struct io_region where;
668#else
669	struct dm_io_region where;
670#endif
671	int i;
672	struct cache_md_sector_head *md_sector_head;
673	struct kcached_job *orig_job = job;
674	unsigned long flags;
675
676	if (flashcache_alloc_md_sector(job)) {
677		DMERR("flashcache: %d: Cache metadata write failed, cannot alloc page ! block %lu", 
678		      job->action, job->disk.sector);
679		flashcache_md_write_callback(-EIO, job);
680		return;
681	}
682	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
683	/*
684	 * Transfer whatever is on the pending queue to the md_io_inprog queue.
685	 */
686	md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
687	md_sector_head->md_io_inprog = md_sector_head->pending_jobs;
688	md_sector_head->pending_jobs = NULL;
689	md_sector = job->md_sector;
690	md_sector_ix = INDEX_TO_MD_SECTOR(job->index) * MD_BLOCKS_PER_SECTOR;
691	/* First copy out the entire sector */
692	for (i = 0 ; 
693	     i < MD_BLOCKS_PER_SECTOR && md_sector_ix < dmc->size ; 
694	     i++, md_sector_ix++) {
695		md_sector[i].dbn = dmc->cache[md_sector_ix].dbn;
696#ifdef FLASHCACHE_DO_CHECKSUMS
697		md_sector[i].checksum = dmc->cache[md_sector_ix].checksum;
698#endif
699		md_sector[i].cache_state = 
700			dmc->cache[md_sector_ix].cache_state & (VALID | INVALID | DIRTY);
701	}
702	/* Then set/clear the DIRTY bit for the "current" index */
703	if (job->action == WRITECACHE) {
704		/* DIRTY the cache block */
705		md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 
706			(VALID | DIRTY);
707	} else { /* job->action == WRITEDISK* */
708		/* un-DIRTY the cache block */
709		md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
710	}
711
712	for (job = md_sector_head->md_io_inprog ; 
713	     job != NULL ;
714	     job = job->next) {
715		if (job->action == WRITECACHE) {
716			/* DIRTY the cache block */
717			md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = 
718				(VALID | DIRTY);
719		} else { /* job->action == WRITEDISK* */
720			/* un-DIRTY the cache block */
721			md_sector[INDEX_TO_MD_SECTOR_OFFSET(job->index)].cache_state = VALID;
722		}
723	}
724	spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
725	where.bdev = dmc->cache_dev->bdev;
726	where.count = 1;
727	where.sector = 1 + INDEX_TO_MD_SECTOR(orig_job->index);
728	dmc->ssd_writes++;
729	dm_io_async_bvec(1, &where, WRITE,
730			 &orig_job->md_io_bvec,
731			 flashcache_md_write_callback, orig_job);
732	flashcache_unplug_device(dmc->cache_dev->bdev);
733}

這裏cacheblock 信息保存到job->md_io_bvec的page頁中，再調用dm_io_async_bvec將數據寫到SSD盤中。我們來看一下該函數原型：

static int dm_io_async_bvec(unsigned int num_regions, 
			    struct dm_io_region *where, int rw, 
			    struct bio_vec *bvec, io_notify_fn fn, 
			    void *context)

該函數與之前的dm_kcopyd_copy類似，我們最關心的是參數where，因爲這是人生最重要的一課，你是誰？你要到哪裏去？

where的bdev域就是目標設備，而sector域就是起始地址，count表示要寫的扇區數。這個函數就是把dmc->cache的管理結構打包到job->md_io_bvec中，然後寫到SSD對應位置上。

再接下來看寫SSD完成調用flashcache_md_write_callback：

621void 
622flashcache_md_write_callback(unsigned long error, void *context)
623{
624	struct kcached_job *job = (struct kcached_job *)context;
625
626	job->error = error;
627	push_md_complete(job);
628	schedule_work(&_kcached_wq);
629}

該函數只是簡單地設置job的返回值，然後放到_md_complete_jobs這個鏈表裏，然後通知workqueue處理。爲什麼不直接在這個函數裏處理，而要放到後面處理呢？這就像每個公司都有個漂亮的前臺祕書，這個物流公司送來了大箱的物料，美女祕書當然不會自己搬，隨便撒個嬌一大羣工科男都搶着幹活。這裏函數是寫完成的回調函數，是在軟中斷中調用的，軟中斷跟美女祕書一樣，幹不了重活，只能簡單地簽收一下，剩下的活就由workqueue來完成了。

要繼續我們的跟蹤，那就得問workqueue是從哪裏來的，workqueue做了什麼，或者說對job做了什麼？

flashcache_init=>INIT_WORK(&_kcached_wq, do_work);=>process_jobs(&_md_complete_jobs, flashcache_md_write_done);

先看process_jobs

284static void
285process_jobs(struct list_head *jobs,
286	     void (*fn) (struct kcached_job *))
287{
288	struct kcached_job *job;
289
290	while ((job = pop(jobs)))
291		(void)fn(job);
292}

就是從隊列中把剛纔美女祕書籤收的job取出來，然後調用fn，fn就是這裏註冊的flashcache_md_write_done。

從函數名有個蛋（done），就好像每天下午的5點半，一天的忙碌立馬可以收工了，但是悲劇的LZ現在每個月都要加班72個小時，這樣想想大家有沒有從LZ的不幸中找到自己的幸福？

735void
736flashcache_md_write_done(struct kcached_job *job)
737{
738	struct cache_c *dmc = job->dmc;
739	struct cache_md_sector_head *md_sector_head;
740	int index;
741	unsigned long flags;
742	struct kcached_job *job_list;
743	int error = job->error;
744	struct kcached_job *next;
745	struct cacheblock *cacheblk;
746		
747	VERIFY(!in_interrupt());
748	VERIFY(job->action == WRITEDISK || job->action == WRITECACHE || 
749	       job->action == WRITEDISK_SYNC);
750	flashcache_free_md_sector(job);
751	job->md_sector = NULL;
752	md_sector_head = &dmc->md_sectors_buf[INDEX_TO_MD_SECTOR(job->index)];
753	job_list = job;
754	job->next = md_sector_head->md_io_inprog;
755	md_sector_head->md_io_inprog = NULL;
756	for (job = job_list ; job != NULL ; job = next) {
757		next = job->next;
758		job->error = error;
759		index = job->index;
760		cacheblk = &dmc->cache[index];
761		spin_lock_irqsave(&dmc->cache_spin_lock, flags);
762		if (job->action == WRITECACHE) {
763			if (unlikely(sysctl_flashcache_error_inject & WRITECACHE_MD_ERROR)) {
764				job->error = -EIO;
765				sysctl_flashcache_error_inject &= ~WRITECACHE_MD_ERROR;
766			}
767			if (likely(job->error == 0)) {
768				if ((cacheblk->cache_state & DIRTY) == 0) {
769					dmc->cache_sets[index / dmc->assoc].nr_dirty++;
770					dmc->nr_dirty++;
771				}
772				dmc->md_write_dirty++;
773				cacheblk->cache_state |= DIRTY;
774			} else
775				dmc->ssd_write_errors++;
776			flashcache_bio_endio(job->bio, job->error);
777			if (job->error || cacheblk->head) {
778				if (job->error) {
779					DMERR("flashcache: WRITE: Cache metadata write failed ! error %d block %lu", 
780					      -job->error, cacheblk->dbn);
781				}
782				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
783				flashcache_do_pending(job);
784			} else {
785				cacheblk->cache_state &= ~BLOCK_IO_INPROG;
786				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
787				flashcache_free_cache_job(job);
788				if (atomic_dec_and_test(&dmc->nr_jobs))
789					wake_up(&dmc->destroyq);
790			}
791		} else {
792			int action = job->action;
793
794			if (unlikely(sysctl_flashcache_error_inject & WRITEDISK_MD_ERROR)) {
795				job->error = -EIO;
796				sysctl_flashcache_error_inject &= ~WRITEDISK_MD_ERROR;
797			}
798			/*
799			 * If we have an error on a WRITEDISK*, no choice but to preserve the 
800			 * dirty block in cache. Fail any IOs for this block that occurred while
801			 * the block was being cleaned.
802			 */
803			if (likely(job->error == 0)) {
804				dmc->md_write_clean++;
805				cacheblk->cache_state &= ~DIRTY;
806				VERIFY(dmc->cache_sets[index / dmc->assoc].nr_dirty > 0);
807				VERIFY(dmc->nr_dirty > 0);
808				dmc->cache_sets[index / dmc->assoc].nr_dirty--;
809				dmc->nr_dirty--;
810			} else 
811				dmc->ssd_write_errors++;
812			VERIFY(dmc->cache_sets[index / dmc->assoc].clean_inprog > 0);
813			VERIFY(dmc->clean_inprog > 0);
814			dmc->cache_sets[index / dmc->assoc].clean_inprog--;
815			dmc->clean_inprog--;
816			if (job->error || cacheblk->head) {
817				if (job->error) {
818					DMERR("flashcache: CLEAN: Cache metadata write failed ! error %d block %lu", 
819					      -job->error, cacheblk->dbn);
820				}
821				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
822				flashcache_do_pending(job);
823				/* Kick off more cleanings */
824				if (action == WRITEDISK)
825					flashcache_clean_set(dmc, index / dmc->assoc);
826				else
827					flashcache_sync_blocks(dmc);
828			} else {
829				cacheblk->cache_state &= ~BLOCK_IO_INPROG;
830				spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
831				flashcache_free_cache_job(job);
832				if (atomic_dec_and_test(&dmc->nr_jobs))
833					wake_up(&dmc->destroyq);
834				/* Kick off more cleanings */
835				if (action == WRITEDISK)
836					flashcache_clean_set(dmc, index / dmc->assoc);
837				else
838					flashcache_sync_blocks(dmc);
839			}
840			dmc->cleanings++;
841			if (action == WRITEDISK_SYNC)
842				flashcache_update_sync_progress(dmc);
843		}
844	}
845	spin_lock_irqsave(&dmc->cache_spin_lock, flags);
846	if (md_sector_head->pending_jobs != NULL) {
847		/* peel off the first job from the pending queue and kick that off */
848		job = md_sector_head->pending_jobs;
849		md_sector_head->pending_jobs = job->next;
850		job->next = NULL;
851		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
852		VERIFY(job->action == WRITEDISK || job->action == WRITECACHE ||
853		       job->action == WRITEDISK_SYNC);
854		flashcache_md_write_kickoff(job);
855	} else {
856		md_sector_head->nr_in_prog = 0;
857		spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);
858	}
859}
860

首先是flashcache_free_md_sector，這個函數只是簡單地把剛纔分配的記錄cacheblock 的page頁釋放。哪個剛纔啊？就是flashcache_md_write_kickoff中flashcache_alloc_md_sector申請的page頁。所以看這個函數時要回頭再去看看flashcache_md_write_kickoff，所以前面提到了上下文，那麼在這裏kickoff是上文，done就是下文，上文種什麼因，下文就得到什麼果。上文申請了page頁，下文就要釋放page頁；上文把dmc->md_sectors_buf[]中struct kcached_job *md_io_inprog對應的kcached_job都已經下發了，下文這裏纔有一個for循環。細心的你可能會問，爲什麼這裏的kcached_job可以一起下發？那首先要來了解一下這裏的kcached_job是幹什麼的。是結構體上的：

/* 
 * We have one of these for *every* cache metadata sector, to keep track
 * of metadata ios in progress for blocks covered in this sector. Only
 * one metadata IO per sector can be in progress at any given point in 
 * time
 */
struct cache_md_sector_head {
	u_int32_t		nr_in_prog;
	struct kcached_job	*pending_jobs, *md_io_inprog;
};

按規矩先看註釋，每一個cache metadata扇區都有對應一個cache_md_sector_head結構，用於同步進程（內存中）cacheblock metadata到cache metadata扇區。同時只能有一個IO在同步，對應的是cache_md_sector_head->nr_in_prog。回答上面的問題，就是這些kcached_job是對應同一個扇區內的不同metadata的寫，所以可以合併。這個扇區指的是SSD盤上存放flash_block結構的。

再回到flashcache_md_write_done函數中，在for循環中job->action爲WRITEDISK，所以直接來到for循環中else，迎面而來的又是一行註釋，在WRITEDISK*發生錯誤時，只有保持cacheblock的DIRTY標誌。接下來判斷有錯誤或者cacheblock上還有pending_job，那麼繼續下發IO，否則的話清除cacheblock的處理標誌，這裏我們終於見到了kcached_job完成了他的使命，調用flashcache_free_cache_job將該結構返回給內存池。

似乎到這裏我們就可以像童話裏講的“從此他們過上了幸福的生活”來結束kcached_job的介紹。然而回歸資源池也意味着kcached_job的再生，接着判斷action==WRITEDISK，調用flashcache_clean_set，將超過髒水平線的cache塊刷回到磁盤。就是說在每次寫磁盤返回的時候這個workqueue都會檢查一下髒水平線，如果超過就繼續往下刷，這就又回到了本文最開始的flashcache_dirty_writeback函數，真是因果聯繫，環環相扣，kcached_job的再生不是爲了自己，而是爲cacheblock的再生，所以說人不能只爲自己活着，每個人只是萬千輪迴裏的一個元素，都是爲了成全其他元素而進入六道輪迴。

下面一篇會從flashcache的數據結構和存儲設計來分析。

linux內核源碼閱讀之facebook硬盤加速利器flashcache之一

記一次 .NET某工業設計軟件崩潰分析

創建 Vue3 項目

TS + Webpack 整合 Jest

分享5款.NET開源免費的Redis客戶端組件庫

安卓手機如何登錄抖音境外版

golang開發 gorilla websocket的使用

面試官：如果不允許線程池丟棄任務，應該選擇哪個拒絕策略？

嵌入式汽車電子學習路線

Mac卸載 Node npm，升級 Node

uni.showModel內容換行

Linux Memory Management Notes

linux內核源碼閱讀之facebook硬盤加速flashcache之三

linux內核源碼閱讀之facebook硬盤加速flashcache之四

linux內核源碼閱讀之facebook硬盤加速flashcache之五

linux內核源碼閱讀之facebook硬盤加速利器flashcache之一

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結