block IO層框架分析3

1. 在block IO層框架分析2中,我們看到request請求會被提交到IO調度系統中,elv_merge判斷bio是否可以被合併到request。

在準備好request後,linux塊層調用add_request添加request,該函數直接以ELEVATOR_INSERT_SORT爲參數直接調用__elv_add_request。

在blk_peek_request函數中,調用__elv_next_request函數從請求隊列獲得下一個要處理的請求。

所以接下來就分析elv_merge、__elv_add_request、__elv_next_request函數。

IO調度系統有3個算法:NOOP算法,DEADLINE算法,CFQ算法。今天我們以較爲簡單的DEADLINE算法來分析。

struct elevator_type
{
	struct list_head list;	//鏈入系統已註冊的電梯算法
	struct elevator_ops ops;	//這種調度算法的操作表
	struct elv_fs_entry *elevator_attrs;	
	char elevator_name[ELV_NAME_MAX];	//名字
	struct module *elevator_owner;
};

/*
 * each queue has an elevator_queue associated with it
 */
struct elevator_queue
{
	struct elevator_ops *ops;	//指向調度器操作表的指針
	void *elevator_data;	//該電梯隊列的私有數據
	struct kobject kobj;
	struct elevator_type *elevator_type;	//指向調度器類型
	struct mutex sysfs_lock;
	struct hlist_head *hash;	//安裝請求最後一個扇區的編號進行哈希
	unsigned int registered:1;
};
struct elevator_ops
{
	elevator_merge_fn *elevator_merge_fn;	//查找可以和bio合併的request
	elevator_merged_fn *elevator_merged_fn;		//有請求可以合併時調用
	elevator_merge_req_fn *elevator_merge_req_fn;	//在兩個請求合併時調用
	elevator_allow_merge_fn *elevator_allow_merge_fn;	//判定bio可以安全合併到現有請求時調用
	elevator_bio_merged_fn *elevator_bio_merged_fn;

	elevator_dispatch_fn *elevator_dispatch_fn;		//將準備好的請求轉移到派發隊列
	elevator_add_req_fn *elevator_add_req_fn;		//添加一個新請求
	elevator_activate_req_fn *elevator_activate_req_fn;		
	elevator_deactivate_req_fn *elevator_deactivate_req_fn;

	elevator_completed_req_fn *elevator_completed_req_fn;

	elevator_request_list_fn *elevator_former_req_fn;
	elevator_request_list_fn *elevator_latter_req_fn;

	elevator_set_req_fn *elevator_set_req_fn;
	elevator_put_req_fn *elevator_put_req_fn;

	elevator_may_queue_fn *elevator_may_queue_fn;

	elevator_init_fn *elevator_init_fn;
	elevator_exit_fn *elevator_exit_fn;
	void (*trim)(struct io_context *);
};
DEADLINE 調度算法使用四個隊列,寫操作有一個排序隊列和FIFO隊列,讀操作也有一個排序隊列和FIFO隊列。先看下deadline_data結構

struct deadline_data {
	/*
	 * run time data
	 */

	/*
	 * requests (deadline_rq s) are present on both sort_list and fifo_list
	 */
	struct rb_root sort_list[2];	//紅黑樹的根,讀一個,寫一個
	struct list_head fifo_list[2];	//FIFO隊列

	/*
	 * next in sort order. read, write or both are NULL
	 */
	struct request *next_rq[2];	//按照扇區編號增加方向下一個請求的指針
	unsigned int batching;		/* number of sequential requests made */	//提交的請求的數目
	sector_t last_sector;		/* head position */	
	unsigned int starved;		/* times reads have starved writes */	//提交讀請求而造成寫飢餓的次數

	/*
	 * settings that change how the i/o scheduler behaves
	 */
	int fifo_expire[2];
	int fifo_batch;
	int writes_starved;
	int front_merges;
};

2. DEADLINE IO調度分析

int elevator_init(struct request_queue *q, char *name)
{
	struct elevator_type *e = NULL;
	struct elevator_queue *eq;
	void *data;

	if (unlikely(q->elevator))
		return 0;

	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

	if (name) {	//如果指定了名字
		e = elevator_get(name);
		if (!e)
			return -EINVAL;
	}

	if (!e && *chosen_elevator) {	//是否有參數指定默認的電梯算法
		e = elevator_get(chosen_elevator);
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}

	if (!e) {
		e = elevator_get(CONFIG_DEFAULT_IOSCHED);	//是否有編譯選項指定默認的電梯算法
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
				"Using noop.\n");
			e = elevator_get("noop");	//都沒有,就使用noop
		}
	}

	eq = elevator_alloc(q, e);	//分配elevator_queue結構
	if (!eq)
		return -ENOMEM;

	data = elevator_init_queue(q, eq);	//分配私有數據
	if (!data) {
		kobject_put(&eq->kobj);
		return -ENOMEM;
	}

	elevator_attach(q, eq, data);	//將IO調度隊列及私有數據關聯到請求隊列
	return 0;
}
int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
{
	struct elevator_queue *e = q->elevator;
	struct request *__rq;
	int ret;

	/*
	 * Levels of merges:
	 * 	nomerges:  No merges at all attempted
	 * 	noxmerges: Only simple one-hit cache try
	 * 	merges:	   All merge tries attempted
	 */
	if (blk_queue_nomerges(q))	//不要嘗試進行合併
		return ELEVATOR_NO_MERGE;

	/*
	 * First try one-hit cache.
	 */
	if (q->last_merge) {
		ret = elv_try_merge(q->last_merge, bio);
		if (ret != ELEVATOR_NO_MERGE) {
			*req = q->last_merge;
			return ret;
		}
	}

	if (blk_queue_noxmerges(q))		//只嘗試對緩存下來的request進行合併
		return ELEVATOR_NO_MERGE;

	/*
	 * See if our hash lookup can find a potential backmerge.
	 */
	__rq = elv_rqhash_find(q, bio->bi_sector);	//通過扇區的編號進行哈希查找
	if (__rq && elv_rq_merge_ok(__rq, bio)) {
		*req = __rq;
		return ELEVATOR_BACK_MERGE;
	}

	if (e->ops->elevator_merge_fn)	//特定於各個調度算法的,對於DEADLINE,deadline_merge
		return e->ops->elevator_merge_fn(q, req, bio);

	return ELEVATOR_NO_MERGE;
}
static int
deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
{
	struct deadline_data *dd = q->elevator->elevator_data;
	struct request *__rq;
	int ret;

	/*
	 * check for front merge
	 */
	if (dd->front_merges) {		//DEADLINE 只處理向前合併
		sector_t sector = bio->bi_sector + bio_sectors(bio);	//計算最後一個扇區

		__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);	//在紅黑樹中查找
		if (__rq) {	//如果找到
			BUG_ON(sector != blk_rq_pos(__rq));

			if (elv_rq_merge_ok(__rq, bio)) {
				ret = ELEVATOR_FRONT_MERGE;
				goto out;
			}
		}
	}

	return ELEVATOR_NO_MERGE;
out:
	*req = __rq;
	return ret;
}
static void add_acct_request(struct request_queue *q, struct request *rq,
			     int where)
{
	drive_stat_acct(rq, 1);
	__elv_add_request(q, rq, where);	//where爲ELEVATOR_INSERT_SORT
}
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
{
	trace_block_rq_insert(q, rq);

	rq->q = q;

	if (rq->cmd_flags & REQ_SOFTBARRIER) {
		/* barriers are scheduling boundary, update end_sector */
		if (rq->cmd_type == REQ_TYPE_FS ||
		    (rq->cmd_flags & REQ_DISCARD)) {
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
		    (where == ELEVATOR_INSERT_SORT ||
		     where == ELEVATOR_INSERT_SORT_MERGE))
		where = ELEVATOR_INSERT_BACK;

	switch (where) {
	case ELEVATOR_INSERT_REQUEUE:
	case ELEVATOR_INSERT_FRONT:
		rq->cmd_flags |= REQ_SOFTBARRIER;
		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
		rq->cmd_flags |= REQ_SOFTBARRIER;
		elv_drain_elevator(q);
		list_add_tail(&rq->queuelist, &q->queue_head);
		/*
		 * We kick the queue here for the following reasons.
		 * - The elevator might have returned NULL previously
		 *   to delay requests and returned them now.  As the
		 *   queue wasn't empty before this request, ll_rw_blk
		 *   won't run the queue on return, resulting in hang.
		 * - Usually, back inserted requests won't be merged
		 *   with anything.  There's no point in delaying queue
		 *   processing.
		 */
		__blk_run_queue(q);
		break;

	case ELEVATOR_INSERT_SORT_MERGE:
		/*
		 * If we succeed in merging this request with one in the
		 * queue already, we are done - rq has now been freed,
		 * so no need to do anything further.
		 */
		if (elv_attempt_insert_merge(q, rq))
			break;
	case ELEVATOR_INSERT_SORT:
		BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
		       !(rq->cmd_flags & REQ_DISCARD));
		rq->cmd_flags |= REQ_SORTED;
		q->nr_sorted++;
		if (rq_mergeable(rq)) {		//如果該請求可以合併
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}

		/*
		 * Some ioscheds (cfq) run q->request_fn directly, so
		 * rq cannot be accessed after calling
		 * elevator_add_req_fn.
		 */
		q->elevator->ops->elevator_add_req_fn(q, rq);	//執行deadline_add_request
		break;

	case ELEVATOR_INSERT_FLUSH:
		rq->cmd_flags |= REQ_SOFTBARRIER;
		blk_insert_flush(rq);
		break;
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
		       __func__, where);
		BUG();
	}
}
static void
deadline_add_request(struct request_queue *q, struct request *rq)
{
	struct deadline_data *dd = q->elevator->elevator_data;
	const int data_dir = rq_data_dir(rq);

	deadline_add_rq_rb(dd, rq);		//添加到紅黑樹中

	/*
	 * set expire time and add to fifo list
	 */
	rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);	//設置請求的超時時間
	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);	//添加到FIFO
}

接下來再看下__elv_next_request函數:

static inline struct request *__elv_next_request(struct request_queue *q)
{
	struct request *rq;

	while (1) {
		if (!list_empty(&q->queue_head)) {	//派發隊列不爲空
			rq = list_entry_rq(q->queue_head.next);
			return rq;
		}

		/*
		 * Flush request is running and flush request isn't queueable
		 * in the drive, we can hold the queue till flush request is
		 * finished. Even we don't do this, driver can't dispatch next
		 * requests and will requeue them. And this can improve
		 * throughput too. For example, we have request flush1, write1,
		 * flush 2. flush1 is dispatched, then queue is hold, write1
		 * isn't inserted to queue. After flush1 is finished, flush2
		 * will be dispatched. Since disk cache is already clean,
		 * flush2 will be finished very soon, so looks like flush2 is
		 * folded to flush1.
		 * Since the queue is hold, a flag is set to indicate the queue
		 * should be restarted later. Please see flush_end_io() for
		 * details.
		 */
		if (q->flush_pending_idx != q->flush_running_idx &&
				!queue_flush_queueable(q)) {
			q->flush_queue_delayed = 1;
			return NULL;
		}
		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
		    !q->elevator->ops->elevator_dispatch_fn(q, 0))	//從IO調度隊列轉移請求到派發隊列
			return NULL;
	}
}
static int deadline_dispatch_requests(struct request_queue *q, int force)
{
	struct deadline_data *dd = q->elevator->elevator_data;
	const int reads = !list_empty(&dd->fifo_list[READ]);
	const int writes = !list_empty(&dd->fifo_list[WRITE]);
	struct request *rq;
	int data_dir;

	/*
	 * batches are currently reads XOR writes
	 */
	 //確定下一個要處理的請求
	if (dd->next_rq[WRITE])
		rq = dd->next_rq[WRITE];
	else
		rq = dd->next_rq[READ];
	//這個請求可以在這個批次處理
	if (rq && dd->batching < dd->fifo_batch)
		/* we have a next request are still entitled to batch */
		goto dispatch_request;

	/*
	 * at this point we are not running a batch. select the appropriate
	 * data direction (read / write)
	 */

	if (reads) {	//讀優先
		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));

		if (writes && (dd->starved++ >= dd->writes_starved))	//如果寫飢餓次數過多
			goto dispatch_writes;

		data_dir = READ;

		goto dispatch_find_request;
	}

	/*
	 * there are either no reads or writes have been starved
	 */

	if (writes) {
dispatch_writes:
		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));

		dd->starved = 0;	//飢餓次數清0

		data_dir = WRITE;	//處理讀請求

		goto dispatch_find_request;
	}

	return 0;

dispatch_find_request:
	/*
	 * we are not running a batch, find best request for selected data_dir
	 */
	 //如果有請求過期或者掃描到了電梯尾部
	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
		/*
		 * A deadline has expired, the last request was in the other
		 * direction, or we have run out of higher-sectored requests.
		 * Start again from the request with the earliest expiry time.
		 */
		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);	//返回等待最久的請求
	} else {
		/*
		 * The last req was the same dir and we have a next request in
		 * sort order. No expired requests so continue on from here.
		 */
		rq = dd->next_rq[data_dir];
	}

	dd->batching = 0;

dispatch_request:
	/*
	 * rq is the selected appropriate request.
	 */
	dd->batching++;		//遞增批次
	deadline_move_request(dd, rq);	//轉移,並且更新deadline_data結構的next_rq

	return 1;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章