1. 在block IO層框架分析2中,我們看到request請求會被提交到IO調度系統中,elv_merge判斷bio是否可以被合併到request。
在準備好request後,linux塊層調用add_request添加request,該函數直接以ELEVATOR_INSERT_SORT爲參數直接調用__elv_add_request。
在blk_peek_request函數中,調用__elv_next_request函數從請求隊列獲得下一個要處理的請求。
所以接下來就分析elv_merge、__elv_add_request、__elv_next_request函數。
IO調度系統有3個算法:NOOP算法,DEADLINE算法,CFQ算法。今天我們以較爲簡單的DEADLINE算法來分析。
struct elevator_type
{
struct list_head list; //鏈入系統已註冊的電梯算法
struct elevator_ops ops; //這種調度算法的操作表
struct elv_fs_entry *elevator_attrs;
char elevator_name[ELV_NAME_MAX]; //名字
struct module *elevator_owner;
};
/*
* each queue has an elevator_queue associated with it
*/
struct elevator_queue
{
struct elevator_ops *ops; //指向調度器操作表的指針
void *elevator_data; //該電梯隊列的私有數據
struct kobject kobj;
struct elevator_type *elevator_type; //指向調度器類型
struct mutex sysfs_lock;
struct hlist_head *hash; //安裝請求最後一個扇區的編號進行哈希
unsigned int registered:1;
};
struct elevator_ops
{
elevator_merge_fn *elevator_merge_fn; //查找可以和bio合併的request
elevator_merged_fn *elevator_merged_fn; //有請求可以合併時調用
elevator_merge_req_fn *elevator_merge_req_fn; //在兩個請求合併時調用
elevator_allow_merge_fn *elevator_allow_merge_fn; //判定bio可以安全合併到現有請求時調用
elevator_bio_merged_fn *elevator_bio_merged_fn;
elevator_dispatch_fn *elevator_dispatch_fn; //將準備好的請求轉移到派發隊列
elevator_add_req_fn *elevator_add_req_fn; //添加一個新請求
elevator_activate_req_fn *elevator_activate_req_fn;
elevator_deactivate_req_fn *elevator_deactivate_req_fn;
elevator_completed_req_fn *elevator_completed_req_fn;
elevator_request_list_fn *elevator_former_req_fn;
elevator_request_list_fn *elevator_latter_req_fn;
elevator_set_req_fn *elevator_set_req_fn;
elevator_put_req_fn *elevator_put_req_fn;
elevator_may_queue_fn *elevator_may_queue_fn;
elevator_init_fn *elevator_init_fn;
elevator_exit_fn *elevator_exit_fn;
void (*trim)(struct io_context *);
};
DEADLINE 調度算法使用四個隊列,寫操作有一個排序隊列和FIFO隊列,讀操作也有一個排序隊列和FIFO隊列。先看下deadline_data結構
struct deadline_data {
/*
* run time data
*/
/*
* requests (deadline_rq s) are present on both sort_list and fifo_list
*/
struct rb_root sort_list[2]; //紅黑樹的根,讀一個,寫一個
struct list_head fifo_list[2]; //FIFO隊列
/*
* next in sort order. read, write or both are NULL
*/
struct request *next_rq[2]; //按照扇區編號增加方向下一個請求的指針
unsigned int batching; /* number of sequential requests made */ //提交的請求的數目
sector_t last_sector; /* head position */
unsigned int starved; /* times reads have starved writes */ //提交讀請求而造成寫飢餓的次數
/*
* settings that change how the i/o scheduler behaves
*/
int fifo_expire[2];
int fifo_batch;
int writes_starved;
int front_merges;
};
2. DEADLINE IO調度分析
int elevator_init(struct request_queue *q, char *name)
{
struct elevator_type *e = NULL;
struct elevator_queue *eq;
void *data;
if (unlikely(q->elevator))
return 0;
INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
q->end_sector = 0;
q->boundary_rq = NULL;
if (name) { //如果指定了名字
e = elevator_get(name);
if (!e)
return -EINVAL;
}
if (!e && *chosen_elevator) { //是否有參數指定默認的電梯算法
e = elevator_get(chosen_elevator);
if (!e)
printk(KERN_ERR "I/O scheduler %s not found\n",
chosen_elevator);
}
if (!e) {
e = elevator_get(CONFIG_DEFAULT_IOSCHED); //是否有編譯選項指定默認的電梯算法
if (!e) {
printk(KERN_ERR
"Default I/O scheduler not found. " \
"Using noop.\n");
e = elevator_get("noop"); //都沒有,就使用noop
}
}
eq = elevator_alloc(q, e); //分配elevator_queue結構
if (!eq)
return -ENOMEM;
data = elevator_init_queue(q, eq); //分配私有數據
if (!data) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
elevator_attach(q, eq, data); //將IO調度隊列及私有數據關聯到請求隊列
return 0;
}
int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
{
struct elevator_queue *e = q->elevator;
struct request *__rq;
int ret;
/*
* Levels of merges:
* nomerges: No merges at all attempted
* noxmerges: Only simple one-hit cache try
* merges: All merge tries attempted
*/
if (blk_queue_nomerges(q)) //不要嘗試進行合併
return ELEVATOR_NO_MERGE;
/*
* First try one-hit cache.
*/
if (q->last_merge) {
ret = elv_try_merge(q->last_merge, bio);
if (ret != ELEVATOR_NO_MERGE) {
*req = q->last_merge;
return ret;
}
}
if (blk_queue_noxmerges(q)) //只嘗試對緩存下來的request進行合併
return ELEVATOR_NO_MERGE;
/*
* See if our hash lookup can find a potential backmerge.
*/
__rq = elv_rqhash_find(q, bio->bi_sector); //通過扇區的編號進行哈希查找
if (__rq && elv_rq_merge_ok(__rq, bio)) {
*req = __rq;
return ELEVATOR_BACK_MERGE;
}
if (e->ops->elevator_merge_fn) //特定於各個調度算法的,對於DEADLINE,deadline_merge
return e->ops->elevator_merge_fn(q, req, bio);
return ELEVATOR_NO_MERGE;
}
static int
deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
{
struct deadline_data *dd = q->elevator->elevator_data;
struct request *__rq;
int ret;
/*
* check for front merge
*/
if (dd->front_merges) { //DEADLINE 只處理向前合併
sector_t sector = bio->bi_sector + bio_sectors(bio); //計算最後一個扇區
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); //在紅黑樹中查找
if (__rq) { //如果找到
BUG_ON(sector != blk_rq_pos(__rq));
if (elv_rq_merge_ok(__rq, bio)) {
ret = ELEVATOR_FRONT_MERGE;
goto out;
}
}
}
return ELEVATOR_NO_MERGE;
out:
*req = __rq;
return ret;
}
static void add_acct_request(struct request_queue *q, struct request *rq,
int where)
{
drive_stat_acct(rq, 1);
__elv_add_request(q, rq, where); //where爲ELEVATOR_INSERT_SORT
}
void __elv_add_request(struct request_queue *q, struct request *rq, int where)
{
trace_block_rq_insert(q, rq);
rq->q = q;
if (rq->cmd_flags & REQ_SOFTBARRIER) {
/* barriers are scheduling boundary, update end_sector */
if (rq->cmd_type == REQ_TYPE_FS ||
(rq->cmd_flags & REQ_DISCARD)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
(where == ELEVATOR_INSERT_SORT ||
where == ELEVATOR_INSERT_SORT_MERGE))
where = ELEVATOR_INSERT_BACK;
switch (where) {
case ELEVATOR_INSERT_REQUEUE:
case ELEVATOR_INSERT_FRONT:
rq->cmd_flags |= REQ_SOFTBARRIER;
list_add(&rq->queuelist, &q->queue_head);
break;
case ELEVATOR_INSERT_BACK:
rq->cmd_flags |= REQ_SOFTBARRIER;
elv_drain_elevator(q);
list_add_tail(&rq->queuelist, &q->queue_head);
/*
* We kick the queue here for the following reasons.
* - The elevator might have returned NULL previously
* to delay requests and returned them now. As the
* queue wasn't empty before this request, ll_rw_blk
* won't run the queue on return, resulting in hang.
* - Usually, back inserted requests won't be merged
* with anything. There's no point in delaying queue
* processing.
*/
__blk_run_queue(q);
break;
case ELEVATOR_INSERT_SORT_MERGE:
/*
* If we succeed in merging this request with one in the
* queue already, we are done - rq has now been freed,
* so no need to do anything further.
*/
if (elv_attempt_insert_merge(q, rq))
break;
case ELEVATOR_INSERT_SORT:
BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
!(rq->cmd_flags & REQ_DISCARD));
rq->cmd_flags |= REQ_SORTED;
q->nr_sorted++;
if (rq_mergeable(rq)) { //如果該請求可以合併
elv_rqhash_add(q, rq);
if (!q->last_merge)
q->last_merge = rq;
}
/*
* Some ioscheds (cfq) run q->request_fn directly, so
* rq cannot be accessed after calling
* elevator_add_req_fn.
*/
q->elevator->ops->elevator_add_req_fn(q, rq); //執行deadline_add_request
break;
case ELEVATOR_INSERT_FLUSH:
rq->cmd_flags |= REQ_SOFTBARRIER;
blk_insert_flush(rq);
break;
default:
printk(KERN_ERR "%s: bad insertion point %d\n",
__func__, where);
BUG();
}
}
static void
deadline_add_request(struct request_queue *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
deadline_add_rq_rb(dd, rq); //添加到紅黑樹中
/*
* set expire time and add to fifo list
*/
rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); //設置請求的超時時間
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); //添加到FIFO
}
接下來再看下__elv_next_request函數:
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
while (1) {
if (!list_empty(&q->queue_head)) { //派發隊列不爲空
rq = list_entry_rq(q->queue_head.next);
return rq;
}
/*
* Flush request is running and flush request isn't queueable
* in the drive, we can hold the queue till flush request is
* finished. Even we don't do this, driver can't dispatch next
* requests and will requeue them. And this can improve
* throughput too. For example, we have request flush1, write1,
* flush 2. flush1 is dispatched, then queue is hold, write1
* isn't inserted to queue. After flush1 is finished, flush2
* will be dispatched. Since disk cache is already clean,
* flush2 will be finished very soon, so looks like flush2 is
* folded to flush1.
* Since the queue is hold, a flag is set to indicate the queue
* should be restarted later. Please see flush_end_io() for
* details.
*/
if (q->flush_pending_idx != q->flush_running_idx &&
!queue_flush_queueable(q)) {
q->flush_queue_delayed = 1;
return NULL;
}
if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
!q->elevator->ops->elevator_dispatch_fn(q, 0)) //從IO調度隊列轉移請求到派發隊列
return NULL;
}
}
static int deadline_dispatch_requests(struct request_queue *q, int force)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(&dd->fifo_list[READ]);
const int writes = !list_empty(&dd->fifo_list[WRITE]);
struct request *rq;
int data_dir;
/*
* batches are currently reads XOR writes
*/
//確定下一個要處理的請求
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ];
//這個請求可以在這個批次處理
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
goto dispatch_request;
/*
* at this point we are not running a batch. select the appropriate
* data direction (read / write)
*/
if (reads) { //讀優先
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
if (writes && (dd->starved++ >= dd->writes_starved)) //如果寫飢餓次數過多
goto dispatch_writes;
data_dir = READ;
goto dispatch_find_request;
}
/*
* there are either no reads or writes have been starved
*/
if (writes) {
dispatch_writes:
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
dd->starved = 0; //飢餓次數清0
data_dir = WRITE; //處理讀請求
goto dispatch_find_request;
}
return 0;
dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
//如果有請求過期或者掃描到了電梯尾部
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
rq = rq_entry_fifo(dd->fifo_list[data_dir].next); //返回等待最久的請求
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
rq = dd->next_rq[data_dir];
}
dd->batching = 0;
dispatch_request:
/*
* rq is the selected appropriate request.
*/
dd->batching++; //遞增批次
deadline_move_request(dd, rq); //轉移,並且更新deadline_data結構的next_rq
return 1;
}