最近在研究數據與元數據分離的相關主題,隨意對IO請求過程進行了深入研究(針對scsi設備)。尤其是請求隊列在塊層,scsi層中有反反覆覆的操作,所以格外關注了請求隊列。
在看到我們我們將磁盤添加到系統中,先是分配gendisk,也就是alloc_disk,在分配好之後,塊設備驅動就會調用add_disk添加到系統,其中很重要的有個blk_register_queue(disk).
int blk_register_queue(struct gendisk *disk)
{
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
if (WARN_ON(!q))
return -ENXIO;
ret = blk_trace_init_sysfs(dev);
if (ret)
return ret;
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0)
return ret;
kobject_uevent(&q->kobj, KOBJ_ADD);
if (!q->request_fn)
return 0;
ret = elv_register_queue(q);
if (ret) {
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
return ret;
}
return 0;
}
其中會看到很重要的一個是
<span style="color:#ff0000;"><span style="font-size:14px;">struct request_queue *q = disk->queue;</span></span>
可是,我們在alloc_disk中並沒有分配過請求隊列,那麼它究竟是哪裏來的呢??
但是在後面看到請求到scsi子系統那一層時候,才發現原來這裏有伏筆。每每這個時候,總是感概設計linux的人真是我膜拜的偶像啊。
只要有新的 SCSI 設備附加到系統, SCSI 中間層就會調用 sd_probe 函數.來看:
static int sd_probe(struct device *dev)
{
struct scsi_device *sdp = to_scsi_device(dev);
struct scsi_disk *sdkp;
struct gendisk *gd;
u32 index;
int error;
error = -ENODEV;
if (sdp->type != TYPE_DISK && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC)
goto out;
SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,
"sd_attach\n"));
error = -ENOMEM;
sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL);
if (!sdkp)
goto out;
gd = alloc_disk(SD_MINORS);
if (!gd)
goto out_free;
do {
if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
goto out_put;
spin_lock(&sd_index_lock);
error = ida_get_new(&sd_index_ida, &index);
spin_unlock(&sd_index_lock);
} while (error == -EAGAIN);
if (error)
goto out_put;
error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
if (error)
goto out_free_index;
sdkp->device = sdp;
sdkp->driver = &sd_template;
sdkp->disk = gd;
sdkp->index = index;
sdkp->openers = 0;
sdkp->previous_state = 1;
if (!sdp->request_queue->rq_timeout) {
if (sdp->type != TYPE_MOD)
blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
else
blk_queue_rq_timeout(sdp->request_queue,
SD_MOD_TIMEOUT);
}
device_initialize(&sdkp->dev);
sdkp->dev.parent = &sdp->sdev_gendev;
sdkp->dev.class = &sd_disk_class;
dev_set_name(&sdkp->dev, dev_name(&sdp->sdev_gendev));
if (device_add(&sdkp->dev))
goto out_free_index;
get_device(&sdp->sdev_gendev);
get_device(&sdkp->dev); /* prevent release before async_schedule */
async_schedule(sd_probe_async, sdkp);
return 0;
out_free_index:
spin_lock(&sd_index_lock);
ida_remove(&sd_index_ida, index);
spin_unlock(&sd_index_lock);
out_put:
put_disk(gd);
out_free:
kfree(sdkp);
out:
return error;}
這裏邊主要做的工作有:
1)分配scsi_disk
2)分配gendisk,所以現在應該明白了,在系統啓動或者有新的設備加入系統時候,會調用alloc_disik來分配gendisk
3)一些初始化,主要關注的是各層結構之間的聯繫例如 scsi_disk->disk=gd,這樣就將底層的scsi_disk結構與上面塊設備層用的gendisk聯繫在一起了。
sdkp->device = sdp;
sdkp->disk = gd;
4)sd_probe_async,這主要是做一些異步操作。而這裏邊就將scsi_devicerequest_queue與gendisk的request_queue聯繫在一起了。
static void sd_probe_async(void *data, async_cookie_t cookie)
{
struct scsi_disk *sdkp = data;
struct scsi_device *sdp;
struct gendisk *gd;
u32 index;
struct device *dev;
sdp = sdkp->device;
gd = sdkp->disk;
index = sdkp->index;
dev = &sdp->sdev_gendev;
if (index < SD_MAX_DISKS) {
gd->major = sd_major((index & 0xf0) >> 4);
gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
gd->minors = SD_MINORS;
}
gd->fops = &sd_fops;
gd->private_data = &sdkp->driver;
gd->queue = sdkp->device->request_queue;
/* defaults, until the device tells us otherwise */
sdp->sector_size = 512;
sdkp->capacity = 0;
sdkp->media_present = 1;
sdkp->write_prot = 0;
sdkp->WCE = 0;
sdkp->RCD = 0;
sdkp->ATO = 0;
sdkp->first_scan = 1;
sd_revalidate_disk(gd);
blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
gd->driverfs_dev = &sdp->sdev_gendev;
gd->flags = GENHD_FL_EXT_DEVT | GENHD_FL_DRIVERFS;
if (sdp->removable)
gd->flags |= GENHD_FL_REMOVABLE;
dev_set_drvdata(dev, sdkp);
add_disk(gd);//會把queue一起註冊到系統
sd_dif_config_host(sdkp);
sd_revalidate_disk(gd);
sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
sdp->removable ? "removable " : "");
put_device(&sdkp->dev);
}
齊總,有一句 gd->queue=sdkp->device->request_queue,那麼也就是說gendisk中request_queue不是在分配gendisk時候給分配的(因爲我們在那裏的源碼就是沒找到的)。gendisk的queue的結構直接就指向該scsi_device的request_queue,那麼這個scsi_device的request_queue又在哪裏呢?
找到在scsi設備探測過程中的scsi_alloc_sdev
<span style="color:#333333;">static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
unsigned int lun, void *hostdata)
{
struct scsi_device *sdev;
int display_failure_msg = 1, ret;
struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
extern void scsi_evt_thread(struct work_struct *work);
sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
GFP_ATOMIC);
if (!sdev)
goto out;
sdev->vendor = scsi_null_device_strs;
sdev->model = scsi_null_device_strs;
sdev->rev = scsi_null_device_strs;
sdev->host = shost;
sdev->id = starget->id;
sdev->lun = lun;
sdev->channel = starget->channel;
sdev->sdev_state = SDEV_CREATED;
INIT_LIST_HEAD(&sdev->siblings);
INIT_LIST_HEAD(&sdev->same_target_siblings);
INIT_LIST_HEAD(&sdev->cmd_list);
INIT_LIST_HEAD(&sdev->starved_entry);
INIT_LIST_HEAD(&sdev->event_list);
spin_lock_init(&sdev->list_lock);
INIT_WORK(&sdev->event_work, scsi_evt_thread);
sdev->sdev_gendev.parent = get_device(&starget->dev);
sdev->sdev_target = starget;
/* usually NULL and set by ->slave_alloc instead */
sdev->hostdata = hostdata;
/* if the device needs this changing, it may do so in the
* slave_configure function */
sdev->max_device_blocked = SCSI_DEFAULT_DEVICE_BLOCKED;
/*
* Some low level driver could use device->type
*/
sdev->type = -1;
/*
* Assume that the device will have handshaking problems,
* and then fix this field later if it turns out it
* doesn't
*/
sdev->borken = 1;
</span><span style="color:#ff0000;">sdev->request_queue = scsi_alloc_queue(sdev);</span><span style="color:#333333;">
if (!sdev->request_queue) {
/* release fn is set up in scsi_sysfs_device_initialise, so
* have to free and put manually here */
put_device(&starget->dev);
kfree(sdev);
goto out;
}
sdev->request_queue->queuedata = sdev;
scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun);
scsi_sysfs_device_initialize(sdev);
if (shost->hostt->slave_alloc) {
ret = shost->hostt->slave_alloc(sdev);
if (ret) {
/*
* if LLDD reports slave not present, don't clutter
* console with alloc failure messages
*/
if (ret == -ENXIO)
display_failure_msg = 0;
goto out_device_destroy;
}
}
return sdev;
out_device_destroy:
scsi_device_set_state(sdev, SDEV_DEL);
transport_destroy_device(&sdev->sdev_gendev);
put_device(&sdev->sdev_gendev);
out:
if (display_failure_msg)
printk(ALLOC_FAILURE_MSG, __func__);
return NULL;
}</span>
該函數用於分配scsi_device,其中有一句sdev->request_queue = scsi_alloc_queue(sdev); 跟進去會看到主要是q = __scsi_alloc_queue(sdev->host, scsi_request_fn);,而這個函數又是幹嘛呢?
struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
request_fn_proc *request_fn)
{
struct request_queue *q;
struct device *dev = shost->shost_gendev.parent;
q = blk_init_queue(request_fn, NULL);
if (!q)
return NULL;
/*
* this limit is imposed by hardware restrictions
*/
blk_queue_max_hw_segments(q, shost->sg_tablesize);
blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
blk_queue_max_sectors(q, shost->max_sectors);
blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
blk_queue_segment_boundary(q, shost->dma_boundary);
dma_set_seg_boundary(dev, shost->dma_boundary);
blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
/* New queue, no concurrency on queue_flags */
if (!shost->use_clustering)
queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
/*
* set a reasonable default alignment on word boundaries: the
* host and device may alter it using
* blk_queue_update_dma_alignment() later.
*/
blk_queue_dma_alignment(q, 0x03);
return q;
調用了q = blk_init_queue(request_fn, NULL);(同時要注意這裏的request_fn)我們傳入的是scsi_request_fn 這在請求的處理過程中是很重要的,後面說。這裏就是分配請求隊列。所以整理的過程就是這樣
1)在系統啓動時候的設備探測中,我們會調用scsi_alloc_sdev分配scsi_device邏輯設備,在這過程中我們同時分配了其函數,並且在分配queue時候傳入了request_fn爲scsi_request_fn
2)在探測過程中,同時有調用到sd_probe ,這個過程主要做的是,a),分配scsi_diski, b)分配gendisik,c)聯繫起各個結構,將scsi-disk的disk指針指向gendisk,將scsi_disk的device指向scsi_device,同時將gendisk的gd->queue指向該scsi_device 的queue字段。
所以現在應該就很清楚了IO請求隊列的身世。
爲什麼我去追究了這個過程呢?因爲在看到submit_bio後調用的generic_make_request中,我們看到在將請求插入到隊列中前,他會先去獲取設備的請求隊列,通過q = bdev_get_queue(bio->bi_bdev);,而他是通過return bdev->bd_disk->queue;以及在後面的make_request調度後調用 q->request_fn。總是疑惑在上面處理的東西用的是gendisk的request_queue,他到scsi層使用的request_queue跟上面一樣的嗎?q_>request_fn他怎麼知道就是執行到了scsi_request_fn呢?
所以現在就明白了,在系統啓動時候探測到scsi設備時候,會分配好scsi_disk,gendisk,scsi_device,然後聯繫起他們的關係,將scsi-disk的disk指針指向gendisk,將scsi_disk的device指向scsi_device,同時將gendisk的gd->queue指向該scsi_device 的queue字段。所以在generic_make_request 中通過gendisk獲取queue也就是下面的scsi_device的queue,而最後調用的q->request_fn,因爲這裏的q就是下面scsi_device的q,他在alloc_queue的時候就將request_fn實例化爲sicsi_request_fn傳進去了。所以在generic_make_request中當然q->request_fn就是執行scsi_request_fn這個策略例程了。