最近在研究数据与元数据分离的相关主题,随意对IO请求过程进行了深入研究(针对scsi设备)。尤其是请求队列在块层,scsi层中有反反复复的操作,所以格外关注了请求队列。
在看到我们我们将磁盘添加到系统中,先是分配gendisk,也就是alloc_disk,在分配好之后,块设备驱动就会调用add_disk添加到系统,其中很重要的有个blk_register_queue(disk).
int blk_register_queue(struct gendisk *disk)
{
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
if (WARN_ON(!q))
return -ENXIO;
ret = blk_trace_init_sysfs(dev);
if (ret)
return ret;
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0)
return ret;
kobject_uevent(&q->kobj, KOBJ_ADD);
if (!q->request_fn)
return 0;
ret = elv_register_queue(q);
if (ret) {
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
return ret;
}
return 0;
}
其中会看到很重要的一个是
<span style="color:#ff0000;"><span style="font-size:14px;">struct request_queue *q = disk->queue;</span></span>
可是,我们在alloc_disk中并没有分配过请求队列,那么它究竟是哪里来的呢??
但是在后面看到请求到scsi子系统那一层时候,才发现原来这里有伏笔。每每这个时候,总是感概设计linux的人真是我膜拜的偶像啊。
只要有新的 SCSI 设备附加到系统, SCSI 中间层就会调用 sd_probe 函数.来看:
static int sd_probe(struct device *dev)
{
struct scsi_device *sdp = to_scsi_device(dev);
struct scsi_disk *sdkp;
struct gendisk *gd;
u32 index;
int error;
error = -ENODEV;
if (sdp->type != TYPE_DISK && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC)
goto out;
SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,
"sd_attach\n"));
error = -ENOMEM;
sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL);
if (!sdkp)
goto out;
gd = alloc_disk(SD_MINORS);
if (!gd)
goto out_free;
do {
if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
goto out_put;
spin_lock(&sd_index_lock);
error = ida_get_new(&sd_index_ida, &index);
spin_unlock(&sd_index_lock);
} while (error == -EAGAIN);
if (error)
goto out_put;
error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
if (error)
goto out_free_index;
sdkp->device = sdp;
sdkp->driver = &sd_template;
sdkp->disk = gd;
sdkp->index = index;
sdkp->openers = 0;
sdkp->previous_state = 1;
if (!sdp->request_queue->rq_timeout) {
if (sdp->type != TYPE_MOD)
blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
else
blk_queue_rq_timeout(sdp->request_queue,
SD_MOD_TIMEOUT);
}
device_initialize(&sdkp->dev);
sdkp->dev.parent = &sdp->sdev_gendev;
sdkp->dev.class = &sd_disk_class;
dev_set_name(&sdkp->dev, dev_name(&sdp->sdev_gendev));
if (device_add(&sdkp->dev))
goto out_free_index;
get_device(&sdp->sdev_gendev);
get_device(&sdkp->dev); /* prevent release before async_schedule */
async_schedule(sd_probe_async, sdkp);
return 0;
out_free_index:
spin_lock(&sd_index_lock);
ida_remove(&sd_index_ida, index);
spin_unlock(&sd_index_lock);
out_put:
put_disk(gd);
out_free:
kfree(sdkp);
out:
return error;}
这里边主要做的工作有:
1)分配scsi_disk
2)分配gendisk,所以现在应该明白了,在系统启动或者有新的设备加入系统时候,会调用alloc_disik来分配gendisk
3)一些初始化,主要关注的是各层结构之间的联系例如 scsi_disk->disk=gd,这样就将底层的scsi_disk结构与上面块设备层用的gendisk联系在一起了。
sdkp->device = sdp;
sdkp->disk = gd;
4)sd_probe_async,这主要是做一些异步操作。而这里边就将scsi_devicerequest_queue与gendisk的request_queue联系在一起了。
static void sd_probe_async(void *data, async_cookie_t cookie)
{
struct scsi_disk *sdkp = data;
struct scsi_device *sdp;
struct gendisk *gd;
u32 index;
struct device *dev;
sdp = sdkp->device;
gd = sdkp->disk;
index = sdkp->index;
dev = &sdp->sdev_gendev;
if (index < SD_MAX_DISKS) {
gd->major = sd_major((index & 0xf0) >> 4);
gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
gd->minors = SD_MINORS;
}
gd->fops = &sd_fops;
gd->private_data = &sdkp->driver;
gd->queue = sdkp->device->request_queue;
/* defaults, until the device tells us otherwise */
sdp->sector_size = 512;
sdkp->capacity = 0;
sdkp->media_present = 1;
sdkp->write_prot = 0;
sdkp->WCE = 0;
sdkp->RCD = 0;
sdkp->ATO = 0;
sdkp->first_scan = 1;
sd_revalidate_disk(gd);
blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
gd->driverfs_dev = &sdp->sdev_gendev;
gd->flags = GENHD_FL_EXT_DEVT | GENHD_FL_DRIVERFS;
if (sdp->removable)
gd->flags |= GENHD_FL_REMOVABLE;
dev_set_drvdata(dev, sdkp);
add_disk(gd);//会把queue一起注册到系统
sd_dif_config_host(sdkp);
sd_revalidate_disk(gd);
sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
sdp->removable ? "removable " : "");
put_device(&sdkp->dev);
}
齐总,有一句 gd->queue=sdkp->device->request_queue,那么也就是说gendisk中request_queue不是在分配gendisk时候给分配的(因为我们在那里的源码就是没找到的)。gendisk的queue的结构直接就指向该scsi_device的request_queue,那么这个scsi_device的request_queue又在哪里呢?
找到在scsi设备探测过程中的scsi_alloc_sdev
<span style="color:#333333;">static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
unsigned int lun, void *hostdata)
{
struct scsi_device *sdev;
int display_failure_msg = 1, ret;
struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
extern void scsi_evt_thread(struct work_struct *work);
sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
GFP_ATOMIC);
if (!sdev)
goto out;
sdev->vendor = scsi_null_device_strs;
sdev->model = scsi_null_device_strs;
sdev->rev = scsi_null_device_strs;
sdev->host = shost;
sdev->id = starget->id;
sdev->lun = lun;
sdev->channel = starget->channel;
sdev->sdev_state = SDEV_CREATED;
INIT_LIST_HEAD(&sdev->siblings);
INIT_LIST_HEAD(&sdev->same_target_siblings);
INIT_LIST_HEAD(&sdev->cmd_list);
INIT_LIST_HEAD(&sdev->starved_entry);
INIT_LIST_HEAD(&sdev->event_list);
spin_lock_init(&sdev->list_lock);
INIT_WORK(&sdev->event_work, scsi_evt_thread);
sdev->sdev_gendev.parent = get_device(&starget->dev);
sdev->sdev_target = starget;
/* usually NULL and set by ->slave_alloc instead */
sdev->hostdata = hostdata;
/* if the device needs this changing, it may do so in the
* slave_configure function */
sdev->max_device_blocked = SCSI_DEFAULT_DEVICE_BLOCKED;
/*
* Some low level driver could use device->type
*/
sdev->type = -1;
/*
* Assume that the device will have handshaking problems,
* and then fix this field later if it turns out it
* doesn't
*/
sdev->borken = 1;
</span><span style="color:#ff0000;">sdev->request_queue = scsi_alloc_queue(sdev);</span><span style="color:#333333;">
if (!sdev->request_queue) {
/* release fn is set up in scsi_sysfs_device_initialise, so
* have to free and put manually here */
put_device(&starget->dev);
kfree(sdev);
goto out;
}
sdev->request_queue->queuedata = sdev;
scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun);
scsi_sysfs_device_initialize(sdev);
if (shost->hostt->slave_alloc) {
ret = shost->hostt->slave_alloc(sdev);
if (ret) {
/*
* if LLDD reports slave not present, don't clutter
* console with alloc failure messages
*/
if (ret == -ENXIO)
display_failure_msg = 0;
goto out_device_destroy;
}
}
return sdev;
out_device_destroy:
scsi_device_set_state(sdev, SDEV_DEL);
transport_destroy_device(&sdev->sdev_gendev);
put_device(&sdev->sdev_gendev);
out:
if (display_failure_msg)
printk(ALLOC_FAILURE_MSG, __func__);
return NULL;
}</span>
该函数用于分配scsi_device,其中有一句sdev->request_queue = scsi_alloc_queue(sdev); 跟进去会看到主要是q = __scsi_alloc_queue(sdev->host, scsi_request_fn);,而这个函数又是干嘛呢?
struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
request_fn_proc *request_fn)
{
struct request_queue *q;
struct device *dev = shost->shost_gendev.parent;
q = blk_init_queue(request_fn, NULL);
if (!q)
return NULL;
/*
* this limit is imposed by hardware restrictions
*/
blk_queue_max_hw_segments(q, shost->sg_tablesize);
blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
blk_queue_max_sectors(q, shost->max_sectors);
blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
blk_queue_segment_boundary(q, shost->dma_boundary);
dma_set_seg_boundary(dev, shost->dma_boundary);
blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
/* New queue, no concurrency on queue_flags */
if (!shost->use_clustering)
queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
/*
* set a reasonable default alignment on word boundaries: the
* host and device may alter it using
* blk_queue_update_dma_alignment() later.
*/
blk_queue_dma_alignment(q, 0x03);
return q;
调用了q = blk_init_queue(request_fn, NULL);(同时要注意这里的request_fn)我们传入的是scsi_request_fn 这在请求的处理过程中是很重要的,后面说。这里就是分配请求队列。所以整理的过程就是这样
1)在系统启动时候的设备探测中,我们会调用scsi_alloc_sdev分配scsi_device逻辑设备,在这过程中我们同时分配了其函数,并且在分配queue时候传入了request_fn为scsi_request_fn
2)在探测过程中,同时有调用到sd_probe ,这个过程主要做的是,a),分配scsi_diski, b)分配gendisik,c)联系起各个结构,将scsi-disk的disk指针指向gendisk,将scsi_disk的device指向scsi_device,同时将gendisk的gd->queue指向该scsi_device 的queue字段。
所以现在应该就很清楚了IO请求队列的身世。
为什么我去追究了这个过程呢?因为在看到submit_bio后调用的generic_make_request中,我们看到在将请求插入到队列中前,他会先去获取设备的请求队列,通过q = bdev_get_queue(bio->bi_bdev);,而他是通过return bdev->bd_disk->queue;以及在后面的make_request调度后调用 q->request_fn。总是疑惑在上面处理的东西用的是gendisk的request_queue,他到scsi层使用的request_queue跟上面一样的吗?q_>request_fn他怎么知道就是执行到了scsi_request_fn呢?
所以现在就明白了,在系统启动时候探测到scsi设备时候,会分配好scsi_disk,gendisk,scsi_device,然后联系起他们的关系,将scsi-disk的disk指针指向gendisk,将scsi_disk的device指向scsi_device,同时将gendisk的gd->queue指向该scsi_device 的queue字段。所以在generic_make_request 中通过gendisk获取queue也就是下面的scsi_device的queue,而最后调用的q->request_fn,因为这里的q就是下面scsi_device的q,他在alloc_queue的时候就将request_fn实例化为sicsi_request_fn传进去了。所以在generic_make_request中当然q->request_fn就是执行scsi_request_fn这个策略例程了。