Linux3.10.0块IO子系统流程（3）-- SCSI策略例程

很长时间以来，Linux块设备使用了一种称为“蓄流/泄流”（plugging/unplugging）的技术来改进吞吐率。简单而言，这种工作方式类似浴盆排水系统的塞子。当IO被提交时，它被储存在一个队列，稍后的某个时间，我们才允许IO从队列派发出去。之所以这么做是为IO尽可能做合并和排序。

  1 static void scsi_request_fn(struct request_queue *q)
  2 {
  3     struct scsi_device *sdev = q->queuedata;
  4     struct Scsi_Host *shost;
  5     struct scsi_cmnd *cmd;
  6     struct request *req;
  7     if(!get_device(&sdev->sdev_gendev))
  8         /* We must be tearing the block queue down already */
  9         return;
 10     /*
 11      * To start with, we keep looping until the queue is empty, or until
 12      * the host is no longer able to accept any more requests.
 13      */
 14     shost = sdev->host;
 15     for (;;) {
 16         int rtn;
 17         /*
 18          * get next queueable request.  We do this early to make sure
 19          * that the request is fully prepared even if we cannot
 20          * accept it.
 21          */
 22         req = blk_peek_request(q);    // 获得下一个可排队的请求，如果没有请求或者现在还不能想SCSI设备发送请求，则退出循环
 23         if (!req || !scsi_dev_queue_ready(q, sdev))
 24             break;
 25         /* 如果设备已经离线，则输出错误消息， 调用scsi_kill_request函数释放请求，并以此方式处理后面所有的请求 */
 26         if (unlikely(!scsi_device_online(sdev))) {
 27             sdev_printk(KERN_ERR, sdev,
 28                     "rejecting I/O to offline device\n");
 29             scsi_kill_request(req, q);
 30             continue;
 31         }
 32         /*
 33          * Remove the request from the request list.
 34          * 如果队列不是使用generic tag queueing，并且没有为请求启动tagged操作，调用blk_start_request开始由驱动处理请求，这个函数将请求从队列中取出，为它启动超时定时器
 35          */
 36         if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))    // 
 37             blk_start_request(req);
 38         sdev->device_busy++;
 39         spin_unlock(q->queue_lock);
 40         /* 从块设备驱动层请求描述符的special域获得SCSI命令描述符，这是在之前的blk_peek_request函数中调用请求队列的prep_rq_fn回调函数准备的 */
 41         cmd = req->special;
 42         if (unlikely(cmd == NULL)) {
 43             printk(KERN_CRIT "impossible request in %s.\n"
 44                      "please mail a stack trace to "
 45                      "[email protected]\n",
 46                      __func__);
 47             blk_dump_rq_flags(req, "foo");
 48             BUG();
 49         }
 50         spin_lock(shost->host_lock);
 51         /*
 52          * We hit this when the driver is using a host wide
 53          * tag map. For device level tag maps the queue_depth check
 54          * in the device ready fn would prevent us from trying
 55          * to allocate a tag. Since the map is a shared host resource
 56          * we add the dev to the starved list so it eventually gets
 57          * a run when a tag is freed.
 58          */
 59         if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
 60             if (list_empty(&sdev->starved_entry))
 61                 list_add_tail(&sdev->starved_entry,
 62                           &shost->starved_list);
 63             goto not_ready;
 64         }
 65         if (!scsi_target_queue_ready(shost, sdev))
 66             goto not_ready;
 67         if (!scsi_host_queue_ready(q, shost, sdev))
 68             goto not_ready;
 69         scsi_target(sdev)->target_busy++;
 70         shost->host_busy++;
 71         /*
 72          * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
 73          *        take the lock again.
 74          */
 75         spin_unlock_irq(shost->host_lock);
 76         /*
 77          * Finally, initialize any error handling parameters, and set up the timers for timeouts.
 78          * 初始化错误处理参数， 设置超时定时器
 79          */
 80         scsi_init_cmd_errh(cmd);
 81         /*
 82          * Dispatch the command to the low-level driver.
 83          * 将命令派发到底层驱动
 84          */
 85         rtn = scsi_dispatch_cmd(cmd);
 86         spin_lock_irq(q->queue_lock);
 87         if (rtn)
 88             goto out_delay;
 89     }
 90     goto out;
 91 
 92 not_ready:
 93     spin_unlock_irq(shost->host_lock);
 94     /*
 95      * lock q, handle tag, requeue req, and decrement device_busy. We
 96      * must return with queue_lock held.
 97      *
 98      * Decrementing device_busy without checking it is OK, as all such
 99      * cases (host limits or settings) should run the queue at some
100      * later time.
101      */
102     spin_lock_irq(q->queue_lock);
103     blk_requeue_request(q, req);
104     sdev->device_busy--;
105 out_delay:
106     if (sdev->device_busy == 0)
107         blk_delay_queue(q, SCSI_QUEUE_DELAY);
108 out:
109     /* must be careful here...if we trigger the ->remove() function
110      * we cannot be holding the q lock */
111     spin_unlock_irq(q->queue_lock);
112     put_device(&sdev->sdev_gendev);
113     spin_lock_irq(q->queue_lock);
114 }

blk_peek_request从请求队列“顶部”取得下一个请求。函数的实现就是一个大循环，每次调用__elv_next_request从电梯队列中取出一个请求进行处理

  1 /**
  2 * blk_peek_request - peek at the top of a request queue
  3 * @q: request queue to peek at
  4 *
  5 * Description:
  6 *     Return the request at the top of @q.  The returned request
  7 *     should be started using blk_start_request() before LLD starts
  8 *     processing it.
  9 *
 10 * Return:
 11 *     Pointer to the request at the top of @q if available.  Null
 12 *     otherwise.
 13 *
 14 * Context:
 15 *     queue_lock must be held.
 16 */
 17 struct request *blk_peek_request(struct request_queue *q)
 18 {
 19     struct request *rq;
 20     int ret;
 21 
 22     while ((rq = __elv_next_request(q)) != NULL) {
 23 
 24         rq = blk_pm_peek_request(q, rq);
 25         if (!rq)
 26             break;
 27         /* 请求可能是全新的或者是由于暂时不能处理而重新排入队列的，对于后一种情况，必然设置了REQ_STARTED标志。
 28           * 换句话说，如果没有该标志，则表示第一次看见此请求，如果请求被插入还需要排序，则调用elv_activate_rq函数确定合适执行该请求
 29           */
 30         if (!(rq->cmd_flags & REQ_STARTED)) {
 31             /*
 32              * This is the first time the device driver
 33              * sees this request (possibly after
 34              * requeueing).  Notify IO scheduler.
 35              */
 36             if (rq->cmd_flags & REQ_SORTED)
 37                 elv_activate_rq(q, rq);
 38 
 39             /*
 40              * just mark as started even if we don't start
 41              * it, a request that has been delayed should
 42              * not be passed by new incoming requests
 43              */
 44             rq->cmd_flags |= REQ_STARTED;
 45             trace_block_rq_issue(q, rq);
 46         }
 47         /* 配合IO调度器 */
 48         if (!q->boundary_rq || q->boundary_rq == rq) {
 49             q->end_sector = rq_end_sector(rq);
 50             q->boundary_rq = NULL;
 51         }
 52 
 53         /* 如果请求队列设置了REQ_DONTPREP，表明不需要准备SCSI命令，退出循环，向调用者返回这个请求 */
 54         if (rq->cmd_flags & REQ_DONTPREP)
 55             break;
 56 
 57         /* 
 58           * 如果请求队列的dma_drain_size不为0，说明存在“过剩DMA”问题，这种情况下，需要为请求增加一个额外的段
 59           * 以便将来在聚散列表后追加“抽干缓冲区”
 60           */
 61         if (q->dma_drain_size && blk_rq_bytes(rq)) {
 62             /*
 63              * make sure space for the drain appears we
 64              * know we can do this because max_hw_segments
 65              * has been adjusted to be one fewer than the
 66              * device can handle
 67              */
 68             rq->nr_phys_segments++;
 69         }
 70         /* 
 71           * 如果没有定义 prep_rq_fn回调，则返回
 72           * 否则调用回调为请求准备SCSI命令描述符，它有三种返回值：
 73           *     BLKPREP_OK：表示命令初期准备成功
 74           *     BLKPREP_DEFER：表示暂时还不能继续处理，需要将命令重新排入队列
 75           *     BLKPREP_KILL：该请求没办法继续处理，上上层报告IO错误，这里不退出循环，而是继续尝试下一个请求
 76           */
 77         if (!q->prep_rq_fn)
 78             break;
 79 
 80         ret = q->prep_rq_fn(q, rq);
 81         if (ret == BLKPREP_OK) {
 82             break;
 83         } else if (ret == BLKPREP_DEFER) {
 84             /*
 85              * the request may have been (partially) prepped.
 86              * we need to keep this request in the front to
 87              * avoid resource deadlock.  REQ_STARTED will
 88              * prevent other fs requests from passing this one.
 89              */
 90             if (q->dma_drain_size && blk_rq_bytes(rq) &&
 91                 !(rq->cmd_flags & REQ_DONTPREP)) {
 92                 /*
 93                  * remove the space for the drain we added
 94                  * so that we don't add it again
 95                  */
 96                 --rq->nr_phys_segments;
 97             }
 98 
 99             rq = NULL;
100             break;
101         } else if (ret == BLKPREP_KILL) {
102             rq->cmd_flags |= REQ_QUIET;
103             /*
104              * Mark this request as started so we don't trigger
105              * any debug logic in the end I/O path.
106              */
107             blk_start_request(rq);
108             __blk_end_request_all(rq, -EIO);
109         } else {
110             printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
111             break;
112         }
113     }
114 
115     return rq;
116 }

请求队列中的prep_rq_fn回调函数实现了从请求构造SCSI命令的方法，prep_rq_fn回调函数关键有两个任务：

构造命令描述块
如果需要的话为数据传输准备聚散列表

命令描述块和聚散列表都被封装到SCSI命令描述符中，我们知道，请求至少有两个来源

来自上层bio
来自SCSI公共服务层

在刚找到SCSI设备为其初始化请求队列时，这个回调函数被设置为scsi_prep_fn

 1 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 2 {
 3     struct request_queue *q;
 4 
 5     q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
 6     if (!q)
 7         return NULL;
 8 
 9     blk_queue_prep_rq(q, scsi_prep_fn);
10     blk_queue_softirq_done(q, scsi_softirq_done);
11     blk_queue_rq_timed_out(q, scsi_times_out);
12     blk_queue_lld_busy(q, scsi_lld_busy);
13     return q;
14 }
15 
16 /**
17 * blk_queue_prep_rq - set a prepare_request function for queue
18 * @q:        queue
19 * @pfn:    prepare_request function
20 *
21 * It's possible for a queue to register a prepare_request callback which
22 * is invoked before the request is handed to the request_fn. The goal of
23 * the function is to prepare a request for I/O, it can be used to build a
24 * cdb from the request data for instance.
25 *
26 */
27 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
28 {
29     q->prep_rq_fn = pfn;
30 }

初始化回调

如果SCSI设备被高层驱动绑定，这个回调函数会被修改，例如，在sd_probe中被设置成sd_prep_fn

 1 static void sd_probe_async(void *data, async_cookie_t cookie)
 2 {
 3     struct scsi_disk *sdkp = data;
 4     struct scsi_device *sdp;
 5     struct gendisk *gd;
 6     u32 index;
 7     struct device *dev;
 8 
 9     sdp = sdkp->device;
10     gd = sdkp->disk;
11     index = sdkp->index;
12     dev = &sdp->sdev_gendev;
13 
14     gd->major = sd_major((index & 0xf0) >> 4);
15     gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
16     gd->minors = SD_MINORS;
17 
18     gd->fops = &sd_fops;
19     gd->private_data = &sdkp->driver;
20     gd->queue = sdkp->device->request_queue;
21 
22     /* defaults, until the device tells us otherwise */
23     sdp->sector_size = 512;
24     sdkp->capacity = 0;
25     sdkp->media_present = 1;
26     sdkp->write_prot = 0;
27     sdkp->cache_override = 0;
28     sdkp->WCE = 0;
29     sdkp->RCD = 0;
30     sdkp->ATO = 0;
31     sdkp->first_scan = 1;
32     sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;
33 
34     sd_revalidate_disk(gd);
35 
36     blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
37     blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);
38 
39     gd->driverfs_dev = &sdp->sdev_gendev;
40     gd->flags = GENHD_FL_EXT_DEVT;
41     if (sdp->removable) {
42         gd->flags |= GENHD_FL_REMOVABLE;
43         gd->events |= DISK_EVENT_MEDIA_CHANGE;
44     }
45 
46     add_disk(gd);
47     if (sdkp->capacity)
48         sd_dif_config_host(sdkp);
49 
50     sd_revalidate_disk(gd);
51 
52     sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
53           sdp->removable ? "removable " : "");
54     blk_pm_runtime_init(sdp->request_queue, dev);
55     scsi_autopm_put_device(sdp);
56     put_device(&sdkp->dev);
57 }

初始化回调

在前一种情况下，SCSI设备只能处理来自SCSI公共服务层的请求，后一种情况下，SCSI命令不仅能处理来自SCSI公共服务层的请求，还能够处理来自上层的bio请求，分析见下一节

Linux3.10.0块IO子系统流程（3）-- SCSI策略例程

猜你喜欢