[PATCH RFC 2/5] NVMe: Basic NVMe device hotplug support

Matias Bjorling m at bjorling.me
Mon Dec 30 08:48:08 EST 2013


On 12/30/2013 11:27 AM, Santosh Y wrote:
> This patch provides basic hotplug support for NVMe.
> For NVMe hotplug to work the PCIe slot must be hotplug capable.
>
> When a NVMe device is surprise removed and inserted back the
> device may need some time to respond to host IO commands, and
> will return NVME_SC_NS_NOT_READY. In this case the requests
> will be requeued until the device responds to the IO commands
> with status response or until the commands timeout.
>
> Signed-off-by: Ravi Kumar <ravi.android at gmail.com>
> Signed-off-by: Santosh Y <santoshsy at gmail.com>
>
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 86b9f37..f92ec96 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -319,6 +319,14 @@ config BLK_DEV_NVME
>   	  To compile this driver as a module, choose M here: the
>   	  module will be called nvme.
>
> +config BLK_DEV_NVME_HP
> +	bool "Enable hotplug support"
> +	depends on BLK_DEV_NVME && HOTPLUG_PCI_PCIE
> +	default n
> +	help
> +	  If you say Y here, the driver will support hotplug feature.


> +	  Hotplug only works if the PCIe slot is hotplug capable.
> +
>   config BLK_DEV_SKD
>   	tristate "STEC S1120 Block Driver"
>   	depends on PCI
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
> index a523296..8a02135 100644
> --- a/drivers/block/nvme-core.c
> +++ b/drivers/block/nvme-core.c
> @@ -172,6 +172,13 @@ static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
>   	return cmdid;
>   }
>
> +static inline bool nvme_check_surprise_removal(struct nvme_dev *dev)
> +{
> +	if (readl(&dev->bar->csts) == -1)
> +		return true;
> +	return false;
> +}
> +
>   static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
>   				nvme_completion_fn handler, unsigned timeout)
>   {
> @@ -370,6 +377,19 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
>   	part_stat_unlock();
>   }
>
> +#ifdef CONFIG_BLK_DEV_NVME_HP

There are an excessive amount of ifdef/endif declarations. Instead of 
special case around each chunk hotplug code. What about refactoring it 
into its own nvme-hotpull.c file?

This will clean the code path for hotplugging and won't interfere with 
the natural flow of the nvme core code. Another is to just bite the 
complexity and don't ifdef around hotplug areas.


> +static void nvme_requeue_bio(struct nvme_dev *dev, struct bio *bio)
> +{
> +	struct nvme_queue *nvmeq = get_nvmeq(dev);
> +	if (bio_list_empty(&nvmeq->sq_cong))
> +		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
> +	bio_list_add(&nvmeq->sq_cong, bio);
> +	put_nvmeq(nvmeq);
> +	wake_up_process(nvme_thread);
> +}
> +#endif
> +
> +
>   static void bio_completion(struct nvme_dev *dev, void *ctx,
>   						struct nvme_completion *cqe)
>   {
> @@ -383,10 +403,19 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
>   		nvme_end_io_acct(bio, iod->start_time);
>   	}
>   	nvme_free_iod(dev, iod);
> -	if (status)
> -		bio_endio(bio, -EIO);
> -	else
> +	if (status) {
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +		if ((status & 0xff) == NVME_SC_INVALID_NS) {
> +			bio_endio(bio, -ENODEV);
> +		} else if ((status & 0xff) == NVME_SC_NS_NOT_READY) {
> +			bio->bi_rw |= REQ_FAILFAST_DRIVER;
> +			nvme_requeue_bio(dev, bio);
> +		} else
> +#endif
> +			bio_endio(bio, -EIO);
> +	} else {
>   		bio_endio(bio, 0);
> +	}
>   }
>
>   /* length is in bytes.  gfp flags indicates whether we may sleep. */
> @@ -722,6 +751,10 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
>   	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
>   		return nvme_submit_flush(nvmeq, ns, cmdid);
>
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +	if (bio->bi_rw & REQ_FAILFAST_DRIVER)
> +		mdelay(100);
> +#endif
>   	control = 0;
>   	if (bio->bi_rw & REQ_FUA)
>   		control |= NVME_RW_FUA;
> @@ -814,10 +847,26 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
>
>   static void nvme_make_request(struct request_queue *q, struct bio *bio)
>   {
> -	struct nvme_ns *ns = q->queuedata;
> -	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
> +	struct nvme_ns *ns = NULL;
> +	struct nvme_queue *nvmeq = NULL;
>   	int result = -EBUSY;
>
> +	if (likely(q && q->queuedata))
> +		ns = q->queuedata;
> +	if (unlikely(!ns)) {
> +		bio_endio(bio, -ENODEV);
> +		return;
> +	}
> +
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +	if (test_bit(NVME_HOT_REM, &ns->dev->hp_flag) ||
> +		!(bio->bi_bdev->bd_disk->flags & GENHD_FL_UP)) {
> +		bio_endio(bio, -ENODEV);
> +		return;
> +	}
> +#endif
> +	nvmeq = get_nvmeq(ns->dev);
> +
>   	if (!nvmeq) {
>   		put_nvmeq(NULL);
>   		bio_endio(bio, -EIO);
> @@ -1120,6 +1169,12 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
>   			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
>   		};
>
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +		if (test_bit(NVME_HOT_REM, &nvmeq->dev->hp_flag)) {
> +			cqe.status |= (NVME_SC_INVALID_NS << 1);
> +			info[cmdid].timeout = jiffies - NVME_IO_TIMEOUT;
> +		}
> +#endif
>   		if (timeout && !time_after(now, info[cmdid].timeout))
>   			continue;
>   		if (info[cmdid].ctx == CMD_CTX_CANCELLED)
> @@ -1205,7 +1260,7 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
>
>   	/* Don't tell the adapter to delete the admin queue.
>   	 * Don't tell a removed adapter to delete IO queues. */
> -	if (qid && readl(&dev->bar->csts) != -1) {
> +	if (qid && !nvme_check_surprise_removal(dev)) {
>   		adapter_delete_sq(dev, qid);
>   		adapter_delete_cq(dev, qid);
>   	}
> @@ -1724,6 +1779,13 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
>   		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
>   		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
>
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +		if (test_bit(NVME_HOT_REM, &ns->dev->hp_flag) ||
> +			!(bio->bi_bdev->bd_disk->flags & GENHD_FL_UP)) {
> +			bio_endio(bio, -ENODEV);
> +			continue;
> +		}
> +#endif
>   		if (bio_list_empty(&nvmeq->sq_cong))
>   			remove_wait_queue(&nvmeq->sq_full,
>   							&nvmeq->sq_cong_wait);
> @@ -1746,7 +1808,8 @@ static int nvme_kthread(void *data)
>   		spin_lock(&dev_list_lock);
>   		list_for_each_entry_safe(dev, next, &dev_list, node) {
>   			int i;
> -			if (readl(&dev->bar->csts) & NVME_CSTS_CFS &&
> +			if (!nvme_check_surprise_removal(dev) &&
> +				readl(&dev->bar->csts) & NVME_CSTS_CFS &&
>   							dev->initialized) {
>   				if (work_busy(&dev->reset_work))
>   					continue;
> @@ -2082,7 +2145,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
>   	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
>   	if (!dev->bar)
>   		goto disable;
> -	if (readl(&dev->bar->csts) == -1) {
> +	if (nvme_check_surprise_removal(dev)) {
>   		result = -ENODEV;
>   		goto unmap;
>   	}
> @@ -2265,7 +2328,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
>   	list_del_init(&dev->node);
>   	spin_unlock(&dev_list_lock);
>
> -	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
> +	if (!dev->bar || (dev->bar && nvme_check_surprise_removal(dev))) {
>   		for (i = dev->queue_count - 1; i >= 0; i--) {
>   			struct nvme_queue *nvmeq = dev->queues[i];
>   			nvme_suspend_queue(nvmeq);
> @@ -2534,6 +2597,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>
>   	dev->initialized = 1;
>   	kref_init(&dev->kref);
> +
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +	if (!pdev->is_added)
> +		dev_info(&pdev->dev,
> +			"Device 0x%x is on-line\n", pdev->device);
> +#endif
>   	return 0;
>
>    remove:
> @@ -2556,6 +2625,16 @@ static void nvme_remove(struct pci_dev *pdev)
>   {
>   	struct nvme_dev *dev = pci_get_drvdata(pdev);
>
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +	if (!pdev || !dev)
> +		return;
> +	if (nvme_check_surprise_removal(dev)) {
> +		set_bit(NVME_HOT_REM, &dev->hp_flag);
> +		dev_info(&pdev->dev,
> +			"Surprise removal of device 0x%x\n", pdev->device);
> +	}
> +	pci_dev_get(pdev);
> +#endif
>   	pci_set_drvdata(pdev, NULL);
>   	flush_work(&dev->reset_work);
>   	misc_deregister(&dev->miscdev);
> @@ -2565,6 +2644,9 @@ static void nvme_remove(struct pci_dev *pdev)
>   	nvme_release_instance(dev);
>   	nvme_release_prp_pools(dev);
>   	kref_put(&dev->kref, nvme_free_dev);
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +	pci_dev_put(pdev);
> +#endif
>   }
>
>   /* These functions are yet to be implemented */
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 69ae03f..4ef375e 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -68,6 +68,12 @@ enum {
>
>   #define NVME_IO_TIMEOUT	(5 * HZ)
>
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +enum {
> +	NVME_HOT_REM,
> +};
> +#endif
> +
>   /*
>    * Represents an NVM Express device.  Each nvme_dev is a PCI function.
>    */
> @@ -97,6 +103,9 @@ struct nvme_dev {
>   	u16 oncs;
>   	u16 abort_limit;
>   	u8 initialized;
> +#ifdef CONFIG_BLK_DEV_NVME_HP
> +	unsigned long hp_flag;
> +#endif
>   };
>
>   /*
>




More information about the Linux-nvme mailing list