[PATCH v3 7/7] NVMe: End-to-end data protection
Keith Busch
keith.busch at intel.com
Fri Mar 22 11:36:44 EDT 2013
Registers a DIF capable nvme namespace with block integrity.
If the namepsace meta-data is a separate buffer, the driver will use
the appropriate block integrity template to generate and verify the
protection information on writes and reads and use the bip_buf as the
meta-data pointer in the nvme command. Separate meta-data with protection
information is not usable if it occurs as the last eight bytes and is
larger than a DIF field size. If the namespace is not formatted with
protection information, a no-op block integrity template is used to
create the unused meta-data buffer.
If the meta-data is interleaved and formatted for data-protection, the
NVMe PRACT field is set to have the controller generate DIF on writes
and strip it on reads.
LBA formats that the driver cannot deal with will not create a block
device for that namespace.
Signed-off-by: Keith Busch <keith.busch at intel.com>
This v3 replaces patch [7/7] from this set:
http://merlin.infradead.org/pipermail/linux-nvme/2013-March/000180.html
We can't use protection information that occurs as the last eight bytes
of meta-data when it is larger than a DIF field size, and this just adds
that check.
---
drivers/block/nvme.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++--
include/linux/nvme.h | 28 ++++++++--
2 files changed, 159 insertions(+), 9 deletions(-)
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 86c7f28..457a5be 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -94,6 +94,9 @@ struct nvme_ns {
int ns_id;
int lba_shift;
+ int pi_type;
+ int extended;
+ u16 ms;
};
/*
@@ -307,6 +310,7 @@ struct nvme_iod {
int nents; /* Used in scatterlist */
int length; /* Of data, in bytes */
dma_addr_t first_dma;
+ dma_addr_t meta_dma;
struct scatterlist sg[0];
};
@@ -367,10 +371,14 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
struct nvme_iod *iod = ctx;
struct bio *bio = iod->private;
u16 status = le16_to_cpup(&cqe->status) >> 1;
+ enum dma_data_direction dma_dir = bio_data_dir(bio) ? DMA_TO_DEVICE :
+ DMA_FROM_DEVICE;
if (iod->nents)
- dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
- bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, dma_dir);
+ if (bio_integrity(bio))
+ dma_unmap_single(&dev->pci_dev->dev, iod->meta_dma,
+ bio->bi_integrity->bip_size, dma_dir);
nvme_free_iod(dev, iod);
if (status)
@@ -464,6 +472,7 @@ static int nvme_setup_prps(struct nvme_dev *dev,
struct nvme_bio_pair {
struct bio b1, b2, *parent;
struct bio_vec *bv1, *bv2;
+ struct bio_integrity_payload bip1, bip2;
int err;
atomic_t cnt;
};
@@ -532,6 +541,23 @@ static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx,
} else
bp->bv1 = bp->bv2 = NULL;
+ if (bio_integrity(bio)) {
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int bip_split_len =
+ (len / bdev_logical_block_size(bio->bi_bdev)) *
+ bi->tuple_size;
+
+ bp->bip1.bip_buf = bio->bi_integrity->bip_buf;
+ bp->bip1.bip_size = bip_split_len;
+
+ bp->bip2.bip_buf = bio->bi_integrity->bip_buf + bip_split_len;
+ bp->bip2.bip_size = bio->bi_integrity->bip_size - bip_split_len;
+
+ bp->b1.bi_integrity = &bp->bip1;
+ bp->b2.bi_integrity = &bp->bip2;
+
+ }
+
bp->b1.bi_private = bp;
bp->b2.bi_private = bp;
@@ -692,6 +718,29 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
GFP_ATOMIC);
cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
+
+ if (ns->ms) {
+ if (ns->pi_type) {
+ control |= NVME_RW_PRINFO_PRCHK_GUARD;
+ if (ns->pi_type != NVME_NS_DPS_PI_TYPE3) {
+ control |= NVME_RW_PRINFO_PRCHK_REF;
+ cmnd->rw.reftag = cpu_to_le32(
+ (bio->bi_sector >> (ns->lba_shift - 9)) &
+ 0xffffffff);
+ }
+ }
+ if (bio_integrity(bio)) {
+ iod->meta_dma =
+ dma_map_single(nvmeq->q_dmadev,
+ bio->bi_integrity->bip_buf,
+ bio->bi_integrity->bip_size,
+ dma_dir);
+ cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+ } else {
+ control |= NVME_RW_PRINFO_PRACT;
+ }
+ }
+
cmnd->rw.control = cpu_to_le16(control);
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
@@ -1435,16 +1484,90 @@ static void nvme_put_ns_idx(int index)
spin_unlock(&dev_list_lock);
}
+static void nvme_generate(struct blk_integrity_exchg *bix)
+{
+ return;
+}
+
+static int nvme_verify(struct blk_integrity_exchg *bix)
+{
+ return 0;
+}
+
+/*
+ * No-op integrity extension for namespace formats with meta-data but
+ * without protection settings.
+ */
+static struct blk_integrity nvme_no_dif = {
+ .name = "T10-DIF-TYPE0",
+ .generate_fn = &nvme_generate,
+ .verify_fn = &nvme_verify,
+ .get_tag_fn = NULL,
+ .set_tag_fn = NULL,
+ .tuple_size = 0,
+ .tag_size = 0,
+};
+
+static void nvme_ns_register_pi(struct nvme_ns *ns)
+{
+ struct blk_integrity integrity;
+
+ if (ns->pi_type == NVME_NS_DPS_PI_TYPE3) {
+ integrity = sd_dif_get_type3_crc();
+ integrity.tag_size = sizeof(u16);
+ } else if (ns->pi_type) {
+ integrity = sd_dif_get_type1_crc();
+ integrity.tag_size = sizeof(u16) + sizeof(u32);
+ } else {
+ integrity = nvme_no_dif;
+ }
+ integrity.tuple_size = ns->ms;
+ blk_integrity_register(ns->disk, &integrity);
+}
+
+/*
+ * Interleaved meta-data is not usable unless the controller can strip/insert
+ * it on reads/writes, which means the namespace has to be formatted with
+ * protection information and meta-data size equal to DIF size. Separate
+ * meta-data with protection information is usable if the meta-data size is
+ * equal to a DIF size or the DIF field occurs as the first eight meta-data
+ * bytes. All other formats are usable.
+ */
+static int nvme_check_pi_format(struct nvme_id_ns *id) {
+ int lbaf = id->flbas & NVME_NS_FLBAS_LBAF_MASK;
+ int ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ int pi = id->dps & NVME_NS_DPS_PI_MASK;
+ int first = id->dps & NVME_NS_DPS_PI_FIRST;
+ int extended = id->flbas & NVME_NS_FLBAS_LBA_EXTENDED;
+
+ if (ms == 8 && pi)
+ return pi;
+ if (pi && ms > 8) {
+ if (!extended && first)
+ return pi;
+ return -1;
+ }
+ if (!extended)
+ return 0;
+ if (ms)
+ return -1;
+ return 0;
+}
+
static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
{
struct nvme_ns *ns;
struct gendisk *disk;
- int lbaf;
+ int lbaf, pi_type;
if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
return NULL;
+ pi_type = nvme_check_pi_format(id);
+ if (pi_type < 0)
+ return NULL;
+
ns = kzalloc(sizeof(*ns), GFP_KERNEL);
if (!ns)
return NULL;
@@ -1458,6 +1581,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
blk_queue_make_request(ns->queue, nvme_make_request);
ns->dev = dev;
ns->queue->queuedata = ns;
+ ns->pi_type = pi_type;
+ if (pi_type)
+ ns->extended = id->flbas & NVME_NS_FLBAS_LBA_EXTENDED;
disk = alloc_disk(NVME_MINORS);
if (!disk)
@@ -1466,6 +1592,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
ns->disk = disk;
lbaf = id->flbas & 0xf;
ns->lba_shift = id->lbaf[lbaf].ds;
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
if (dev->max_hw_sectors)
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
@@ -1634,8 +1761,11 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
if (ns)
list_add_tail(&ns->list, &dev->namespaces);
}
- list_for_each_entry(ns, &dev->namespaces, list)
+ list_for_each_entry(ns, &dev->namespaces, list) {
add_disk(ns->disk);
+ if (!ns->extended && ns->pi_type)
+ nvme_ns_register_pi(ns);
+ }
goto out;
@@ -1660,6 +1790,8 @@ static int nvme_dev_remove(struct nvme_dev *dev)
list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
list_del(&ns->list);
+ if (!ns->extended && ns->pi_type)
+ blk_integrity_unregister(ns->disk);
del_gendisk(ns->disk);
nvme_ns_free(ns);
}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4fa3b0b..f499455 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -130,11 +130,25 @@ struct nvme_id_ns {
};
enum {
- NVME_NS_FEAT_THIN = 1 << 0,
- NVME_LBAF_RP_BEST = 0,
- NVME_LBAF_RP_BETTER = 1,
- NVME_LBAF_RP_GOOD = 2,
- NVME_LBAF_RP_DEGRADED = 3,
+ NVME_NS_FEAT_THIN = 1 << 0,
+ NVME_NS_MC_EXTENDED = 1 << 0,
+ NVME_NS_MC_SEPARATE = 1 << 1,
+ NVME_NS_FLBAS_LBA_EXTENDED = 1 << 4,
+ NVME_NS_FLBAS_LBAF_MASK = 0xf,
+ NVME_NS_DPC_PI_LAST = 1 << 4,
+ NVME_NS_DPC_PI_FIRST = 1 << 3,
+ NVME_NS_DPC_PI_TYPE3 = 1 << 2,
+ NVME_NS_DPC_PI_TYPE2 = 1 << 1,
+ NVME_NS_DPC_PI_TYPE1 = 1 << 0,
+ NVME_NS_DPS_PI_MASK = 0x7,
+ NVME_NS_DPS_PI_TYPE1 = 1,
+ NVME_NS_DPS_PI_TYPE2 = 2,
+ NVME_NS_DPS_PI_TYPE3 = 3,
+ NVME_NS_DPS_PI_FIRST = 8,
+ NVME_LBAF_RP_BEST = 0,
+ NVME_LBAF_RP_BETTER = 1,
+ NVME_LBAF_RP_GOOD = 2,
+ NVME_LBAF_RP_DEGRADED = 3,
};
struct nvme_smart_log {
@@ -244,6 +258,10 @@ enum {
NVME_RW_DSM_LATENCY_LOW = 3 << 4,
NVME_RW_DSM_SEQ_REQ = 1 << 6,
NVME_RW_DSM_COMPRESSED = 1 << 7,
+ NVME_RW_PRINFO_PRACT = 1 << 13,
+ NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
+ NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
+ NVME_RW_PRINFO_PRCHK_REF = 1 << 10,
};
/* Admin commands */
--
1.7.0.4
More information about the Linux-nvme
mailing list