[RFC mdadm PATCH 07/11] imsm: cache metadata definitions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
 Makefile      |    2 
 isrt-intel.h  |  256 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 super-intel.c |   16 +++-
 3 files changed, 271 insertions(+), 3 deletions(-)
 create mode 100644 isrt-intel.h

diff --git a/Makefile b/Makefile
index b823d85f89e3..7d50df69a744 100644
--- a/Makefile
+++ b/Makefile
@@ -127,7 +127,7 @@ CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o
 
 SRCS =  $(patsubst %.o,%.c,$(OBJS))
 
-INCL = mdadm.h part.h bitmap.h
+INCL = mdadm.h part.h bitmap.h isrt-intel.h platform-intel.h
 
 MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \
 	policy.o lib.o \
diff --git a/isrt-intel.h b/isrt-intel.h
new file mode 100644
index 000000000000..50365de1a620
--- /dev/null
+++ b/isrt-intel.h
@@ -0,0 +1,256 @@
+/*
+ * mdadm - Intel(R) Smart Response Technology Support
+ *
+ * Copyright (C) 2011-2014 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __ISRT_INTEL_H__
+#define __ISRT_INTEL_H__
+
+enum {
+	/* for a given cache device how many volumes can be associated */
+	MAX_NV_CACHE_VOLS = 1,
+	/* likely should be dynamically configurable when this driver is
+	 * made more generic
+	 */
+	ISRT_FRAME_SIZE = 8192,
+	VOL_CONFIG_RESERVED = 32,
+	MD_HEADER_RESERVED = 32,
+	MAX_RAID_SERIAL_LEN = 16,
+	NVC_SIG_LEN = 32,
+	ISRT_DEV_IDX = 0,
+	ISRT_TARGET_DEV_IDX = 1,
+
+	NV_CACHE_MODE_OFF          = 0,
+	NV_CACHE_MODE_OFF_TO_SAFE  = 1, /* powerfail recovery state */
+	NV_CACHE_MODE_OFF_TO_PERF  = 2, /* powerfail recovery state */
+	NV_CACHE_MODE_SAFE         = 3,
+	NV_CACHE_MODE_SAFE_TO_OFF  = 4,
+	NV_CACHE_MODE_PERF         = 5,
+	NV_CACHE_MODE_PERF_TO_OFF  = 6,
+	NV_CACHE_MODE_PERF_TO_SAFE = 7,
+	NV_CACHE_MODE_IS_FAILING   = 8,
+	NV_CACHE_MODE_HAS_FAILED   = 9,
+	NV_CACHE_MODE_DIS_PERF     = 10, /* caching on volume or nv cache disabled */
+	NV_CACHE_MODE_DIS_SAFE     = 11, /* volume or NV cache not associated */
+};
+
+struct segment_index_pair {
+	__u32 segment;
+	__u32 index;
+};
+
+#define NV_CACHE_CONFIG_SIG "Intel IMSM NV Cache Cfg. Sig.   "
+#define MAX_NVC_SIZE_GB            128UL      /* Max NvCache we can support is 128GB */
+#define NVC_FRAME_SIZE             8192UL
+#define NVC_FRAME_SIZE_IN_KB       (NVC_FRAME_SIZE / 1024UL)                  /* 8 */
+#define NVC_FRAMES_PER_GB          (1024UL * (1024UL / NVC_FRAME_SIZE_IN_KB))   /* 128k */
+#define MAX_NVC_FRAMES             (MAX_NVC_SIZE_GB * NVC_FRAMES_PER_GB)    /* 16m */
+#define SEGIDX_PAIRS_PER_NVC_FRAME (NVC_FRAME_SIZE / sizeof(struct segment_index_pair)) /* 1k */
+#define SEGHEAP_SEGS_PER_NVC_FRAME (NVC_FRAME_SIZE / sizeof(__u32)) /* 2k */
+#define FRAMES_PER_SEGHEAP_FRAME   (SEGIDX_PAIRS_PER_NVC_FRAME \
+				    * SEGHEAP_SEGS_PER_NVC_FRAME) /* 2m */
+#define MAX_SEGHEAP_NVC_FRAMES     (MAX_NVC_FRAMES/FRAMES_PER_SEGHEAP_FRAME)  /* 8 */
+#define MAX_SEGHEAP_TOC_ENTRIES    (MAX_SEGHEAP_NVC_FRAMES + 1)
+
+
+/* XXX: size of enum guarantees? */
+enum nvc_shutdown_state {
+	ShutdownStateNormal,
+	ShutdownStateS4CrashDmpStart,
+	ShutdownStateS4CrashDmpEnd,
+	ShutdownStateS4CrashDmpFailed
+};
+
+struct isrt_mpb {
+	/*
+	 * Metadata array (packed_md0_nba or packed_md1_nba).  is the base for
+	 * the Metadata Delta Log changes.  The current contents of the Metadata
+	 * Delta Log applied to this packed metadata base becomes the working
+	 * packed metadata upon recovery from a power failure.  The alternate
+	 * packed metadata array, indicated by (md_base_for_delta_log ^1) is
+	 * where the next complete write of packed metadata from DRAM will be
+	 * written. On a clean shutdown, packed metadata will also be written to
+	 * the alternate array.
+	 */
+	__u32 packed_md0_nba; /* Start of primary packed metadata array */
+	__u32 packed_md1_nba; /* Start of secondary packed metadata array */
+	__u32 md_base_for_delta_log; /* 0 or 1. Indicates which packed */
+	__u32 packed_md_size; /* Size of packed metadata array in bytes */
+	__u32 aux_packed_md_nba; /* Start of array of extra metadata for driver use */
+	__u32 aux_packed_md_size; /* Size of array of extra metadata for driver use */
+	__u32 cache_frame0_nba; /* Start of actual cache frames */
+	__u32 seg_num_index_nba; /* Start of the Seg_num_index array */
+	__u32 seg_num_heap_nba; /* Start of the Seg_num_heap */
+	__u32 seg_num_heap_size; /* Size of the Seg_num Heap in bytes (always a */
+	/*
+	 * Multiple of NVM_PAGE_SIZE bytes. The Seg_nums in the tail of the last
+	 * page are all set to 0xFFFFFFFF
+	 */
+	__u32 seg_heap_toc[MAX_SEGHEAP_TOC_ENTRIES];
+	__u32 md_delta_log_nba; /* Start of the Metadata Delta Log region */
+	/*  The Delta Log is a circular buffer */
+	__u32 md_delta_log_max_size; /* Size of the Metadata Delta Log region in bytes */
+	__u32 orom_frames_to_sync_nba; /* Start of the orom_frames_to_sync record */
+	__u32 num_cache_frames; /* Total number of cache frames */
+	__u32 cache_frame_size; /* Size of each cache frame in bytes */
+	__u32 lba_alignment; /* Offset to add to host I/O request LBA before
+			       * shifting to form the segment number
+			       */
+	__u32 valid_frame_gen_num; /* Valid cache frame generation number */
+	/*
+	 * If the cache frame metadata contains a smaller generation number,
+	 * that frame's contents are considered invalid.
+	 */
+	__u32 packed_md_frame_gen_num; /* Packed metadata frame generation number */
+	/*
+	 * This is the frame generation number associated with all frames in the
+	 * packed metadata array. If this is < valid_frame_gen_num, then all
+	 * frames in packed metadata are considered invalid.
+	 */
+	__u32 curr_clean_batch_num; /* Initialized to 0, incremented whenever
+				      * the cache goes clean. If this value is
+				      * greater than the Nv_cache_metadata
+				      * dirty_batch_num in the atomic metadata
+				      * of the cache frame, the frame is
+				      * considered clean.
+				      */
+	__u32 total_used_sectors; /* Total number of NVM sectors of size
+				    * NVM_SECTOR_SIZE used by cache frames and
+				    * metadata.
+				    */
+	/* OROM I/O Log fields */
+	__u32 orom_log_nba; /* OROM I/O Log area for next boot */
+	__u32 orom_log_size; /* OROM I/O Log size in 512-byte blocks */
+
+	/* Hibernate/Crashdump Extent_log */
+	__u32 s4_crash_dmp_extent_log_nba; /* I/O Extent Log area created by the */
+					   /* hibernate/crashdump driver for OROM */
+	/* Driver shutdown state utilized by the OROM */
+	enum nvc_shutdown_state driver_shutdown_state;
+
+	__u32 validity_bits;
+	__u64 nvc_hdr_array_in_dram;
+
+	/* The following fields are used in managing the Metadata Delta Log. */
+
+	/*
+	 * Every delta record in the Metadata Delta Log  has a copy of the value
+	 * of this field at the time the record was written. This gen num is
+	 * incremented by 1 every time the log fills up, and allows powerfail
+	 * recovery to easily find the end of the log (it's the first record
+	 * whose gen num field is < curr_delta_log_gen_num.)
+	 */
+	__u32 curr_delta_log_gen_num;
+	/*
+	 * This is the Nba to the start of the current generation of delta
+	 * records in the log.  Since the log is circular, the currentlog
+	 * extends from md_delta_log_first up to and including
+	 * (md_delta_log_first +max_records-2) % max_records) NOTE: when reading
+	 * the delta log, the actual end of the log is indicated by the first
+	 * record whose gen num field is <curr_delta_log_gen_num, so the
+	 * 'max_records-2' guarantees we'll have at least one delta record whose
+	 * gen num field will qualify to mark the end of the log.
+	 */
+	__u32 md_delta_log_first;
+	/*
+	 * How many free frames are used in the Metadata Delta Log. After every
+	 * write of a delta log record that contains at least one
+	 * Md_delta_log_entry, there must always be exactly
+	 */
+
+	__u32 md_delta_log_num_free_frames;
+	__u32 num_dirty_frames; /* Number of dirty frames in cache when this
+				  * isrt_mpb was written.
+				  */
+	__u32 num_dirty_frames_at_mode_trans; /* Number of dirty frames from
+						* the start of the most recent
+						* transition out of Performance
+						* mode (Perf_to_safe/Perf_to_off)
+						*/
+} __attribute__((packed));
+
+
+struct nv_cache_vol_config_md {
+	__u32 acc_vol_orig_family_num; /* Unique Volume Id of the accelerated
+					 * volume caching to the NVC Volume
+					 */
+	__u16 acc_vol_dev_id; /* (original family + dev_id ) if there is no
+				* volume associated with Nv_cache, both of these
+				* fields are 0.
+				*/
+	__u16 nv_cache_mode; /* NV Cache mode of this volume */
+	/*
+	 * The serial_no of the accelerated volume associated with Nv_cache.  If
+	 * there is no volume associated with Nv_cache, acc_vol_name[0] = 0
+	 */
+	char acc_vol_name[MAX_RAID_SERIAL_LEN];
+	__u32 flags;
+	__u32 power_cycle_count; /* Power Cycle Count of the underlying disk or
+				   * volume from the last device enumeration.
+				   */
+	/* Used to determine separation case. */
+	__u32  expansion_space[VOL_CONFIG_RESERVED];
+} __attribute__((packed));
+
+struct nv_cache_config_md_header {
+	char signature[NVC_SIG_LEN]; /* "Intel IMSM NV Cache Cfg. Sig.   " */
+	__u16  version_number; /* NV_CACHE_CFG_MD_VERSION */
+	__u16  header_length; /* Length by bytes */
+	__u32  total_length; /* Length of the entire Config Metadata including
+			       * header and volume(s) in bytes
+			       */
+	/* Elements above here will never change even in new versions */
+	__u16  num_volumes; /* Number of volumes that have config metadata. in
+			      * 9.0 it's either 0 or 1
+			      */
+	__u32 expansion_space[MD_HEADER_RESERVED];
+	struct nv_cache_vol_config_md vol_config_md[MAX_NV_CACHE_VOLS]; /* Array of Volume */
+	/* Config Metadata entries. Contains "num_volumes" */
+	/* entries. In 9.0 'MAX_NV_CACHE_VOLS' = 1. */
+} __attribute__((packed));
+
+struct nv_cache_control_data {
+	struct nv_cache_config_md_header hdr;
+	struct isrt_mpb mpb;
+} __attribute__((packed));
+
+/* One or more sectors in NAND page are bad */
+#define NVC_PACKED_SECTORS_BAD (1 << 0)
+#define NVC_PACKED_DIRTY (1 << 1)
+#define NVC_PACKED_FRAME_TYPE_SHIFT (2)
+/* If set, frame is in clean area of LRU list */
+#define NVC_PACKED_IN_CLEAN_AREA (1 << 5)
+/*
+ * This frame was TRIMMed (OROM shouldn't expect the delta log rebuild to match
+ * the packed metadata stored on a clean shutdown.
+ */
+#define NVC_PACKED_TRIMMED (1 << 6)
+
+struct nv_cache_packed_md {
+	__u32 seg_num; /* Disk Segment currently assigned to frame */
+	__u16 per_sector_validity; /* Per sector validity */
+	__u8 flags;
+	union {
+		__u8 pad;
+		/* repurpose padding for driver state */
+		__u8 locked;
+	};
+} __attribute__((packed));
+
+#define SEGMENTS_PER_PAGE_SHIFT 6
+#define SEGMENTS_PER_PAGE (1 << SEGMENTS_PER_PAGE_SHIFT)
+#define SEGMENTS_PER_PAGE_MASK (SEGMENTS_PER_PAGE-1)
+#define FRAME_SHIFT 4
+#define SECTORS_PER_FRAME (1 << FRAME_SHIFT)
+#define FRAME_MASK (SECTORS_PER_FRAME-1)
+
+#endif /* __ISRT_INTEL_H__ */
diff --git a/super-intel.c b/super-intel.c
index 07e4c68982cd..acc46368322f 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -22,6 +22,7 @@
 #include "mdmon.h"
 #include "sha1.h"
 #include "platform-intel.h"
+#include "isrt-intel.h"
 #include <values.h>
 #include <scsi/sg.h>
 #include <ctype.h>
@@ -39,7 +40,6 @@
 #define MPB_VERSION_CNG "1.2.06"
 #define MPB_VERSION_ATTRIBS "1.3.00"
 #define MAX_SIGNATURE_LENGTH  32
-#define MAX_RAID_SERIAL_LEN   16
 
 /* supports RAID0 */
 #define MPB_ATTRIB_RAID0		__cpu_to_le32(0x00000001)
@@ -179,6 +179,8 @@ struct imsm_dev {
 #define DEV_CLONE_N_GO		__cpu_to_le32(0x400)
 #define DEV_CLONE_MAN_SYNC	__cpu_to_le32(0x800)
 #define DEV_CNG_MASTER_DISK_NUM	__cpu_to_le32(0x1000)
+/* Volume is being used as NvCache for an accelerated volume */
+#define DEV_NVC_VOLUME          __cpu_to_le32(0x4000)
 	__u32 status;	/* Persistent RaidDev status */
 	__u32 reserved_blocks; /* Reserved blocks at beginning of volume */
 	__u8  migr_priority;
@@ -189,8 +191,18 @@ struct imsm_dev {
 	__u8  cng_state;
 	__u8  cng_sub_state;
 	__u16 dev_id;
+	__u8 nv_cache_mode;
+#define DEV_NVC_CLEAN		(0)
+#define DEV_NVC_DIRTY		(1)
+#define DEV_NVC_HEALTH_GOOD     (0 << 1)
+#define DEV_NVC_HEALTH_FAILED	(1 << 1)
+#define DEV_NVC_HEALTH_READONLY	(2 << 1)
+#define DEV_NVC_HEALTH_BACKUP	(3 << 1)
+	__u8 nv_cache_flags;
+	__u32 nvc_orig_family_num; /* Unique Volume Id of the cache */
+	__u16 nvc_dev_id;	   /* volume associated with this volume */
 	__u16 fill;
-	__u32 filler[9];
+	__u32 filler[7];
 	struct imsm_vol vol;
 } __attribute__ ((packed));
 

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux