All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media
@ 2023-04-26  2:14 Davidlohr Bueso
  2023-04-26  2:14 ` [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support Davidlohr Bueso
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-26  2:14 UTC (permalink / raw
  To: Jonathan.Cameron
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	dave, linux-cxl

Hi,

The following is a very early rfc for supporting the rest of the Media an Poison
mailbox commands, extending what is currently there for background command support
and making use of the poison list infrastructure. The kernel is also pretty much
at the same point, for which the motivation of this series is to help aid any
incoming driver support. Similarly testing is 0 but sending this early hoping for
feedback to at least know if I'm going in the right direction while the kernel bits
are developed.

A few general notes:
 o I have no particular affinity to the values provided for media scan hinting times.
   Feel free to suggest new ones.
 
 o Scan Media will directly update the poison list, with its own special type,
   based on random values (number of new poison entries and actual dpa).
   
 o As with the current poison commands, this needs proper checking of input dpa
   ranges to: 1) ensure it fits within the actual address space of the device
   (ie: compare query_start/len to memdev's  mr->addr through cxl_dstate->mem_size
   ranges). And 2) ensure that the passed range does not span both ram and pmem
   devices, albeit unlikely for the qemu cases.

 o DRAM Event Logs are not generated for every new poisoned dpa. afaict we will
   need to have a scan media version of the qmp event error injection interface,
   as I don't see qmp being usable for scan media.
 
 o Getting the scan results inherits the current get poison list limitations,
   such as limited number of records, so only one call is necessary.

Series applies on top of Jonathan's 'cxl-2023-04-19' branch, which fyi has the
v1 version of the bg cmd support (required for this series), so will require
rebasing eventually:
	https://gitlab.com/jic23/qemu/-/tree/cxl-2023-04-19

Thanks!

Davidlohr Bueso (3):
  hw/cxl: Add get scan media capabilities mailbox command support
  hw/cxl: Add scan media mailbox command support
  hw/cxl: Add scan media mailbox command support

 hw/cxl/cxl-mailbox-utils.c  | 225 +++++++++++++++++++++++++++++++++++-
 hw/mem/cxl_type3.c          |   3 +
 include/hw/cxl/cxl_device.h |   8 ++
 3 files changed, 233 insertions(+), 3 deletions(-)

--
2.40.0


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support
  2023-04-26  2:14 [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
@ 2023-04-26  2:14 ` Davidlohr Bueso
  2023-04-28 16:18   ` Fan Ni
  2023-05-15 11:01   ` Jonathan Cameron
  2023-04-26  2:14 ` [PATCH 2/3] hw/cxl: Add scan media " Davidlohr Bueso
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-26  2:14 UTC (permalink / raw
  To: Jonathan.Cameron
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	dave, linux-cxl

Use simple heuristics to determine the cost of scanning any given chunk,
assuming cost is even across the whole device.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 hw/cxl/cxl-mailbox-utils.c | 45 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 0849cfbc2a4c..f346aa8ce3b2 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -79,6 +79,7 @@ enum {
         #define GET_POISON_LIST        0x0
         #define INJECT_POISON          0x1
         #define CLEAR_POISON           0x2
+        #define GET_SCAN_MEDIA_CAPABILITIES 0x3
     PHYSICAL_SWITCH = 0x51
         #define IDENTIFY_SWITCH_DEVICE      0x0
 };
@@ -882,7 +883,7 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
             return CXL_MBOX_INTERNAL_ERROR;
         }
     }
-  
+
     QLIST_FOREACH(ent, poison_list, node) {
         /*
          * Test for contained in entry. Simpler than general case
@@ -934,6 +935,45 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
     return CXL_MBOX_SUCCESS;
 }
 
+static CXLRetCode
+cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
+                                      CXLDeviceState *cxl_dstate, uint16_t *len)
+{
+    struct get_scan_media_capabilities_pl {
+        uint64_t pa;
+        uint64_t length;
+    } QEMU_PACKED;
+    struct get_scan_media_capabilities_out_pl {
+        uint32_t estimated_runtime_ms;
+    } QEMU_PACKED;
+    struct get_scan_media_capabilities_pl *in = (void *)cmd->payload;
+    struct get_scan_media_capabilities_out_pl *out = (void *)cmd->payload;
+    uint64_t query_start;
+    uint64_t query_length;
+
+    query_start = ldq_le_p(&in->pa);
+    /* 64 byte alignment required */
+    if (query_start & 0x3f) {
+        return CXL_MBOX_INVALID_INPUT;
+    }
+    query_length = ldq_le_p(&in->length) * 64;
+
+    if (query_start + query_length > cxl_dstate->mem_size) {
+        return CXL_MBOX_INVALID_PA;
+    }
+
+    /*
+     * Just use 400 nanosecond access/read latency + 100 ns for
+     * the cost of updating the poison list. For small enough
+     * chunks return at least 1 ms.
+     */
+    stq_le_p(&out->estimated_runtime_ms,
+             MAX(1, query_length * (0.0005L/64)));
+
+    *len = sizeof(*out);
+    return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -972,6 +1012,9 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
         cmd_media_inject_poison, 8, 0 },
     [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON",
         cmd_media_clear_poison, 72, 0 },
+    [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
+        cmd_media_get_scan_media_capabilities, 16, 0 },
+
 };
 
 static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/3] hw/cxl: Add scan media mailbox command support
  2023-04-26  2:14 [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
  2023-04-26  2:14 ` [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support Davidlohr Bueso
@ 2023-04-26  2:14 ` Davidlohr Bueso
  2023-04-28 16:42   ` Fan Ni
  2023-05-15 11:49   ` Jonathan Cameron
  2023-04-26  2:14 ` [PATCH 3/3] " Davidlohr Bueso
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-26  2:14 UTC (permalink / raw
  To: Jonathan.Cameron
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	dave, linux-cxl

Trigger the background command and when done go about adding
entries to the poison list, or not. Randonly decide how many
random addresses to generate.

Currently never udpate DRAM Event Log.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 hw/cxl/cxl-mailbox-utils.c  | 127 +++++++++++++++++++++++++++++++++++-
 hw/mem/cxl_type3.c          |   3 +
 include/hw/cxl/cxl_device.h |   8 +++
 3 files changed, 135 insertions(+), 3 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index f346aa8ce3b2..7f29b840a2c9 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -80,6 +80,7 @@ enum {
         #define INJECT_POISON          0x1
         #define CLEAR_POISON           0x2
         #define GET_SCAN_MEDIA_CAPABILITIES 0x3
+        #define SCAN_MEDIA             0x4
     PHYSICAL_SWITCH = 0x51
         #define IDENTIFY_SWITCH_DEVICE      0x0
 };
@@ -852,6 +853,9 @@ static CXLRetCode cmd_media_inject_poison(struct cxl_cmd *cmd,
      */
     QLIST_INSERT_HEAD(poison_list, p, node);
     ct3d->poison_list_cnt++;
+    if (scan_media_running(cxl_dstate)) {
+        ct3d->poison_list_dirty = true;
+    }
 
     return CXL_MBOX_SUCCESS;
 }
@@ -932,6 +936,10 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
     /* Any fragments have been added, free original entry */
     g_free(ent);
 
+    if (scan_media_running(cxl_dstate)) {
+        ct3d->poison_list_dirty = true;
+    }
+
     return CXL_MBOX_SUCCESS;
 }
 
@@ -974,6 +982,103 @@ cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
     return CXL_MBOX_SUCCESS;
 }
 
+static void __do_scan_media(CXLDeviceState *cxl_dstate)
+{
+    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
+    CXLPoisonList *poison_list = &ct3d->poison_list;
+    CXLPoison *ent, *p;
+    uint64_t dpa = 0;
+    int n_dpas;
+    GRand *rand;
+
+    rand = g_rand_new();
+
+    /* how many dpas to "detect" and add to the poison list */
+    n_dpas = g_random_int_range(0, CXL_MBOX_INJECT_POISON_LIMIT/4);
+
+    do {
+        /*
+         * TODO: limit it within the device dpa space (same for
+         * inject poison). For now generate some random address.
+         */
+    retry_dpa:
+        dpa = (uint64_t)g_rand_int(rand) << 32 |(uint64_t)g_rand_int(rand);
+        /* 64 byte alignment required */
+        if (dpa & 0x3f) {
+            goto retry_dpa;
+        }
+
+        QLIST_FOREACH(ent, poison_list, node) {
+            if (dpa >= ent->start &&
+                dpa + 64 <= ent->start + ent->length) {
+                goto retry_dpa;
+            }
+        }
+
+        p = g_new0(CXLPoison, 1);
+
+        p->length = 64;
+        p->start = dpa;
+        p->type = CXL_POISON_TYPE_SCANMEDIA;
+
+        QLIST_INSERT_HEAD(poison_list, p, node);
+        ct3d->poison_list_cnt++;
+    } while (--n_dpas);
+
+    g_rand_free(rand);
+}
+
+static CXLRetCode
+cmd_media_scan_media(struct cxl_cmd *cmd,
+                     CXLDeviceState *cxl_dstate, uint16_t *len)
+{
+    struct scan_media_pl {
+        uint64_t pa;
+        uint64_t length;
+        uint8_t flags;
+    } QEMU_PACKED;
+    struct scan_media_pl *in = (void *)cmd->payload;
+    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
+    uint64_t query_start;
+    uint64_t query_length;
+
+    query_start = ldq_le_p(&in->pa);
+    /* 64 byte alignment required */
+    if (query_start & 0x3f) {
+        return CXL_MBOX_INVALID_INPUT;
+    }
+    query_length = ldq_le_p(&in->length) * 64;
+
+    /*
+     * TODO: Any previous Scan Media results are discarded by
+     * the device upon receiving a new Scan Media command.
+     */
+
+    /*
+     * The host should use this command only if the poison list
+     * has overflowed and is no longer a complete list of the
+     * memory errors that exist on the media.
+     */
+    if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) {
+        ct3d->poison_list_cnt = 0;
+    }
+
+    if (query_start + query_length > cxl_dstate->mem_size) {
+        return CXL_MBOX_INVALID_PA;
+    }
+
+    if (in->flags == 0) { /* TODO */
+        qemu_log_mask(LOG_UNIMP,
+                      "Scan Media Event Log is unsupported\n");
+    }
+
+    cxl_dstate->bg.runtime = MAX(1, query_length * (0.0005L/64));
+    ct3d->poison_list_dirty = false;
+
+    *len = 0;
+    return CXL_MBOX_BG_STARTED;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -1014,7 +1119,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
         cmd_media_clear_poison, 72, 0 },
     [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
         cmd_media_get_scan_media_capabilities, 16, 0 },
-
+    [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
+        cmd_media_scan_media, 17, 0 },
 };
 
 static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
@@ -1048,7 +1154,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
     opcode_handler h;
     uint8_t bg_started = 0;
     uint64_t command_reg = cxl_dstate->mbox_reg_state64[R_CXL_DEV_MAILBOX_CMD];
-
+    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
     uint8_t set = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND_SET);
     uint8_t cmd = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND);
     uint16_t len = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, LENGTH);
@@ -1064,6 +1170,20 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
                     ret = CXL_MBOX_BUSY;
                     goto done;
             }
+
+            /*
+             * If the device updates its Poison List while the
+             * Scan Media operation is executing, the device
+             * shall indicate that a media scan is in progress
+             * if Get Poison List is called during the scan.
+             */
+            if (ct3d->poison_list_dirty) {
+                if (h == cmd_media_get_poison_list) {
+                    ret = CXL_MBOX_BUSY;
+                    goto done;
+                }
+            }
+
             /* forbid any selected commands while overwriting */
             if (sanitize_running(cxl_dstate)) {
                 if (h == cmd_events_get_records ||
@@ -1157,7 +1277,8 @@ static void bg_timercb(void *opaque)
                 __do_sanitization(cxl_dstate);
                 cxl_dev_enable_media(cxl_dstate);
                 break;
-            case 0x4304: /* TODO: scan media */
+            case 0x4304: /* scan media */
+                __do_scan_media(cxl_dstate);
                 break;
             default:
                 __builtin_unreachable();
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 3e63dbd83551..0bc017061f30 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -1213,6 +1213,9 @@ void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length,
 
     QLIST_INSERT_HEAD(&ct3d->poison_list, p, node);
     ct3d->poison_list_cnt++;
+    if (scan_media_running(&ct3d->cxl_dstate)) {
+        ct3d->poison_list_dirty = true;
+    }
 }
 
 /* For uncorrectable errors include support for multiple header recording */
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index cd7f28dba884..8db63c1f7cd1 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -376,12 +376,18 @@ typedef struct CXLPoison {
 #define CXL_POISON_TYPE_EXTERNAL 0x1
 #define CXL_POISON_TYPE_INTERNAL 0x2
 #define CXL_POISON_TYPE_INJECTED 0x3
+#define CXL_POISON_TYPE_SCANMEDIA 0x4
     QLIST_ENTRY(CXLPoison) node;
 } CXLPoison;
 
 typedef QLIST_HEAD(, CXLPoison) CXLPoisonList;
 #define CXL_POISON_LIST_LIMIT 256
 
+static inline bool scan_media_running(CXLDeviceState *cxl_dstate)
+{
+    return cxl_dstate->bg.runtime && cxl_dstate->bg.opcode == 0x4304;
+}
+
 struct CXLType3Dev {
     /* Private */
     PCIDevice parent_obj;
@@ -413,6 +419,8 @@ struct CXLType3Dev {
     unsigned int poison_list_cnt;
     bool poison_list_overflowed;
     uint64_t poison_list_overflow_ts;
+    /* list modified while doing scan media */
+    bool poison_list_dirty;
 };
 
 #define TYPE_CXL_TYPE3 "cxl-type3"
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/3] hw/cxl: Add scan media mailbox command support
  2023-04-26  2:14 [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
  2023-04-26  2:14 ` [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support Davidlohr Bueso
  2023-04-26  2:14 ` [PATCH 2/3] hw/cxl: Add scan media " Davidlohr Bueso
@ 2023-04-26  2:14 ` Davidlohr Bueso
  2023-04-28 17:00   ` Fan Ni
  2023-05-15 12:04   ` Jonathan Cameron
  2023-04-26  2:16 ` [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
  2023-05-15 12:01 ` Jonathan Cameron
  4 siblings, 2 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-26  2:14 UTC (permalink / raw
  To: Jonathan.Cameron
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	dave, linux-cxl

Allow the caller to retrieve the results of a previous scan media
operation, with similar semantics to the get poison list command.
Number of returned records is to the max payload output size.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 hw/cxl/cxl-mailbox-utils.c | 55 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 7f29b840a2c9..f978180d8e21 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -81,6 +81,7 @@ enum {
         #define CLEAR_POISON           0x2
         #define GET_SCAN_MEDIA_CAPABILITIES 0x3
         #define SCAN_MEDIA             0x4
+        #define GET_SCAN_MEDIA_RESULTS 0x5
     PHYSICAL_SWITCH = 0x51
         #define IDENTIFY_SWITCH_DEVICE      0x0
 };
@@ -1079,6 +1080,58 @@ cmd_media_scan_media(struct cxl_cmd *cmd,
     return CXL_MBOX_BG_STARTED;
 }
 
+static CXLRetCode cmd_media_get_scan_media_results(struct cxl_cmd *cmd,
+                                                   CXLDeviceState *cxl_dstate,
+                                                   uint16_t *len)
+{
+    struct get_scan_media_results_out_pl {
+        uint64_t dpa_restart;
+        uint64_t length;
+        uint8_t flags;
+        uint8_t rsvd1;
+        uint16_t count;
+        uint8_t rsvd2[20];
+        struct {
+            uint64_t addr;
+            uint32_t length;
+            uint32_t resv;
+        } QEMU_PACKED records[];
+    } QEMU_PACKED;
+    struct get_scan_media_results_out_pl *out = (void *)cmd->payload;
+    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
+    uint16_t record_count = 0, i = 0;
+    CXLPoisonList *poison_list = &ct3d->poison_list;
+    CXLPoison *ent;
+    uint16_t out_pl_len;
+
+    QLIST_FOREACH(ent, poison_list, node) {
+        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
+            record_count++;
+        }
+    }
+
+    out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
+    assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
+
+    memset(out, 0, out_pl_len);
+    QLIST_FOREACH(ent, poison_list, node) {
+        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
+            uint64_t start, stop;
+
+            start = ent->start & 0xffffffffffffffc0;
+            stop = ent->start & 0xffffffffffffffc0;
+            stq_le_p(&out->records[i].addr, start | (ent->type & 0x7));
+            stl_le_p(&out->records[i].length, (stop - start) / 64);
+            i++;
+        }
+    }
+
+    stw_le_p(&out->count, record_count);
+
+    *len = out_pl_len;
+    return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -1121,6 +1174,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
         cmd_media_get_scan_media_capabilities, 16, 0 },
     [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
         cmd_media_scan_media, 17, 0 },
+    [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS",
+        cmd_media_get_scan_media_results, 0, 0 },
 };
 
 static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
-- 
2.40.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media
  2023-04-26  2:14 [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
                   ` (2 preceding siblings ...)
  2023-04-26  2:14 ` [PATCH 3/3] " Davidlohr Bueso
@ 2023-04-26  2:16 ` Davidlohr Bueso
  2023-05-15 12:01 ` Jonathan Cameron
  4 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-26  2:16 UTC (permalink / raw
  To: Jonathan.Cameron
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	linux-cxl

>Davidlohr Bueso (3):
>  hw/cxl: Add get scan media capabilities mailbox command support
>  hw/cxl: Add scan media mailbox command support
>  hw/cxl: Add scan media mailbox command support

The last one should be "Add get scan media results mailbox command support"

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support
  2023-04-28 16:18   ` Fan Ni
@ 2023-04-28 15:49     ` Davidlohr Bueso
  0 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-28 15:49 UTC (permalink / raw
  To: Fan Ni
  Cc: Jonathan.Cameron, alison.schofield, ira.weiny, dan.j.williams,
	fan.ni, a.manzanares, linux-cxl

On Fri, 28 Apr 2023, Fan Ni wrote:

>> +static CXLRetCode
>> +cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
>> +                                      CXLDeviceState *cxl_dstate, uint16_t *len)
>> +{
>> +    struct get_scan_media_capabilities_pl {
>> +        uint64_t pa;
>> +        uint64_t length;
>> +    } QEMU_PACKED;
>> +    struct get_scan_media_capabilities_out_pl {
>> +        uint32_t estimated_runtime_ms;
>> +    } QEMU_PACKED;
>> +    struct get_scan_media_capabilities_pl *in = (void *)cmd->payload;
>> +    struct get_scan_media_capabilities_out_pl *out = (void *)cmd->payload;
>> +    uint64_t query_start;
>> +    uint64_t query_length;
>> +
>> +    query_start = ldq_le_p(&in->pa);
>> +    /* 64 byte alignment required */
>> +    if (query_start & 0x3f) {
>> +        return CXL_MBOX_INVALID_INPUT;
>> +    }
>> +    query_length = ldq_le_p(&in->length) * 64;
>> +
>> +    if (query_start + query_length > cxl_dstate->mem_size) {
>> +        return CXL_MBOX_INVALID_PA;
>> +    }
>> +
>> +    /*
>> +     * Just use 400 nanosecond access/read latency + 100 ns for
>> +     * the cost of updating the poison list. For small enough
>> +     * chunks return at least 1 ms.
>> +     */
>> +    stq_le_p(&out->estimated_runtime_ms,
>> +             MAX(1, query_length * (0.0005L/64)));
>
>estimated_runtime_ms is uint32_t, use stl_le_p instead.

Yup, thanks.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support
  2023-04-26  2:14 ` [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support Davidlohr Bueso
@ 2023-04-28 16:18   ` Fan Ni
  2023-04-28 15:49     ` Davidlohr Bueso
  2023-05-15 11:01   ` Jonathan Cameron
  1 sibling, 1 reply; 15+ messages in thread
From: Fan Ni @ 2023-04-28 16:18 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: Jonathan.Cameron, alison.schofield, ira.weiny, dan.j.williams,
	fan.ni, a.manzanares, linux-cxl

The 04/25/2023 19:14, Davidlohr Bueso wrote:
> Use simple heuristics to determine the cost of scanning any given chunk,
> assuming cost is even across the whole device.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---
>  hw/cxl/cxl-mailbox-utils.c | 45 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> index 0849cfbc2a4c..f346aa8ce3b2 100644
> --- a/hw/cxl/cxl-mailbox-utils.c
> +++ b/hw/cxl/cxl-mailbox-utils.c
> @@ -79,6 +79,7 @@ enum {
>          #define GET_POISON_LIST        0x0
>          #define INJECT_POISON          0x1
>          #define CLEAR_POISON           0x2
> +        #define GET_SCAN_MEDIA_CAPABILITIES 0x3
>      PHYSICAL_SWITCH = 0x51
>          #define IDENTIFY_SWITCH_DEVICE      0x0
>  };
> @@ -882,7 +883,7 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
>              return CXL_MBOX_INTERNAL_ERROR;
>          }
>      }
> -  
> +
>      QLIST_FOREACH(ent, poison_list, node) {
>          /*
>           * Test for contained in entry. Simpler than general case
> @@ -934,6 +935,45 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
>      return CXL_MBOX_SUCCESS;
>  }
>  
> +static CXLRetCode
> +cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
> +                                      CXLDeviceState *cxl_dstate, uint16_t *len)
> +{
> +    struct get_scan_media_capabilities_pl {
> +        uint64_t pa;
> +        uint64_t length;
> +    } QEMU_PACKED;
> +    struct get_scan_media_capabilities_out_pl {
> +        uint32_t estimated_runtime_ms;
> +    } QEMU_PACKED;
> +    struct get_scan_media_capabilities_pl *in = (void *)cmd->payload;
> +    struct get_scan_media_capabilities_out_pl *out = (void *)cmd->payload;
> +    uint64_t query_start;
> +    uint64_t query_length;
> +
> +    query_start = ldq_le_p(&in->pa);
> +    /* 64 byte alignment required */
> +    if (query_start & 0x3f) {
> +        return CXL_MBOX_INVALID_INPUT;
> +    }
> +    query_length = ldq_le_p(&in->length) * 64;
> +
> +    if (query_start + query_length > cxl_dstate->mem_size) {
> +        return CXL_MBOX_INVALID_PA;
> +    }
> +
> +    /*
> +     * Just use 400 nanosecond access/read latency + 100 ns for
> +     * the cost of updating the poison list. For small enough
> +     * chunks return at least 1 ms.
> +     */
> +    stq_le_p(&out->estimated_runtime_ms,
> +             MAX(1, query_length * (0.0005L/64)));

estimated_runtime_ms is uint32_t, use stl_le_p instead.

> +
> +    *len = sizeof(*out);
> +    return CXL_MBOX_SUCCESS;
> +}
> +
>  #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
>  #define IMMEDIATE_DATA_CHANGE (1 << 2)
>  #define IMMEDIATE_POLICY_CHANGE (1 << 3)
> @@ -972,6 +1012,9 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
>          cmd_media_inject_poison, 8, 0 },
>      [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON",
>          cmd_media_clear_poison, 72, 0 },
> +    [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
> +        cmd_media_get_scan_media_capabilities, 16, 0 },
> +
>  };
>  
>  static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
> -- 
> 2.40.0
> 

-- 
Fan Ni <nifan@outlook.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] hw/cxl: Add scan media mailbox command support
  2023-04-26  2:14 ` [PATCH 2/3] hw/cxl: Add scan media " Davidlohr Bueso
@ 2023-04-28 16:42   ` Fan Ni
  2023-04-28 19:45     ` Davidlohr Bueso
  2023-05-15 11:49   ` Jonathan Cameron
  1 sibling, 1 reply; 15+ messages in thread
From: Fan Ni @ 2023-04-28 16:42 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: Jonathan.Cameron, alison.schofield, ira.weiny, dan.j.williams,
	fan.ni, a.manzanares, linux-cxl

The 04/25/2023 19:14, Davidlohr Bueso wrote:
> Trigger the background command and when done go about adding
> entries to the poison list, or not. Randonly decide how many
> random addresses to generate.
> 
> Currently never udpate DRAM Event Log.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---
>  hw/cxl/cxl-mailbox-utils.c  | 127 +++++++++++++++++++++++++++++++++++-
>  hw/mem/cxl_type3.c          |   3 +
>  include/hw/cxl/cxl_device.h |   8 +++
>  3 files changed, 135 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> index f346aa8ce3b2..7f29b840a2c9 100644
> --- a/hw/cxl/cxl-mailbox-utils.c
> +++ b/hw/cxl/cxl-mailbox-utils.c
> @@ -80,6 +80,7 @@ enum {
>          #define INJECT_POISON          0x1
>          #define CLEAR_POISON           0x2
>          #define GET_SCAN_MEDIA_CAPABILITIES 0x3
> +        #define SCAN_MEDIA             0x4
>      PHYSICAL_SWITCH = 0x51
>          #define IDENTIFY_SWITCH_DEVICE      0x0
>  };
> @@ -852,6 +853,9 @@ static CXLRetCode cmd_media_inject_poison(struct cxl_cmd *cmd,
>       */
>      QLIST_INSERT_HEAD(poison_list, p, node);
>      ct3d->poison_list_cnt++;
> +    if (scan_media_running(cxl_dstate)) {
> +        ct3d->poison_list_dirty = true;
> +    }
>  
>      return CXL_MBOX_SUCCESS;
>  }
> @@ -932,6 +936,10 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
>      /* Any fragments have been added, free original entry */
>      g_free(ent);
>  
> +    if (scan_media_running(cxl_dstate)) {
> +        ct3d->poison_list_dirty = true;
> +    }
> +
>      return CXL_MBOX_SUCCESS;
>  }
>  
> @@ -974,6 +982,103 @@ cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
>      return CXL_MBOX_SUCCESS;
>  }
>  
> +static void __do_scan_media(CXLDeviceState *cxl_dstate)
> +{
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> +    CXLPoisonList *poison_list = &ct3d->poison_list;
> +    CXLPoison *ent, *p;
> +    uint64_t dpa = 0;
> +    int n_dpas;
> +    GRand *rand;
> +
> +    rand = g_rand_new();
> +
> +    /* how many dpas to "detect" and add to the poison list */
> +    n_dpas = g_random_int_range(0, CXL_MBOX_INJECT_POISON_LIMIT/4);
> +
> +    do {
> +        /*
> +         * TODO: limit it within the device dpa space (same for
> +         * inject poison). For now generate some random address.
> +         */
> +    retry_dpa:
> +        dpa = (uint64_t)g_rand_int(rand) << 32 |(uint64_t)g_rand_int(rand);
> +        /* 64 byte alignment required */
> +        if (dpa & 0x3f) {
> +            goto retry_dpa;
> +        }
This seems very inefficient. the possibility for dpa to pass the check
is 1/64. Why not just do something like below:
dpa = (uint64_t)g_rand_int(rand) << 6;
> +
> +        QLIST_FOREACH(ent, poison_list, node) {
> +            if (dpa >= ent->start &&
> +                dpa + 64 <= ent->start + ent->length) {
> +                goto retry_dpa;
> +            }
> +        }
> +
> +        p = g_new0(CXLPoison, 1);
> +
> +        p->length = 64;
> +        p->start = dpa;
> +        p->type = CXL_POISON_TYPE_SCANMEDIA;
> +
> +        QLIST_INSERT_HEAD(poison_list, p, node);
> +        ct3d->poison_list_cnt++;
> +    } while (--n_dpas);
> +
> +    g_rand_free(rand);
> +}
> +
> +static CXLRetCode
> +cmd_media_scan_media(struct cxl_cmd *cmd,
> +                     CXLDeviceState *cxl_dstate, uint16_t *len)
> +{
> +    struct scan_media_pl {
> +        uint64_t pa;
> +        uint64_t length;
> +        uint8_t flags;
> +    } QEMU_PACKED;
> +    struct scan_media_pl *in = (void *)cmd->payload;
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> +    uint64_t query_start;
> +    uint64_t query_length;
> +
> +    query_start = ldq_le_p(&in->pa);
> +    /* 64 byte alignment required */
> +    if (query_start & 0x3f) {
> +        return CXL_MBOX_INVALID_INPUT;
> +    }
> +    query_length = ldq_le_p(&in->length) * 64;
> +
> +    /*
> +     * TODO: Any previous Scan Media results are discarded by
> +     * the device upon receiving a new Scan Media command.
> +     */
> +
> +    /*
> +     * The host should use this command only if the poison list
> +     * has overflowed and is no longer a complete list of the
> +     * memory errors that exist on the media.
> +     */
> +    if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) {
> +        ct3d->poison_list_cnt = 0;
> +    }
> +
> +    if (query_start + query_length > cxl_dstate->mem_size) {
> +        return CXL_MBOX_INVALID_PA;
> +    }
> +
> +    if (in->flags == 0) { /* TODO */
> +        qemu_log_mask(LOG_UNIMP,
> +                      "Scan Media Event Log is unsupported\n");
> +    }
> +
> +    cxl_dstate->bg.runtime = MAX(1, query_length * (0.0005L/64));
> +    ct3d->poison_list_dirty = false;
> +
> +    *len = 0;
> +    return CXL_MBOX_BG_STARTED;
> +}
> +
>  #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
>  #define IMMEDIATE_DATA_CHANGE (1 << 2)
>  #define IMMEDIATE_POLICY_CHANGE (1 << 3)
> @@ -1014,7 +1119,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
>          cmd_media_clear_poison, 72, 0 },
>      [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
>          cmd_media_get_scan_media_capabilities, 16, 0 },
should command effect be BACKGROUND_OPERATION instead of 0?
> -
> +    [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
> +        cmd_media_scan_media, 17, 0 },
>  };
>  
>  static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
> @@ -1048,7 +1154,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
>      opcode_handler h;
>      uint8_t bg_started = 0;
>      uint64_t command_reg = cxl_dstate->mbox_reg_state64[R_CXL_DEV_MAILBOX_CMD];
> -
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
>      uint8_t set = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND_SET);
>      uint8_t cmd = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND);
>      uint16_t len = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, LENGTH);
> @@ -1064,6 +1170,20 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
>                      ret = CXL_MBOX_BUSY;
>                      goto done;
>              }
> +
> +            /*
> +             * If the device updates its Poison List while the
> +             * Scan Media operation is executing, the device
> +             * shall indicate that a media scan is in progress
> +             * if Get Poison List is called during the scan.
> +             */
> +            if (ct3d->poison_list_dirty) {
> +                if (h == cmd_media_get_poison_list) {
> +                    ret = CXL_MBOX_BUSY;
> +                    goto done;
> +                }
> +            }
> +
>              /* forbid any selected commands while overwriting */
>              if (sanitize_running(cxl_dstate)) {
>                  if (h == cmd_events_get_records ||
> @@ -1157,7 +1277,8 @@ static void bg_timercb(void *opaque)
>                  __do_sanitization(cxl_dstate);
>                  cxl_dev_enable_media(cxl_dstate);
>                  break;
> -            case 0x4304: /* TODO: scan media */
> +            case 0x4304: /* scan media */
> +                __do_scan_media(cxl_dstate);
>                  break;
>              default:
>                  __builtin_unreachable();
> diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> index 3e63dbd83551..0bc017061f30 100644
> --- a/hw/mem/cxl_type3.c
> +++ b/hw/mem/cxl_type3.c
> @@ -1213,6 +1213,9 @@ void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length,
>  
>      QLIST_INSERT_HEAD(&ct3d->poison_list, p, node);
>      ct3d->poison_list_cnt++;
> +    if (scan_media_running(&ct3d->cxl_dstate)) {
> +        ct3d->poison_list_dirty = true;
> +    }
>  }
>  
>  /* For uncorrectable errors include support for multiple header recording */
> diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> index cd7f28dba884..8db63c1f7cd1 100644
> --- a/include/hw/cxl/cxl_device.h
> +++ b/include/hw/cxl/cxl_device.h
> @@ -376,12 +376,18 @@ typedef struct CXLPoison {
>  #define CXL_POISON_TYPE_EXTERNAL 0x1
>  #define CXL_POISON_TYPE_INTERNAL 0x2
>  #define CXL_POISON_TYPE_INJECTED 0x3
> +#define CXL_POISON_TYPE_SCANMEDIA 0x4
>      QLIST_ENTRY(CXLPoison) node;
>  } CXLPoison;
>  
>  typedef QLIST_HEAD(, CXLPoison) CXLPoisonList;
>  #define CXL_POISON_LIST_LIMIT 256
>  
> +static inline bool scan_media_running(CXLDeviceState *cxl_dstate)
> +{
> +    return cxl_dstate->bg.runtime && cxl_dstate->bg.opcode == 0x4304;
> +}
> +
>  struct CXLType3Dev {
>      /* Private */
>      PCIDevice parent_obj;
> @@ -413,6 +419,8 @@ struct CXLType3Dev {
>      unsigned int poison_list_cnt;
>      bool poison_list_overflowed;
>      uint64_t poison_list_overflow_ts;
> +    /* list modified while doing scan media */
> +    bool poison_list_dirty;
>  };
>  
>  #define TYPE_CXL_TYPE3 "cxl-type3"
> -- 
> 2.40.0
> 

-- 
Fan Ni <nifan@outlook.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] hw/cxl: Add scan media mailbox command support
  2023-04-26  2:14 ` [PATCH 3/3] " Davidlohr Bueso
@ 2023-04-28 17:00   ` Fan Ni
  2023-04-28 19:30     ` Davidlohr Bueso
  2023-05-15 12:04   ` Jonathan Cameron
  1 sibling, 1 reply; 15+ messages in thread
From: Fan Ni @ 2023-04-28 17:00 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: Jonathan.Cameron, alison.schofield, ira.weiny, dan.j.williams,
	fan.ni, a.manzanares, linux-cxl

The 04/25/2023 19:14, Davidlohr Bueso wrote:
> Allow the caller to retrieve the results of a previous scan media
> operation, with similar semantics to the get poison list command.
> Number of returned records is to the max payload output size.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---
>  hw/cxl/cxl-mailbox-utils.c | 55 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
> 
> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> index 7f29b840a2c9..f978180d8e21 100644
> --- a/hw/cxl/cxl-mailbox-utils.c
> +++ b/hw/cxl/cxl-mailbox-utils.c
> @@ -81,6 +81,7 @@ enum {
>          #define CLEAR_POISON           0x2
>          #define GET_SCAN_MEDIA_CAPABILITIES 0x3
>          #define SCAN_MEDIA             0x4
> +        #define GET_SCAN_MEDIA_RESULTS 0x5
>      PHYSICAL_SWITCH = 0x51
>          #define IDENTIFY_SWITCH_DEVICE      0x0
>  };
> @@ -1079,6 +1080,58 @@ cmd_media_scan_media(struct cxl_cmd *cmd,
>      return CXL_MBOX_BG_STARTED;
>  }
>  
> +static CXLRetCode cmd_media_get_scan_media_results(struct cxl_cmd *cmd,
> +                                                   CXLDeviceState *cxl_dstate,
> +                                                   uint16_t *len)
> +{
> +    struct get_scan_media_results_out_pl {
> +        uint64_t dpa_restart;
> +        uint64_t length;
> +        uint8_t flags;
> +        uint8_t rsvd1;
> +        uint16_t count;
> +        uint8_t rsvd2[20];
20-->0xc
> +        struct {
> +            uint64_t addr;
> +            uint32_t length;
> +            uint32_t resv;
> +        } QEMU_PACKED records[];
> +    } QEMU_PACKED;
> +    struct get_scan_media_results_out_pl *out = (void *)cmd->payload;
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> +    uint16_t record_count = 0, i = 0;
> +    CXLPoisonList *poison_list = &ct3d->poison_list;
> +    CXLPoison *ent;
> +    uint16_t out_pl_len;
> +
> +    QLIST_FOREACH(ent, poison_list, node) {
> +        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
> +            record_count++;
> +        }
> +    }
> +
> +    out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
> +    assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
Add TODO to cover the case when eror list is larger than output payload
size??
> +
> +    memset(out, 0, out_pl_len);
> +    QLIST_FOREACH(ent, poison_list, node) {
> +        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
> +            uint64_t start, stop;
> +
> +            start = ent->start & 0xffffffffffffffc0;
> +            stop = ent->start & 0xffffffffffffffc0;
so start == stop??? missing ent->length for stop???
> +            stq_le_p(&out->records[i].addr, start | (ent->type & 0x7));
> +            stl_le_p(&out->records[i].length, (stop - start) / 64);
length always be 0.
> +            i++;
> +        }
> +    }
> +
> +    stw_le_p(&out->count, record_count);
> +
> +    *len = out_pl_len;
> +    return CXL_MBOX_SUCCESS;
> +}
> +
>  #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
>  #define IMMEDIATE_DATA_CHANGE (1 << 2)
>  #define IMMEDIATE_POLICY_CHANGE (1 << 3)
> @@ -1121,6 +1174,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
>          cmd_media_get_scan_media_capabilities, 16, 0 },
>      [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
>          cmd_media_scan_media, 17, 0 },
> +    [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS",
> +        cmd_media_get_scan_media_results, 0, 0 },
>  };
>  
>  static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
> -- 
> 2.40.0
> 

-- 
Fan Ni <nifan@outlook.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] hw/cxl: Add scan media mailbox command support
  2023-04-28 17:00   ` Fan Ni
@ 2023-04-28 19:30     ` Davidlohr Bueso
  0 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-28 19:30 UTC (permalink / raw
  To: Fan Ni
  Cc: Jonathan.Cameron, alison.schofield, ira.weiny, dan.j.williams,
	fan.ni, a.manzanares, linux-cxl

On Fri, 28 Apr 2023, Fan Ni wrote:

>The 04/25/2023 19:14, Davidlohr Bueso wrote:
>> +static CXLRetCode cmd_media_get_scan_media_results(struct cxl_cmd *cmd,
>> +                                                   CXLDeviceState *cxl_dstate,
>> +                                                   uint16_t *len)
>> +{
>> +    struct get_scan_media_results_out_pl {
>> +        uint64_t dpa_restart;
>> +        uint64_t length;
>> +        uint8_t flags;
>> +        uint8_t rsvd1;
>> +        uint16_t count;
>> +        uint8_t rsvd2[20];
>20-->0xc
>> +        struct {
>> +            uint64_t addr;
>> +            uint32_t length;
>> +            uint32_t resv;
>> +        } QEMU_PACKED records[];
>> +    } QEMU_PACKED;
>> +    struct get_scan_media_results_out_pl *out = (void *)cmd->payload;
>> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
>> +    uint16_t record_count = 0, i = 0;
>> +    CXLPoisonList *poison_list = &ct3d->poison_list;
>> +    CXLPoison *ent;
>> +    uint16_t out_pl_len;
>> +
>> +    QLIST_FOREACH(ent, poison_list, node) {
>> +        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
>> +            record_count++;
>> +        }
>> +    }
>> +
>> +    out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
>> +    assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
>Add TODO to cover the case when eror list is larger than output payload
>size??

Sure, but this is why I referenced this limitation in the cover letter.

>> +
>> +    memset(out, 0, out_pl_len);
>> +    QLIST_FOREACH(ent, poison_list, node) {
>> +        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
>> +            uint64_t start, stop;
>> +
>> +            start = ent->start & 0xffffffffffffffc0;
>> +            stop = ent->start & 0xffffffffffffffc0;
>so start == stop??? missing ent->length for stop???

duh yes should be ent->start + ent->length. I did say all this was untested :)

Thanks,
Davidlor

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] hw/cxl: Add scan media mailbox command support
  2023-04-28 16:42   ` Fan Ni
@ 2023-04-28 19:45     ` Davidlohr Bueso
  0 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2023-04-28 19:45 UTC (permalink / raw
  To: Fan Ni
  Cc: Jonathan.Cameron, alison.schofield, ira.weiny, dan.j.williams,
	fan.ni, a.manzanares, linux-cxl

On Fri, 28 Apr 2023, Fan Ni wrote:

>The 04/25/2023 19:14, Davidlohr Bueso wrote:
>> Trigger the background command and when done go about adding
>> entries to the poison list, or not. Randonly decide how many
>> random addresses to generate.
>>
>> Currently never udpate DRAM Event Log.
>>
>> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
>> ---
>>  hw/cxl/cxl-mailbox-utils.c  | 127 +++++++++++++++++++++++++++++++++++-
>>  hw/mem/cxl_type3.c          |   3 +
>>  include/hw/cxl/cxl_device.h |   8 +++
>>  3 files changed, 135 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
>> index f346aa8ce3b2..7f29b840a2c9 100644
>> --- a/hw/cxl/cxl-mailbox-utils.c
>> +++ b/hw/cxl/cxl-mailbox-utils.c
>> @@ -80,6 +80,7 @@ enum {
>>          #define INJECT_POISON          0x1
>>          #define CLEAR_POISON           0x2
>>          #define GET_SCAN_MEDIA_CAPABILITIES 0x3
>> +        #define SCAN_MEDIA             0x4
>>      PHYSICAL_SWITCH = 0x51
>>          #define IDENTIFY_SWITCH_DEVICE      0x0
>>  };
>> @@ -852,6 +853,9 @@ static CXLRetCode cmd_media_inject_poison(struct cxl_cmd *cmd,
>>       */
>>      QLIST_INSERT_HEAD(poison_list, p, node);
>>      ct3d->poison_list_cnt++;
>> +    if (scan_media_running(cxl_dstate)) {
>> +        ct3d->poison_list_dirty = true;
>> +    }
>>
>>      return CXL_MBOX_SUCCESS;
>>  }
>> @@ -932,6 +936,10 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
>>      /* Any fragments have been added, free original entry */
>>      g_free(ent);
>>
>> +    if (scan_media_running(cxl_dstate)) {
>> +        ct3d->poison_list_dirty = true;
>> +    }
>> +
>>      return CXL_MBOX_SUCCESS;
>>  }
>>
>> @@ -974,6 +982,103 @@ cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
>>      return CXL_MBOX_SUCCESS;
>>  }
>>
>> +static void __do_scan_media(CXLDeviceState *cxl_dstate)
>> +{
>> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
>> +    CXLPoisonList *poison_list = &ct3d->poison_list;
>> +    CXLPoison *ent, *p;
>> +    uint64_t dpa = 0;
>> +    int n_dpas;
>> +    GRand *rand;
>> +
>> +    rand = g_rand_new();
>> +
>> +    /* how many dpas to "detect" and add to the poison list */
>> +    n_dpas = g_random_int_range(0, CXL_MBOX_INJECT_POISON_LIMIT/4);
>> +
>> +    do {
>> +        /*
>> +         * TODO: limit it within the device dpa space (same for
>> +         * inject poison). For now generate some random address.
>> +         */
>> +    retry_dpa:
>> +        dpa = (uint64_t)g_rand_int(rand) << 32 |(uint64_t)g_rand_int(rand);
>> +        /* 64 byte alignment required */
>> +        if (dpa & 0x3f) {
>> +            goto retry_dpa;
>> +        }
>This seems very inefficient. the possibility for dpa to pass the check
>is 1/64. Why not just do something like below:
>dpa = (uint64_t)g_rand_int(rand) << 6;
>> +
>> +        QLIST_FOREACH(ent, poison_list, node) {
>> +            if (dpa >= ent->start &&
>> +                dpa + 64 <= ent->start + ent->length) {
>> +                goto retry_dpa;
>> +            }
>> +        }
>> +
>> +        p = g_new0(CXLPoison, 1);
>> +
>> +        p->length = 64;
>> +        p->start = dpa;
>> +        p->type = CXL_POISON_TYPE_SCANMEDIA;
>> +
>> +        QLIST_INSERT_HEAD(poison_list, p, node);
>> +        ct3d->poison_list_cnt++;
>> +    } while (--n_dpas);
>> +
>> +    g_rand_free(rand);
>> +}
>> +
>> +static CXLRetCode
>> +cmd_media_scan_media(struct cxl_cmd *cmd,
>> +                     CXLDeviceState *cxl_dstate, uint16_t *len)
>> +{
>> +    struct scan_media_pl {
>> +        uint64_t pa;
>> +        uint64_t length;
>> +        uint8_t flags;
>> +    } QEMU_PACKED;
>> +    struct scan_media_pl *in = (void *)cmd->payload;
>> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
>> +    uint64_t query_start;
>> +    uint64_t query_length;
>> +
>> +    query_start = ldq_le_p(&in->pa);
>> +    /* 64 byte alignment required */
>> +    if (query_start & 0x3f) {
>> +        return CXL_MBOX_INVALID_INPUT;
>> +    }
>> +    query_length = ldq_le_p(&in->length) * 64;
>> +
>> +    /*
>> +     * TODO: Any previous Scan Media results are discarded by
>> +     * the device upon receiving a new Scan Media command.
>> +     */
>> +
>> +    /*
>> +     * The host should use this command only if the poison list
>> +     * has overflowed and is no longer a complete list of the
>> +     * memory errors that exist on the media.
>> +     */
>> +    if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) {
>> +        ct3d->poison_list_cnt = 0;
>> +    }
>> +
>> +    if (query_start + query_length > cxl_dstate->mem_size) {
>> +        return CXL_MBOX_INVALID_PA;
>> +    }
>> +
>> +    if (in->flags == 0) { /* TODO */
>> +        qemu_log_mask(LOG_UNIMP,
>> +                      "Scan Media Event Log is unsupported\n");
>> +    }
>> +
>> +    cxl_dstate->bg.runtime = MAX(1, query_length * (0.0005L/64));
>> +    ct3d->poison_list_dirty = false;
>> +
>> +    *len = 0;
>> +    return CXL_MBOX_BG_STARTED;
>> +}
>> +
>>  #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
>>  #define IMMEDIATE_DATA_CHANGE (1 << 2)
>>  #define IMMEDIATE_POLICY_CHANGE (1 << 3)
>> @@ -1014,7 +1119,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
>>          cmd_media_clear_poison, 72, 0 },
>>      [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
>>          cmd_media_get_scan_media_capabilities, 16, 0 },
>should command effect be BACKGROUND_OPERATION instead of 0?

Yes...

>> -
>> +    [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
>> +        cmd_media_scan_media, 17, 0 },

but for this one.

>>  };

I'll incorporate your feedback in a next v1 - probably after the kernel
changes are posted, for actual testing.

Thanks for going through the rfc!

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support
  2023-04-26  2:14 ` [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support Davidlohr Bueso
  2023-04-28 16:18   ` Fan Ni
@ 2023-05-15 11:01   ` Jonathan Cameron
  1 sibling, 0 replies; 15+ messages in thread
From: Jonathan Cameron @ 2023-05-15 11:01 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	linux-cxl

On Tue, 25 Apr 2023 19:14:16 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> Use simple heuristics to determine the cost of scanning any given chunk,
> assuming cost is even across the whole device.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Hi.

A few trivial additions / comments / moans.
Obviously it's an RFC but I can't resist pointing things out you'd
fix for the v1.

Thanks,

Jonathan

> ---
>  hw/cxl/cxl-mailbox-utils.c | 45 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> index 0849cfbc2a4c..f346aa8ce3b2 100644
> --- a/hw/cxl/cxl-mailbox-utils.c
> +++ b/hw/cxl/cxl-mailbox-utils.c
> @@ -79,6 +79,7 @@ enum {
>          #define GET_POISON_LIST        0x0
>          #define INJECT_POISON          0x1
>          #define CLEAR_POISON           0x2
> +        #define GET_SCAN_MEDIA_CAPABILITIES 0x3
>      PHYSICAL_SWITCH = 0x51
>          #define IDENTIFY_SWITCH_DEVICE      0x0
>  };
> @@ -882,7 +883,7 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
>              return CXL_MBOX_INTERNAL_ERROR;
>          }
>      }
> -  
> +
Hmm. Where did that white space sneak in? 
I'm guessing in something else I'm carrying on the branch you based on (I'm not
seeing this locally now - so might already be fixed).

Still shouldn't be in here!


>      QLIST_FOREACH(ent, poison_list, node) {
>          /*
>           * Test for contained in entry. Simpler than general case
> @@ -934,6 +935,45 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
>      return CXL_MBOX_SUCCESS;
>  }
>  
> +static CXLRetCode
> +cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
> +                                      CXLDeviceState *cxl_dstate, uint16_t *len)
> +{
> +    struct get_scan_media_capabilities_pl {
> +        uint64_t pa;
> +        uint64_t length;
> +    } QEMU_PACKED;

I'd add some blank lines to help readability a little.  One here.

> +    struct get_scan_media_capabilities_out_pl {
> +        uint32_t estimated_runtime_ms;
> +    } QEMU_PACKED;

One here.

> +    struct get_scan_media_capabilities_pl *in = (void *)cmd->payload;
> +    struct get_scan_media_capabilities_out_pl *out = (void *)cmd->payload;
> +    uint64_t query_start;
> +    uint64_t query_length;
> +
> +    query_start = ldq_le_p(&in->pa);
> +    /* 64 byte alignment required */
> +    if (query_start & 0x3f) {
> +        return CXL_MBOX_INVALID_INPUT;
> +    }
> +    query_length = ldq_le_p(&in->length) * 64;
> +
> +    if (query_start + query_length > cxl_dstate->mem_size) {
> +        return CXL_MBOX_INVALID_PA;
> +    }
> +
> +    /*
> +     * Just use 400 nanosecond access/read latency + 100 ns for
> +     * the cost of updating the poison list. For small enough
> +     * chunks return at least 1 ms.
> +     */
> +    stq_le_p(&out->estimated_runtime_ms,
> +             MAX(1, query_length * (0.0005L/64)));
> +
> +    *len = sizeof(*out);
> +    return CXL_MBOX_SUCCESS;
> +}
> +
>  #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
>  #define IMMEDIATE_DATA_CHANGE (1 << 2)
>  #define IMMEDIATE_POLICY_CHANGE (1 << 3)
> @@ -972,6 +1012,9 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
>          cmd_media_inject_poison, 8, 0 },
>      [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON",
>          cmd_media_clear_poison, 72, 0 },
> +    [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
> +        cmd_media_get_scan_media_capabilities, 16, 0 },
> +
Grumpy maintainer moment.  No blank line should be added here ;)

>  };
>  
>  static struct cxl_cmd cxl_cmd_set_sw[256][256] = {


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] hw/cxl: Add scan media mailbox command support
  2023-04-26  2:14 ` [PATCH 2/3] hw/cxl: Add scan media " Davidlohr Bueso
  2023-04-28 16:42   ` Fan Ni
@ 2023-05-15 11:49   ` Jonathan Cameron
  1 sibling, 0 replies; 15+ messages in thread
From: Jonathan Cameron @ 2023-05-15 11:49 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	linux-cxl

On Tue, 25 Apr 2023 19:14:17 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> Trigger the background command and when done go about adding
> entries to the poison list, or not. Randonly decide how many
> random addresses to generate.

Hi Davidlohr,
I'm not keen on random when we have an injection interface already.
Just make the QMP qemu poison list injector allow injection beyond
the poison list overflow then random etc becomes a userspace test
script problem not one for QEMU to emulate.

> 
> Currently never udpate DRAM Event Log.
Yeah. That's true for poison in general.

All the patch sets got reordered so many times there wasn't a good
way to add that without dependencies. After everything is in place
I'd like to circle back and fix the log part up.

> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>


Various things inline.  Biggest is I don't think the poison list clearing
is correct from my reading of the spec.

> ---
>  hw/cxl/cxl-mailbox-utils.c  | 127 +++++++++++++++++++++++++++++++++++-
>  hw/mem/cxl_type3.c          |   3 +
>  include/hw/cxl/cxl_device.h |   8 +++
>  3 files changed, 135 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> index f346aa8ce3b2..7f29b840a2c9 100644
> --- a/hw/cxl/cxl-mailbox-utils.c
> +++ b/hw/cxl/cxl-mailbox-utils.c
> @@ -80,6 +80,7 @@ enum {
>          #define INJECT_POISON          0x1
>          #define CLEAR_POISON           0x2
>          #define GET_SCAN_MEDIA_CAPABILITIES 0x3
> +        #define SCAN_MEDIA             0x4
>      PHYSICAL_SWITCH = 0x51
>          #define IDENTIFY_SWITCH_DEVICE      0x0
>  };
> @@ -852,6 +853,9 @@ static CXLRetCode cmd_media_inject_poison(struct cxl_cmd *cmd,
>       */
>      QLIST_INSERT_HEAD(poison_list, p, node);
>      ct3d->poison_list_cnt++;
> +    if (scan_media_running(cxl_dstate)) {
> +        ct3d->poison_list_dirty = true;
> +    }
>  
>      return CXL_MBOX_SUCCESS;
>  }
> @@ -932,6 +936,10 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd *cmd,
>      /* Any fragments have been added, free original entry */
>      g_free(ent);
>  
> +    if (scan_media_running(cxl_dstate)) {
> +        ct3d->poison_list_dirty = true;
> +    }
> +
>      return CXL_MBOX_SUCCESS;
>  }
>  
> @@ -974,6 +982,103 @@ cmd_media_get_scan_media_capabilities(struct cxl_cmd *cmd,
>      return CXL_MBOX_SUCCESS;
>  }
>  
> +static void __do_scan_media(CXLDeviceState *cxl_dstate)
> +{
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> +    CXLPoisonList *poison_list = &ct3d->poison_list;
> +    CXLPoison *ent, *p;
> +    uint64_t dpa = 0;
> +    int n_dpas;
> +    GRand *rand;
> +
> +    rand = g_rand_new();
> +
> +    /* how many dpas to "detect" and add to the poison list */
> +    n_dpas = g_random_int_range(0, CXL_MBOX_INJECT_POISON_LIMIT/4);

Not sure we need to do this and it's a pain to use in testing given
random entries.  Maybe just let the QMP poison injection path
log a bunch more after the poison list is full?  That should give
us something nice and simple to test.  If you want random injection
then write a script to poke that interface appropriately.
 
> +
> +    do {
> +        /*
> +         * TODO: limit it within the device dpa space (same for
> +         * inject poison). For now generate some random address.
> +         */
> +    retry_dpa:
> +        dpa = (uint64_t)g_rand_int(rand) << 32 |(uint64_t)g_rand_int(rand);
> +        /* 64 byte alignment required */
> +        if (dpa & 0x3f) {
> +            goto retry_dpa;
> +        }
> +
> +        QLIST_FOREACH(ent, poison_list, node) {
> +            if (dpa >= ent->start &&
> +                dpa + 64 <= ent->start + ent->length) {
> +                goto retry_dpa;
> +            }
> +        }
> +
> +        p = g_new0(CXLPoison, 1);
> +
> +        p->length = 64;
> +        p->start = dpa;
> +        p->type = CXL_POISON_TYPE_SCANMEDIA;
> +
> +        QLIST_INSERT_HEAD(poison_list, p, node);
> +        ct3d->poison_list_cnt++;
> +    } while (--n_dpas);
> +
> +    g_rand_free(rand);
> +}
> +
> +static CXLRetCode
> +cmd_media_scan_media(struct cxl_cmd *cmd,
> +                     CXLDeviceState *cxl_dstate, uint16_t *len)
> +{
> +    struct scan_media_pl {
> +        uint64_t pa;
> +        uint64_t length;
> +        uint8_t flags;
> +    } QEMU_PACKED;
> +    struct scan_media_pl *in = (void *)cmd->payload;
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> +    uint64_t query_start;
> +    uint64_t query_length;
> +
> +    query_start = ldq_le_p(&in->pa);
> +    /* 64 byte alignment required */
> +    if (query_start & 0x3f) {
> +        return CXL_MBOX_INVALID_INPUT;
> +    }
> +    query_length = ldq_le_p(&in->length) * 64;
> +
> +    /*
> +     * TODO: Any previous Scan Media results are discarded by
> +     * the device upon receiving a new Scan Media command.
> +     */
> +
> +    /*
> +     * The host should use this command only if the poison list
> +     * has overflowed and is no longer a complete list of the
> +     * memory errors that exist on the media.
Reference.

> +     */
> +    if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) {
> +        ct3d->poison_list_cnt = 0;

Clearing the poison list definitely needs a reference.
I don't think this is valid, but might be reading things wrong.
I think the cycle if you get a poison list overflow is...

1) Call scan media / scan media results (including handling of
   premature stop and restart etc) that gets you the full list.
2) Clear that poison either using command or impdef host specific
   write of full cacheline.
3) Rerun Scan media request on full address range.  (see under
Get Poison List Output). 
(On this last one: I'm not sure what you meant by timeslicing commands
 in the BG commands kernel series, but if you meant breaking them into
 smaller memory address ranges then this will not work).

If the list being rebuilt is under the limit it will clear the overflow.
If not we go around a gain a few times and if no progress made give
up and mark device dying and power down machine ASAP.


> +    }
> +
> +    if (query_start + query_length > cxl_dstate->mem_size) {
> +        return CXL_MBOX_INVALID_PA;
> +    }
> +
> +    if (in->flags == 0) { /* TODO */
> +        qemu_log_mask(LOG_UNIMP,
> +                      "Scan Media Event Log is unsupported\n");
> +    }
> +
> +    cxl_dstate->bg.runtime = MAX(1, query_length * (0.0005L/64));
> +    ct3d->poison_list_dirty = false;
> +
> +    *len = 0;
> +    return CXL_MBOX_BG_STARTED;
> +}
> +
>  #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
>  #define IMMEDIATE_DATA_CHANGE (1 << 2)
>  #define IMMEDIATE_POLICY_CHANGE (1 << 3)
> @@ -1014,7 +1119,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
>          cmd_media_clear_poison, 72, 0 },
>      [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES",
>          cmd_media_get_scan_media_capabilities, 16, 0 },
> -

And there it goes again ;)

> +    [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
> +        cmd_media_scan_media, 17, 0 },
>  };
>  
>  static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
> @@ -1048,7 +1154,7 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
>      opcode_handler h;
>      uint8_t bg_started = 0;
>      uint64_t command_reg = cxl_dstate->mbox_reg_state64[R_CXL_DEV_MAILBOX_CMD];
> -
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);

Move this up as it's breaking up the reading of the register and the separation
of it's fields.

>      uint8_t set = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND_SET);
>      uint8_t cmd = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, COMMAND);
>      uint16_t len = FIELD_EX64(command_reg, CXL_DEV_MAILBOX_CMD, LENGTH);
> @@ -1064,6 +1170,20 @@ void cxl_process_mailbox(CXLDeviceState *cxl_dstate)
>                      ret = CXL_MBOX_BUSY;
>                      goto done;
>              }
> +
> +            /*
> +             * If the device updates its Poison List while the
> +             * Scan Media operation is executing, the device

Line wrap seems a little short.

> +             * shall indicate that a media scan is in progress
> +             * if Get Poison List is called during the scan.

Spec reference (as that text is cut and paste from spec ;)

> +             */
> +            if (ct3d->poison_list_dirty) {
> +                if (h == cmd_media_get_poison_list) {
> +                    ret = CXL_MBOX_BUSY;

Not a valid return code for Get Poison List.
Also the text under 'scan media in progress' flag in the poison list
output suggests to me it doesn't fail the command, it simply
sets the flag to say the data might be wrong / partial.
So this needs to set that flag and carry on otherwise.

Scan media includes some weasel words that means a device might
not update it's poison list anyway or set this flag, but I think
it's sensible to make the default in QEMU that it does both update
poison list and set the flag.  Don't think you setting flag and not
updating this list is valid....

> +                    goto done;
> +                }
> +            }
> +
>              /* forbid any selected commands while overwriting */
>              if (sanitize_running(cxl_dstate)) {
>                  if (h == cmd_events_get_records ||
> @@ -1157,7 +1277,8 @@ static void bg_timercb(void *opaque)
>                  __do_sanitization(cxl_dstate);
>                  cxl_dev_enable_media(cxl_dstate);
>                  break;
> -            case 0x4304: /* TODO: scan media */
> +            case 0x4304: /* scan media */

Better to build these, then we don't need the comment.

	MEDIA_AND_POISON << 8 | ....

> +                __do_scan_media(cxl_dstate);
>                  break;
>              default:
>                  __builtin_unreachable();
> diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> index 3e63dbd83551..0bc017061f30 100644
> --- a/hw/mem/cxl_type3.c
> +++ b/hw/mem/cxl_type3.c
> @@ -1213,6 +1213,9 @@ void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length,
>  
>      QLIST_INSERT_HEAD(&ct3d->poison_list, p, node);
>      ct3d->poison_list_cnt++;
> +    if (scan_media_running(&ct3d->cxl_dstate)) {
> +        ct3d->poison_list_dirty = true;
> +    }
>  }
>  
>  /* For uncorrectable errors include support for multiple header recording */
> diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> index cd7f28dba884..8db63c1f7cd1 100644
> --- a/include/hw/cxl/cxl_device.h
> +++ b/include/hw/cxl/cxl_device.h
> @@ -376,12 +376,18 @@ typedef struct CXLPoison {
>  #define CXL_POISON_TYPE_EXTERNAL 0x1
>  #define CXL_POISON_TYPE_INTERNAL 0x2
>  #define CXL_POISON_TYPE_INJECTED 0x3
> +#define CXL_POISON_TYPE_SCANMEDIA 0x4

Invented?  Nothing special about these I think.
They come from the same 3 sources as other poison does.

>      QLIST_ENTRY(CXLPoison) node;
>  } CXLPoison;
>  
>  typedef QLIST_HEAD(, CXLPoison) CXLPoisonList;
>  #define CXL_POISON_LIST_LIMIT 256
>  
> +static inline bool scan_media_running(CXLDeviceState *cxl_dstate)
> +{
> +    return cxl_dstate->bg.runtime && cxl_dstate->bg.opcode == 0x4304;
> +}
> +
>  struct CXLType3Dev {
>      /* Private */
>      PCIDevice parent_obj;
> @@ -413,6 +419,8 @@ struct CXLType3Dev {
>      unsigned int poison_list_cnt;
>      bool poison_list_overflowed;
>      uint64_t poison_list_overflow_ts;
> +    /* list modified while doing scan media */
> +    bool poison_list_dirty;
>  };
>  
>  #define TYPE_CXL_TYPE3 "cxl-type3"


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media
  2023-04-26  2:14 [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
                   ` (3 preceding siblings ...)
  2023-04-26  2:16 ` [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
@ 2023-05-15 12:01 ` Jonathan Cameron
  4 siblings, 0 replies; 15+ messages in thread
From: Jonathan Cameron @ 2023-05-15 12:01 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	linux-cxl

On Tue, 25 Apr 2023 19:14:15 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> Hi,
> 
> The following is a very early rfc for supporting the rest of the Media an Poison
> mailbox commands, extending what is currently there for background command support
> and making use of the poison list infrastructure. The kernel is also pretty much
> at the same point, for which the motivation of this series is to help aid any
> incoming driver support. Similarly testing is 0 but sending this early hoping for
> feedback to at least know if I'm going in the right direction while the kernel bits
> are developed.
> 
> A few general notes:
>  o I have no particular affinity to the values provided for media scan hinting times.
>    Feel free to suggest new ones.

Look fine to me.

>  
>  o Scan Media will directly update the poison list, with its own special type,
>    based on random values (number of new poison entries and actual dpa).

I comment on this in the code.  There is no such type and we shouldn't need to
invent one - all poison is just poison.  The question is whether a device can
track it or not.  QEMU can track anything, but pretends it has limited tracking
resources. 

>    
>  o As with the current poison commands, this needs proper checking of input dpa
>    ranges to: 1) ensure it fits within the actual address space of the device
>    (ie: compare query_start/len to memdev's  mr->addr through cxl_dstate->mem_size
>    ranges). And 2) ensure that the passed range does not span both ram and pmem
>    devices, albeit unlikely for the qemu cases.

Why unlikely specifically for QEMU cases? Or do you mean no OS should do this
so we should never hit the test?

> 
>  o DRAM Event Logs are not generated for every new poisoned dpa. 

Agreed that needs fixing for the Poison injection in general. It needs to happen
after all the moving parts are in place though as it's hard enough to get those
series merged without adding complex interactions from the start.

> afaict we will
>    need to have a scan media version of the qmp event error injection interface,
>    as I don't see qmp being usable for scan media.

I don't understanding why we don't just use the existing QMP poison injection interface.
Just keep injecting poison after we have overflowed the list. That hidden record
of poison is only available if we use the scan media command.
Get Poison List just reports the first part.  Then if you clear poison it will actually
clear (that shouldn't care if it's in the list or not).

The only subtle part is how we clear the overflow event (not just because the list is
no longer overflowed, but only on explicit call of scan media on the whole address
range that doesn't find an overflow.

>  
>  o Getting the scan results inherits the current get poison list limitations,
>    such as limited number of records, so only one call is necessary.

Yeah. We should relax that at somepoint, but not yet I think!

> 
> Series applies on top of Jonathan's 'cxl-2023-04-19' branch, which fyi has the
> v1 version of the bg cmd support (required for this series), so will require
> rebasing eventually:
> 	https://gitlab.com/jic23/qemu/-/tree/cxl-2023-04-19

I'll get to rebasing that and collecting new version in a few days time (probably).
I promised myself I'd catch up with review first... :)

Jonathan

> 
> Thanks!
> 
> Davidlohr Bueso (3):
>   hw/cxl: Add get scan media capabilities mailbox command support
>   hw/cxl: Add scan media mailbox command support
>   hw/cxl: Add scan media mailbox command support
> 
>  hw/cxl/cxl-mailbox-utils.c  | 225 +++++++++++++++++++++++++++++++++++-
>  hw/mem/cxl_type3.c          |   3 +
>  include/hw/cxl/cxl_device.h |   8 ++
>  3 files changed, 233 insertions(+), 3 deletions(-)
> 
> --
> 2.40.0
> 
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] hw/cxl: Add scan media mailbox command support
  2023-04-26  2:14 ` [PATCH 3/3] " Davidlohr Bueso
  2023-04-28 17:00   ` Fan Ni
@ 2023-05-15 12:04   ` Jonathan Cameron
  1 sibling, 0 replies; 15+ messages in thread
From: Jonathan Cameron @ 2023-05-15 12:04 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: alison.schofield, ira.weiny, dan.j.williams, fan.ni, a.manzanares,
	linux-cxl

On Tue, 25 Apr 2023 19:14:18 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> Allow the caller to retrieve the results of a previous scan media
> operation, with similar semantics to the get poison list command.
> Number of returned records is to the max payload output size.
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>

The special poison type is an issue here as well. As in previous patch
I don't think we need it.  Poison doesn't go away on it's own, so
Scan Media (results) should see all poison that Get Poison List does +
others that didn't fit in the tracking list.

> ---
>  hw/cxl/cxl-mailbox-utils.c | 55 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
> 
> diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> index 7f29b840a2c9..f978180d8e21 100644
> --- a/hw/cxl/cxl-mailbox-utils.c
> +++ b/hw/cxl/cxl-mailbox-utils.c
> @@ -81,6 +81,7 @@ enum {
>          #define CLEAR_POISON           0x2
>          #define GET_SCAN_MEDIA_CAPABILITIES 0x3
>          #define SCAN_MEDIA             0x4
> +        #define GET_SCAN_MEDIA_RESULTS 0x5
>      PHYSICAL_SWITCH = 0x51
>          #define IDENTIFY_SWITCH_DEVICE      0x0
>  };
> @@ -1079,6 +1080,58 @@ cmd_media_scan_media(struct cxl_cmd *cmd,
>      return CXL_MBOX_BG_STARTED;
>  }
>  
> +static CXLRetCode cmd_media_get_scan_media_results(struct cxl_cmd *cmd,
> +                                                   CXLDeviceState *cxl_dstate,
> +                                                   uint16_t *len)
> +{
> +    struct get_scan_media_results_out_pl {
> +        uint64_t dpa_restart;
> +        uint64_t length;
> +        uint8_t flags;
> +        uint8_t rsvd1;
> +        uint16_t count;
> +        uint8_t rsvd2[20];
> +        struct {
> +            uint64_t addr;
> +            uint32_t length;
> +            uint32_t resv;
> +        } QEMU_PACKED records[];
> +    } QEMU_PACKED;
> +    struct get_scan_media_results_out_pl *out = (void *)cmd->payload;
> +    CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> +    uint16_t record_count = 0, i = 0;
> +    CXLPoisonList *poison_list = &ct3d->poison_list;
> +    CXLPoison *ent;
> +    uint16_t out_pl_len;
> +
> +    QLIST_FOREACH(ent, poison_list, node) {
> +        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
> +            record_count++;
> +        }
> +    }
> +
> +    out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
> +    assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
> +
> +    memset(out, 0, out_pl_len);
> +    QLIST_FOREACH(ent, poison_list, node) {
> +        if (ent->type == CXL_POISON_TYPE_SCANMEDIA) {
> +            uint64_t start, stop;
> +
> +            start = ent->start & 0xffffffffffffffc0;
> +            stop = ent->start & 0xffffffffffffffc0;
> +            stq_le_p(&out->records[i].addr, start | (ent->type & 0x7));

Given type is 0x4 that mask doesn't block us having a value defined in spec as
'all other encodings are reserved'

> +            stl_le_p(&out->records[i].length, (stop - start) / 64);
> +            i++;
> +        }
> +    }
> +
> +    stw_le_p(&out->count, record_count);
> +
> +    *len = out_pl_len;
> +    return CXL_MBOX_SUCCESS;
> +}
> +
>  #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
>  #define IMMEDIATE_DATA_CHANGE (1 << 2)
>  #define IMMEDIATE_POLICY_CHANGE (1 << 3)
> @@ -1121,6 +1174,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
>          cmd_media_get_scan_media_capabilities, 16, 0 },
>      [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA",
>          cmd_media_scan_media, 17, 0 },
> +    [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS",
> +        cmd_media_get_scan_media_results, 0, 0 },
>  };
>  
>  static struct cxl_cmd cxl_cmd_set_sw[256][256] = {


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-05-15 12:05 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-04-26  2:14 [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
2023-04-26  2:14 ` [PATCH 1/3] hw/cxl: Add get scan media capabilities mailbox command support Davidlohr Bueso
2023-04-28 16:18   ` Fan Ni
2023-04-28 15:49     ` Davidlohr Bueso
2023-05-15 11:01   ` Jonathan Cameron
2023-04-26  2:14 ` [PATCH 2/3] hw/cxl: Add scan media " Davidlohr Bueso
2023-04-28 16:42   ` Fan Ni
2023-04-28 19:45     ` Davidlohr Bueso
2023-05-15 11:49   ` Jonathan Cameron
2023-04-26  2:14 ` [PATCH 3/3] " Davidlohr Bueso
2023-04-28 17:00   ` Fan Ni
2023-04-28 19:30     ` Davidlohr Bueso
2023-05-15 12:04   ` Jonathan Cameron
2023-04-26  2:16 ` [PATCH -qemu rfc 0/3] hw/cxl: Add support for Scan Media Davidlohr Bueso
2023-05-15 12:01 ` Jonathan Cameron

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.