Linux-ACPI Archive mirror
 help / color / mirror / Atom feed
From: Shiju Jose <shiju.jose@huawei.com>
To: Daniel Ferguson <danielf@os.amperecomputing.com>,
	"Rafael J. Wysocki" <rafael@kernel.org>,
	Len Brown <lenb@kernel.org>, James Morse <james.morse@arm.com>,
	Tony Luck <tony.luck@intel.com>, Borislav Petkov <bp@alien8.de>
Cc: "linux-acpi@vger.kernel.org" <linux-acpi@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>,
	luoshengwei <luoshengwei@huawei.com>,
	Jason Tian <jason@os.amperecomputing.com>
Subject: RE: [PATCH v5 2/2] RAS: Report ARM processor information to userspace
Date: Fri, 26 Apr 2024 11:45:56 +0000	[thread overview]
Message-ID: <73d0834a539a4a69bca670141dd06bc8@huawei.com> (raw)
In-Reply-To: <20240321-b4-arm-ras-error-vendor-info-v5-rc3-v5-2-850f9bfb97a8@os.amperecomputing.com>

Tested-by: Shiju Jose <shiju.jose@huawei.com>

CPU core isolation feature in rasdaemon has dependency on this kernel patch.

Thanks,
Shiju
>-----Original Message-----
>From: Daniel Ferguson <danielf@os.amperecomputing.com>
>Sent: 21 March 2024 22:56
>To: Rafael J. Wysocki <rafael@kernel.org>; Len Brown <lenb@kernel.org>;
>James Morse <james.morse@arm.com>; Tony Luck <tony.luck@intel.com>;
>Borislav Petkov <bp@alien8.de>
>Cc: linux-acpi@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
>edac@vger.kernel.org; Daniel Ferguson <danielf@os.amperecomputing.com>;
>luoshengwei <luoshengwei@huawei.com>; Jason Tian
><jason@os.amperecomputing.com>
>Subject: [PATCH v5 2/2] RAS: Report ARM processor information to userspace
>
>From: Shengwei Luo <luoshengwei@huawei.com>
>
>The original arm_event trace code only traces out ARM processor error
>information data. It's not enough for user to take appropriate action.
>
>According to UEFI_2_9 specification chapter N2.4.4, the ARM processor error
>section includes several ARM processor error information, several ARM
>processor context information and several vendor specific error information
>structures. In addition to these info, there are error severity and cpu logical
>index about the event. Report all of these information to userspace via perf i/f.
>So that the user can do cpu core isolation according to error severity and other
>info.
>
>Signed-off-by: Shengwei Luo <luoshengwei@huawei.com>
>Signed-off-by: Jason Tian <jason@os.amperecomputing.com>
>Signed-off-by: Daniel Ferguson <danielf@os.amperecomputing.com>
>---
> drivers/acpi/apei/ghes.c |  3 +--
> drivers/ras/ras.c        | 46
>++++++++++++++++++++++++++++++++++++++++++++--
> include/linux/ras.h      | 15 ++++++++++++---
> include/ras/ras_event.h  | 48
>+++++++++++++++++++++++++++++++++++++++++++-----
> 4 files changed, 100 insertions(+), 12 deletions(-)
>
>diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
>58014558b8e0..a93c80fe1bab 100644
>--- a/drivers/acpi/apei/ghes.c
>+++ b/drivers/acpi/apei/ghes.c
>@@ -535,9 +535,8 @@ static bool ghes_handle_arm_hw_error(struct
>acpi_hest_generic_data *gdata,
> 	int sec_sev, i;
> 	char *p;
>
>-	log_arm_hw_error(err);
>-
> 	sec_sev = ghes_severity(gdata->error_severity);
>+	log_arm_hw_error(err, sec_sev);
> 	if (sev != GHES_SEV_RECOVERABLE || sec_sev !=
>GHES_SEV_RECOVERABLE)
> 		return false;
>
>diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index
>249dce21a738..3e2beed2db07 100644
>--- a/drivers/ras/ras.c
>+++ b/drivers/ras/ras.c
>@@ -53,9 +53,51 @@ void log_non_standard_event(const guid_t *sec_type,
>const guid_t *fru_id,  }
>
> #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err)
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
> {
>-	trace_arm_event(err);
>+	u32 pei_len;
>+	u32 ctx_len = 0;
>+	s32 vsei_len;
>+	u8 *pei_err;
>+	u8 *ctx_err;
>+	u8 *ven_err_data;
>+	struct cper_arm_err_info *err_info;
>+	struct cper_arm_ctx_info *ctx_info;
>+	int n, sz;
>+	int cpu;
>+
>+	pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
>+	pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm);
>+
>+	err_info = (struct cper_arm_err_info *)(err + 1);
>+	ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
>+	ctx_err = (u8 *)ctx_info;
>+	for (n = 0; n < err->context_info_num; n++) {
>+		sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
>+		ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
>+		ctx_len += sz;
>+	}
>+
>+	vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) +
>+						pei_len + ctx_len);
>+	if (vsei_len < 0) {
>+		pr_warn(FW_BUG
>+			"section length: %d\n", err->section_length);
>+		pr_warn(FW_BUG
>+			"section length is too small\n");
>+		pr_warn(FW_BUG
>+			"firmware-generated error record is incorrect\n");
>+		vsei_len = 0;
>+	}
>+	ven_err_data = (u8 *)ctx_info;
>+
>+	cpu = GET_LOGICAL_INDEX(err->mpidr);
>+	/* when return value is invalid, set cpu index to -1 */
>+	if (cpu < 0)
>+		cpu = -1;
>+
>+	trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
>+			ven_err_data, (u32)vsei_len, sev, cpu);
> }
> #endif
>
>diff --git a/include/linux/ras.h b/include/linux/ras.h index
>811feb9d8160..2070e4ae0626 100644
>--- a/include/linux/ras.h
>+++ b/include/linux/ras.h
>@@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type,
> 			    const guid_t *fru_id, const char *fru_text,
> 			    const u8 sev, const u8 *err, const u32 len);  #if
>defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err);
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
> #endif
> #else
> static inline void
>@@ -35,7 +35,7 @@ log_non_standard_event(const guid_t *sec_type,  { return;
>}  #if defined(CONFIG_ARM) || defined(CONFIG_ARM64)  static inline void -
>log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
>+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return;
>+}
> #endif
> #endif
>
>@@ -55,5 +55,14 @@ static inline void amd_retire_dram_row(struct atl_err
>*err) { }  static inline unsigned long
>amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL;
>}  #endif /* CONFIG_AMD_ATL */
>-
>+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) #include
>+<asm/smp_plat.h>
>+/*
>+ * Include ARM specific SMP header which provides a function mapping
>+mpidr to
>+ * cpu logical index.
>+ */
>+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr &
>+MPIDR_HWID_BITMASK) #else #define GET_LOGICAL_INDEX(mpidr) -EINVAL
>+#endif /* CONFIG_ARM || CONFIG_ARM64 */
> #endif /* __RAS_H__ */
>diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index
>c011ea236e9b..a7d7b6e717b6 100644
>--- a/include/ras/ras_event.h
>+++ b/include/ras/ras_event.h
>@@ -168,11 +168,24 @@ TRACE_EVENT(mc_event,
>  * This event is generated when hardware detects an ARM processor error
>  * has occurred. UEFI 2.6 spec section N.2.4.4.
>  */
>+#define APEIL "ARM Processor Err Info data len"
>+#define APEID "ARM Processor Err Info raw data"
>+#define APECIL "ARM Processor Err Context Info data len"
>+#define APECID "ARM Processor Err Context Info raw data"
>+#define VSEIL "Vendor Specific Err Info data len"
>+#define VSEID "Vendor Specific Err Info raw data"
> TRACE_EVENT(arm_event,
>
>-	TP_PROTO(const struct cper_sec_proc_arm *proc),
>+	TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err,
>+			const u32 pei_len,
>+			const u8 *ctx_err,
>+			const u32 ctx_len,
>+			const u8 *oem,
>+			const u32 oem_len,
>+			u8 sev,
>+			int cpu),
>
>-	TP_ARGS(proc),
>+	TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev,
>+cpu),
>
> 	TP_STRUCT__entry(
> 		__field(u64, mpidr)
>@@ -180,6 +193,14 @@ TRACE_EVENT(arm_event,
> 		__field(u32, running_state)
> 		__field(u32, psci_state)
> 		__field(u8, affinity)
>+		__field(u32, pei_len)
>+		__dynamic_array(u8, buf, pei_len)
>+		__field(u32, ctx_len)
>+		__dynamic_array(u8, buf1, ctx_len)
>+		__field(u32, oem_len)
>+		__dynamic_array(u8, buf2, oem_len)
>+		__field(u8, sev)
>+		__field(int, cpu)
> 	),
>
> 	TP_fast_assign(
>@@ -199,12 +220,29 @@ TRACE_EVENT(arm_event,
> 			__entry->running_state = ~0;
> 			__entry->psci_state = ~0;
> 		}
>+		__entry->pei_len = pei_len;
>+		memcpy(__get_dynamic_array(buf), pei_err, pei_len);
>+		__entry->ctx_len = ctx_len;
>+		memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len);
>+		__entry->oem_len = oem_len;
>+		memcpy(__get_dynamic_array(buf2), oem, oem_len);
>+		__entry->sev = sev;
>+		__entry->cpu = cpu;
> 	),
>
>-	TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
>-		  "running state: %d; PSCI state: %d",
>+	TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR:
>%016llx; "
>+		  "running state: %d; PSCI state: %d; "
>+		  "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
>+		  __entry->cpu,
>+		  __entry->sev,
> 		  __entry->affinity, __entry->mpidr, __entry->midr,
>-		  __entry->running_state, __entry->psci_state)
>+		  __entry->running_state, __entry->psci_state,
>+		  APEIL, __entry->pei_len, APEID,
>+		  __print_hex(__get_dynamic_array(buf), __entry->pei_len),
>+		  APECIL, __entry->ctx_len, APECID,
>+		  __print_hex(__get_dynamic_array(buf1), __entry->ctx_len),
>+		  VSEIL, __entry->oem_len, VSEID,
>+		  __print_hex(__get_dynamic_array(buf2), __entry->oem_len))
> );
>
> /*
>
>--
>2.43.0
>


  reply	other threads:[~2024-04-26 11:46 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-21 22:55 [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson
2024-03-21 22:55 ` [PATCH v5 1/2] RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines Daniel Ferguson
2024-03-21 22:55 ` [PATCH v5 2/2] RAS: Report ARM processor information to userspace Daniel Ferguson
2024-04-26 11:45   ` Shiju Jose [this message]
2024-04-11 20:43 ` [PATCH v5 0/2] Adds additional information to ARM RAS errors Daniel Ferguson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=73d0834a539a4a69bca670141dd06bc8@huawei.com \
    --to=shiju.jose@huawei.com \
    --cc=bp@alien8.de \
    --cc=danielf@os.amperecomputing.com \
    --cc=james.morse@arm.com \
    --cc=jason@os.amperecomputing.com \
    --cc=lenb@kernel.org \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luoshengwei@huawei.com \
    --cc=rafael@kernel.org \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).