LKML Archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/7] habanalabs/gaudi: use standard error codes
@ 2021-06-09 15:03 Oded Gabbay
  2021-06-09 15:03 ` [PATCH 2/7] habanalabs: small code refactoring Oded Gabbay
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Oded Gabbay @ 2021-06-09 15:03 UTC (permalink / raw
  To: linux-kernel

When there is an ECC error in the HBM, return a standard error code,
-EIO in this case, and not a positive value.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 703f41488852..9b4bd38c2986 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7471,7 +7471,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
 			struct hl_eq_hbm_ecc_data *hbm_ecc_data)
 {
 	u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch;
-	int err = 0;
+	int rc = 0;
 
 	if (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
 					CPU_BOOT_DEV_STS0_HBM_ECC_EN) {
@@ -7516,7 +7516,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
 		val = RREG32_MASK(base + ch * 0x1000 + 0x06C, 0x0000FFFF);
 		val = (val & 0xFF) | ((val >> 8) & 0xFF);
 		if (val) {
-			err = 1;
+			rc = -EIO;
 			dev_err(hdev->dev,
 				"HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
 				device, ch * 2, val & 0x1, (val >> 1) & 0x1,
@@ -7536,7 +7536,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
 		val = RREG32_MASK(base + ch * 0x1000 + 0x07C, 0x0000FFFF);
 		val = (val & 0xFF) | ((val >> 8) & 0xFF);
 		if (val) {
-			err = 1;
+			rc = -EIO;
 			dev_err(hdev->dev,
 				"HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
 				device, ch * 2 + 1, val & 0x1, (val >> 1) & 0x1,
@@ -7565,7 +7565,7 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
 	val  = RREG32(base + 0x8F30);
 	val2 = RREG32(base + 0x8F34);
 	if (val | val2) {
-		err = 1;
+		rc = -EIO;
 		dev_err(hdev->dev,
 			"HBM %d MC SRAM SERR info: Reg 0x8F30=0x%x, Reg 0x8F34=0x%x\n",
 			device, val, val2);
@@ -7573,13 +7573,13 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
 	val  = RREG32(base + 0x8F40);
 	val2 = RREG32(base + 0x8F44);
 	if (val | val2) {
-		err = 1;
+		rc = -EIO;
 		dev_err(hdev->dev,
 			"HBM %d MC SRAM DERR info: Reg 0x8F40=0x%x, Reg 0x8F44=0x%x\n",
 			device, val, val2);
 	}
 
-	return err;
+	return rc;
 }
 
 static int gaudi_hbm_event_to_dev(u16 hbm_event_type)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/7] habanalabs: small code refactoring
  2021-06-09 15:03 [PATCH 1/7] habanalabs/gaudi: use standard error codes Oded Gabbay
@ 2021-06-09 15:03 ` Oded Gabbay
  2021-06-09 15:03 ` [PATCH 3/7] habanalabs: report EQ fault during heartbeat Oded Gabbay
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2021-06-09 15:03 UTC (permalink / raw
  To: linux-kernel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

Use datatype defines instead of hard coded values,
and rename set_fixed_properties function.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 2 +-
 drivers/misc/habanalabs/gaudi/gaudi.c   | 6 +++---
 drivers/misc/habanalabs/goya/goya.c     | 4 ++--
 drivers/misc/habanalabs/goya/goyaP.h    | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 0056282cec94..46fcab1bf873 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1395,7 +1395,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 
 	dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
 		hdev->asic_name,
-		hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
+		hdev->asic_prop.dram_size / SZ_1G);
 
 	rc = hl_vm_init(hdev);
 	if (rc) {
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 9b4bd38c2986..f8bf30e48bba 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -410,7 +410,7 @@ static inline void set_default_power_values(struct hl_device *hdev)
 	}
 }
 
-static int gaudi_get_fixed_properties(struct hl_device *hdev)
+static int gaudi_set_fixed_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u32 num_sync_stream_queues = 0;
@@ -655,9 +655,9 @@ static int gaudi_early_init(struct hl_device *hdev)
 	u32 fw_boot_status;
 	int rc;
 
-	rc = gaudi_get_fixed_properties(hdev);
+	rc = gaudi_set_fixed_properties(hdev);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to get fixed properties\n");
+		dev_err(hdev->dev, "Failed setting fixed properties\n");
 		return rc;
 	}
 
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index bcefc372a689..6d63930b7a10 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -355,7 +355,7 @@ static int goya_mmu_set_dram_default_page(struct hl_device *hdev);
 static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev);
 static void goya_mmu_prepare(struct hl_device *hdev, u32 asid);
 
-int goya_get_fixed_properties(struct hl_device *hdev)
+int goya_set_fixed_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int i;
@@ -587,7 +587,7 @@ static int goya_early_init(struct hl_device *hdev)
 	u32 fw_boot_status, val;
 	int rc;
 
-	rc = goya_get_fixed_properties(hdev);
+	rc = goya_set_fixed_properties(hdev);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to get fixed properties\n");
 		return rc;
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index ef8c6c8b5c8d..0b05da614729 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -168,7 +168,7 @@ struct goya_device {
 	u8		device_cpu_mmu_mappings_done;
 };
 
-int goya_get_fixed_properties(struct hl_device *hdev);
+int goya_set_fixed_properties(struct hl_device *hdev);
 int goya_mmu_init(struct hl_device *hdev);
 void goya_init_dma_qmans(struct hl_device *hdev);
 void goya_init_mme_qmans(struct hl_device *hdev);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/7] habanalabs: report EQ fault during heartbeat
  2021-06-09 15:03 [PATCH 1/7] habanalabs/gaudi: use standard error codes Oded Gabbay
  2021-06-09 15:03 ` [PATCH 2/7] habanalabs: small code refactoring Oded Gabbay
@ 2021-06-09 15:03 ` Oded Gabbay
  2021-06-09 15:03 ` [PATCH 4/7] habanalabs: enable stop on error for all QMANs and engines Oded Gabbay
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2021-06-09 15:03 UTC (permalink / raw
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

In case we have EQ fault we would like to know about it.
For this, a status bitmask was added in which EQ_FAULT bit is
set by FW in case of EQ fault.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c  |  8 +++++++-
 .../misc/habanalabs/include/common/cpucp_if.h | 20 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 9412e6707906..d5a3c786d4c9 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -362,7 +362,7 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 
 int hl_fw_send_heartbeat(struct hl_device *hdev)
 {
-	struct cpucp_packet hb_pkt = {};
+	struct cpucp_packet hb_pkt = {0};
 	u64 result;
 	int rc;
 
@@ -374,7 +374,13 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
 						sizeof(hb_pkt), 0, &result);
 
 	if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
+		return -EIO;
+
+	if (le32_to_cpu(hb_pkt.status_mask) &
+					CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK) {
+		dev_warn(hdev->dev, "FW reported EQ fault during heartbeat\n");
 		rc = -EIO;
+	}
 
 	return rc;
 }
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index d4dc189a6c92..80b1d5a9d9f1 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -404,6 +404,20 @@ enum cpucp_packet_id {
 #define CPUCP_PKT_RES_PLL_OUT3_SHIFT	48
 #define CPUCP_PKT_RES_PLL_OUT3_MASK	0xFFFF000000000000ull
 
+#define CPUCP_PKT_VAL_PFC_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_PFC_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_PFC_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_PFC_IN2_MASK	0x000000000000001Eull
+
+#define CPUCP_PKT_VAL_LPBK_IN1_SHIFT	0
+#define CPUCP_PKT_VAL_LPBK_IN1_MASK	0x0000000000000001ull
+#define CPUCP_PKT_VAL_LPBK_IN2_SHIFT	1
+#define CPUCP_PKT_VAL_LPBK_IN2_MASK	0x000000000000001Eull
+
+/* heartbeat status bits */
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_SHIFT		0
+#define CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK		0x00000001
+
 struct cpucp_packet {
 	union {
 		__le64 value;	/* For SET packets */
@@ -445,6 +459,12 @@ struct cpucp_packet {
 
 		/* For get CpuCP info/EEPROM data/NIC info */
 		__le32 data_max_size;
+
+		/*
+		 * For any general status bitmask. Shall be used whenever the
+		 * result cannot be used to hold general purpose data.
+		 */
+		__le32 status_mask;
 	};
 
 	__le32 reserved;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/7] habanalabs: enable stop on error for all QMANs and engines
  2021-06-09 15:03 [PATCH 1/7] habanalabs/gaudi: use standard error codes Oded Gabbay
  2021-06-09 15:03 ` [PATCH 2/7] habanalabs: small code refactoring Oded Gabbay
  2021-06-09 15:03 ` [PATCH 3/7] habanalabs: report EQ fault during heartbeat Oded Gabbay
@ 2021-06-09 15:03 ` Oded Gabbay
  2021-06-09 15:03 ` [PATCH 5/7] habanalabs: enable dram scramble before linux f/w Oded Gabbay
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2021-06-09 15:03 UTC (permalink / raw
  To: linux-kernel; +Cc: Ofir Bitton

From: Ofir Bitton <obitton@habana.ai>

If there is an error in the QMAN/engine, there is no point of trying
to continue running the workload. It is better to stop to allow the
user to debug the program.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/habanalabs_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index b55dd1c55166..3a4233971f2b 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -326,6 +326,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	hdev->reset_on_lockup = reset_on_lockup;
 	hdev->memory_scrub = memory_scrub;
 	hdev->boot_error_status_mask = boot_error_status_mask;
+	hdev->stop_on_err = true;
 
 	hdev->pldm = 0;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/7] habanalabs: enable dram scramble before linux f/w
  2021-06-09 15:03 [PATCH 1/7] habanalabs/gaudi: use standard error codes Oded Gabbay
                   ` (2 preceding siblings ...)
  2021-06-09 15:03 ` [PATCH 4/7] habanalabs: enable stop on error for all QMANs and engines Oded Gabbay
@ 2021-06-09 15:03 ` Oded Gabbay
  2021-06-09 15:03 ` [PATCH 6/7] habanalabs: add hard reset timeout for PLDM Oded Gabbay
  2021-06-09 15:03 ` [PATCH 7/7] habanalabs: print firmware versions Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2021-06-09 15:03 UTC (permalink / raw
  To: linux-kernel; +Cc: Bharat Jauhari

From: Bharat Jauhari <bjauhari@habana.ai>

In current code, for dynamic f/w loading flow, DRAM scrambling is
enabled post Linux fit image is loaded to the card. This can cause the
device CPU to go into reset state.

The correct sequence should be:
1. Load boot fit image
2. Enable scrambling
3. Load Linux fit image

This commit aligns the DRAM scrambling enabling with the static f/w load
flow.

Signed-off-by: Bharat Jauhari <bjauhari@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 10 ++++++++++
 drivers/misc/habanalabs/common/habanalabs.h  |  4 +++-
 drivers/misc/habanalabs/gaudi/gaudi.c        |  4 +---
 drivers/misc/habanalabs/goya/goya.c          |  8 +++++++-
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index d5a3c786d4c9..2bb2a4145640 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -2149,6 +2149,11 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	if (rc)
 		goto protocol_err;
 
+	/* Enable DRAM scrambling before Linux boot and after successful
+	 *  UBoot
+	 */
+	hdev->asic_funcs->init_cpu_scrambler_dram(hdev);
+
 	if (!(hdev->fw_components & FW_TYPE_LINUX)) {
 		dev_info(hdev->dev, "Skip loading Linux F/W\n");
 		return 0;
@@ -2295,6 +2300,11 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev,
 		goto out;
 	}
 
+	/* Enable DRAM scrambling before Linux boot and after successful
+	 *  UBoot
+	 */
+	hdev->asic_funcs->init_cpu_scrambler_dram(hdev);
+
 	if (!(hdev->fw_components & FW_TYPE_LINUX)) {
 		dev_info(hdev->dev, "Skip loading Linux F/W\n");
 		goto out;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index bcb5bfdd7f20..bc5a1b45270f 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1092,7 +1092,8 @@ struct fw_load_mgr {
  * @get_msi_info: Retrieve asic-specific MSI ID of the f/w async event
  * @map_pll_idx_to_fw_idx: convert driver specific per asic PLL index to
  *                         generic f/w compatible PLL Indexes
- *@init_firmware_loader: initialize data for FW loader.
+ * @init_firmware_loader: initialize data for FW loader.
+ * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1217,6 +1218,7 @@ struct hl_asic_funcs {
 	void (*get_msi_info)(__le32 *table);
 	int (*map_pll_idx_to_fw_idx)(u32 pll_idx);
 	void (*init_firmware_loader)(struct hl_device *hdev);
+	void (*init_cpu_scrambler_dram)(struct hl_device *hdev);
 };
 
 
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index f8bf30e48bba..ca1a8ca24d4a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -3804,9 +3804,6 @@ static int gaudi_load_firmware_to_device(struct hl_device *hdev)
 {
 	void __iomem *dst;
 
-	/* HBM scrambler must be initialized before pushing F/W to HBM */
-	gaudi_init_scrambler_hbm(hdev);
-
 	dst = hdev->pcie_bar[HBM_BAR_ID] + LINUX_FW_OFFSET;
 
 	return hl_fw_load_fw_to_device(hdev, GAUDI_LINUX_FW_FILE, dst, 0, 0);
@@ -8949,6 +8946,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.enable_events_from_fw = gaudi_enable_events_from_fw,
 	.map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
 	.init_firmware_loader = gaudi_init_firmware_loader,
+	.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm
 };
 
 /**
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 6d63930b7a10..2a9b91d5c6ff 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5402,6 +5402,11 @@ static int goya_get_eeprom_data(struct hl_device *hdev, void *data,
 	return hl_fw_get_eeprom_data(hdev, data, max_size);
 }
 
+static void goya_cpu_init_scrambler_dram(struct hl_device *hdev)
+{
+
+}
+
 static int goya_ctx_init(struct hl_ctx *ctx)
 {
 	if (ctx->asid != HL_KERNEL_ASID_ID)
@@ -5601,7 +5606,8 @@ static const struct hl_asic_funcs goya_funcs = {
 	.hw_block_mmap = goya_block_mmap,
 	.enable_events_from_fw = goya_enable_events_from_fw,
 	.map_pll_idx_to_fw_idx = goya_map_pll_idx_to_fw_idx,
-	.init_firmware_loader = goya_init_firmware_loader
+	.init_firmware_loader = goya_init_firmware_loader,
+	.init_cpu_scrambler_dram = goya_cpu_init_scrambler_dram
 };
 
 /*
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/7] habanalabs: add hard reset timeout for PLDM
  2021-06-09 15:03 [PATCH 1/7] habanalabs/gaudi: use standard error codes Oded Gabbay
                   ` (3 preceding siblings ...)
  2021-06-09 15:03 ` [PATCH 5/7] habanalabs: enable dram scramble before linux f/w Oded Gabbay
@ 2021-06-09 15:03 ` Oded Gabbay
  2021-06-09 15:03 ` [PATCH 7/7] habanalabs: print firmware versions Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2021-06-09 15:03 UTC (permalink / raw
  To: linux-kernel; +Cc: Omer Shpigelman

From: Omer Shpigelman <oshpigelman@habana.ai>

Hard reset flow on PLDM might take more than 2 minutes.
Hence add a dedicated hard reset timeout of 6 minutes for PLDM.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c     | 9 +++++++--
 drivers/misc/habanalabs/common/habanalabs.h | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 46fcab1bf873..cbdf75b24cb4 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1501,6 +1501,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 void hl_device_fini(struct hl_device *hdev)
 {
 	ktime_t timeout;
+	u64 reset_sec;
 	int i, rc;
 
 	dev_info(hdev->dev, "Removing device\n");
@@ -1508,6 +1509,11 @@ void hl_device_fini(struct hl_device *hdev)
 	hdev->device_fini_pending = 1;
 	flush_delayed_work(&hdev->device_reset_work.reset_work);
 
+	if (hdev->pldm)
+		reset_sec = HL_PLDM_HARD_RESET_MAX_TIMEOUT;
+	else
+		reset_sec = HL_HARD_RESET_MAX_TIMEOUT;
+
 	/*
 	 * This function is competing with the reset function, so try to
 	 * take the reset atomic and if we are already in middle of reset,
@@ -1516,8 +1522,7 @@ void hl_device_fini(struct hl_device *hdev)
 	 * ports, the hard reset could take between 10-30 seconds
 	 */
 
-	timeout = ktime_add_us(ktime_get(),
-				HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000);
+	timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
 	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
 	while (rc) {
 		usleep_range(50, 200);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index bc5a1b45270f..244fbf209d34 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -48,6 +48,7 @@
 #define HL_PENDING_RESET_LONG_SEC	60
 
 #define HL_HARD_RESET_MAX_TIMEOUT	120
+#define HL_PLDM_HARD_RESET_MAX_TIMEOUT	(HL_HARD_RESET_MAX_TIMEOUT * 3)
 
 #define HL_DEVICE_TIMEOUT_USEC		1000000 /* 1 s */
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 7/7] habanalabs: print firmware versions
  2021-06-09 15:03 [PATCH 1/7] habanalabs/gaudi: use standard error codes Oded Gabbay
                   ` (4 preceding siblings ...)
  2021-06-09 15:03 ` [PATCH 6/7] habanalabs: add hard reset timeout for PLDM Oded Gabbay
@ 2021-06-09 15:03 ` Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2021-06-09 15:03 UTC (permalink / raw
  To: linux-kernel

Firmware in habanalabs devices is composed of several components.
During device initialization, we read these versions from the device.
Print them during device initialization to allow better visibility in
automated systems.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/firmware_if.c | 105 +++++++++++++++++--
 1 file changed, 95 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 2bb2a4145640..14e70422af25 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -11,11 +11,41 @@
 #include <linux/firmware.h>
 #include <linux/crc32.h>
 #include <linux/slab.h>
+#include <linux/ctype.h>
 
 #define FW_FILE_MAX_SIZE		0x1400000 /* maximum size of 20MB */
 
 #define FW_CPU_STATUS_POLL_INTERVAL_USEC	10000
 
+static char *extract_fw_ver_from_str(const char *fw_str)
+{
+	char *str, *fw_ver, *whitespace;
+
+	fw_ver = kmalloc(16, GFP_KERNEL);
+	if (!fw_ver)
+		return NULL;
+
+	str = strnstr(fw_str, "fw-", VERSION_MAX_LEN);
+	if (!str)
+		goto free_fw_ver;
+
+	/* Skip the fw- part */
+	str += 3;
+
+	/* Copy until the next whitespace */
+	whitespace =  strnstr(str, " ", 15);
+	if (!whitespace)
+		goto free_fw_ver;
+
+	strscpy(fw_ver, str, whitespace - str + 1);
+
+	return fw_ver;
+
+free_fw_ver:
+	kfree(fw_ver);
+	return NULL;
+}
+
 static int hl_request_fw(struct hl_device *hdev,
 				const struct firmware **firmware_p,
 				const char *fw_name)
@@ -573,8 +603,9 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev,
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct cpucp_packet pkt = {};
-	void *cpucp_info_cpu_addr;
 	dma_addr_t cpucp_info_dma_addr;
+	void *cpucp_info_cpu_addr;
+	char *kernel_ver;
 	u64 result;
 	int rc;
 
@@ -621,6 +652,12 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev,
 		goto out;
 	}
 
+	kernel_ver = extract_fw_ver_from_str(prop->cpucp_info.kernel_version);
+	if (kernel_ver) {
+		dev_info(hdev->dev, "Linux version %s", kernel_ver);
+		kfree(kernel_ver);
+	}
+
 	/* assume EQ code doesn't need to check eqe index */
 	hdev->event_queue.check_eqe_index = false;
 
@@ -1066,24 +1103,26 @@ static int hl_fw_read_preboot_caps(struct hl_device *hdev,
 static int hl_fw_static_read_device_fw_version(struct hl_device *hdev,
 					enum hl_fw_component fwc)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct fw_load_mgr *fw_loader = &hdev->fw_loader;
 	struct static_fw_load_mgr *static_loader;
-	const char *name;
+	char *dest, *boot_ver, *preboot_ver;
 	u32 ver_off, limit;
-	char *dest;
+	const char *name;
+	char btl_ver[32];
 
 	static_loader = &hdev->fw_loader.static_loader;
 
 	switch (fwc) {
 	case FW_COMP_BOOT_FIT:
 		ver_off = RREG32(static_loader->boot_fit_version_offset_reg);
-		dest = hdev->asic_prop.uboot_ver;
+		dest = prop->uboot_ver;
 		name = "Boot-fit";
 		limit = static_loader->boot_fit_version_max_off;
 		break;
 	case FW_COMP_PREBOOT:
 		ver_off = RREG32(static_loader->preboot_version_offset_reg);
-		dest = hdev->asic_prop.preboot_ver;
+		dest = prop->preboot_ver;
 		name = "Preboot";
 		limit = static_loader->preboot_version_max_off;
 		break;
@@ -1105,6 +1144,30 @@ static int hl_fw_static_read_device_fw_version(struct hl_device *hdev,
 		return -EIO;
 	}
 
+	if (fwc == FW_COMP_BOOT_FIT) {
+		boot_ver = extract_fw_ver_from_str(prop->uboot_ver);
+		if (boot_ver) {
+			dev_info(hdev->dev, "boot-fit version %s\n", boot_ver);
+			kfree(boot_ver);
+		}
+	} else if (fwc == FW_COMP_PREBOOT) {
+		preboot_ver = strnstr(prop->preboot_ver, "Preboot",
+						VERSION_MAX_LEN);
+		if (preboot_ver && preboot_ver != prop->preboot_ver) {
+			strscpy(btl_ver, prop->preboot_ver,
+				min((int) (preboot_ver - prop->preboot_ver),
+									31));
+			dev_info(hdev->dev, "%s\n", btl_ver);
+		}
+
+		preboot_ver = extract_fw_ver_from_str(prop->preboot_ver);
+		if (preboot_ver) {
+			dev_info(hdev->dev, "preboot version %s\n",
+								preboot_ver);
+			kfree(preboot_ver);
+		}
+	}
+
 	return 0;
 }
 
@@ -1691,21 +1754,43 @@ static void hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
 					enum hl_fw_component fwc,
 					const char *fw_version)
 {
-	char *dest;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	char *preboot_ver, *boot_ver;
+	char btl_ver[32];
 
 	switch (fwc) {
 	case FW_COMP_BOOT_FIT:
-		dest = hdev->asic_prop.uboot_ver;
+		strscpy(prop->uboot_ver, fw_version, VERSION_MAX_LEN);
+		boot_ver = extract_fw_ver_from_str(prop->uboot_ver);
+		if (boot_ver) {
+			dev_info(hdev->dev, "boot-fit version %s\n", boot_ver);
+			kfree(boot_ver);
+		}
+
 		break;
 	case FW_COMP_PREBOOT:
-		dest = hdev->asic_prop.preboot_ver;
+		strscpy(prop->preboot_ver, fw_version, VERSION_MAX_LEN);
+		preboot_ver = strnstr(prop->preboot_ver, "Preboot",
+						VERSION_MAX_LEN);
+		if (preboot_ver && preboot_ver != prop->preboot_ver) {
+			strscpy(btl_ver, prop->preboot_ver,
+				min((int) (preboot_ver - prop->preboot_ver),
+									31));
+			dev_info(hdev->dev, "%s\n", btl_ver);
+		}
+
+		preboot_ver = extract_fw_ver_from_str(prop->preboot_ver);
+		if (preboot_ver) {
+			dev_info(hdev->dev, "preboot version %s\n",
+								preboot_ver);
+			kfree(preboot_ver);
+		}
+
 		break;
 	default:
 		dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
 		return;
 	}
-
-	strscpy(dest, fw_version, VERSION_MAX_LEN);
 }
 
 /**
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-06-09 15:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-06-09 15:03 [PATCH 1/7] habanalabs/gaudi: use standard error codes Oded Gabbay
2021-06-09 15:03 ` [PATCH 2/7] habanalabs: small code refactoring Oded Gabbay
2021-06-09 15:03 ` [PATCH 3/7] habanalabs: report EQ fault during heartbeat Oded Gabbay
2021-06-09 15:03 ` [PATCH 4/7] habanalabs: enable stop on error for all QMANs and engines Oded Gabbay
2021-06-09 15:03 ` [PATCH 5/7] habanalabs: enable dram scramble before linux f/w Oded Gabbay
2021-06-09 15:03 ` [PATCH 6/7] habanalabs: add hard reset timeout for PLDM Oded Gabbay
2021-06-09 15:03 ` [PATCH 7/7] habanalabs: print firmware versions Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).