AMD-GFX Archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV
@ 2024-01-24 23:21 Zhigang Luo
  2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Zhigang Luo @ 2024-01-24 23:21 UTC (permalink / raw
  To: amd-gfx; +Cc: YiPeng.Chai, Sashank.Saye, Victor.Skvortsov, Hawking.Zhang

From: YiPeng Chai <YiPeng.Chai@amd.com>

Support passing poison consumption ras blocks
to SRIOV.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  5 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       |  5 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c      |  2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  3 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c         | 23 +++++++++++++++----
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  |  7 ++++--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  7 ++++--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  7 ++++--
 13 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 77e263660288..dfb93664e866 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
 	amdgpu_device_flush_hdp(adev, NULL);
 }
 
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+	enum amdgpu_ras_block block, bool reset)
 {
-	amdgpu_umc_poison_handler(adev, reset);
+	amdgpu_umc_poison_handler(adev, block, reset);
 }
 
 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 584a0cea5572..50d3e0149032 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
 				struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-				bool reset);
+			enum amdgpu_ras_block block, bool reset);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ebcd1cb60052..79bf6bd428a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
 		}
 	}
 
-	amdgpu_umc_poison_handler(adev, false);
+	amdgpu_umc_poison_handler(adev, obj->head.block, false);
 
 	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
 		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index a6cdb69897f2..20436f81856a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
 	return 0;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+			enum amdgpu_ras_block block, bool reset)
 {
 	int ret = AMDGPU_RAS_SUCCESS;
 
@@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 		}
 	} else {
 		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-			adev->virt.ops->ras_poison_handler(adev);
+			adev->virt.ops->ras_poison_handler(adev, block);
 		else
 			dev_warn(adev->dev,
 				"No ras_poison_handler interface in SRIOV!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 83199296ed10..26d2ae498daf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -102,7 +102,8 @@ struct amdgpu_umc {
 
 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+			enum amdgpu_ras_block block, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 		struct amdgpu_irq_src *source,
 		struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index f4963330c772..f300d4a4457d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
 		amdgpu_ras_interrupt_dispatch(adev, &ih_data);
 	} else {
 		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-			adev->virt.ops->ras_poison_handler(adev);
+			adev->virt.ops->ras_poison_handler(adev, ras_if->block);
 		else
 			dev_warn(adev->dev,
 				"No ras_poison_handler interface in SRIOV for VCN!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 1b49c007ff62..fa7be5f277b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,7 +88,8 @@ struct amdgpu_virt_ops {
 	int (*wait_reset)(struct amdgpu_device *adev);
 	void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
 			  u32 data1, u32 data2, u32 data3);
-	void (*ras_poison_handler)(struct amdgpu_device *adev);
+	void (*ras_poison_handler)(struct amdgpu_device *adev,
+					enum amdgpu_ras_block block);
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 26d6286d86c9..9e7ce1e6bc06 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
 		amdgpu_ras_interrupt_dispatch(adev, &ih_data);
 	} else {
 		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-			adev->virt.ops->ras_poison_handler(adev);
+			adev->virt.ops->ras_poison_handler(adev, ras_if->block);
 		else
 			dev_warn(adev->dev,
 				"No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 63725b2ebc03..a2bd2c3b1ef9 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
 	return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);
 }
 
-static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
+					enum amdgpu_ras_block block)
 {
 	xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6a68ee946f1c..d0a018da3c7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev,
 	xgpu_nv_mailbox_set_valid(adev, false);
 }
 
-static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
-					enum idh_request req)
+static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
+			enum idh_request req, u32 data1, u32 data2, u32 data3)
 {
 	int r, retry = 1;
 	enum idh_event event = -1;
 
 send_request:
-	xgpu_nv_mailbox_trans_msg(adev, req, 0, 0, 0);
+	xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3);
 
 	switch (req) {
 	case IDH_REQ_GPU_INIT_ACCESS:
@@ -206,6 +206,13 @@ static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
+					enum idh_request req)
+{
+	return xgpu_nv_send_access_requests_with_param(adev,
+						req, 0, 0, 0);
+}
+
 static int xgpu_nv_request_reset(struct amdgpu_device *adev)
 {
 	int ret, i = 0;
@@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
 	amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
 }
 
-static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
+		enum amdgpu_ras_block block)
 {
-	xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+	if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
+		xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+	} else {
+		xgpu_nv_send_access_requests_with_param(adev,
+					IDH_RAS_POISON,	block, 0, 0);
+	}
 }
 
 const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a7697ec8188e..9a06c6fb6605 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 static void event_interrupt_poison_consumption(struct kfd_node *dev,
 				uint16_t pasid, uint16_t client_id)
 {
+	enum amdgpu_ras_block block = 0;
 	int old_poison, ret = -EINVAL;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
 	case SOC15_IH_CLIENTID_SE3SH:
 	case SOC15_IH_CLIENTID_UTCL2:
 		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+		block = AMDGPU_RAS_BLOCK__GFX;
 		break;
 	case SOC15_IH_CLIENTID_SDMA0:
 	case SOC15_IH_CLIENTID_SDMA1:
 	case SOC15_IH_CLIENTID_SDMA2:
 	case SOC15_IH_CLIENTID_SDMA3:
 	case SOC15_IH_CLIENTID_SDMA4:
+		block = AMDGPU_RAS_BLOCK__SDMA;
 		break;
 	default:
 		break;
@@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
 		dev_warn(dev->adev->dev,
 			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
 			client_id);
-		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
 	} else {
 		dev_warn(dev->adev->dev,
 			"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
 			client_id);
-		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 2a65792fd116..c6d28e37ed46 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
 static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
 				uint16_t pasid, uint16_t source_id)
 {
+	enum amdgpu_ras_block block = 0;
 	int ret = -EINVAL;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -210,8 +211,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
 	case SOC15_INTSRC_SQ_INTERRUPT_MSG:
 		if (dev->dqm->ops.reset_queues)
 			ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+		block = AMDGPU_RAS_BLOCK__GFX;
 		break;
 	case SOC21_INTSRC_SDMA_ECC:
+		block = AMDGPU_RAS_BLOCK__SDMA;
 	default:
 		break;
 	}
@@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
 	/* resetting queue passes, do page retirement without gpu reset
 	   resetting queue fails, fallback to gpu reset solution */
 	if (!ret)
-		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
 	else
-		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
 }
 
 static bool event_interrupt_isr_v11(struct kfd_node *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 27cdaea40501..91dd5e045b51 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
 static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 				uint16_t pasid, uint16_t client_id)
 {
+	enum amdgpu_ras_block block = 0;
 	int old_poison, ret = -EINVAL;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 	case SOC15_IH_CLIENTID_SE3SH:
 	case SOC15_IH_CLIENTID_UTCL2:
 		ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+		block = AMDGPU_RAS_BLOCK__GFX;
 		break;
 	case SOC15_IH_CLIENTID_SDMA0:
 	case SOC15_IH_CLIENTID_SDMA1:
 	case SOC15_IH_CLIENTID_SDMA2:
 	case SOC15_IH_CLIENTID_SDMA3:
 	case SOC15_IH_CLIENTID_SDMA4:
+		block = AMDGPU_RAS_BLOCK__SDMA;
 		break;
 	default:
 		break;
@@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
 		dev_warn(dev->adev->dev,
 			"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
 			client_id);
-		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
 	} else {
 		dev_warn(dev->adev->dev,
 			"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
 			client_id);
-		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+		amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
 	}
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message
  2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
@ 2024-01-24 23:21 ` Zhigang Luo
  2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
  2024-01-25  3:25 ` [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhang, Hawking
  2 siblings, 0 replies; 5+ messages in thread
From: Zhigang Luo @ 2024-01-24 23:21 UTC (permalink / raw
  To: amd-gfx; +Cc: YiPeng.Chai, Sashank.Saye, Victor Skvortsov, Hawking.Zhang

From: Victor Skvortsov <victor.skvortsov@amd.com>

In a non-FLR page avoidance scenario, the host driver will
provide the bad pages in the pf2vf exchange region.

Adding a new host response message to indicate when the
pf2vf exchange region has been updated.

Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
Change-Id: I58d5d11d959d91ad5723d33fddb93570c259e245
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 5 +++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index d0a018da3c7a..c49bf87d4b0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -170,6 +170,9 @@ static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
 	case IDH_REQ_GPU_INIT_DATA:
 		event = IDH_REQ_GPU_INIT_DATA_READY;
 		break;
+	case IDH_RAS_POISON:
+		if (data1 != 0)
+			event = IDH_RAS_POISON_READY;
 	default:
 		break;
 	}
@@ -437,8 +440,10 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
 	if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
 		xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
 	} else {
+		amdgpu_virt_fini_data_exchange(adev);
 		xgpu_nv_send_access_requests_with_param(adev,
 					IDH_RAS_POISON,	block, 0, 0);
+		amdgpu_virt_init_data_exchange(adev);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index d0221ce08769..1e8fd90cab43 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -51,6 +51,7 @@ enum idh_event {
 	IDH_FAIL,
 	IDH_QUERY_ALIVE,
 	IDH_REQ_GPU_INIT_DATA_READY,
+	IDH_RAS_POISON_READY,
 
 	IDH_TEXT_MESSAGE = 255,
 };
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement
  2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
  2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
@ 2024-01-24 23:21 ` Zhigang Luo
  2024-01-26 15:21   ` Christian König
  2024-01-25  3:25 ` [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhang, Hawking
  2 siblings, 1 reply; 5+ messages in thread
From: Zhigang Luo @ 2024-01-24 23:21 UTC (permalink / raw
  To: amd-gfx; +Cc: YiPeng.Chai, Sashank.Saye, Victor Skvortsov, Hawking.Zhang

From: Victor Skvortsov <victor.skvortsov@amd.com>

In runtime, use vram manager for virtualization page retirement.

Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
Change-Id: Ia8fe6c7d4e4acae9d3a953b3ba4567e8fc6de0fa
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 30 ++++++++++++++++--------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f5c66e0038b5..6ff7d3fb2008 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -250,11 +250,11 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
 	if (!*data)
 		goto data_failure;
 
-	bps = kmalloc_array(align_space, sizeof((*data)->bps), GFP_KERNEL);
+	bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
 	if (!bps)
 		goto bps_failure;
 
-	bps_bo = kmalloc_array(align_space, sizeof((*data)->bps_bo), GFP_KERNEL);
+	bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL);
 	if (!bps_bo)
 		goto bps_bo_failure;
 
@@ -287,8 +287,10 @@ static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev)
 
 	for (i = data->last_reserved - 1; i >= 0; i--) {
 		bo = data->bps_bo[i];
-		amdgpu_bo_free_kernel(&bo, NULL, NULL);
-		data->bps_bo[i] = bo;
+		if (bo) {
+			amdgpu_bo_free_kernel(&bo, NULL, NULL);
+			data->bps_bo[i] = bo;
+		}
 		data->last_reserved = i;
 	}
 }
@@ -328,6 +330,8 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
 {
 	struct amdgpu_virt *virt = &adev->virt;
 	struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
+	struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+	struct ttm_resource_manager *man = &mgr->manager;
 	struct amdgpu_bo *bo = NULL;
 	uint64_t bp;
 	int i;
@@ -343,12 +347,18 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
 		 * 2) a ras bad page has been reserved (duplicate error injection
 		 *    for one page);
 		 */
-		if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
-					       AMDGPU_GPU_PAGE_SIZE,
-					       &bo, NULL))
-			DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
-
-		data->bps_bo[i] = bo;
+		if  (ttm_resource_manager_used(man)) {
+			amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
+				bp << AMDGPU_GPU_PAGE_SHIFT,
+				AMDGPU_GPU_PAGE_SIZE);
+			data->bps_bo[i] = NULL;
+		} else {
+			if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
+							AMDGPU_GPU_PAGE_SIZE,
+							&bo, NULL))
+				DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
+			data->bps_bo[i] = bo;
+		}
 		data->last_reserved = i + 1;
 		bo = NULL;
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* RE: [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV
  2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
  2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
  2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
@ 2024-01-25  3:25 ` Zhang, Hawking
  2 siblings, 0 replies; 5+ messages in thread
From: Zhang, Hawking @ 2024-01-25  3:25 UTC (permalink / raw
  To: Luo, Zhigang, amd-gfx@lists.freedesktop.org, Chai, Thomas
  Cc: Saye, Sashank, Skvortsov, Victor

[-- Attachment #1: Type: text/plain, Size: 17822 bytes --]

[AMD Official Use Only - General]


@@ -210,8 +211,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
        case SOC15_INTSRC_SQ_INTERRUPT_MSG:
                if (dev->dqm->ops.reset_queues)
                        ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        case SOC21_INTSRC_SDMA_ECC:
+               block = AMDGPU_RAS_BLOCK__SDMA;

Hi @Chai, Thomas<mailto:YiPeng.Chai@amd.com>/@Luo, Zhigang<mailto:Zhigang.Luo@amd.com>,

event_interrupt_poison_consumption_v11 was duplicated from v9 generation. However, the hardware/firmware takes completely different approach to handle poison consumption in gfx11.

At this stage, let's just initialize block to AMDGPU_RAS_BLOCK__GFX for all the IQR sources (i.e., gfx 11 poison consumption notification was centralized to RLC). I believe we still need a few series to correct the v11 implementation.

With above addressed, the series is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking

-----Original Message-----
From: Luo, Zhigang <Zhigang.Luo@amd.com<mailto:Zhigang.Luo@amd.com>>
Sent: Thursday, January 25, 2024 07:22
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com<mailto:Hawking.Zhang@amd.com>>; Skvortsov, Victor <Victor.Skvortsov@amd.com<mailto:Victor.Skvortsov@amd.com>>; Saye, Sashank <Sashank.Saye@amd.com<mailto:Sashank.Saye@amd.com>>; Chai, Thomas <YiPeng.Chai@amd.com<mailto:YiPeng.Chai@amd.com>>
Subject: [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV

From: YiPeng Chai <YiPeng.Chai@amd.com<mailto:YiPeng.Chai@amd.com>>

Support passing poison consumption ras blocks to SRIOV.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com<mailto:YiPeng.Chai@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  5 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       |  5 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c       |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c      |  2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  3 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c         | 23 +++++++++++++++----
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  |  7 ++++--  .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  7 ++++--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  7 ++++--
 13 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 77e263660288..dfb93664e866 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
        amdgpu_device_flush_hdp(adev, NULL);
 }

-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+       enum amdgpu_ras_block block, bool reset)
 {
-       amdgpu_umc_poison_handler(adev, reset);
+       amdgpu_umc_poison_handler(adev, block, reset);
 }

 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 584a0cea5572..50d3e0149032 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);  int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
                                struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
-                               bool reset);
+                       enum amdgpu_ras_block block, bool reset);
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p);  int amdgpu_amdkfd_criu_resume(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ebcd1cb60052..79bf6bd428a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
                }
        }

-       amdgpu_umc_poison_handler(adev, false);
+       amdgpu_umc_poison_handler(adev, obj->head.block, false);

        if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
                poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index a6cdb69897f2..20436f81856a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
        return 0;
 }

-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, bool reset)
 {
        int ret = AMDGPU_RAS_SUCCESS;

@@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
                }
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-                       adev->virt.ops->ras_poison_handler(adev);
+                       adev->virt.ops->ras_poison_handler(adev, block);
                else
                        dev_warn(adev->dev,
                                "No ras_poison_handler interface in SRIOV!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 83199296ed10..26d2ae498daf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -102,7 +102,8 @@ struct amdgpu_umc {

 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+                       enum amdgpu_ras_block block, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index f4963330c772..f300d4a4457d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
                amdgpu_ras_interrupt_dispatch(adev, &ih_data);
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-                       adev->virt.ops->ras_poison_handler(adev);
+                       adev->virt.ops->ras_poison_handler(adev, ras_if->block);
                else
                        dev_warn(adev->dev,
                                "No ras_poison_handler interface in SRIOV for VCN!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 1b49c007ff62..fa7be5f277b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,7 +88,8 @@ struct amdgpu_virt_ops {
        int (*wait_reset)(struct amdgpu_device *adev);
        void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
                          u32 data1, u32 data2, u32 data3);
-       void (*ras_poison_handler)(struct amdgpu_device *adev);
+       void (*ras_poison_handler)(struct amdgpu_device *adev,
+                                       enum amdgpu_ras_block block);
 };

 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 26d6286d86c9..9e7ce1e6bc06 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
                amdgpu_ras_interrupt_dispatch(adev, &ih_data);
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-                       adev->virt.ops->ras_poison_handler(adev);
+                       adev->virt.ops->ras_poison_handler(adev, ras_if->block);
                else
                        dev_warn(adev->dev,
                                "No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 63725b2ebc03..a2bd2c3b1ef9 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
        return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);  }

-static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
+                                       enum amdgpu_ras_block block)
 {
        xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);  } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6a68ee946f1c..d0a018da3c7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev,
        xgpu_nv_mailbox_set_valid(adev, false);  }

-static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
-                                       enum idh_request req)
+static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
+                       enum idh_request req, u32 data1, u32 data2, u32 data3)
 {
        int r, retry = 1;
        enum idh_event event = -1;

 send_request:
-       xgpu_nv_mailbox_trans_msg(adev, req, 0, 0, 0);
+       xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3);

        switch (req) {
        case IDH_REQ_GPU_INIT_ACCESS:
@@ -206,6 +206,13 @@ static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
        return 0;
 }

+static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
+                                       enum idh_request req)
+{
+       return xgpu_nv_send_access_requests_with_param(adev,
+                                               req, 0, 0, 0);
+}
+
 static int xgpu_nv_request_reset(struct amdgpu_device *adev)  {
        int ret, i = 0;
@@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
        amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);  }

-static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
+               enum amdgpu_ras_block block)
 {
-       xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+       if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
+               xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+       } else {
+               xgpu_nv_send_access_requests_with_param(adev,
+                                       IDH_RAS_POISON, block, 0, 0);
+       }
 }

 const struct amdgpu_virt_ops xgpu_nv_virt_ops = { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a7697ec8188e..9a06c6fb6605 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {  static void event_interrupt_poison_consumption(struct kfd_node *dev,
                                uint16_t pasid, uint16_t client_id)  {
+       enum amdgpu_ras_block block = 0;
        int old_poison, ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

@@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_SE3SH:
        case SOC15_IH_CLIENTID_UTCL2:
                ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        case SOC15_IH_CLIENTID_SDMA0:
        case SOC15_IH_CLIENTID_SDMA1:
        case SOC15_IH_CLIENTID_SDMA2:
        case SOC15_IH_CLIENTID_SDMA3:
        case SOC15_IH_CLIENTID_SDMA4:
+               block = AMDGPU_RAS_BLOCK__SDMA;
                break;
        default:
                break;
@@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
+false);
        } else {
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
        }
 }

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 2a65792fd116..c6d28e37ed46 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)  static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
                                uint16_t pasid, uint16_t source_id)  {
+       enum amdgpu_ras_block block = 0;
        int ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

@@ -210,8 +211,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
        case SOC15_INTSRC_SQ_INTERRUPT_MSG:
                if (dev->dqm->ops.reset_queues)
                        ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        case SOC21_INTSRC_SDMA_ECC:
+               block = AMDGPU_RAS_BLOCK__SDMA;
        default:
                break;
        }
@@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
        /* resetting queue passes, do page retirement without gpu reset
           resetting queue fails, fallback to gpu reset solution */
        if (!ret)
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
+false);
        else
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
 }

 static bool event_interrupt_isr_v11(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 27cdaea40501..91dd5e045b51 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {  static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                                uint16_t pasid, uint16_t client_id)  {
+       enum amdgpu_ras_block block = 0;
        int old_poison, ret = -EINVAL;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

@@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
        case SOC15_IH_CLIENTID_SE3SH:
        case SOC15_IH_CLIENTID_UTCL2:
                ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+               block = AMDGPU_RAS_BLOCK__GFX;
                break;
        case SOC15_IH_CLIENTID_SDMA0:
        case SOC15_IH_CLIENTID_SDMA1:
        case SOC15_IH_CLIENTID_SDMA2:
        case SOC15_IH_CLIENTID_SDMA3:
        case SOC15_IH_CLIENTID_SDMA4:
+               block = AMDGPU_RAS_BLOCK__SDMA;
                break;
        default:
                break;
@@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
+false);
        } else {
                dev_warn(dev->adev->dev,
                        "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
                        client_id);
-               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+               amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
        }
 }

--
2.25.1


[-- Attachment #2: Type: text/html, Size: 60871 bytes --]

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement
  2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
@ 2024-01-26 15:21   ` Christian König
  0 siblings, 0 replies; 5+ messages in thread
From: Christian König @ 2024-01-26 15:21 UTC (permalink / raw
  To: Zhigang Luo, amd-gfx
  Cc: Victor Skvortsov, Sashank.Saye, YiPeng.Chai, Hawking.Zhang

Am 25.01.24 um 00:21 schrieb Zhigang Luo:
> From: Victor Skvortsov <victor.skvortsov@amd.com>
>
> In runtime, use vram manager for virtualization page retirement.
>
> Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
> Change-Id: Ia8fe6c7d4e4acae9d3a953b3ba4567e8fc6de0fa
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 30 ++++++++++++++++--------
>   1 file changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f5c66e0038b5..6ff7d3fb2008 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -250,11 +250,11 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
>   	if (!*data)
>   		goto data_failure;
>   
> -	bps = kmalloc_array(align_space, sizeof((*data)->bps), GFP_KERNEL);
> +	bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
>   	if (!bps)
>   		goto bps_failure;
>   
> -	bps_bo = kmalloc_array(align_space, sizeof((*data)->bps_bo), GFP_KERNEL);
> +	bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL);

That looks like an rather important bug fix which should be in a 
separate patch.

>   	if (!bps_bo)
>   		goto bps_bo_failure;
>   
> @@ -287,8 +287,10 @@ static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev)
>   
>   	for (i = data->last_reserved - 1; i >= 0; i--) {
>   		bo = data->bps_bo[i];
> -		amdgpu_bo_free_kernel(&bo, NULL, NULL);
> -		data->bps_bo[i] = bo;
> +		if (bo) {
> +			amdgpu_bo_free_kernel(&bo, NULL, NULL);
> +			data->bps_bo[i] = bo;
> +		}
>   		data->last_reserved = i;
>   	}
>   }
> @@ -328,6 +330,8 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_virt *virt = &adev->virt;
>   	struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
> +	struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
> +	struct ttm_resource_manager *man = &mgr->manager;
>   	struct amdgpu_bo *bo = NULL;
>   	uint64_t bp;
>   	int i;
> @@ -343,12 +347,18 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
>   		 * 2) a ras bad page has been reserved (duplicate error injection
>   		 *    for one page);
>   		 */
> -		if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
> -					       AMDGPU_GPU_PAGE_SIZE,
> -					       &bo, NULL))
> -			DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
> -
> -		data->bps_bo[i] = bo;
> +		if  (ttm_resource_manager_used(man)) {
> +			amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
> +				bp << AMDGPU_GPU_PAGE_SHIFT,
> +				AMDGPU_GPU_PAGE_SIZE);
> +			data->bps_bo[i] = NULL;
> +		} else {
> +			if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
> +							AMDGPU_GPU_PAGE_SIZE,
> +							&bo, NULL))
> +				DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
> +			data->bps_bo[i] = bo;
> +		}

That code makes no sense. If the VRAM mgr is not enabled then 
amdgpu_bo_create_kernel_at() won't work either.

I suggest to completely remove the amdgpu_bo_create_kernel_at() code path.

Regards,
Christian.

>   		data->last_reserved = i + 1;
>   		bo = NULL;
>   	}


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-01-26 15:22 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
2024-01-26 15:21   ` Christian König
2024-01-25  3:25 ` [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhang, Hawking

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).