* [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV
@ 2024-01-24 23:21 Zhigang Luo
2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Zhigang Luo @ 2024-01-24 23:21 UTC (permalink / raw
To: amd-gfx; +Cc: YiPeng.Chai, Sashank.Saye, Victor.Skvortsov, Hawking.Zhang
From: YiPeng Chai <YiPeng.Chai@amd.com>
Support passing poison consumption ras blocks
to SRIOV.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 ++-
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 3 ++-
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 23 +++++++++++++++----
.../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 7 ++++--
.../gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 7 ++++--
.../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 7 ++++--
13 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 77e263660288..dfb93664e866 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
amdgpu_device_flush_hdp(adev, NULL);
}
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, bool reset)
{
- amdgpu_umc_poison_handler(adev, reset);
+ amdgpu_umc_poison_handler(adev, block, reset);
}
int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 584a0cea5572..50d3e0149032 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
- bool reset);
+ enum amdgpu_ras_block block, bool reset);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ebcd1cb60052..79bf6bd428a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
- amdgpu_umc_poison_handler(adev, false);
+ amdgpu_umc_poison_handler(adev, obj->head.block, false);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index a6cdb69897f2..20436f81856a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
return 0;
}
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, bool reset)
{
int ret = AMDGPU_RAS_SUCCESS;
@@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
}
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
- adev->virt.ops->ras_poison_handler(adev);
+ adev->virt.ops->ras_poison_handler(adev, block);
else
dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 83199296ed10..26d2ae498daf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -102,7 +102,8 @@ struct amdgpu_umc {
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, bool reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index f4963330c772..f300d4a4457d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
amdgpu_ras_interrupt_dispatch(adev, &ih_data);
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
- adev->virt.ops->ras_poison_handler(adev);
+ adev->virt.ops->ras_poison_handler(adev, ras_if->block);
else
dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV for VCN!\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 1b49c007ff62..fa7be5f277b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,7 +88,8 @@ struct amdgpu_virt_ops {
int (*wait_reset)(struct amdgpu_device *adev);
void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
u32 data1, u32 data2, u32 data3);
- void (*ras_poison_handler)(struct amdgpu_device *adev);
+ void (*ras_poison_handler)(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block);
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 26d6286d86c9..9e7ce1e6bc06 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
amdgpu_ras_interrupt_dispatch(adev, &ih_data);
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
- adev->virt.ops->ras_poison_handler(adev);
+ adev->virt.ops->ras_poison_handler(adev, ras_if->block);
else
dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 63725b2ebc03..a2bd2c3b1ef9 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);
}
-static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block)
{
xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6a68ee946f1c..d0a018da3c7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev,
xgpu_nv_mailbox_set_valid(adev, false);
}
-static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
- enum idh_request req)
+static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
+ enum idh_request req, u32 data1, u32 data2, u32 data3)
{
int r, retry = 1;
enum idh_event event = -1;
send_request:
- xgpu_nv_mailbox_trans_msg(adev, req, 0, 0, 0);
+ xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3);
switch (req) {
case IDH_REQ_GPU_INIT_ACCESS:
@@ -206,6 +206,13 @@ static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
return 0;
}
+static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
+ enum idh_request req)
+{
+ return xgpu_nv_send_access_requests_with_param(adev,
+ req, 0, 0, 0);
+}
+
static int xgpu_nv_request_reset(struct amdgpu_device *adev)
{
int ret, i = 0;
@@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
}
-static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block)
{
- xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+ if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
+ xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+ } else {
+ xgpu_nv_send_access_requests_with_param(adev,
+ IDH_RAS_POISON, block, 0, 0);
+ }
}
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a7697ec8188e..9a06c6fb6605 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
static void event_interrupt_poison_consumption(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id)
{
+ enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
case SOC15_IH_CLIENTID_SDMA2:
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
+ block = AMDGPU_RAS_BLOCK__SDMA;
break;
default:
break;
@@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
} else {
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 2a65792fd116..c6d28e37ed46 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
uint16_t pasid, uint16_t source_id)
{
+ enum amdgpu_ras_block block = 0;
int ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -210,8 +211,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
break;
case SOC21_INTSRC_SDMA_ECC:
+ block = AMDGPU_RAS_BLOCK__SDMA;
default:
break;
}
@@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
if (!ret)
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
else
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
}
static bool event_interrupt_isr_v11(struct kfd_node *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 27cdaea40501..91dd5e045b51 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id)
{
+ enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
case SOC15_IH_CLIENTID_SDMA2:
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
+ block = AMDGPU_RAS_BLOCK__SDMA;
break;
default:
break;
@@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
} else {
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
}
}
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message
2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
@ 2024-01-24 23:21 ` Zhigang Luo
2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
2024-01-25 3:25 ` [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhang, Hawking
2 siblings, 0 replies; 5+ messages in thread
From: Zhigang Luo @ 2024-01-24 23:21 UTC (permalink / raw
To: amd-gfx; +Cc: YiPeng.Chai, Sashank.Saye, Victor Skvortsov, Hawking.Zhang
From: Victor Skvortsov <victor.skvortsov@amd.com>
In a non-FLR page avoidance scenario, the host driver will
provide the bad pages in the pf2vf exchange region.
Adding a new host response message to indicate when the
pf2vf exchange region has been updated.
Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
Change-Id: I58d5d11d959d91ad5723d33fddb93570c259e245
---
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 5 +++++
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 1 +
2 files changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index d0a018da3c7a..c49bf87d4b0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -170,6 +170,9 @@ static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
case IDH_REQ_GPU_INIT_DATA:
event = IDH_REQ_GPU_INIT_DATA_READY;
break;
+ case IDH_RAS_POISON:
+ if (data1 != 0)
+ event = IDH_RAS_POISON_READY;
default:
break;
}
@@ -437,8 +440,10 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
} else {
+ amdgpu_virt_fini_data_exchange(adev);
xgpu_nv_send_access_requests_with_param(adev,
IDH_RAS_POISON, block, 0, 0);
+ amdgpu_virt_init_data_exchange(adev);
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index d0221ce08769..1e8fd90cab43 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -51,6 +51,7 @@ enum idh_event {
IDH_FAIL,
IDH_QUERY_ALIVE,
IDH_REQ_GPU_INIT_DATA_READY,
+ IDH_RAS_POISON_READY,
IDH_TEXT_MESSAGE = 255,
};
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement
2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
@ 2024-01-24 23:21 ` Zhigang Luo
2024-01-26 15:21 ` Christian König
2024-01-25 3:25 ` [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhang, Hawking
2 siblings, 1 reply; 5+ messages in thread
From: Zhigang Luo @ 2024-01-24 23:21 UTC (permalink / raw
To: amd-gfx; +Cc: YiPeng.Chai, Sashank.Saye, Victor Skvortsov, Hawking.Zhang
From: Victor Skvortsov <victor.skvortsov@amd.com>
In runtime, use vram manager for virtualization page retirement.
Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
Change-Id: Ia8fe6c7d4e4acae9d3a953b3ba4567e8fc6de0fa
---
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 30 ++++++++++++++++--------
1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f5c66e0038b5..6ff7d3fb2008 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -250,11 +250,11 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
if (!*data)
goto data_failure;
- bps = kmalloc_array(align_space, sizeof((*data)->bps), GFP_KERNEL);
+ bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
if (!bps)
goto bps_failure;
- bps_bo = kmalloc_array(align_space, sizeof((*data)->bps_bo), GFP_KERNEL);
+ bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL);
if (!bps_bo)
goto bps_bo_failure;
@@ -287,8 +287,10 @@ static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev)
for (i = data->last_reserved - 1; i >= 0; i--) {
bo = data->bps_bo[i];
- amdgpu_bo_free_kernel(&bo, NULL, NULL);
- data->bps_bo[i] = bo;
+ if (bo) {
+ amdgpu_bo_free_kernel(&bo, NULL, NULL);
+ data->bps_bo[i] = bo;
+ }
data->last_reserved = i;
}
}
@@ -328,6 +330,8 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
{
struct amdgpu_virt *virt = &adev->virt;
struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
+ struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+ struct ttm_resource_manager *man = &mgr->manager;
struct amdgpu_bo *bo = NULL;
uint64_t bp;
int i;
@@ -343,12 +347,18 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
* 2) a ras bad page has been reserved (duplicate error injection
* for one page);
*/
- if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
- AMDGPU_GPU_PAGE_SIZE,
- &bo, NULL))
- DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
-
- data->bps_bo[i] = bo;
+ if (ttm_resource_manager_used(man)) {
+ amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
+ bp << AMDGPU_GPU_PAGE_SHIFT,
+ AMDGPU_GPU_PAGE_SIZE);
+ data->bps_bo[i] = NULL;
+ } else {
+ if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
+ AMDGPU_GPU_PAGE_SIZE,
+ &bo, NULL))
+ DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
+ data->bps_bo[i] = bo;
+ }
data->last_reserved = i + 1;
bo = NULL;
}
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* RE: [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV
2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
@ 2024-01-25 3:25 ` Zhang, Hawking
2 siblings, 0 replies; 5+ messages in thread
From: Zhang, Hawking @ 2024-01-25 3:25 UTC (permalink / raw
To: Luo, Zhigang, amd-gfx@lists.freedesktop.org, Chai, Thomas
Cc: Saye, Sashank, Skvortsov, Victor
[-- Attachment #1: Type: text/plain, Size: 17822 bytes --]
[AMD Official Use Only - General]
@@ -210,8 +211,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
break;
case SOC21_INTSRC_SDMA_ECC:
+ block = AMDGPU_RAS_BLOCK__SDMA;
Hi @Chai, Thomas<mailto:YiPeng.Chai@amd.com>/@Luo, Zhigang<mailto:Zhigang.Luo@amd.com>,
event_interrupt_poison_consumption_v11 was duplicated from v9 generation. However, the hardware/firmware takes completely different approach to handle poison consumption in gfx11.
At this stage, let's just initialize block to AMDGPU_RAS_BLOCK__GFX for all the IQR sources (i.e., gfx 11 poison consumption notification was centralized to RLC). I believe we still need a few series to correct the v11 implementation.
With above addressed, the series is
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Regards,
Hawking
-----Original Message-----
From: Luo, Zhigang <Zhigang.Luo@amd.com<mailto:Zhigang.Luo@amd.com>>
Sent: Thursday, January 25, 2024 07:22
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Zhang, Hawking <Hawking.Zhang@amd.com<mailto:Hawking.Zhang@amd.com>>; Skvortsov, Victor <Victor.Skvortsov@amd.com<mailto:Victor.Skvortsov@amd.com>>; Saye, Sashank <Sashank.Saye@amd.com<mailto:Sashank.Saye@amd.com>>; Chai, Thomas <YiPeng.Chai@amd.com<mailto:YiPeng.Chai@amd.com>>
Subject: [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV
From: YiPeng Chai <YiPeng.Chai@amd.com<mailto:YiPeng.Chai@amd.com>>
Support passing poison consumption ras blocks to SRIOV.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com<mailto:YiPeng.Chai@amd.com>>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 ++-
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 3 ++-
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 23 +++++++++++++++----
.../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 7 ++++-- .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 7 ++++--
.../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 7 ++++--
13 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 77e263660288..dfb93664e866 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -732,9 +732,10 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
amdgpu_device_flush_hdp(adev, NULL);
}
-void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, bool reset)
{
- amdgpu_umc_poison_handler(adev, reset);
+ amdgpu_umc_poison_handler(adev, block, reset);
}
int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 584a0cea5572..50d3e0149032 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -334,7 +334,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
- bool reset);
+ enum amdgpu_ras_block block, bool reset);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ebcd1cb60052..79bf6bd428a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2041,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
- amdgpu_umc_poison_handler(adev, false);
+ amdgpu_umc_poison_handler(adev, obj->head.block, false);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index a6cdb69897f2..20436f81856a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -246,7 +246,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
return 0;
}
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, bool reset)
{
int ret = AMDGPU_RAS_SUCCESS;
@@ -297,7 +298,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
}
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
- adev->virt.ops->ras_poison_handler(adev);
+ adev->virt.ops->ras_poison_handler(adev, block);
else
dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 83199296ed10..26d2ae498daf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -102,7 +102,8 @@ struct amdgpu_umc {
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, bool reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index f4963330c772..f300d4a4457d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1189,7 +1189,7 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
amdgpu_ras_interrupt_dispatch(adev, &ih_data);
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
- adev->virt.ops->ras_poison_handler(adev);
+ adev->virt.ops->ras_poison_handler(adev, ras_if->block);
else
dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV for VCN!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 1b49c007ff62..fa7be5f277b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -88,7 +88,8 @@ struct amdgpu_virt_ops {
int (*wait_reset)(struct amdgpu_device *adev);
void (*trans_msg)(struct amdgpu_device *adev, enum idh_request req,
u32 data1, u32 data2, u32 data3);
- void (*ras_poison_handler)(struct amdgpu_device *adev);
+ void (*ras_poison_handler)(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block);
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 26d6286d86c9..9e7ce1e6bc06 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -69,7 +69,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
amdgpu_ras_interrupt_dispatch(adev, &ih_data);
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
- adev->virt.ops->ras_poison_handler(adev);
+ adev->virt.ops->ras_poison_handler(adev, ras_if->block);
else
dev_warn(adev->dev,
"No ras_poison_handler interface in SRIOV for %s!\n", ras_if->name); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 63725b2ebc03..a2bd2c3b1ef9 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -404,7 +404,8 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA); }
-static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block)
{
xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 6a68ee946f1c..d0a018da3c7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -152,14 +152,14 @@ static void xgpu_nv_mailbox_trans_msg (struct amdgpu_device *adev,
xgpu_nv_mailbox_set_valid(adev, false); }
-static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
- enum idh_request req)
+static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
+ enum idh_request req, u32 data1, u32 data2, u32 data3)
{
int r, retry = 1;
enum idh_event event = -1;
send_request:
- xgpu_nv_mailbox_trans_msg(adev, req, 0, 0, 0);
+ xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3);
switch (req) {
case IDH_REQ_GPU_INIT_ACCESS:
@@ -206,6 +206,13 @@ static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
return 0;
}
+static int xgpu_nv_send_access_requests(struct amdgpu_device *adev,
+ enum idh_request req)
+{
+ return xgpu_nv_send_access_requests_with_param(adev,
+ req, 0, 0, 0);
+}
+
static int xgpu_nv_request_reset(struct amdgpu_device *adev) {
int ret, i = 0;
@@ -424,9 +431,15 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0); }
-static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev)
+static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block)
{
- xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+ if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
+ xgpu_nv_send_access_requests(adev, IDH_RAS_POISON);
+ } else {
+ xgpu_nv_send_access_requests_with_param(adev,
+ IDH_RAS_POISON, block, 0, 0);
+ }
}
const struct amdgpu_virt_ops xgpu_nv_virt_ops = { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a7697ec8188e..9a06c6fb6605 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -132,6 +132,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id) {
+ enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -151,12 +152,14 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
case SOC15_IH_CLIENTID_SDMA2:
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
+ block = AMDGPU_RAS_BLOCK__SDMA;
break;
default:
break;
@@ -171,12 +174,12 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
+false);
} else {
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 2a65792fd116..c6d28e37ed46 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -191,6 +191,7 @@ static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
uint16_t pasid, uint16_t source_id) {
+ enum amdgpu_ras_block block = 0;
int ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -210,8 +211,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
break;
case SOC21_INTSRC_SDMA_ECC:
+ block = AMDGPU_RAS_BLOCK__SDMA;
default:
break;
}
@@ -221,9 +224,9 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
if (!ret)
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
+false);
else
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
}
static bool event_interrupt_isr_v11(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 27cdaea40501..91dd5e045b51 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -143,6 +143,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id) {
+ enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -162,12 +163,14 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+ block = AMDGPU_RAS_BLOCK__GFX;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
case SOC15_IH_CLIENTID_SDMA2:
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
+ block = AMDGPU_RAS_BLOCK__SDMA;
break;
default:
break;
@@ -182,12 +185,12 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
+false);
} else {
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
}
}
--
2.25.1
[-- Attachment #2: Type: text/html, Size: 60871 bytes --]
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement
2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
@ 2024-01-26 15:21 ` Christian König
0 siblings, 0 replies; 5+ messages in thread
From: Christian König @ 2024-01-26 15:21 UTC (permalink / raw
To: Zhigang Luo, amd-gfx
Cc: Victor Skvortsov, Sashank.Saye, YiPeng.Chai, Hawking.Zhang
Am 25.01.24 um 00:21 schrieb Zhigang Luo:
> From: Victor Skvortsov <victor.skvortsov@amd.com>
>
> In runtime, use vram manager for virtualization page retirement.
>
> Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
> Change-Id: Ia8fe6c7d4e4acae9d3a953b3ba4567e8fc6de0fa
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 30 ++++++++++++++++--------
> 1 file changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f5c66e0038b5..6ff7d3fb2008 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -250,11 +250,11 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
> if (!*data)
> goto data_failure;
>
> - bps = kmalloc_array(align_space, sizeof((*data)->bps), GFP_KERNEL);
> + bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
> if (!bps)
> goto bps_failure;
>
> - bps_bo = kmalloc_array(align_space, sizeof((*data)->bps_bo), GFP_KERNEL);
> + bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL);
That looks like an rather important bug fix which should be in a
separate patch.
> if (!bps_bo)
> goto bps_bo_failure;
>
> @@ -287,8 +287,10 @@ static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev)
>
> for (i = data->last_reserved - 1; i >= 0; i--) {
> bo = data->bps_bo[i];
> - amdgpu_bo_free_kernel(&bo, NULL, NULL);
> - data->bps_bo[i] = bo;
> + if (bo) {
> + amdgpu_bo_free_kernel(&bo, NULL, NULL);
> + data->bps_bo[i] = bo;
> + }
> data->last_reserved = i;
> }
> }
> @@ -328,6 +330,8 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
> {
> struct amdgpu_virt *virt = &adev->virt;
> struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
> + struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
> + struct ttm_resource_manager *man = &mgr->manager;
> struct amdgpu_bo *bo = NULL;
> uint64_t bp;
> int i;
> @@ -343,12 +347,18 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
> * 2) a ras bad page has been reserved (duplicate error injection
> * for one page);
> */
> - if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
> - AMDGPU_GPU_PAGE_SIZE,
> - &bo, NULL))
> - DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
> -
> - data->bps_bo[i] = bo;
> + if (ttm_resource_manager_used(man)) {
> + amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
> + bp << AMDGPU_GPU_PAGE_SHIFT,
> + AMDGPU_GPU_PAGE_SIZE);
> + data->bps_bo[i] = NULL;
> + } else {
> + if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
> + AMDGPU_GPU_PAGE_SIZE,
> + &bo, NULL))
> + DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
> + data->bps_bo[i] = bo;
> + }
That code makes no sense. If the VRAM mgr is not enabled then
amdgpu_bo_create_kernel_at() won't work either.
I suggest to completely remove the amdgpu_bo_create_kernel_at() code path.
Regards,
Christian.
> data->last_reserved = i + 1;
> bo = NULL;
> }
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-01-26 15:22 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-24 23:21 [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhigang Luo
2024-01-24 23:21 ` [PATCH 2/3] drm/amdgpu: Add RAS_POISON_READY host response message Zhigang Luo
2024-01-24 23:21 ` [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement Zhigang Luo
2024-01-26 15:21 ` Christian König
2024-01-25 3:25 ` [PATCH 1/3] drm/amdgpu: Support passing poison consumption ras block to SRIOV Zhang, Hawking
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).