All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu/mes12: print MES opcodes rather than numbers
@ 2024-04-30 20:36 Alex Deucher
  2024-04-30 20:36 ` [PATCH 2/3] drm/amdgpu/mes12: increase mes submission timeout Alex Deucher
  2024-04-30 20:36 ` [PATCH 3/3] drm/amdgpu/mes12: Use a separate fence per transaction Alex Deucher
  0 siblings, 2 replies; 3+ messages in thread
From: Alex Deucher @ 2024-04-30 20:36 UTC (permalink / raw
  To: amd-gfx; +Cc: Alex Deucher

Makes it easier to review the logs when there are MES
errors.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 81 ++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index cbd5b312a075b..9385ee76f3e6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -86,18 +86,76 @@ static const struct amdgpu_ring_funcs mes_v12_0_ring_funcs = {
 	.insert_nop = amdgpu_ring_insert_nop,
 };
 
+static const char *mes_v12_0_opcodes[] = {
+	"SET_HW_RSRC",
+	"SET_SCHEDULING_CONFIG",
+	"ADD_QUEUE",
+	"REMOVE_QUEUE",
+	"PERFORM_YIELD",
+	"SET_GANG_PRIORITY_LEVEL",
+	"SUSPEND",
+	"RESUME",
+	"RESET",
+	"SET_LOG_BUFFER",
+	"CHANGE_GANG_PRORITY",
+	"QUERY_SCHEDULER_STATUS",
+	"SET_DEBUG_VMID",
+	"MISC",
+	"UPDATE_ROOT_PAGE_TABLE",
+	"AMD_LOG",
+	"SET_SE_MODE",
+	"SET_GANG_SUBMIT",
+	"SET_HW_RSRC_1",
+};
+
+static const char *mes_v12_0_misc_opcodes[] = {
+	"WRITE_REG",
+	"INV_GART",
+	"QUERY_STATUS",
+	"READ_REG",
+	"WAIT_REG_MEM",
+	"SET_SHADER_DEBUGGER",
+	"NOTIFY_WORK_ON_UNMAPPED_QUEUE",
+	"NOTIFY_TO_UNMAP_PROCESSES",
+};
+
+static const char *mes_v12_0_get_op_string(union MESAPI__MISC *x_pkt)
+{
+	const char *op_str = NULL;
+
+	if (x_pkt->header.opcode < ARRAY_SIZE(mes_v12_0_opcodes))
+		op_str = mes_v12_0_opcodes[x_pkt->header.opcode];
+
+	return op_str;
+}
+
+static const char *mes_v12_0_get_misc_op_string(union MESAPI__MISC *x_pkt)
+{
+	const char *op_str = NULL;
+
+	if ((x_pkt->header.opcode == MES_SCH_API_MISC) &&
+	    (x_pkt->opcode < ARRAY_SIZE(mes_v12_0_misc_opcodes)))
+		op_str = mes_v12_0_misc_opcodes[x_pkt->opcode];
+
+	return op_str;
+}
+
 static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 						    void *pkt, int size,
 						    int api_status_off)
 {
 	int ndw = size / 4;
 	signed long r;
-	union MESAPI__ADD_QUEUE *x_pkt = pkt;
+	union MESAPI__MISC *x_pkt = pkt;
 	struct MES_API_STATUS *api_status;
 	struct amdgpu_device *adev = mes->adev;
 	struct amdgpu_ring *ring = &mes->ring;
 	unsigned long flags;
 	signed long timeout = adev->usec_timeout;
+	const char *op_str, *misc_op_str;
+
+	if (x_pkt->header.opcode >= MES_SCH_API_MAX)
+		return -EINVAL;
 
 	if (amdgpu_emu_mode) {
 		timeout *= 100;
@@ -121,13 +179,28 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	amdgpu_ring_commit(ring);
 	spin_unlock_irqrestore(&mes->ring_lock, flags);
 
-	DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode);
+	op_str = mes_v12_0_get_op_string(x_pkt);
+	misc_op_str = mes_v12_0_get_misc_op_string(x_pkt);
+
+	if (misc_op_str)
+		dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str);
+	else if (op_str)
+		dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str);
+	else
+		dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);
 
 	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
 		      timeout);
 	if (r < 1) {
-		DRM_ERROR("MES failed to response msg=%d\n",
-			  x_pkt->header.opcode);
+		if (misc_op_str)
+			dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n",
+				op_str, misc_op_str);
+		else if (op_str)
+			dev_err(adev->dev, "MES failed to respond to msg=%s\n",
+				op_str);
+		else
+			dev_err(adev->dev, "MES failed to respond to msg=%d\n",
+				x_pkt->header.opcode);
 
 		while (halt_if_hws_hang)
 			schedule();
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/3] drm/amdgpu/mes12: increase mes submission timeout
  2024-04-30 20:36 [PATCH 1/3] drm/amdgpu/mes12: print MES opcodes rather than numbers Alex Deucher
@ 2024-04-30 20:36 ` Alex Deucher
  2024-04-30 20:36 ` [PATCH 3/3] drm/amdgpu/mes12: Use a separate fence per transaction Alex Deucher
  1 sibling, 0 replies; 3+ messages in thread
From: Alex Deucher @ 2024-04-30 20:36 UTC (permalink / raw
  To: amd-gfx; +Cc: Alex Deucher

MES internally has a timeout allowance of 2 seconds.
Increase driver timeout to 3 seconds to be safe.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 9385ee76f3e6b..57bc277677ed6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -151,8 +151,8 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	struct amdgpu_device *adev = mes->adev;
 	struct amdgpu_ring *ring = &mes->ring;
 	unsigned long flags;
-	signed long timeout = adev->usec_timeout;
 	const char *op_str, *misc_op_str;
+	signed long timeout = 3000000; /* 3000 ms */
 
 	if (x_pkt->header.opcode >= MES_SCH_API_MAX)
 		return -EINVAL;
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 3/3] drm/amdgpu/mes12: Use a separate fence per transaction
  2024-04-30 20:36 [PATCH 1/3] drm/amdgpu/mes12: print MES opcodes rather than numbers Alex Deucher
  2024-04-30 20:36 ` [PATCH 2/3] drm/amdgpu/mes12: increase mes submission timeout Alex Deucher
@ 2024-04-30 20:36 ` Alex Deucher
  1 sibling, 0 replies; 3+ messages in thread
From: Alex Deucher @ 2024-04-30 20:36 UTC (permalink / raw
  To: amd-gfx; +Cc: Alex Deucher

We can't use a shared fence location because each transaction
should be considered independently.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 57bc277677ed6..76db85157bf9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -153,6 +153,10 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	unsigned long flags;
 	const char *op_str, *misc_op_str;
 	signed long timeout = 3000000; /* 3000 ms */
+	u32 fence_offset;
+	u64 fence_gpu_addr;
+	u64 *fence_ptr;
+	int ret;
 
 	if (x_pkt->header.opcode >= MES_SCH_API_MAX)
 		return -EINVAL;
@@ -165,15 +169,24 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	}
 	BUG_ON(size % 4 != 0);
 
+	ret = amdgpu_device_wb_get(adev, &fence_offset);
+	if (ret)
+		return ret;
+	fence_gpu_addr =
+		adev->wb.gpu_addr + (fence_offset * 4);
+	fence_ptr = (u64 *)&adev->wb.wb[fence_offset];
+	*fence_ptr = 0;
+
 	spin_lock_irqsave(&mes->ring_lock, flags);
 	if (amdgpu_ring_alloc(ring, ndw)) {
 		spin_unlock_irqrestore(&mes->ring_lock, flags);
+		amdgpu_device_wb_free(adev, fence_offset);
 		return -ENOMEM;
 	}
 
 	api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
-	api_status->api_completion_fence_addr = mes->ring.fence_drv.gpu_addr;
-	api_status->api_completion_fence_value = ++mes->ring.fence_drv.sync_seq;
+	api_status->api_completion_fence_addr = fence_gpu_addr;
+	api_status->api_completion_fence_value = 1;
 
 	amdgpu_ring_write_multiple(ring, pkt, ndw);
 	amdgpu_ring_commit(ring);
@@ -189,8 +202,9 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	else
 		dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);
 
-	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
-		      timeout);
+	r = amdgpu_mes_fence_wait_polling(fence_ptr, (u64)1, timeout);
+	amdgpu_device_wb_free(adev, fence_offset);
+
 	if (r < 1) {
 		if (misc_op_str)
 			dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n",
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-04-30 20:36 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-30 20:36 [PATCH 1/3] drm/amdgpu/mes12: print MES opcodes rather than numbers Alex Deucher
2024-04-30 20:36 ` [PATCH 2/3] drm/amdgpu/mes12: increase mes submission timeout Alex Deucher
2024-04-30 20:36 ` [PATCH 3/3] drm/amdgpu/mes12: Use a separate fence per transaction Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.