* [PATCH v3] drm/amdgpu: change vm->task_info handling
@ 2024-02-05 17:05 Shashank Sharma
2024-03-01 17:07 ` Felix Kuehling
0 siblings, 1 reply; 3+ messages in thread
From: Shashank Sharma @ 2024-02-05 17:05 UTC (permalink / raw
To: amd-gfx; +Cc: Shashank Sharma, Christian Koenig, Alex Deucher, Felix Kuehling
This patch changes the handling and lifecycle of vm->task_info object.
The major changes are:
- vm->task_info is a dynamically allocated ptr now, and its uasge is
reference counted.
- introducing two new helper funcs for task_info lifecycle management
- amdgpu_vm_get_task_info: reference counts up task_info before
returning this info
- amdgpu_vm_put_task_info: reference counts down task_info
- last put to task_info() frees task_info from the vm.
This patch also does logistical changes required for existing usage
of vm->task_info.
V2: Do not block all the prints when task_info not found (Felix)
V3: (Felix)
- Fix wrong indentation
- No debug message for -ENOMEM
- Add NULL check for task_info
- Do not duplicate the debug messages (ti vs no ti)
- Get first reference of task_info in vm_init(), put last
in vm_fini()
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 18 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 158 ++++++++++++++------
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 21 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 24 +--
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 23 +--
drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 20 ++-
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 23 +--
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 23 +--
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 22 +--
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 20 +--
13 files changed, 251 insertions(+), 124 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 0e61ebdb3f3e..f9eb12697b95 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1775,9 +1775,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused)
list_for_each_entry(file, &dev->filelist, lhead) {
struct amdgpu_fpriv *fpriv = file->driver_priv;
struct amdgpu_vm *vm = &fpriv->vm;
+ struct amdgpu_task_info *ti;
+
+ ti = amdgpu_vm_get_task_info_vm(vm);
+ if (ti) {
+ seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid, ti->process_name);
+ amdgpu_vm_put_task_info(ti);
+ }
- seq_printf(m, "pid:%d\tProcess:%s ----------\n",
- vm->task_info.pid, vm->task_info.process_name);
r = amdgpu_bo_reserve(vm->root.bo, true);
if (r)
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 1f357198533f..e6e6d56398f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
- struct amdgpu_task_info ti;
+ struct amdgpu_task_info *ti;
struct amdgpu_device *adev = ring->adev;
int idx;
int r;
@@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}
- memset(&ti, 0, sizeof(struct amdgpu_task_info));
+
adev->job_hang = true;
if (amdgpu_gpu_recovery &&
@@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
goto exit;
}
- amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
- job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
- ring->fence_drv.sync_seq);
- DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
- ti.process_name, ti.tgid, ti.task_name, ti.pid);
+ job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
+ ring->fence_drv.sync_seq);
+
+ ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
+ if (ti) {
+ DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
+ ti->process_name, ti->tgid, ti->task_name, ti->pid);
+ amdgpu_vm_put_task_info(ti);
+ }
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 4baa300121d8..a59364e9b6ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -230,8 +230,16 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
coredump->reset_vram_lost = vram_lost;
- if (reset_context->job && reset_context->job->vm)
- coredump->reset_task_info = reset_context->job->vm->task_info;
+ if (reset_context->job && reset_context->job->vm) {
+ struct amdgpu_task_info *ti;
+ struct amdgpu_vm *vm = reset_context->job->vm;
+
+ ti = amdgpu_vm_get_task_info_vm(vm);
+ if (ti) {
+ coredump->reset_task_info = *ti;
+ amdgpu_vm_put_task_info(ti);
+ }
+ }
coredump->adev = adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d1b8afd105c9..8414567af683 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2127,6 +2127,117 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
return dma_fence_wait_timeout(vm->last_unlocked, true, timeout);
}
+static void amdgpu_vm_destroy_task_info(struct kref *kref)
+{
+ struct amdgpu_task_info *ti = container_of(kref, struct amdgpu_task_info, refcount);
+
+ kfree(ti);
+}
+
+static inline struct amdgpu_vm *
+amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
+{
+ struct amdgpu_vm *vm;
+ unsigned long flags;
+
+ xa_lock_irqsave(&adev->vm_manager.pasids, flags);
+ vm = xa_load(&adev->vm_manager.pasids, pasid);
+ xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
+
+ return vm;
+}
+
+/**
+ * amdgpu_vm_put_task_info - reference down the vm task_info ptr
+ *
+ * @task_info: task_info struct under discussion.
+ *
+ * frees the vm task_info ptr at the last put
+ */
+void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info)
+{
+ kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
+}
+
+/**
+ * amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
+ *
+ * @adev: drm device pointer
+ * @pasid: PASID identifier for VM
+ *
+ * Returns the reference counted task_info structure, which must be
+ * referenced down with amdgpu_vm_put_task_info.
+ */
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
+{
+ struct amdgpu_vm *vm;
+ struct amdgpu_task_info *ti = NULL;
+
+ vm = amdgpu_vm_get_vm_from_pasid(adev, pasid);
+ if (vm) {
+ ti = vm->task_info;
+ kref_get(&vm->task_info->refcount);
+ }
+
+ return ti;
+}
+
+/**
+ * amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
+ *
+ * @vm: VM to get info from
+ *
+ * Returns the reference counted task_info structure, which must be
+ * referenced down with amdgpu_vm_put_task_info.
+ */
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
+{
+ struct amdgpu_task_info *ti = NULL;
+
+ if (vm) {
+ ti = vm->task_info;
+ kref_get(&vm->task_info->refcount);
+ }
+
+ return ti;
+}
+
+static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
+{
+ vm->task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL);
+ if (!vm->task_info)
+ return -ENOMEM;
+
+ kref_init(&vm->task_info->refcount);
+ kref_get(&vm->task_info->refcount);
+ return 0;
+}
+
+/**
+ * amdgpu_vm_set_task_info - Sets VMs task info.
+ *
+ * @vm: vm for which to set the info
+ */
+void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
+{
+ if (!vm->task_info)
+ return;
+
+ if (vm->task_info->pid == current->pid)
+ return;
+
+ vm->task_info->pid = current->pid;
+ get_task_comm(vm->task_info->task_name, current);
+
+ if (current->group_leader->mm != current->mm)
+ return;
+
+ vm->task_info->tgid = current->group_leader->pid;
+ get_task_comm(vm->task_info->process_name, current->group_leader);
+}
+
/**
* amdgpu_vm_init - initialize a vm instance
*
@@ -2212,6 +2323,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (r)
goto error_free_root;
+ r = amdgpu_vm_create_task_info(vm);
+ if (r)
+ DRM_DEBUG("Failed to create task info for VM\n");
+
amdgpu_bo_unreserve(vm->root.bo);
amdgpu_bo_unref(&root_bo);
@@ -2351,6 +2466,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
root = amdgpu_bo_ref(vm->root.bo);
amdgpu_bo_reserve(root, true);
+ amdgpu_vm_put_task_info(vm->task_info);
amdgpu_vm_set_pasid(adev, vm, 0);
dma_fence_wait(vm->last_unlocked, false);
dma_fence_put(vm->last_unlocked);
@@ -2507,48 +2623,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
return 0;
}
-/**
- * amdgpu_vm_get_task_info - Extracts task info for a PASID.
- *
- * @adev: drm device pointer
- * @pasid: PASID identifier for VM
- * @task_info: task_info to fill.
- */
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
- struct amdgpu_task_info *task_info)
-{
- struct amdgpu_vm *vm;
- unsigned long flags;
-
- xa_lock_irqsave(&adev->vm_manager.pasids, flags);
-
- vm = xa_load(&adev->vm_manager.pasids, pasid);
- if (vm)
- *task_info = vm->task_info;
-
- xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
-}
-
-/**
- * amdgpu_vm_set_task_info - Sets VMs task info.
- *
- * @vm: vm for which to set the info
- */
-void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
-{
- if (vm->task_info.pid)
- return;
-
- vm->task_info.pid = current->pid;
- get_task_comm(vm->task_info.task_name, current);
-
- if (current->group_leader->mm != current->mm)
- return;
-
- vm->task_info.tgid = current->group_leader->pid;
- get_task_comm(vm->task_info.process_name, current->group_leader);
-}
-
/**
* amdgpu_vm_handle_fault - graceful handling of VM faults.
* @adev: amdgpu device pointer
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 2cd86d2bf73f..a74b94c3c9ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -190,10 +190,11 @@ struct amdgpu_vm_pte_funcs {
};
struct amdgpu_task_info {
- char process_name[TASK_COMM_LEN];
- char task_name[TASK_COMM_LEN];
- pid_t pid;
- pid_t tgid;
+ char process_name[TASK_COMM_LEN];
+ char task_name[TASK_COMM_LEN];
+ pid_t pid;
+ pid_t tgid;
+ struct kref refcount;
};
/**
@@ -356,7 +357,7 @@ struct amdgpu_vm {
uint64_t pd_phys_addr;
/* Some basic info about the task */
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
/* Store positions of group of BOs */
struct ttm_lru_bulk_move lru_bulk_move;
@@ -492,8 +493,14 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
struct amdgpu_job *job);
void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
-void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
- struct amdgpu_task_info *task_info);
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
+
+struct amdgpu_task_info *
+amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
+
+void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
+
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
bool write_fault);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a160265ddc07..d9e895cb0c10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -1027,7 +1027,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
min(nptes, 32u), dst, incr,
upd_flags,
- vm->task_info.tgid,
+ vm->task_info ? vm->task_info->tgid : 0,
vm->immediate.fence_context);
amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
cursor.level, pe_start, dst,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index a5a05c16c10d..7098f0be83c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -105,7 +105,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
uint32_t status = 0;
u64 addr;
@@ -157,18 +157,22 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
if (!printk_ratelimit())
return 0;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
-
dev_err(adev->dev,
- "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
+ "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
entry->vmid_src ? "mmhub" : "gfxhub",
- entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
+ entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev,
+ " in process %s pid %d thread %s pid %d\n",
+ task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info(task_info);
+ }
+
dev_err(adev->dev, " in page starting at address 0x%016llx from client 0x%x (%s)\n",
- addr, entry->client_id,
- soc15_ih_clientid_name[entry->client_id]);
+ addr, entry->client_id,
+ soc15_ih_clientid_name[entry->client_id]);
if (!amdgpu_sriov_vf(adev))
hub->vmhub_funcs->print_l2_protection_fault_status(adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 23d7b548d13f..bff88070bb00 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -126,19 +126,24 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
}
if (printk_ratelimit()) {
- struct amdgpu_task_info task_info;
-
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ struct amdgpu_task_info *task_info;
dev_err(adev->dev,
- "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
+ "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
entry->vmid_src ? "mmhub" : "gfxhub",
- entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
+ entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev,
+ " in process %s pid %d thread %s pid %d)\n",
+ task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info(task_info);
+ }
+
dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n",
- addr, entry->client_id);
+ addr, entry->client_id);
+
if (!amdgpu_sriov_vf(adev))
hub->vmhub_funcs->print_l2_protection_fault_status(adev, status);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index ff4ae73d27ec..ba1f18978487 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1444,18 +1444,24 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
gmc_v8_0_set_fault_enable_default(adev, false);
if (printk_ratelimit()) {
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+ dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
+ entry->src_id, entry->src_data[0]);
+
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev, " for process %s pid %d thread %s pid %d\n",
+ task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info(task_info);
+ }
- dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
- entry->src_id, entry->src_data[0], task_info.process_name,
- task_info.tgid, task_info.task_name, task_info.pid);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
- addr);
+ addr);
dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
status);
+
gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
entry->pasid);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 2ac5820e9c92..cc0968f553a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -549,7 +549,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
uint32_t status = 0, cid = 0, rw = 0;
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
const char *hub_name;
@@ -626,15 +626,20 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
if (!printk_ratelimit())
return 0;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
-
dev_err(adev->dev,
- "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
- hub_name, retry_fault ? "retry" : "no-retry",
- entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
+ "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", hub_name,
+ retry_fault ? "retry" : "no-retry",
+ entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
+
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_err(adev->dev,
+ " for process %s pid %d thread %s pid %d)\n",
+ task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info(task_info);
+ }
+
dev_err(adev->dev, " in page starting at address 0x%016llx from IH client 0x%x (%s)\n",
addr, entry->client_id,
soc15_ih_clientid_name[entry->client_id]);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 3d68dd5523c6..43775cb67ff5 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2104,7 +2104,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
int instance;
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
u64 addr;
instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
@@ -2116,15 +2116,20 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
addr = (u64)entry->src_data[0] << 12;
addr |= ((u64)entry->src_data[1] & 0xf) << 44;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
-
dev_dbg_ratelimited(adev->dev,
- "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
- "pasid:%u, for process %s pid %d thread %s pid %d\n",
- instance, addr, entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
+ "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
+ instance, addr, entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid);
+
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_dbg_ratelimited(adev->dev,
+ " for process %s pid %d thread %s pid %d\n",
+ task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info(task_info);
+ }
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 0f24af6f2810..51a17d7076ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1642,7 +1642,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
int instance;
- struct amdgpu_task_info task_info;
+ struct amdgpu_task_info *task_info;
u64 addr;
instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
@@ -1654,15 +1654,19 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
addr = (u64)entry->src_data[0] << 12;
addr |= ((u64)entry->src_data[1] & 0xf) << 44;
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
-
dev_dbg_ratelimited(adev->dev,
- "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
- "pasid:%u, for process %s pid %d thread %s pid %d\n",
- instance, addr, entry->src_id, entry->ring_id, entry->vmid,
- entry->pasid, task_info.process_name, task_info.tgid,
- task_info.task_name, task_info.pid);
+ "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
+ instance, addr, entry->src_id, entry->ring_id, entry->vmid,
+ entry->pasid);
+
+ task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
+ if (task_info) {
+ dev_dbg_ratelimited(adev->dev, " for process %s pid %d thread %s pid %d\n",
+ task_info->process_name, task_info->tgid,
+ task_info->task_name, task_info->pid);
+ amdgpu_vm_put_task_info(task_info);
+ }
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index d9953c2b2661..06ac835190f9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -238,16 +238,16 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
{
- struct amdgpu_task_info task_info;
-
- memset(&task_info, 0, sizeof(struct amdgpu_task_info));
- amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
- /* Report VM faults from user applications, not retry from kernel */
- if (!task_info.pid)
- return;
-
- kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
- task_info.pid, task_info.task_name);
+ struct amdgpu_task_info *task_info;
+
+ task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
+ if (task_info) {
+ /* Report VM faults from user applications, not retry from kernel */
+ if (task_info->pid)
+ kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
+ task_info->pid, task_info->task_name);
+ amdgpu_vm_put_task_info(task_info);
+ }
}
void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
--
2.43.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v3] drm/amdgpu: change vm->task_info handling
2024-02-05 17:05 [PATCH v3] drm/amdgpu: change vm->task_info handling Shashank Sharma
@ 2024-03-01 17:07 ` Felix Kuehling
2024-03-01 18:29 ` Sharma, Shashank
0 siblings, 1 reply; 3+ messages in thread
From: Felix Kuehling @ 2024-03-01 17:07 UTC (permalink / raw
To: Shashank Sharma, amd-gfx; +Cc: Christian Koenig, Alex Deucher
[-- Attachment #1: Type: text/plain, Size: 25437 bytes --]
On 2024-02-05 12:05, Shashank Sharma wrote:
> This patch changes the handling and lifecycle of vm->task_info object.
> The major changes are:
> - vm->task_info is a dynamically allocated ptr now, and its uasge is
> reference counted.
> - introducing two new helper funcs for task_info lifecycle management
> - amdgpu_vm_get_task_info: reference counts up task_info before
> returning this info
> - amdgpu_vm_put_task_info: reference counts down task_info
> - last put to task_info() frees task_info from the vm.
>
> This patch also does logistical changes required for existing usage
> of vm->task_info.
>
> V2: Do not block all the prints when task_info not found (Felix)
> V3: (Felix)
> - Fix wrong indentation
> - No debug message for -ENOMEM
> - Add NULL check for task_info
> - Do not duplicate the debug messages (ti vs no ti)
> - Get first reference of task_info in vm_init(), put last
> in vm_fini()
>
> Cc: Christian Koenig<christian.koenig@amd.com>
> Cc: Alex Deucher<alexander.deucher@amd.com>
> Cc: Felix Kuehling<Felix.Kuehling@amd.com>
> Signed-off-by: Shashank Sharma<shashank.sharma@amd.com>
One nit-pick and one bug inline. With those fixed, the patch
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 18 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 158 ++++++++++++++------
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 21 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 24 +--
> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 23 +--
> drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 20 ++-
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 23 +--
> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 23 +--
> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 22 +--
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 20 +--
> 13 files changed, 251 insertions(+), 124 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 0e61ebdb3f3e..f9eb12697b95 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1775,9 +1775,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused)
> list_for_each_entry(file, &dev->filelist, lhead) {
> struct amdgpu_fpriv *fpriv = file->driver_priv;
> struct amdgpu_vm *vm = &fpriv->vm;
> + struct amdgpu_task_info *ti;
> +
> + ti = amdgpu_vm_get_task_info_vm(vm);
> + if (ti) {
> + seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid, ti->process_name);
> + amdgpu_vm_put_task_info(ti);
> + }
>
> - seq_printf(m, "pid:%d\tProcess:%s ----------\n",
> - vm->task_info.pid, vm->task_info.process_name);
> r = amdgpu_bo_reserve(vm->root.bo, true);
> if (r)
> break;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 1f357198533f..e6e6d56398f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> {
> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> struct amdgpu_job *job = to_amdgpu_job(s_job);
> - struct amdgpu_task_info ti;
> + struct amdgpu_task_info *ti;
> struct amdgpu_device *adev = ring->adev;
> int idx;
> int r;
> @@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> return DRM_GPU_SCHED_STAT_ENODEV;
> }
>
> - memset(&ti, 0, sizeof(struct amdgpu_task_info));
> +
> adev->job_hang = true;
>
> if (amdgpu_gpu_recovery &&
> @@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> goto exit;
> }
>
> - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
> - job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> - ring->fence_drv.sync_seq);
> - DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
> - ti.process_name, ti.tgid, ti.task_name, ti.pid);
> + job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> + ring->fence_drv.sync_seq);
> +
> + ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
> + if (ti) {
> + DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
> + ti->process_name, ti->tgid, ti->task_name, ti->pid);
> + amdgpu_vm_put_task_info(ti);
> + }
>
> dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 4baa300121d8..a59364e9b6ed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -230,8 +230,16 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>
> coredump->reset_vram_lost = vram_lost;
>
> - if (reset_context->job && reset_context->job->vm)
> - coredump->reset_task_info = reset_context->job->vm->task_info;
> + if (reset_context->job && reset_context->job->vm) {
> + struct amdgpu_task_info *ti;
> + struct amdgpu_vm *vm = reset_context->job->vm;
> +
> + ti = amdgpu_vm_get_task_info_vm(vm);
> + if (ti) {
> + coredump->reset_task_info = *ti;
> + amdgpu_vm_put_task_info(ti);
> + }
> + }
>
> coredump->adev = adev;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index d1b8afd105c9..8414567af683 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2127,6 +2127,117 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
> return dma_fence_wait_timeout(vm->last_unlocked, true, timeout);
> }
>
> +static void amdgpu_vm_destroy_task_info(struct kref *kref)
> +{
> + struct amdgpu_task_info *ti = container_of(kref, struct amdgpu_task_info, refcount);
> +
> + kfree(ti);
> +}
> +
> +static inline struct amdgpu_vm *
> +amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
> +{
> + struct amdgpu_vm *vm;
> + unsigned long flags;
> +
> + xa_lock_irqsave(&adev->vm_manager.pasids, flags);
> + vm = xa_load(&adev->vm_manager.pasids, pasid);
> + xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
> +
> + return vm;
> +}
> +
> +/**
> + * amdgpu_vm_put_task_info - reference down the vm task_info ptr
> + *
> + * @task_info: task_info struct under discussion.
> + *
> + * frees the vm task_info ptr at the last put
> + */
> +void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info)
> +{
> + kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
> +}
> +
> +/**
> + * amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
> + *
> + * @adev: drm device pointer
> + * @pasid: PASID identifier for VM
> + *
> + * Returns the reference counted task_info structure, which must be
> + * referenced down with amdgpu_vm_put_task_info.
> + */
> +struct amdgpu_task_info *
> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
> +{
> + struct amdgpu_vm *vm;
> + struct amdgpu_task_info *ti = NULL;
> +
> + vm = amdgpu_vm_get_vm_from_pasid(adev, pasid);
> + if (vm) {
> + ti = vm->task_info;
> + kref_get(&vm->task_info->refcount);
> + }
This could be more concise and elegant if you implemented it in terms of
amdgpu_vm_get_task_info_vm. I.e.:
return amdgpu_vm_get_task_info_vm(
amdgpu_vm_get_vm_from_pasid(adev, pasid));
> +
> + return ti;
> +}
> +
> +/**
> + * amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
> + *
> + * @vm: VM to get info from
> + *
> + * Returns the reference counted task_info structure, which must be
> + * referenced down with amdgpu_vm_put_task_info.
> + */
> +struct amdgpu_task_info *
> +amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
> +{
> + struct amdgpu_task_info *ti = NULL;
> +
> + if (vm) {
> + ti = vm->task_info;
> + kref_get(&vm->task_info->refcount);
> + }
> +
> + return ti;
> +}
> +
> +static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
> +{
> + vm->task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL);
> + if (!vm->task_info)
> + return -ENOMEM;
> +
> + kref_init(&vm->task_info->refcount);
> + kref_get(&vm->task_info->refcount);
kref_init initializes the refcount to 1. I don't think you should to
take another reference here because that reference never gets released.
Regards,
Felix
> + return 0;
> +}
> +
> +/**
> + * amdgpu_vm_set_task_info - Sets VMs task info.
> + *
> + * @vm: vm for which to set the info
> + */
> +void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
> +{
> + if (!vm->task_info)
> + return;
> +
> + if (vm->task_info->pid == current->pid)
> + return;
> +
> + vm->task_info->pid = current->pid;
> + get_task_comm(vm->task_info->task_name, current);
> +
> + if (current->group_leader->mm != current->mm)
> + return;
> +
> + vm->task_info->tgid = current->group_leader->pid;
> + get_task_comm(vm->task_info->process_name, current->group_leader);
> +}
> +
> /**
> * amdgpu_vm_init - initialize a vm instance
> *
> @@ -2212,6 +2323,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> if (r)
> goto error_free_root;
>
> + r = amdgpu_vm_create_task_info(vm);
> + if (r)
> + DRM_DEBUG("Failed to create task info for VM\n");
> +
> amdgpu_bo_unreserve(vm->root.bo);
> amdgpu_bo_unref(&root_bo);
>
> @@ -2351,6 +2466,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>
> root = amdgpu_bo_ref(vm->root.bo);
> amdgpu_bo_reserve(root, true);
> + amdgpu_vm_put_task_info(vm->task_info);
> amdgpu_vm_set_pasid(adev, vm, 0);
> dma_fence_wait(vm->last_unlocked, false);
> dma_fence_put(vm->last_unlocked);
> @@ -2507,48 +2623,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> return 0;
> }
>
> -/**
> - * amdgpu_vm_get_task_info - Extracts task info for a PASID.
> - *
> - * @adev: drm device pointer
> - * @pasid: PASID identifier for VM
> - * @task_info: task_info to fill.
> - */
> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
> - struct amdgpu_task_info *task_info)
> -{
> - struct amdgpu_vm *vm;
> - unsigned long flags;
> -
> - xa_lock_irqsave(&adev->vm_manager.pasids, flags);
> -
> - vm = xa_load(&adev->vm_manager.pasids, pasid);
> - if (vm)
> - *task_info = vm->task_info;
> -
> - xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
> -}
> -
> -/**
> - * amdgpu_vm_set_task_info - Sets VMs task info.
> - *
> - * @vm: vm for which to set the info
> - */
> -void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
> -{
> - if (vm->task_info.pid)
> - return;
> -
> - vm->task_info.pid = current->pid;
> - get_task_comm(vm->task_info.task_name, current);
> -
> - if (current->group_leader->mm != current->mm)
> - return;
> -
> - vm->task_info.tgid = current->group_leader->pid;
> - get_task_comm(vm->task_info.process_name, current->group_leader);
> -}
> -
> /**
> * amdgpu_vm_handle_fault - graceful handling of VM faults.
> * @adev: amdgpu device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 2cd86d2bf73f..a74b94c3c9ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -190,10 +190,11 @@ struct amdgpu_vm_pte_funcs {
> };
>
> struct amdgpu_task_info {
> - char process_name[TASK_COMM_LEN];
> - char task_name[TASK_COMM_LEN];
> - pid_t pid;
> - pid_t tgid;
> + char process_name[TASK_COMM_LEN];
> + char task_name[TASK_COMM_LEN];
> + pid_t pid;
> + pid_t tgid;
> + struct kref refcount;
> };
>
> /**
> @@ -356,7 +357,7 @@ struct amdgpu_vm {
> uint64_t pd_phys_addr;
>
> /* Some basic info about the task */
> - struct amdgpu_task_info task_info;
> + struct amdgpu_task_info *task_info;
>
> /* Store positions of group of BOs */
> struct ttm_lru_bulk_move lru_bulk_move;
> @@ -492,8 +493,14 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
> struct amdgpu_job *job);
> void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>
> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
> - struct amdgpu_task_info *task_info);
> +struct amdgpu_task_info *
> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
> +
> +struct amdgpu_task_info *
> +amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
> +
> +void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
> +
> bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> u32 vmid, u32 node_id, uint64_t addr,
> bool write_fault);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index a160265ddc07..d9e895cb0c10 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -1027,7 +1027,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
> trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
> min(nptes, 32u), dst, incr,
> upd_flags,
> - vm->task_info.tgid,
> + vm->task_info ? vm->task_info->tgid : 0,
> vm->immediate.fence_context);
> amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
> cursor.level, pe_start, dst,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index a5a05c16c10d..7098f0be83c7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -105,7 +105,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
> struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
> bool retry_fault = !!(entry->src_data[1] & 0x80);
> bool write_fault = !!(entry->src_data[1] & 0x20);
> - struct amdgpu_task_info task_info;
> + struct amdgpu_task_info *task_info;
> uint32_t status = 0;
> u64 addr;
>
> @@ -157,18 +157,22 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
> if (!printk_ratelimit())
> return 0;
>
> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> -
> dev_err(adev->dev,
> - "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
> + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
> entry->vmid_src ? "mmhub" : "gfxhub",
> - entry->src_id, entry->ring_id, entry->vmid,
> - entry->pasid, task_info.process_name, task_info.tgid,
> - task_info.task_name, task_info.pid);
> + entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> + if (task_info) {
> + dev_err(adev->dev,
> + " in process %s pid %d thread %s pid %d\n",
> + task_info->process_name, task_info->tgid,
> + task_info->task_name, task_info->pid);
> + amdgpu_vm_put_task_info(task_info);
> + }
> +
> dev_err(adev->dev, " in page starting at address 0x%016llx from client 0x%x (%s)\n",
> - addr, entry->client_id,
> - soc15_ih_clientid_name[entry->client_id]);
> + addr, entry->client_id,
> + soc15_ih_clientid_name[entry->client_id]);
>
> if (!amdgpu_sriov_vf(adev))
> hub->vmhub_funcs->print_l2_protection_fault_status(adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 23d7b548d13f..bff88070bb00 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -126,19 +126,24 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
> }
>
> if (printk_ratelimit()) {
> - struct amdgpu_task_info task_info;
> -
> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> + struct amdgpu_task_info *task_info;
>
> dev_err(adev->dev,
> - "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
> + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
> entry->vmid_src ? "mmhub" : "gfxhub",
> - entry->src_id, entry->ring_id, entry->vmid,
> - entry->pasid, task_info.process_name, task_info.tgid,
> - task_info.task_name, task_info.pid);
> + entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> + if (task_info) {
> + dev_err(adev->dev,
> + " in process %s pid %d thread %s pid %d)\n",
> + task_info->process_name, task_info->tgid,
> + task_info->task_name, task_info->pid);
> + amdgpu_vm_put_task_info(task_info);
> + }
> +
> dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n",
> - addr, entry->client_id);
> + addr, entry->client_id);
> +
> if (!amdgpu_sriov_vf(adev))
> hub->vmhub_funcs->print_l2_protection_fault_status(adev, status);
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index ff4ae73d27ec..ba1f18978487 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -1444,18 +1444,24 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
> gmc_v8_0_set_fault_enable_default(adev, false);
>
> if (printk_ratelimit()) {
> - struct amdgpu_task_info task_info;
> + struct amdgpu_task_info *task_info;
>
> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> + dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
> + entry->src_id, entry->src_data[0]);
> +
> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> + if (task_info) {
> + dev_err(adev->dev, " for process %s pid %d thread %s pid %d\n",
> + task_info->process_name, task_info->tgid,
> + task_info->task_name, task_info->pid);
> + amdgpu_vm_put_task_info(task_info);
> + }
>
> - dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
> - entry->src_id, entry->src_data[0], task_info.process_name,
> - task_info.tgid, task_info.task_name, task_info.pid);
> dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
> - addr);
> + addr);
> dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
> status);
> +
> gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
> entry->pasid);
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 2ac5820e9c92..cc0968f553a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -549,7 +549,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
> bool retry_fault = !!(entry->src_data[1] & 0x80);
> bool write_fault = !!(entry->src_data[1] & 0x20);
> uint32_t status = 0, cid = 0, rw = 0;
> - struct amdgpu_task_info task_info;
> + struct amdgpu_task_info *task_info;
> struct amdgpu_vmhub *hub;
> const char *mmhub_cid;
> const char *hub_name;
> @@ -626,15 +626,20 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
> if (!printk_ratelimit())
> return 0;
>
> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> -
> dev_err(adev->dev,
> - "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
> - hub_name, retry_fault ? "retry" : "no-retry",
> - entry->src_id, entry->ring_id, entry->vmid,
> - entry->pasid, task_info.process_name, task_info.tgid,
> - task_info.task_name, task_info.pid);
> + "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", hub_name,
> + retry_fault ? "retry" : "no-retry",
> + entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
> +
> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> + if (task_info) {
> + dev_err(adev->dev,
> + " for process %s pid %d thread %s pid %d)\n",
> + task_info->process_name, task_info->tgid,
> + task_info->task_name, task_info->pid);
> + amdgpu_vm_put_task_info(task_info);
> + }
> +
> dev_err(adev->dev, " in page starting at address 0x%016llx from IH client 0x%x (%s)\n",
> addr, entry->client_id,
> soc15_ih_clientid_name[entry->client_id]);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index 3d68dd5523c6..43775cb67ff5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -2104,7 +2104,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
> struct amdgpu_iv_entry *entry)
> {
> int instance;
> - struct amdgpu_task_info task_info;
> + struct amdgpu_task_info *task_info;
> u64 addr;
>
> instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
> @@ -2116,15 +2116,20 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
> addr = (u64)entry->src_data[0] << 12;
> addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>
> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> -
> dev_dbg_ratelimited(adev->dev,
> - "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
> - "pasid:%u, for process %s pid %d thread %s pid %d\n",
> - instance, addr, entry->src_id, entry->ring_id, entry->vmid,
> - entry->pasid, task_info.process_name, task_info.tgid,
> - task_info.task_name, task_info.pid);
> + "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
> + instance, addr, entry->src_id, entry->ring_id, entry->vmid,
> + entry->pasid);
> +
> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> + if (task_info) {
> + dev_dbg_ratelimited(adev->dev,
> + " for process %s pid %d thread %s pid %d\n",
> + task_info->process_name, task_info->tgid,
> + task_info->task_name, task_info->pid);
> + amdgpu_vm_put_task_info(task_info);
> + }
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 0f24af6f2810..51a17d7076ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1642,7 +1642,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
> struct amdgpu_iv_entry *entry)
> {
> int instance;
> - struct amdgpu_task_info task_info;
> + struct amdgpu_task_info *task_info;
> u64 addr;
>
> instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
> @@ -1654,15 +1654,19 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
> addr = (u64)entry->src_data[0] << 12;
> addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>
> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> -
> dev_dbg_ratelimited(adev->dev,
> - "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
> - "pasid:%u, for process %s pid %d thread %s pid %d\n",
> - instance, addr, entry->src_id, entry->ring_id, entry->vmid,
> - entry->pasid, task_info.process_name, task_info.tgid,
> - task_info.task_name, task_info.pid);
> + "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
> + instance, addr, entry->src_id, entry->ring_id, entry->vmid,
> + entry->pasid);
> +
> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
> + if (task_info) {
> + dev_dbg_ratelimited(adev->dev, " for process %s pid %d thread %s pid %d\n",
> + task_info->process_name, task_info->tgid,
> + task_info->task_name, task_info->pid);
> + amdgpu_vm_put_task_info(task_info);
> + }
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index d9953c2b2661..06ac835190f9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -238,16 +238,16 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>
> void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
> {
> - struct amdgpu_task_info task_info;
> -
> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> - amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
> - /* Report VM faults from user applications, not retry from kernel */
> - if (!task_info.pid)
> - return;
> -
> - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
> - task_info.pid, task_info.task_name);
> + struct amdgpu_task_info *task_info;
> +
> + task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
> + if (task_info) {
> + /* Report VM faults from user applications, not retry from kernel */
> + if (task_info->pid)
> + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
> + task_info->pid, task_info->task_name);
> + amdgpu_vm_put_task_info(task_info);
> + }
> }
>
> void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
[-- Attachment #2: Type: text/html, Size: 26651 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v3] drm/amdgpu: change vm->task_info handling
2024-03-01 17:07 ` Felix Kuehling
@ 2024-03-01 18:29 ` Sharma, Shashank
0 siblings, 0 replies; 3+ messages in thread
From: Sharma, Shashank @ 2024-03-01 18:29 UTC (permalink / raw
To: Felix Kuehling, amd-gfx; +Cc: Christian Koenig, Alex Deucher
[-- Attachment #1: Type: text/plain, Size: 26428 bytes --]
On 01/03/2024 18:07, Felix Kuehling wrote:
> On 2024-02-05 12:05, Shashank Sharma wrote:
>> This patch changes the handling and lifecycle of vm->task_info object.
>> The major changes are:
>> - vm->task_info is a dynamically allocated ptr now, and its uasge is
>> reference counted.
>> - introducing two new helper funcs for task_info lifecycle management
>> - amdgpu_vm_get_task_info: reference counts up task_info before
>> returning this info
>> - amdgpu_vm_put_task_info: reference counts down task_info
>> - last put to task_info() frees task_info from the vm.
>>
>> This patch also does logistical changes required for existing usage
>> of vm->task_info.
>>
>> V2: Do not block all the prints when task_info not found (Felix)
>> V3: (Felix)
>> - Fix wrong indentation
>> - No debug message for -ENOMEM
>> - Add NULL check for task_info
>> - Do not duplicate the debug messages (ti vs no ti)
>> - Get first reference of task_info in vm_init(), put last
>> in vm_fini()
>>
>> Cc: Christian Koenig<christian.koenig@amd.com>
>> Cc: Alex Deucher<alexander.deucher@amd.com>
>> Cc: Felix Kuehling<Felix.Kuehling@amd.com>
>> Signed-off-by: Shashank Sharma<shashank.sharma@amd.com>
>
> One nit-pick and one bug inline. With those fixed, the patch
>
> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
>
>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 9 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 18 ++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 12 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 158 ++++++++++++++------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 21 ++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +-
>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 24 +--
>> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 23 +--
>> drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 20 ++-
>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 23 +--
>> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 23 +--
>> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 22 +--
>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 20 +--
>> 13 files changed, 251 insertions(+), 124 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index 0e61ebdb3f3e..f9eb12697b95 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1775,9 +1775,14 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused)
>> list_for_each_entry(file, &dev->filelist, lhead) {
>> struct amdgpu_fpriv *fpriv = file->driver_priv;
>> struct amdgpu_vm *vm = &fpriv->vm;
>> + struct amdgpu_task_info *ti;
>> +
>> + ti = amdgpu_vm_get_task_info_vm(vm);
>> + if (ti) {
>> + seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid, ti->process_name);
>> + amdgpu_vm_put_task_info(ti);
>> + }
>>
>> - seq_printf(m, "pid:%d\tProcess:%s ----------\n",
>> - vm->task_info.pid, vm->task_info.process_name);
>> r = amdgpu_bo_reserve(vm->root.bo, true);
>> if (r)
>> break;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 1f357198533f..e6e6d56398f2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>> {
>> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>> struct amdgpu_job *job = to_amdgpu_job(s_job);
>> - struct amdgpu_task_info ti;
>> + struct amdgpu_task_info *ti;
>> struct amdgpu_device *adev = ring->adev;
>> int idx;
>> int r;
>> @@ -48,7 +48,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>> return DRM_GPU_SCHED_STAT_ENODEV;
>> }
>>
>> - memset(&ti, 0, sizeof(struct amdgpu_task_info));
>> +
>> adev->job_hang = true;
>>
>> if (amdgpu_gpu_recovery &&
>> @@ -58,12 +58,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>> goto exit;
>> }
>>
>> - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>> DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>> - job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>> - ring->fence_drv.sync_seq);
>> - DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
>> - ti.process_name, ti.tgid, ti.task_name, ti.pid);
>> + job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>> + ring->fence_drv.sync_seq);
>> +
>> + ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
>> + if (ti) {
>> + DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n",
>> + ti->process_name, ti->tgid, ti->task_name, ti->pid);
>> + amdgpu_vm_put_task_info(ti);
>> + }
>>
>> dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 4baa300121d8..a59364e9b6ed 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -230,8 +230,16 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>>
>> coredump->reset_vram_lost = vram_lost;
>>
>> - if (reset_context->job && reset_context->job->vm)
>> - coredump->reset_task_info = reset_context->job->vm->task_info;
>> + if (reset_context->job && reset_context->job->vm) {
>> + struct amdgpu_task_info *ti;
>> + struct amdgpu_vm *vm = reset_context->job->vm;
>> +
>> + ti = amdgpu_vm_get_task_info_vm(vm);
>> + if (ti) {
>> + coredump->reset_task_info = *ti;
>> + amdgpu_vm_put_task_info(ti);
>> + }
>> + }
>>
>> coredump->adev = adev;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index d1b8afd105c9..8414567af683 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2127,6 +2127,117 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
>> return dma_fence_wait_timeout(vm->last_unlocked, true, timeout);
>> }
>>
>> +static void amdgpu_vm_destroy_task_info(struct kref *kref)
>> +{
>> + struct amdgpu_task_info *ti = container_of(kref, struct amdgpu_task_info, refcount);
>> +
>> + kfree(ti);
>> +}
>> +
>> +static inline struct amdgpu_vm *
>> +amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
>> +{
>> + struct amdgpu_vm *vm;
>> + unsigned long flags;
>> +
>> + xa_lock_irqsave(&adev->vm_manager.pasids, flags);
>> + vm = xa_load(&adev->vm_manager.pasids, pasid);
>> + xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
>> +
>> + return vm;
>> +}
>> +
>> +/**
>> + * amdgpu_vm_put_task_info - reference down the vm task_info ptr
>> + *
>> + * @task_info: task_info struct under discussion.
>> + *
>> + * frees the vm task_info ptr at the last put
>> + */
>> +void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info)
>> +{
>> + kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
>> +}
>> +
>> +/**
>> + * amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
>> + *
>> + * @adev: drm device pointer
>> + * @pasid: PASID identifier for VM
>> + *
>> + * Returns the reference counted task_info structure, which must be
>> + * referenced down with amdgpu_vm_put_task_info.
>> + */
>> +struct amdgpu_task_info *
>> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
>> +{
>> + struct amdgpu_vm *vm;
>> + struct amdgpu_task_info *ti = NULL;
>> +
>> + vm = amdgpu_vm_get_vm_from_pasid(adev, pasid);
>> + if (vm) {
>> + ti = vm->task_info;
>> + kref_get(&vm->task_info->refcount);
>> + }
>
> This could be more concise and elegant if you implemented it in terms
> of amdgpu_vm_get_task_info_vm. I.e.:
>
> return amdgpu_vm_get_task_info_vm(
> amdgpu_vm_get_vm_from_pasid(adev, pasid));
Certainly looks better this way.
>
>> +
>> + return ti;
>> +}
>> +
>> +/**
>> + * amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
>> + *
>> + * @vm: VM to get info from
>> + *
>> + * Returns the reference counted task_info structure, which must be
>> + * referenced down with amdgpu_vm_put_task_info.
>> + */
>> +struct amdgpu_task_info *
>> +amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
>> +{
>> + struct amdgpu_task_info *ti = NULL;
>> +
>> + if (vm) {
>> + ti = vm->task_info;
>> + kref_get(&vm->task_info->refcount);
>> + }
>> +
>> + return ti;
>> +}
>> +
>> +static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
>> +{
>> + vm->task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL);
>> + if (!vm->task_info)
>> + return -ENOMEM;
>> +
>> + kref_init(&vm->task_info->refcount);
>> + kref_get(&vm->task_info->refcount);
>
> kref_init initializes the refcount to 1. I don't think you should to
> take another reference here because that reference never gets released.
>
Thanks for this input, I wasn't aware of this and I realized the destroy
function was never getting called due to this. After fixing this it is
getting called properly.
I will do this change and push the patch.
- Shashank
> Regards,
> Felix
>
>
>> + return 0;
>> +}
>> +
>> +/**
>> + * amdgpu_vm_set_task_info - Sets VMs task info.
>> + *
>> + * @vm: vm for which to set the info
>> + */
>> +void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>> +{
>> + if (!vm->task_info)
>> + return;
>> +
>> + if (vm->task_info->pid == current->pid)
>> + return;
>> +
>> + vm->task_info->pid = current->pid;
>> + get_task_comm(vm->task_info->task_name, current);
>> +
>> + if (current->group_leader->mm != current->mm)
>> + return;
>> +
>> + vm->task_info->tgid = current->group_leader->pid;
>> + get_task_comm(vm->task_info->process_name, current->group_leader);
>> +}
>> +
>> /**
>> * amdgpu_vm_init - initialize a vm instance
>> *
>> @@ -2212,6 +2323,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>> if (r)
>> goto error_free_root;
>>
>> + r = amdgpu_vm_create_task_info(vm);
>> + if (r)
>> + DRM_DEBUG("Failed to create task info for VM\n");
>> +
>> amdgpu_bo_unreserve(vm->root.bo);
>> amdgpu_bo_unref(&root_bo);
>>
>> @@ -2351,6 +2466,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>
>> root = amdgpu_bo_ref(vm->root.bo);
>> amdgpu_bo_reserve(root, true);
>> + amdgpu_vm_put_task_info(vm->task_info);
>> amdgpu_vm_set_pasid(adev, vm, 0);
>> dma_fence_wait(vm->last_unlocked, false);
>> dma_fence_put(vm->last_unlocked);
>> @@ -2507,48 +2623,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>> return 0;
>> }
>>
>> -/**
>> - * amdgpu_vm_get_task_info - Extracts task info for a PASID.
>> - *
>> - * @adev: drm device pointer
>> - * @pasid: PASID identifier for VM
>> - * @task_info: task_info to fill.
>> - */
>> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
>> - struct amdgpu_task_info *task_info)
>> -{
>> - struct amdgpu_vm *vm;
>> - unsigned long flags;
>> -
>> - xa_lock_irqsave(&adev->vm_manager.pasids, flags);
>> -
>> - vm = xa_load(&adev->vm_manager.pasids, pasid);
>> - if (vm)
>> - *task_info = vm->task_info;
>> -
>> - xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
>> -}
>> -
>> -/**
>> - * amdgpu_vm_set_task_info - Sets VMs task info.
>> - *
>> - * @vm: vm for which to set the info
>> - */
>> -void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>> -{
>> - if (vm->task_info.pid)
>> - return;
>> -
>> - vm->task_info.pid = current->pid;
>> - get_task_comm(vm->task_info.task_name, current);
>> -
>> - if (current->group_leader->mm != current->mm)
>> - return;
>> -
>> - vm->task_info.tgid = current->group_leader->pid;
>> - get_task_comm(vm->task_info.process_name, current->group_leader);
>> -}
>> -
>> /**
>> * amdgpu_vm_handle_fault - graceful handling of VM faults.
>> * @adev: amdgpu device pointer
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 2cd86d2bf73f..a74b94c3c9ba 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -190,10 +190,11 @@ struct amdgpu_vm_pte_funcs {
>> };
>>
>> struct amdgpu_task_info {
>> - char process_name[TASK_COMM_LEN];
>> - char task_name[TASK_COMM_LEN];
>> - pid_t pid;
>> - pid_t tgid;
>> + char process_name[TASK_COMM_LEN];
>> + char task_name[TASK_COMM_LEN];
>> + pid_t pid;
>> + pid_t tgid;
>> + struct kref refcount;
>> };
>>
>> /**
>> @@ -356,7 +357,7 @@ struct amdgpu_vm {
>> uint64_t pd_phys_addr;
>>
>> /* Some basic info about the task */
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>>
>> /* Store positions of group of BOs */
>> struct ttm_lru_bulk_move lru_bulk_move;
>> @@ -492,8 +493,14 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>> struct amdgpu_job *job);
>> void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>>
>> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
>> - struct amdgpu_task_info *task_info);
>> +struct amdgpu_task_info *
>> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
>> +
>> +struct amdgpu_task_info *
>> +amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
>> +
>> +void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
>> +
>> bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
>> u32 vmid, u32 node_id, uint64_t addr,
>> bool write_fault);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> index a160265ddc07..d9e895cb0c10 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> @@ -1027,7 +1027,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
>> trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
>> min(nptes, 32u), dst, incr,
>> upd_flags,
>> - vm->task_info.tgid,
>> + vm->task_info ? vm->task_info->tgid : 0,
>> vm->immediate.fence_context);
>> amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
>> cursor.level, pe_start, dst,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index a5a05c16c10d..7098f0be83c7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -105,7 +105,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>> struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
>> bool retry_fault = !!(entry->src_data[1] & 0x80);
>> bool write_fault = !!(entry->src_data[1] & 0x20);
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> uint32_t status = 0;
>> u64 addr;
>>
>> @@ -157,18 +157,22 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>> if (!printk_ratelimit())
>> return 0;
>>
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> dev_err(adev->dev,
>> - "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>> entry->vmid_src ? "mmhub" : "gfxhub",
>> - entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> + entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>> + dev_err(adev->dev,
>> + " in process %s pid %d thread %s pid %d\n",
>> + task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info(task_info);
>> + }
>> +
>> dev_err(adev->dev, " in page starting at address 0x%016llx from client 0x%x (%s)\n",
>> - addr, entry->client_id,
>> - soc15_ih_clientid_name[entry->client_id]);
>> + addr, entry->client_id,
>> + soc15_ih_clientid_name[entry->client_id]);
>>
>> if (!amdgpu_sriov_vf(adev))
>> hub->vmhub_funcs->print_l2_protection_fault_status(adev,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>> index 23d7b548d13f..bff88070bb00 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>> @@ -126,19 +126,24 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
>> }
>>
>> if (printk_ratelimit()) {
>> - struct amdgpu_task_info task_info;
>> -
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> + struct amdgpu_task_info *task_info;
>>
>> dev_err(adev->dev,
>> - "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>> entry->vmid_src ? "mmhub" : "gfxhub",
>> - entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> + entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>> + dev_err(adev->dev,
>> + " in process %s pid %d thread %s pid %d)\n",
>> + task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info(task_info);
>> + }
>> +
>> dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n",
>> - addr, entry->client_id);
>> + addr, entry->client_id);
>> +
>> if (!amdgpu_sriov_vf(adev))
>> hub->vmhub_funcs->print_l2_protection_fault_status(adev, status);
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> index ff4ae73d27ec..ba1f18978487 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> @@ -1444,18 +1444,24 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>> gmc_v8_0_set_fault_enable_default(adev, false);
>>
>> if (printk_ratelimit()) {
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>>
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> + dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
>> + entry->src_id, entry->src_data[0]);
>> +
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>> + dev_err(adev->dev, " for process %s pid %d thread %s pid %d\n",
>> + task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info(task_info);
>> + }
>>
>> - dev_err(adev->dev, "GPU fault detected: %d 0x%08x for process %s pid %d thread %s pid %d\n",
>> - entry->src_id, entry->src_data[0], task_info.process_name,
>> - task_info.tgid, task_info.task_name, task_info.pid);
>> dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
>> - addr);
>> + addr);
>> dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
>> status);
>> +
>> gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
>> entry->pasid);
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index 2ac5820e9c92..cc0968f553a1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -549,7 +549,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>> bool retry_fault = !!(entry->src_data[1] & 0x80);
>> bool write_fault = !!(entry->src_data[1] & 0x20);
>> uint32_t status = 0, cid = 0, rw = 0;
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> struct amdgpu_vmhub *hub;
>> const char *mmhub_cid;
>> const char *hub_name;
>> @@ -626,15 +626,20 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>> if (!printk_ratelimit())
>> return 0;
>>
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> dev_err(adev->dev,
>> - "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> - hub_name, retry_fault ? "retry" : "no-retry",
>> - entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> + "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", hub_name,
>> + retry_fault ? "retry" : "no-retry",
>> + entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
>> +
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>> + dev_err(adev->dev,
>> + " for process %s pid %d thread %s pid %d)\n",
>> + task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info(task_info);
>> + }
>> +
>> dev_err(adev->dev, " in page starting at address 0x%016llx from IH client 0x%x (%s)\n",
>> addr, entry->client_id,
>> soc15_ih_clientid_name[entry->client_id]);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> index 3d68dd5523c6..43775cb67ff5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> @@ -2104,7 +2104,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
>> struct amdgpu_iv_entry *entry)
>> {
>> int instance;
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> u64 addr;
>>
>> instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>> @@ -2116,15 +2116,20 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev,
>> addr = (u64)entry->src_data[0] << 12;
>> addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>>
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> dev_dbg_ratelimited(adev->dev,
>> - "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>> - "pasid:%u, for process %s pid %d thread %s pid %d\n",
>> - instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> + "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
>> + instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> + entry->pasid);
>> +
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>> + dev_dbg_ratelimited(adev->dev,
>> + " for process %s pid %d thread %s pid %d\n",
>> + task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info(task_info);
>> + }
>> +
>> return 0;
>> }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> index 0f24af6f2810..51a17d7076ec 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> @@ -1642,7 +1642,7 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
>> struct amdgpu_iv_entry *entry)
>> {
>> int instance;
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> u64 addr;
>>
>> instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
>> @@ -1654,15 +1654,19 @@ static int sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
>> addr = (u64)entry->src_data[0] << 12;
>> addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>>
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> dev_dbg_ratelimited(adev->dev,
>> - "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>> - "pasid:%u, for process %s pid %d thread %s pid %d\n",
>> - instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> + "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u pasid:%u\n",
>> + instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> + entry->pasid);
>> +
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>> + dev_dbg_ratelimited(adev->dev, " for process %s pid %d thread %s pid %d\n",
>> + task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info(task_info);
>> + }
>> +
>> return 0;
>> }
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> index d9953c2b2661..06ac835190f9 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -238,16 +238,16 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>>
>> void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid)
>> {
>> - struct amdgpu_task_info task_info;
>> -
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
>> - /* Report VM faults from user applications, not retry from kernel */
>> - if (!task_info.pid)
>> - return;
>> -
>> - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
>> - task_info.pid, task_info.task_name);
>> + struct amdgpu_task_info *task_info;
>> +
>> + task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
>> + if (task_info) {
>> + /* Report VM faults from user applications, not retry from kernel */
>> + if (task_info->pid)
>> + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
>> + task_info->pid, task_info->task_name);
>> + amdgpu_vm_put_task_info(task_info);
>> + }
>> }
>>
>> void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
[-- Attachment #2: Type: text/html, Size: 27622 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-03-01 18:30 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-02-05 17:05 [PATCH v3] drm/amdgpu: change vm->task_info handling Shashank Sharma
2024-03-01 17:07 ` Felix Kuehling
2024-03-01 18:29 ` Sharma, Shashank
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).