From ffc197763e636b928963c5dd9a3eaea8146345e3 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 3 May 2017 09:20:10 +0800 Subject: [PATCH] drm/i915/gvt: rewrite the trace gvt:gvt_command using trace style approach The gvt:gvt_command trace involve unnecessary overhead even this trace is not enabled. We need improve it. The kernel trace infrastructure provide a full api to define a trace event. We should leverage them if possible. And one important thing is that a trace point should store raw data but not format string. This patch include two part work: 1) Refactor the gvt_command trace definition, including: o only store raw trace data. o use __dynamic_array() to declare a variable size buffer. o use __print_array() to format raw cmd data. o rename vm_id as vgpu_id. 2) Improve the trace invoking, including: o remove the cycles calculation for handler. We can get this data by any perf tool. o do not make a backup for raw cmd data which just doesn't make sense. With this patch, this trace has no overhead if it is not enabled. And we are trace style now. The final output example: gvt workload 0-211 [000] ...1 120.555964: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e880, raw cmd {0x4000000} gvt workload 0-211 [000] ...1 120.556014: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e884, raw cmd {0x7a000004,0x1004000,0xe1511018,0x0,0x7d,0x0} gvt workload 0-211 [000] ...1 120.556062: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e89c, raw cmd {0x7a000004,0x140000,0x0,0x0,0x0,0x0} gvt workload 0-211 [000] ...1 120.556110: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e8b4, raw cmd {0x10400002,0xe1511018,0x0,0x7d} Signed-off-by: Changbin Du Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/cmd_parser.c | 50 ++--------------- drivers/gpu/drm/i915/gvt/trace.h | 78 ++++++++++----------------- 2 files changed, 32 insertions(+), 96 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index 41b2c3aaa04a..5634eb1fa24b 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -2414,53 +2414,13 @@ static void add_cmd_entry(struct intel_gvt *gvt, struct cmd_entry *e) hash_add(gvt->cmd_table, &e->hlist, e->info->opcode); } -#define GVT_MAX_CMD_LENGTH 20 /* In Dword */ - -static void trace_cs_command(struct parser_exec_state *s, - cycles_t cost_pre_cmd_handler, cycles_t cost_cmd_handler) -{ - /* This buffer is used by ftrace to store all commands copied from - * guest gma space. Sometimes commands can cross pages, this should - * not be handled in ftrace logic. So this is just used as a - * 'bounce buffer' - */ - u32 cmd_trace_buf[GVT_MAX_CMD_LENGTH]; - int i; - u32 cmd_len = cmd_length(s); - /* The chosen value of GVT_MAX_CMD_LENGTH are just based on - * following two considerations: - * 1) From observation, most common ring commands is not that long. - * But there are execeptions. So it indeed makes sence to observe - * longer commands. - * 2) From the performance and debugging point of view, dumping all - * contents of very commands is not necessary. - * We mgith shrink GVT_MAX_CMD_LENGTH or remove this trace event in - * future for performance considerations. - */ - if (unlikely(cmd_len > GVT_MAX_CMD_LENGTH)) { - gvt_dbg_cmd("cmd length exceed tracing limitation!\n"); - cmd_len = GVT_MAX_CMD_LENGTH; - } - - for (i = 0; i < cmd_len; i++) - cmd_trace_buf[i] = cmd_val(s, i); - - trace_gvt_command(s->vgpu->id, s->ring_id, s->ip_gma, cmd_trace_buf, - cmd_len, s->buf_type == RING_BUFFER_INSTRUCTION, - cost_pre_cmd_handler, cost_cmd_handler); -} - /* call the cmd handler, and advance ip */ static int cmd_parser_exec(struct parser_exec_state *s) { + struct intel_vgpu *vgpu = s->vgpu; struct cmd_info *info; u32 cmd; int ret = 0; - cycles_t t0, t1, t2; - struct parser_exec_state s_before_advance_custom; - struct intel_vgpu *vgpu = s->vgpu; - - t0 = get_cycles(); cmd = cmd_val(s, 0); @@ -2475,9 +2435,8 @@ static int cmd_parser_exec(struct parser_exec_state *s) s->info = info; - t1 = get_cycles(); - - s_before_advance_custom = *s; + trace_gvt_command(vgpu->id, s->ring_id, s->ip_gma, s->ip_va, + cmd_length(s), s->buf_type); if (info->handler) { ret = info->handler(s); @@ -2486,9 +2445,6 @@ static int cmd_parser_exec(struct parser_exec_state *s) return ret; } } - t2 = get_cycles(); - - trace_cs_command(&s_before_advance_custom, t1 - t0, t2 - t1); if (!(info->flag & F_IP_ADVANCE_CUSTOM)) { ret = cmd_advance_default(s); diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h index 53a2d10cf3f1..9171291e36c6 100644 --- a/drivers/gpu/drm/i915/gvt/trace.h +++ b/drivers/gpu/drm/i915/gvt/trace.h @@ -224,57 +224,37 @@ TRACE_EVENT(oos_sync, TP_printk("%s", __entry->buf) ); -#define MAX_CMD_STR_LEN 256 TRACE_EVENT(gvt_command, - TP_PROTO(u8 vm_id, u8 ring_id, u32 ip_gma, u32 *cmd_va, u32 cmd_len, bool ring_buffer_cmd, cycles_t cost_pre_cmd_handler, cycles_t cost_cmd_handler), - - TP_ARGS(vm_id, ring_id, ip_gma, cmd_va, cmd_len, ring_buffer_cmd, cost_pre_cmd_handler, cost_cmd_handler), - - TP_STRUCT__entry( - __field(u8, vm_id) - __field(u8, ring_id) - __field(int, i) - __array(char, tmp_buf, MAX_CMD_STR_LEN) - __array(char, cmd_str, MAX_CMD_STR_LEN) - ), - - TP_fast_assign( - __entry->vm_id = vm_id; - __entry->ring_id = ring_id; - __entry->cmd_str[0] = '\0'; - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "VM(%d) Ring(%d): %s ip(%08x) pre handler cost (%llu), handler cost (%llu) ", vm_id, ring_id, ring_buffer_cmd ? "RB":"BB", ip_gma, cost_pre_cmd_handler, cost_cmd_handler); - strcat(__entry->cmd_str, __entry->tmp_buf); - entry->i = 0; - while (cmd_len > 0) { - if (cmd_len >= 8) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x %08x %08x %08x %08x %08x %08x ", - cmd_va[__entry->i], cmd_va[__entry->i+1], cmd_va[__entry->i+2], cmd_va[__entry->i+3], - cmd_va[__entry->i+4], cmd_va[__entry->i+5], cmd_va[__entry->i+6], cmd_va[__entry->i+7]); - __entry->i += 8; - cmd_len -= 8; - strcat(__entry->cmd_str, __entry->tmp_buf); - } else if (cmd_len >= 4) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x %08x %08x ", - cmd_va[__entry->i], cmd_va[__entry->i+1], cmd_va[__entry->i+2], cmd_va[__entry->i+3]); - __entry->i += 4; - cmd_len -= 4; - strcat(__entry->cmd_str, __entry->tmp_buf); - } else if (cmd_len >= 2) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x ", cmd_va[__entry->i], cmd_va[__entry->i+1]); - __entry->i += 2; - cmd_len -= 2; - strcat(__entry->cmd_str, __entry->tmp_buf); - } else if (cmd_len == 1) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x ", cmd_va[__entry->i]); - __entry->i += 1; - cmd_len -= 1; - strcat(__entry->cmd_str, __entry->tmp_buf); - } - } - strcat(__entry->cmd_str, "\n"); - ), + TP_PROTO(u8 vgpu_id, u8 ring_id, u32 ip_gma, u32 *cmd_va, u32 cmd_len, + u32 buf_type), + + TP_ARGS(vgpu_id, ring_id, ip_gma, cmd_va, cmd_len, buf_type), + + TP_STRUCT__entry( + __field(u8, vgpu_id) + __field(u8, ring_id) + __field(u32, ip_gma) + __field(u32, buf_type) + __field(u32, cmd_len) + __dynamic_array(u32, raw_cmd, cmd_len) + ), + + TP_fast_assign( + __entry->vgpu_id = vgpu_id; + __entry->ring_id = ring_id; + __entry->ip_gma = ip_gma; + __entry->buf_type = buf_type; + __entry->cmd_len = cmd_len; + memcpy(__get_dynamic_array(raw_cmd), cmd_va, cmd_len * sizeof(*cmd_va)); + ), + - TP_printk("%s", __entry->cmd_str) + TP_printk("vgpu%d ring %d: buf_type %u, ip_gma %08x, raw cmd %s", + __entry->vgpu_id, + __entry->ring_id, + __entry->buf_type, + __entry->ip_gma, + __print_array(__get_dynamic_array(raw_cmd), __entry->cmd_len, 4)) ); #endif /* _GVT_TRACE_H_ */ -- 2.39.5