diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d6e89e957bc5df..c409c8f1db38ee 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1784,6 +1784,7 @@ struct i915_oa_ops { void (*update_oacontrol)(struct drm_i915_private *dev_priv); void (*update_hw_ctx_id_locked)(struct drm_i915_private *dev_priv, u32 ctx_id); + void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req); void (*read)(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state); bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); @@ -2071,10 +2072,14 @@ struct drm_i915_private { u8 *addr; u32 head; u32 tail; + u32 last_ctx_id; int format; int format_size; } oa_buffer; + u32 ctx_oactxctrl_off; + u32 ctx_flexeu0_off; + struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; int n_builtin_sets; @@ -3321,6 +3326,8 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data, struct drm_file *file); void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv, struct intel_context *context); +void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req); +void i915_oa_update_reg_state(struct intel_engine_cs *ring, uint32_t *reg_state); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct drm_device *dev, diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 9688a825f4c56f..3a90e798e431b0 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -828,6 +828,8 @@ static int do_switch(struct drm_i915_gem_request *req) } } + i915_oa_legacy_ctx_switch_notify(req); + return 0; unpin_out: diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 5aa49127487349..c5447b4382907d 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -28,6 +28,9 @@ #include "intel_ringbuffer.h" #include "intel_lrc.h" #include "i915_oa_hsw.h" +#include "i915_oa_bdw.h" +#include "i915_oa_chv.h" +#include "i915_oa_skl.h" /* Must be a power of two */ #define OA_BUFFER_SIZE SZ_16M @@ -69,6 +72,13 @@ static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = { [I915_OA_FORMAT_C4_B8] = { 7, 64 }, }; +static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { + [I915_OA_FORMAT_A12] = { 0, 64 }, + [I915_OA_FORMAT_A12_B8_C8] = { 2, 128 }, + [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, + [I915_OA_FORMAT_C4_B8] = { 7, 64 }, +}; + #define SAMPLE_OA_REPORT (1<<0) struct perf_open_properties @@ -85,6 +95,14 @@ struct perf_open_properties u32 oa_period_exponent; }; +static bool gen8_oa_buffer_is_empty(struct drm_i915_private *dev_priv) +{ + u32 head = I915_READ(GEN8_OAHEADPTR); + u32 tail = I915_READ(GEN8_OATAILPTR); + + return OA_TAKEN(tail, head) == 0; +} + static bool gen7_oa_buffer_is_empty(struct drm_i915_private *dev_priv) { u32 oastatus2 = I915_READ(GEN7_OASTATUS2); @@ -141,6 +159,116 @@ static bool append_oa_sample(struct i915_perf_stream *stream, return true; } +static u32 gen8_append_oa_reports(struct i915_perf_stream *stream, + struct i915_perf_read_state *read_state, + u32 head, + u32 tail) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + int report_size = dev_priv->perf.oa.oa_buffer.format_size; + u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr; + u32 mask = (OA_BUFFER_SIZE - 1); + u8 *report; + u32 taken; + + head -= dev_priv->perf.oa.oa_buffer.gtt_offset; + tail -= dev_priv->perf.oa.oa_buffer.gtt_offset; + + /* Note: the gpu doesn't wrap the tail according to the OA buffer size + * so when we need to make sure our head/tail values are in-bounds we + * use the above mask. + */ + + while ((taken = OA_TAKEN(tail, head))) { + u32 ctx_id; + + /* The tail increases in 64 byte increments, not in + * format_size steps. */ + if (taken < report_size) + break; + + /* All the report sizes factor neatly into the buffer + * size so we never expect to see a report split + * between the beginning and end of the buffer... */ + BUG_ON((OA_BUFFER_SIZE - (head & mask)) < report_size); + + report = oa_buf_base + (head & mask); + + ctx_id = *(u32 *)(report + 12); + if (i915.enable_execlists) { + /* XXX: Just keep the lower 20 bits for now since I'm + * not entirely sure if the HW touches any of the higher + * bits */ + ctx_id &= 0xfffff; + } + + if (dev_priv->perf.oa.exclusive_stream->enabled) { + + /* NB: For Gen 8 we handle per-context report filtering + * ourselves instead of programming the OA unit with a + * specific context id. + * + * NB: To allow userspace to calculate all counter + * deltas for a specific context we have to send the + * first report belonging to any subsequently + * switched-too context. + */ + if (!dev_priv->perf.oa.exclusive_stream->ctx || + (dev_priv->perf.oa.specific_ctx_id == ctx_id || + (dev_priv->perf.oa.specific_ctx_id != + dev_priv->perf.oa.oa_buffer.last_ctx_id))) { + + if (!append_oa_sample(stream, read_state, report)) + break; + } + } + + /* If append_oa_sample() returns false we shouldn't progress + * head so we update it afterwards... */ + dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id; + head += report_size; + } + + return dev_priv->perf.oa.oa_buffer.gtt_offset + head; +} + +static void gen8_oa_read(struct i915_perf_stream *stream, + struct i915_perf_read_state *read_state) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + u32 oastatus; + u32 head; + u32 tail; + + WARN_ON(!dev_priv->perf.oa.oa_buffer.addr); + + head = I915_READ(GEN8_OAHEADPTR); + tail = I915_READ(GEN8_OATAILPTR); + oastatus = I915_READ(GEN8_OASTATUS); + + if (unlikely(oastatus & (GEN8_OASTATUS_OABUFFER_OVERFLOW | + GEN8_OASTATUS_REPORT_LOST))) { + + if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) { + if (append_oa_status(stream, read_state, + DRM_I915_PERF_RECORD_OA_BUFFER_OVERFLOW)) + oastatus &= ~GEN8_OASTATUS_OABUFFER_OVERFLOW; + } + + if (oastatus & GEN8_OASTATUS_REPORT_LOST) { + if (append_oa_status(stream, read_state, + DRM_I915_PERF_RECORD_OA_REPORT_LOST)) + oastatus &= ~GEN8_OASTATUS_REPORT_LOST; + } + + I915_WRITE(GEN8_OASTATUS, oastatus); + } + + head = gen8_append_oa_reports(stream, read_state, head, tail); + + I915_WRITE(GEN8_OAHEADPTR, head); +} + static u32 gen7_append_oa_reports(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state, u32 head, @@ -335,6 +463,23 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv) OABUFFER_SIZE_16M); /* tail */ } +static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv) +{ + I915_WRITE(GEN8_OAHEADPTR, + dev_priv->perf.oa.oa_buffer.gtt_offset); + /* PRM says: + * + * "This MMIO must be set before the OATAILPTR + * register and after the OAHEADPTR register. This is + * to enable proper functionality of the overflow + * bit." + */ + I915_WRITE(GEN8_OABUFFER, dev_priv->perf.oa.oa_buffer.gtt_offset | + OABUFFER_SIZE_16M | OA_MEM_SELECT_GGTT); + I915_WRITE(GEN8_OATAILPTR, + dev_priv->perf.oa.oa_buffer.gtt_offset); +} + static int alloc_oa_buffer(struct drm_i915_private *dev_priv) { struct drm_i915_gem_object *bo; @@ -437,6 +582,154 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv) ~GT_NOA_ENABLE)); } +/* Manages updating the per-context aspects of the OA stream + * configuration across all contexts. + * + * The awkward consideration here is that OACTXCONTROL controls the + * exponent for periodic sampling which is primarily used for system + * wide profiling where we'd like a consistent sampling period even in + * the face of context switches. + * + * Our approach of updating the register state context (as opposed to + * say using a workaround batch buffer) ensures that the hardware + * won't automatically reload an out-of-date timer exponent even + * transiently before a WA BB could be parsed. + */ +static int configure_all_contexts(struct drm_i915_private *dev_priv) +{ + struct drm_device *dev = dev_priv->dev; + struct intel_context *ctx; + struct intel_engine_cs *ring; + int ring_id; + int ret; + + ret = mutex_lock_interruptible(&dev->struct_mutex); + if (ret) + return ret; + + list_for_each_entry(ctx, &dev_priv->context_list, link) { + + for_each_ring(ring, dev_priv, ring_id) { + /* The actual update of the register state context + * will happen the next time this logical ring + * is submitted. (See i915_oa_update_reg_state() + * which hooks into execlists_update_context()) + */ + atomic_set(&ring->oa_state_dirty, true); + } + } + + mutex_unlock(&dev->struct_mutex); + + /* Now update the current context. + * + * Note: Using MMIO to update per-context registers requires + * some extra care... + */ + ret = intel_uncore_begin_ctx_mmio(dev_priv); + if (ret) { + DRM_ERROR("Failed to bring RCS out of idle to update current ctx OA state"); + return ret; + } + + I915_WRITE(GEN8_OACTXCONTROL, ((dev_priv->perf.oa.period_exponent << + GEN8_OA_TIMER_PERIOD_SHIFT) | + (dev_priv->perf.oa.periodic ? + GEN8_OA_TIMER_ENABLE : 0) | + GEN8_OA_COUNTER_RESUME)); + + config_oa_regs(dev_priv, dev_priv->perf.oa.flex_regs, + dev_priv->perf.oa.flex_regs_len); + + intel_uncore_end_ctx_mmio(dev_priv); + + return 0; +} + +static int bdw_enable_metric_set(struct drm_i915_private *dev_priv) +{ + int ret = i915_oa_select_metric_set_bdw(dev_priv); + + if (ret) + return ret; + + I915_WRITE(GDT_CHICKEN_BITS, 0xA0); + config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs, + dev_priv->perf.oa.mux_regs_len); + I915_WRITE(GDT_CHICKEN_BITS, 0x80); + config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs, + dev_priv->perf.oa.b_counter_regs_len); + + configure_all_contexts(dev_priv); + + return 0; +} + +static void bdw_disable_metric_set(struct drm_i915_private *dev_priv) +{ + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) & + ~GEN6_CSUNIT_CLOCK_GATE_DISABLE)); + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) | + GEN7_DOP_CLOCK_GATE_ENABLE)); +#warning "BDW: Do we need to write to CHICKEN2 to disable DOP clock gating when idle? (vpg does this)" +} + +static int chv_enable_metric_set(struct drm_i915_private *dev_priv) +{ + int ret = i915_oa_select_metric_set_chv(dev_priv); + + if (ret) + return ret; + + I915_WRITE(GDT_CHICKEN_BITS, 0xA0); + config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs, + dev_priv->perf.oa.mux_regs_len); + I915_WRITE(GDT_CHICKEN_BITS, 0x80); + config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs, + dev_priv->perf.oa.b_counter_regs_len); + + configure_all_contexts(dev_priv); + + return 0; +} + +static void chv_disable_metric_set(struct drm_i915_private *dev_priv) +{ + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) & + ~GEN6_CSUNIT_CLOCK_GATE_DISABLE)); + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) | + GEN7_DOP_CLOCK_GATE_ENABLE)); +#warning "CHV: Do we need to write to CHICKEN2 to disable DOP clock gating when idle? (vpg does this)" +} + +static int skl_enable_metric_set(struct drm_i915_private *dev_priv) +{ + int ret = i915_oa_select_metric_set_skl(dev_priv); + + if (ret) + return ret; + + I915_WRITE(GDT_CHICKEN_BITS, 0xA0); + config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs, + dev_priv->perf.oa.mux_regs_len); + I915_WRITE(GDT_CHICKEN_BITS, 0x80); + config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs, + dev_priv->perf.oa.b_counter_regs_len); + + configure_all_contexts(dev_priv); + + return 0; +} + +static void skl_disable_metric_set(struct drm_i915_private *dev_priv) +{ + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) & + ~GEN6_CSUNIT_CLOCK_GATE_DISABLE)); + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) | + GEN7_DOP_CLOCK_GATE_ENABLE)); +#warning "SKL: Do we need to write to CHICKEN2 to disable DOP clock gating when idle? (vpg does this)" +} + static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv) { assert_spin_locked(&dev_priv->perf.hook_lock); @@ -491,6 +784,23 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv) OA_MEM_SELECT_GGTT); } +static void gen8_oa_enable(struct drm_i915_private *dev_priv) +{ + u32 report_format = dev_priv->perf.oa.oa_buffer.format; + u32 tail; + + /* Note: we don't rely on the hardware to perform single context + * filtering and instead filter on the cpu based on the context-id + * field of reports */ + I915_WRITE(GEN8_OACONTROL, (report_format << + GEN8_OA_REPORT_FORMAT_SHIFT) | + GEN8_OA_COUNTER_ENABLE); + + /* Reset the head ptr so we don't forward reports from before now. */ + tail = I915_READ(GEN8_OATAILPTR); + I915_WRITE(GEN8_OAHEADPTR, tail); +} + static void i915_oa_stream_enable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; @@ -508,6 +818,11 @@ static void gen7_oa_disable(struct drm_i915_private *dev_priv) I915_WRITE(GEN7_OACONTROL, 0); } +static void gen8_oa_disable(struct drm_i915_private *dev_priv) +{ + I915_WRITE(GEN8_OACONTROL, 0); +} + static void i915_oa_stream_disable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; @@ -573,6 +888,10 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, if (dev_priv->perf.oa.periodic) dev_priv->perf.oa.period_exponent = props->oa_period_exponent; + if (i915.enable_execlists && stream->ctx) + dev_priv->perf.oa.specific_ctx_id = + intel_execlists_ctx_id(stream->ctx); + ret = alloc_oa_buffer(dev_priv); if (ret) return ret; @@ -648,6 +967,131 @@ void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv, spin_unlock_irqrestore(&dev_priv->perf.hook_lock, flags); } +static void gen8_legacy_ctx_switch_unlocked(struct drm_i915_gem_request *req) +{ + struct drm_i915_private *dev_priv = req->i915; + struct intel_engine_cs *ring = req->ring; + const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs; + int n_flex_regs = dev_priv->perf.oa.flex_regs_len; + int ret; + int i; + + if (!atomic_read(&ring->oa_state_dirty)) + return; + + ret = intel_ring_begin(req, n_flex_regs * 2 + 4); + if (ret) + return; + + intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(n_flex_regs + 1)); + + intel_ring_emit(ring, GEN8_OACTXCONTROL); + intel_ring_emit(ring, + (dev_priv->perf.oa.period_exponent << + GEN8_OA_TIMER_PERIOD_SHIFT) | + (dev_priv->perf.oa.periodic ? + GEN8_OA_TIMER_ENABLE : 0) | + GEN8_OA_COUNTER_RESUME); + + for (i = 0; i < n_flex_regs; i++) { + intel_ring_emit(ring, flex_regs[i].addr); + intel_ring_emit(ring, flex_regs[i].value); + } + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); + + atomic_set(&ring->oa_state_dirty, false); +} + +void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req) +{ + struct drm_i915_private *dev_priv = req->i915; + + if (!dev_priv->perf.initialized) + return; + + if (dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked == NULL) + return; + + if (dev_priv->perf.oa.exclusive_stream && + dev_priv->perf.oa.exclusive_stream->enabled) { + + /* XXX: We don't take a lock here and this may run + * async with respect to stream methods. Notably we + * don't want to block context switches by long i915 + * perf read() operations. + * + * It's expect to always be safe to read the + * dev_priv->perf state needed here, and expected to + * be benign to redundantly update the state if the OA + * unit has been disabled since oa_state_dirty was + * last set. + */ + dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked(req); + } +} + +static void gen8_update_reg_state_unlocked(struct intel_engine_cs *ring, + uint32_t *reg_state) +{ + struct drm_i915_private *dev_priv = ring->dev->dev_private; + const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs; + int n_flex_regs = dev_priv->perf.oa.flex_regs_len; + int ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_off; + int ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_off; + int i; + + if (!atomic_read(&ring->oa_state_dirty)) + return; + + reg_state[ctx_oactxctrl] = GEN8_OACTXCONTROL; + reg_state[ctx_oactxctrl+1] = (dev_priv->perf.oa.period_exponent << + GEN8_OA_TIMER_PERIOD_SHIFT) | + (dev_priv->perf.oa.periodic ? + GEN8_OA_TIMER_ENABLE : 0) | + GEN8_OA_COUNTER_RESUME; + + for (i = 0; i < n_flex_regs; i++) { + uint32_t offset = flex_regs[i].addr; + + /* Map from mmio address to register state context + * offset... */ + + offset -= EU_PERF_CNTL0; + + offset >>= 5; /* Flex EU mmio registers are separated by 256 + * bytes, here they are separated by 8 bytes */ + + /* EU_PERF_CNTL0 offset in register state context... */ + offset += ctx_flexeu0; + + reg_state[offset] = flex_regs[i].addr; + reg_state[offset+1] = flex_regs[i].value; + } + + atomic_set(&ring->oa_state_dirty, false); +} + +void i915_oa_update_reg_state(struct intel_engine_cs *ring, uint32_t *reg_state) +{ + struct drm_i915_private *dev_priv = ring->dev->dev_private; + + if (!dev_priv->perf.initialized) + return; + + /* XXX: We don't take a lock here and this may run async with + * respect to stream methods. Notably we don't want to block + * context switches by long i915 perf read() operations. + * + * It's expect to always be safe to read the dev_priv->perf + * state needed here, and expected to be benign to redundantly + * update the state if the OA unit has been disabled since + * oa_state_dirty was last set. + */ + + gen8_update_reg_state_unlocked(ring, reg_state); +} + static ssize_t i915_perf_read_locked(struct i915_perf_stream *stream, struct file *file, char __user *buf, @@ -1122,7 +1566,9 @@ void i915_perf_init(struct drm_device *dev) { struct drm_i915_private *dev_priv = to_i915(dev); - if (!IS_HASWELL(dev)) + if (!(IS_HASWELL(dev) || + IS_BROADWELL(dev) || IS_CHERRYVIEW(dev) || + IS_SKYLAKE(dev))) return; dev_priv->perf.metrics_kobj = @@ -1139,30 +1585,86 @@ void i915_perf_init(struct drm_device *dev) mutex_init(&dev_priv->perf.lock); spin_lock_init(&dev_priv->perf.hook_lock); - dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer; - dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set; - dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set; - dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable; - dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable; - dev_priv->perf.oa.ops.update_hw_ctx_id_locked = gen7_update_hw_ctx_id_locked; - dev_priv->perf.oa.ops.read = gen7_oa_read; - dev_priv->perf.oa.ops.oa_buffer_is_empty = gen7_oa_buffer_is_empty; + if (IS_HASWELL(dev)) { + dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer; + dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set; + dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set; + dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable; + dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable; + dev_priv->perf.oa.ops.update_hw_ctx_id_locked = gen7_update_hw_ctx_id_locked; + dev_priv->perf.oa.ops.read = gen7_oa_read; + dev_priv->perf.oa.ops.oa_buffer_is_empty = gen7_oa_buffer_is_empty; - dev_priv->perf.oa.oa_formats = hsw_oa_formats; + dev_priv->perf.oa.oa_formats = hsw_oa_formats; - dev_priv->perf.oa.n_builtin_sets = - i915_oa_n_builtin_metric_sets_hsw; + dev_priv->perf.oa.n_builtin_sets = + i915_oa_n_builtin_metric_sets_hsw; - if (i915_perf_init_sysfs_hsw(dev_priv)) { - kobject_put(dev_priv->perf.metrics_kobj); - dev_priv->perf.metrics_kobj = NULL; - return; + if (i915_perf_init_sysfs_hsw(dev_priv)) + goto sysfs_error; + } else { + dev_priv->perf.oa.ops.init_oa_buffer = gen8_init_oa_buffer; + dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable; + dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable; + dev_priv->perf.oa.ops.read = gen8_oa_read; + dev_priv->perf.oa.ops.oa_buffer_is_empty = gen8_oa_buffer_is_empty; + + dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats; + + if (!i915.enable_execlists) { + dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked = + gen8_legacy_ctx_switch_unlocked; + } + + if (IS_BROADWELL(dev)) { + dev_priv->perf.oa.ops.enable_metric_set = + bdw_enable_metric_set; + dev_priv->perf.oa.ops.disable_metric_set = + bdw_disable_metric_set; + dev_priv->perf.oa.ctx_oactxctrl_off = 0x120; + dev_priv->perf.oa.ctx_flexeu0_off = 0x2ce; + dev_priv->perf.oa.n_builtin_sets = + i915_oa_n_builtin_metric_sets_bdw; + + if (i915_perf_init_sysfs_bdw(dev_priv)) + goto sysfs_error; + } else if (IS_CHERRYVIEW(dev)) { + dev_priv->perf.oa.ops.enable_metric_set = + chv_enable_metric_set; + dev_priv->perf.oa.ops.disable_metric_set = + chv_disable_metric_set; + dev_priv->perf.oa.ctx_oactxctrl_off = 0x120; + dev_priv->perf.oa.ctx_flexeu0_off = 0x2ce; + dev_priv->perf.oa.n_builtin_sets = + i915_oa_n_builtin_metric_sets_chv; + + if (i915_perf_init_sysfs_chv(dev_priv)) + goto sysfs_error; + } else if (IS_SKYLAKE(dev)) { + dev_priv->perf.oa.ops.enable_metric_set = + skl_enable_metric_set; + dev_priv->perf.oa.ops.disable_metric_set = + skl_disable_metric_set; + dev_priv->perf.oa.ctx_oactxctrl_off = 0x128; + dev_priv->perf.oa.ctx_flexeu0_off = 0x3de; + dev_priv->perf.oa.n_builtin_sets = + i915_oa_n_builtin_metric_sets_skl; + + if (i915_perf_init_sysfs_skl(dev_priv)) + goto sysfs_error; + } } dev_priv->perf.sysctl_header = register_sysctl_table(dev_root); dev_priv->perf.initialized = true; + return; + +sysfs_error: + kobject_put(dev_priv->perf.metrics_kobj); + dev_priv->perf.metrics_kobj = NULL; + return; } @@ -1175,7 +1677,14 @@ void i915_perf_fini(struct drm_device *dev) unregister_sysctl_table(dev_priv->perf.sysctl_header); - i915_perf_deinit_sysfs_hsw(dev_priv); + if (IS_HASWELL(dev)) + i915_perf_deinit_sysfs_hsw(dev_priv); + else if (IS_BROADWELL(dev)) + i915_perf_deinit_sysfs_bdw(dev_priv); + else if (IS_CHERRYVIEW(dev)) + i915_perf_deinit_sysfs_chv(dev_priv); + else if (IS_SKYLAKE(dev)) + i915_perf_deinit_sysfs_skl(dev_priv); kobject_put(dev_priv->perf.metrics_kobj); dev_priv->perf.metrics_kobj = NULL; diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 548ee53f1cb939..4789555aa80501 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -385,6 +385,8 @@ static int execlists_update_context(struct drm_i915_gem_request *rq) ASSIGN_CTX_PDP(ppgtt, reg_state, 0); } + i915_oa_update_reg_state(ring, reg_state); + kunmap_atomic(reg_state); return 0; @@ -2354,6 +2356,8 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_R_PWR_CLK_STATE+1] = make_rpcs(dev); } + i915_oa_update_reg_state(ring, reg_state); + kunmap_atomic(reg_state); ctx_obj->dirty = 1; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 49fa41dc0eb66a..302882c2b77426 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -346,6 +346,8 @@ struct intel_engine_cs { * to encode the command length in the header). */ u32 (*get_cmd_length_mask)(u32 cmd_header); + + atomic_t oa_state_dirty; }; bool intel_ring_initialized(struct intel_engine_cs *ring); diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 68f2bb64094154..4a6789548d7e42 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1135,13 +1135,18 @@ struct drm_i915_gem_context_param { }; enum drm_i915_oa_format { - I915_OA_FORMAT_A13 = 1, - I915_OA_FORMAT_A29, - I915_OA_FORMAT_A13_B8_C8, - I915_OA_FORMAT_B4_C8, - I915_OA_FORMAT_A45_B8_C8, - I915_OA_FORMAT_B4_C8_A16, - I915_OA_FORMAT_C4_B8, + I915_OA_FORMAT_A13 = 1, /* HSW only */ + I915_OA_FORMAT_A29, /* HSW only */ + I915_OA_FORMAT_A13_B8_C8, /* HSW only */ + I915_OA_FORMAT_B4_C8, /* HSW only */ + I915_OA_FORMAT_A45_B8_C8, /* HSW only */ + I915_OA_FORMAT_B4_C8_A16, /* HSW only */ + I915_OA_FORMAT_C4_B8, /* HSW+ */ + + /* Gen8+ */ + I915_OA_FORMAT_A12, + I915_OA_FORMAT_A12_B8_C8, + I915_OA_FORMAT_A32u40_A4u32_B8_C8, I915_OA_FORMAT_MAX /* non-ABI */ };