diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d6e89e957bc5df..c409c8f1db38ee 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1784,6 +1784,7 @@ struct i915_oa_ops {
 	void (*update_oacontrol)(struct drm_i915_private *dev_priv);
 	void (*update_hw_ctx_id_locked)(struct drm_i915_private *dev_priv,
 					u32 ctx_id);
+	void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req);
 	void (*read)(struct i915_perf_stream *stream,
 		     struct i915_perf_read_state *read_state);
 	bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
@@ -2071,10 +2072,14 @@ struct drm_i915_private {
 				u8 *addr;
 				u32 head;
 				u32 tail;
+				u32 last_ctx_id;
 				int format;
 				int format_size;
 			} oa_buffer;
 
+			u32 ctx_oactxctrl_off;
+			u32 ctx_flexeu0_off;
+
 			struct i915_oa_ops ops;
 			const struct i915_oa_format *oa_formats;
 			int n_builtin_sets;
@@ -3321,6 +3326,8 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file);
 void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
 				struct intel_context *context);
+void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req);
+void i915_oa_update_reg_state(struct intel_engine_cs *ring, uint32_t *reg_state);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct drm_device *dev,
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 9688a825f4c56f..3a90e798e431b0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -828,6 +828,8 @@ static int do_switch(struct drm_i915_gem_request *req)
 		}
 	}
 
+	i915_oa_legacy_ctx_switch_notify(req);
+
 	return 0;
 
 unpin_out:
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5aa49127487349..c5447b4382907d 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -28,6 +28,9 @@
 #include "intel_ringbuffer.h"
 #include "intel_lrc.h"
 #include "i915_oa_hsw.h"
+#include "i915_oa_bdw.h"
+#include "i915_oa_chv.h"
+#include "i915_oa_skl.h"
 
 /* Must be a power of two */
 #define OA_BUFFER_SIZE	     SZ_16M
@@ -69,6 +72,13 @@ static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
 	[I915_OA_FORMAT_C4_B8]	    = { 7, 64 },
 };
 
+static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
+	[I915_OA_FORMAT_A12]		    = { 0, 64 },
+	[I915_OA_FORMAT_A12_B8_C8]	    = { 2, 128 },
+	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
+	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 
 struct perf_open_properties
@@ -85,6 +95,14 @@ struct perf_open_properties
 	u32 oa_period_exponent;
 };
 
+static bool gen8_oa_buffer_is_empty(struct drm_i915_private *dev_priv)
+{
+	u32 head = I915_READ(GEN8_OAHEADPTR);
+	u32 tail = I915_READ(GEN8_OATAILPTR);
+
+	return OA_TAKEN(tail, head) == 0;
+}
+
 static bool gen7_oa_buffer_is_empty(struct drm_i915_private *dev_priv)
 {
 	u32 oastatus2 = I915_READ(GEN7_OASTATUS2);
@@ -141,6 +159,116 @@ static bool append_oa_sample(struct i915_perf_stream *stream,
 	return true;
 }
 
+static u32 gen8_append_oa_reports(struct i915_perf_stream *stream,
+				  struct i915_perf_read_state *read_state,
+				  u32 head,
+				  u32 tail)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr;
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	u8 *report;
+	u32 taken;
+
+	head -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+
+	/* Note: the gpu doesn't wrap the tail according to the OA buffer size
+	 * so when we need to make sure our head/tail values are in-bounds we
+	 * use the above mask.
+	 */
+
+	while ((taken = OA_TAKEN(tail, head))) {
+		u32 ctx_id;
+
+		/* The tail increases in 64 byte increments, not in
+		 * format_size steps. */
+		if (taken < report_size)
+			break;
+
+		/* All the report sizes factor neatly into the buffer
+		 * size so we never expect to see a report split
+		 * between the beginning and end of the buffer... */
+		BUG_ON((OA_BUFFER_SIZE - (head & mask)) < report_size);
+
+		report = oa_buf_base + (head & mask);
+
+		ctx_id = *(u32 *)(report + 12);
+		if (i915.enable_execlists) {
+			/* XXX: Just keep the lower 20 bits for now since I'm
+			 * not entirely sure if the HW touches any of the higher
+			 * bits */
+			ctx_id &= 0xfffff;
+		}
+
+		if (dev_priv->perf.oa.exclusive_stream->enabled) {
+
+			/* NB: For Gen 8 we handle per-context report filtering
+			 * ourselves instead of programming the OA unit with a
+			 * specific context id.
+			 *
+			 * NB: To allow userspace to calculate all counter
+			 * deltas for a specific context we have to send the
+			 * first report belonging to any subsequently
+			 * switched-too context.
+			 */
+			if (!dev_priv->perf.oa.exclusive_stream->ctx ||
+			    (dev_priv->perf.oa.specific_ctx_id == ctx_id ||
+			     (dev_priv->perf.oa.specific_ctx_id !=
+			      dev_priv->perf.oa.oa_buffer.last_ctx_id))) {
+
+				if (!append_oa_sample(stream, read_state, report))
+					break;
+			}
+		}
+
+		/* If append_oa_sample() returns false we shouldn't progress
+		 * head so we update it afterwards... */
+		dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
+		head += report_size;
+	}
+
+	return dev_priv->perf.oa.oa_buffer.gtt_offset + head;
+}
+
+static void gen8_oa_read(struct i915_perf_stream *stream,
+			 struct i915_perf_read_state *read_state)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	u32 oastatus;
+	u32 head;
+	u32 tail;
+
+	WARN_ON(!dev_priv->perf.oa.oa_buffer.addr);
+
+	head = I915_READ(GEN8_OAHEADPTR);
+	tail = I915_READ(GEN8_OATAILPTR);
+	oastatus = I915_READ(GEN8_OASTATUS);
+
+	if (unlikely(oastatus & (GEN8_OASTATUS_OABUFFER_OVERFLOW |
+				 GEN8_OASTATUS_REPORT_LOST))) {
+
+		if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
+			if (append_oa_status(stream, read_state,
+					     DRM_I915_PERF_RECORD_OA_BUFFER_OVERFLOW))
+				oastatus &= ~GEN8_OASTATUS_OABUFFER_OVERFLOW;
+		}
+
+		if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
+			if (append_oa_status(stream, read_state,
+					     DRM_I915_PERF_RECORD_OA_REPORT_LOST))
+				oastatus &= ~GEN8_OASTATUS_REPORT_LOST;
+		}
+
+		I915_WRITE(GEN8_OASTATUS, oastatus);
+	}
+
+	head = gen8_append_oa_reports(stream, read_state, head, tail);
+
+	I915_WRITE(GEN8_OAHEADPTR, head);
+}
+
 static u32 gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  struct i915_perf_read_state *read_state,
 				  u32 head,
@@ -335,6 +463,23 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 		   OABUFFER_SIZE_16M); /* tail */
 }
 
+static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN8_OAHEADPTR,
+		   dev_priv->perf.oa.oa_buffer.gtt_offset);
+	/* PRM says:
+	 *
+	 *  "This MMIO must be set before the OATAILPTR
+	 *  register and after the OAHEADPTR register. This is
+	 *  to enable proper functionality of the overflow
+	 *  bit."
+	 */
+	I915_WRITE(GEN8_OABUFFER, dev_priv->perf.oa.oa_buffer.gtt_offset |
+		   OABUFFER_SIZE_16M | OA_MEM_SELECT_GGTT);
+	I915_WRITE(GEN8_OATAILPTR,
+		   dev_priv->perf.oa.oa_buffer.gtt_offset);
+}
+
 static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 {
 	struct drm_i915_gem_object *bo;
@@ -437,6 +582,154 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
 				      ~GT_NOA_ENABLE));
 }
 
+/* Manages updating the per-context aspects of the OA stream
+ * configuration across all contexts.
+ *
+ * The awkward consideration here is that OACTXCONTROL controls the
+ * exponent for periodic sampling which is primarily used for system
+ * wide profiling where we'd like a consistent sampling period even in
+ * the face of context switches.
+ *
+ * Our approach of updating the register state context (as opposed to
+ * say using a workaround batch buffer) ensures that the hardware
+ * won't automatically reload an out-of-date timer exponent even
+ * transiently before a WA BB could be parsed.
+ */
+static int configure_all_contexts(struct drm_i915_private *dev_priv)
+{
+	struct drm_device *dev = dev_priv->dev;
+	struct intel_context *ctx;
+	struct intel_engine_cs *ring;
+	int ring_id;
+	int ret;
+
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(ctx, &dev_priv->context_list, link) {
+
+		for_each_ring(ring, dev_priv, ring_id) {
+			/* The actual update of the register state context
+			 * will happen the next time this logical ring
+			 * is submitted. (See i915_oa_update_reg_state()
+			 * which hooks into execlists_update_context())
+			 */
+			atomic_set(&ring->oa_state_dirty, true);
+		}
+	}
+
+	mutex_unlock(&dev->struct_mutex);
+
+	/* Now update the current context.
+	 *
+	 * Note: Using MMIO to update per-context registers requires
+	 * some extra care...
+	 */
+	ret = intel_uncore_begin_ctx_mmio(dev_priv);
+	if (ret) {
+		DRM_ERROR("Failed to bring RCS out of idle to update current ctx OA state");
+		return ret;
+	}
+
+	I915_WRITE(GEN8_OACTXCONTROL, ((dev_priv->perf.oa.period_exponent <<
+					GEN8_OA_TIMER_PERIOD_SHIFT) |
+				      (dev_priv->perf.oa.periodic ?
+				       GEN8_OA_TIMER_ENABLE : 0) |
+				      GEN8_OA_COUNTER_RESUME));
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.flex_regs,
+			dev_priv->perf.oa.flex_regs_len);
+
+	intel_uncore_end_ctx_mmio(dev_priv);
+
+	return 0;
+}
+
+static int bdw_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	int ret = i915_oa_select_metric_set_bdw(dev_priv);
+
+	if (ret)
+		return ret;
+
+	I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+	I915_WRITE(GDT_CHICKEN_BITS, 0x80);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	configure_all_contexts(dev_priv);
+
+	return 0;
+}
+
+static void bdw_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
+				  ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
+	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
+				    GEN7_DOP_CLOCK_GATE_ENABLE));
+#warning "BDW: Do we need to write to CHICKEN2 to disable DOP clock gating when idle? (vpg does this)"
+}
+
+static int chv_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	int ret = i915_oa_select_metric_set_chv(dev_priv);
+
+	if (ret)
+		return ret;
+
+	I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+	I915_WRITE(GDT_CHICKEN_BITS, 0x80);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	configure_all_contexts(dev_priv);
+
+	return 0;
+}
+
+static void chv_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
+				  ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
+	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
+				    GEN7_DOP_CLOCK_GATE_ENABLE));
+#warning "CHV: Do we need to write to CHICKEN2 to disable DOP clock gating when idle? (vpg does this)"
+}
+
+static int skl_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	int ret = i915_oa_select_metric_set_skl(dev_priv);
+
+	if (ret)
+		return ret;
+
+	I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+	I915_WRITE(GDT_CHICKEN_BITS, 0x80);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	configure_all_contexts(dev_priv);
+
+	return 0;
+}
+
+static void skl_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
+				  ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
+	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
+				    GEN7_DOP_CLOCK_GATE_ENABLE));
+#warning "SKL: Do we need to write to CHICKEN2 to disable DOP clock gating when idle? (vpg does this)"
+}
+
 static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
 {
 	assert_spin_locked(&dev_priv->perf.hook_lock);
@@ -491,6 +784,23 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
 				    OA_MEM_SELECT_GGTT);
 }
 
+static void gen8_oa_enable(struct drm_i915_private *dev_priv)
+{
+	u32 report_format = dev_priv->perf.oa.oa_buffer.format;
+	u32 tail;
+
+	/* Note: we don't rely on the hardware to perform single context
+	 * filtering and instead filter on the cpu based on the context-id
+	 * field of reports */
+	I915_WRITE(GEN8_OACONTROL, (report_format <<
+				    GEN8_OA_REPORT_FORMAT_SHIFT) |
+				   GEN8_OA_COUNTER_ENABLE);
+
+	/* Reset the head ptr so we don't forward reports from before now. */
+	tail = I915_READ(GEN8_OATAILPTR);
+	I915_WRITE(GEN8_OAHEADPTR, tail);
+}
+
 static void i915_oa_stream_enable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
@@ -508,6 +818,11 @@ static void gen7_oa_disable(struct drm_i915_private *dev_priv)
 	I915_WRITE(GEN7_OACONTROL, 0);
 }
 
+static void gen8_oa_disable(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN8_OACONTROL, 0);
+}
+
 static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
@@ -573,6 +888,10 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	if (dev_priv->perf.oa.periodic)
 		dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
 
+	if (i915.enable_execlists && stream->ctx)
+		dev_priv->perf.oa.specific_ctx_id =
+			intel_execlists_ctx_id(stream->ctx);
+
 	ret = alloc_oa_buffer(dev_priv);
 	if (ret)
 		return ret;
@@ -648,6 +967,131 @@ void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
 	spin_unlock_irqrestore(&dev_priv->perf.hook_lock, flags);
 }
 
+static void gen8_legacy_ctx_switch_unlocked(struct drm_i915_gem_request *req)
+{
+	struct drm_i915_private *dev_priv = req->i915;
+	struct intel_engine_cs *ring = req->ring;
+	const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs;
+	int n_flex_regs = dev_priv->perf.oa.flex_regs_len;
+	int ret;
+	int i;
+
+	if (!atomic_read(&ring->oa_state_dirty))
+		return;
+
+	ret = intel_ring_begin(req, n_flex_regs * 2 + 4);
+	if (ret)
+		return;
+
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(n_flex_regs + 1));
+
+	intel_ring_emit(ring, GEN8_OACTXCONTROL);
+	intel_ring_emit(ring,
+			(dev_priv->perf.oa.period_exponent <<
+			 GEN8_OA_TIMER_PERIOD_SHIFT) |
+			(dev_priv->perf.oa.periodic ?
+			 GEN8_OA_TIMER_ENABLE : 0) |
+			GEN8_OA_COUNTER_RESUME);
+
+	for (i = 0; i < n_flex_regs; i++) {
+		intel_ring_emit(ring, flex_regs[i].addr);
+		intel_ring_emit(ring, flex_regs[i].value);
+	}
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_advance(ring);
+
+	atomic_set(&ring->oa_state_dirty, false);
+}
+
+void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req)
+{
+	struct drm_i915_private *dev_priv = req->i915;
+
+	if (!dev_priv->perf.initialized)
+		return;
+
+	if (dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked == NULL)
+		return;
+
+	if (dev_priv->perf.oa.exclusive_stream &&
+	    dev_priv->perf.oa.exclusive_stream->enabled) {
+
+		/* XXX: We don't take a lock here and this may run
+		 * async with respect to stream methods. Notably we
+		 * don't want to block context switches by long i915
+		 * perf read() operations.
+		 *
+		 * It's expect to always be safe to read the
+		 * dev_priv->perf state needed here, and expected to
+		 * be benign to redundantly update the state if the OA
+		 * unit has been disabled since oa_state_dirty was
+		 * last set.
+		 */
+		dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked(req);
+	}
+}
+
+static void gen8_update_reg_state_unlocked(struct intel_engine_cs *ring,
+					   uint32_t *reg_state)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs;
+	int n_flex_regs = dev_priv->perf.oa.flex_regs_len;
+	int ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_off;
+	int ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_off;
+	int i;
+
+	if (!atomic_read(&ring->oa_state_dirty))
+		return;
+
+	reg_state[ctx_oactxctrl] = GEN8_OACTXCONTROL;
+	reg_state[ctx_oactxctrl+1] = (dev_priv->perf.oa.period_exponent <<
+				      GEN8_OA_TIMER_PERIOD_SHIFT) |
+				     (dev_priv->perf.oa.periodic ?
+				      GEN8_OA_TIMER_ENABLE : 0) |
+				     GEN8_OA_COUNTER_RESUME;
+
+	for (i = 0; i < n_flex_regs; i++) {
+		uint32_t offset = flex_regs[i].addr;
+
+		/* Map from mmio address to register state context
+		 * offset... */
+
+		offset -= EU_PERF_CNTL0;
+
+		offset >>= 5; /* Flex EU mmio registers are separated by 256
+			       * bytes, here they are separated by 8 bytes */
+
+		/* EU_PERF_CNTL0 offset in register state context... */
+		offset += ctx_flexeu0;
+
+		reg_state[offset] = flex_regs[i].addr;
+		reg_state[offset+1] = flex_regs[i].value;
+	}
+
+	atomic_set(&ring->oa_state_dirty, false);
+}
+
+void i915_oa_update_reg_state(struct intel_engine_cs *ring, uint32_t *reg_state)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+	if (!dev_priv->perf.initialized)
+		return;
+
+	/* XXX: We don't take a lock here and this may run async with
+	 * respect to stream methods. Notably we don't want to block
+	 * context switches by long i915 perf read() operations.
+	 *
+	 * It's expect to always be safe to read the dev_priv->perf
+	 * state needed here, and expected to be benign to redundantly
+	 * update the state if the OA unit has been disabled since
+	 * oa_state_dirty was last set.
+	 */
+
+	gen8_update_reg_state_unlocked(ring, reg_state);
+}
+
 static ssize_t i915_perf_read_locked(struct i915_perf_stream *stream,
 				     struct file *file,
 				     char __user *buf,
@@ -1122,7 +1566,9 @@ void i915_perf_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 
-	if (!IS_HASWELL(dev))
+	if (!(IS_HASWELL(dev) ||
+	      IS_BROADWELL(dev) || IS_CHERRYVIEW(dev) ||
+	      IS_SKYLAKE(dev)))
 		return;
 
 	dev_priv->perf.metrics_kobj =
@@ -1139,30 +1585,86 @@ void i915_perf_init(struct drm_device *dev)
 	mutex_init(&dev_priv->perf.lock);
 	spin_lock_init(&dev_priv->perf.hook_lock);
 
-	dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
-	dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
-	dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
-	dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
-	dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
-	dev_priv->perf.oa.ops.update_hw_ctx_id_locked = gen7_update_hw_ctx_id_locked;
-	dev_priv->perf.oa.ops.read = gen7_oa_read;
-	dev_priv->perf.oa.ops.oa_buffer_is_empty = gen7_oa_buffer_is_empty;
+	if (IS_HASWELL(dev)) {
+		dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
+		dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
+		dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
+		dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
+		dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
+		dev_priv->perf.oa.ops.update_hw_ctx_id_locked = gen7_update_hw_ctx_id_locked;
+		dev_priv->perf.oa.ops.read = gen7_oa_read;
+		dev_priv->perf.oa.ops.oa_buffer_is_empty = gen7_oa_buffer_is_empty;
 
-	dev_priv->perf.oa.oa_formats = hsw_oa_formats;
+		dev_priv->perf.oa.oa_formats = hsw_oa_formats;
 
-	dev_priv->perf.oa.n_builtin_sets =
-		i915_oa_n_builtin_metric_sets_hsw;
+		dev_priv->perf.oa.n_builtin_sets =
+			i915_oa_n_builtin_metric_sets_hsw;
 
-	if (i915_perf_init_sysfs_hsw(dev_priv)) {
-		kobject_put(dev_priv->perf.metrics_kobj);
-		dev_priv->perf.metrics_kobj = NULL;
-		return;
+		if (i915_perf_init_sysfs_hsw(dev_priv))
+			goto sysfs_error;
+	} else {
+		dev_priv->perf.oa.ops.init_oa_buffer = gen8_init_oa_buffer;
+		dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
+		dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
+		dev_priv->perf.oa.ops.read = gen8_oa_read;
+		dev_priv->perf.oa.ops.oa_buffer_is_empty = gen8_oa_buffer_is_empty;
+
+		dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
+
+		if (!i915.enable_execlists) {
+			dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked =
+				gen8_legacy_ctx_switch_unlocked;
+		}
+
+		if (IS_BROADWELL(dev)) {
+			dev_priv->perf.oa.ops.enable_metric_set =
+				bdw_enable_metric_set;
+			dev_priv->perf.oa.ops.disable_metric_set =
+				bdw_disable_metric_set;
+			dev_priv->perf.oa.ctx_oactxctrl_off = 0x120;
+			dev_priv->perf.oa.ctx_flexeu0_off = 0x2ce;
+			dev_priv->perf.oa.n_builtin_sets =
+				i915_oa_n_builtin_metric_sets_bdw;
+
+			if (i915_perf_init_sysfs_bdw(dev_priv))
+				goto sysfs_error;
+		} else if (IS_CHERRYVIEW(dev)) {
+			dev_priv->perf.oa.ops.enable_metric_set =
+				chv_enable_metric_set;
+			dev_priv->perf.oa.ops.disable_metric_set =
+				chv_disable_metric_set;
+			dev_priv->perf.oa.ctx_oactxctrl_off = 0x120;
+			dev_priv->perf.oa.ctx_flexeu0_off = 0x2ce;
+			dev_priv->perf.oa.n_builtin_sets =
+				i915_oa_n_builtin_metric_sets_chv;
+
+			if (i915_perf_init_sysfs_chv(dev_priv))
+				goto sysfs_error;
+		} else if (IS_SKYLAKE(dev)) {
+			dev_priv->perf.oa.ops.enable_metric_set =
+				skl_enable_metric_set;
+			dev_priv->perf.oa.ops.disable_metric_set =
+				skl_disable_metric_set;
+			dev_priv->perf.oa.ctx_oactxctrl_off = 0x128;
+			dev_priv->perf.oa.ctx_flexeu0_off = 0x3de;
+			dev_priv->perf.oa.n_builtin_sets =
+				i915_oa_n_builtin_metric_sets_skl;
+
+			if (i915_perf_init_sysfs_skl(dev_priv))
+				goto sysfs_error;
+		}
 	}
 
 	dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
 
 	dev_priv->perf.initialized = true;
 
+	return;
+
+sysfs_error:
+	kobject_put(dev_priv->perf.metrics_kobj);
+	dev_priv->perf.metrics_kobj = NULL;
+
 	return;
 }
 
@@ -1175,7 +1677,14 @@ void i915_perf_fini(struct drm_device *dev)
 
 	unregister_sysctl_table(dev_priv->perf.sysctl_header);
 
-	i915_perf_deinit_sysfs_hsw(dev_priv);
+        if (IS_HASWELL(dev))
+                i915_perf_deinit_sysfs_hsw(dev_priv);
+        else if (IS_BROADWELL(dev))
+                i915_perf_deinit_sysfs_bdw(dev_priv);
+        else if (IS_CHERRYVIEW(dev))
+                i915_perf_deinit_sysfs_chv(dev_priv);
+        else if (IS_SKYLAKE(dev))
+                i915_perf_deinit_sysfs_skl(dev_priv);
 
 	kobject_put(dev_priv->perf.metrics_kobj);
 	dev_priv->perf.metrics_kobj = NULL;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 548ee53f1cb939..4789555aa80501 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -385,6 +385,8 @@ static int execlists_update_context(struct drm_i915_gem_request *rq)
 		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
 	}
 
+	i915_oa_update_reg_state(ring, reg_state);
+
 	kunmap_atomic(reg_state);
 
 	return 0;
@@ -2354,6 +2356,8 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 		reg_state[CTX_R_PWR_CLK_STATE+1] = make_rpcs(dev);
 	}
 
+	i915_oa_update_reg_state(ring, reg_state);
+
 	kunmap_atomic(reg_state);
 
 	ctx_obj->dirty = 1;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 49fa41dc0eb66a..302882c2b77426 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -346,6 +346,8 @@ struct  intel_engine_cs {
 	 * to encode the command length in the header).
 	 */
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
+
+	atomic_t oa_state_dirty;
 };
 
 bool intel_ring_initialized(struct intel_engine_cs *ring);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 68f2bb64094154..4a6789548d7e42 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1135,13 +1135,18 @@ struct drm_i915_gem_context_param {
 };
 
 enum drm_i915_oa_format {
-	I915_OA_FORMAT_A13 = 1,
-	I915_OA_FORMAT_A29,
-	I915_OA_FORMAT_A13_B8_C8,
-	I915_OA_FORMAT_B4_C8,
-	I915_OA_FORMAT_A45_B8_C8,
-	I915_OA_FORMAT_B4_C8_A16,
-	I915_OA_FORMAT_C4_B8,
+	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
+	I915_OA_FORMAT_A29,	    /* HSW only */
+	I915_OA_FORMAT_A13_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8,	    /* HSW only */
+	I915_OA_FORMAT_A45_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8_A16,   /* HSW only */
+	I915_OA_FORMAT_C4_B8,	    /* HSW+ */
+
+	/* Gen8+ */
+	I915_OA_FORMAT_A12,
+	I915_OA_FORMAT_A12_B8_C8,
+	I915_OA_FORMAT_A32u40_A4u32_B8_C8,
 
 	I915_OA_FORMAT_MAX	    /* non-ABI */
 };