Merge branches 'intel_pstate' and 'pm-sleep'

[karo-tx-linux.git] / drivers / gpu / drm / i915 / intel_lrc.c
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 47517a02f0a439125b3b3a769e6848a4c4928ca2..dac4e003c1f317ec402110132bad0c3a734bf52a 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -190,13 +190,7 @@
  #define CTX_R_PWR_CLK_STATE            0x42
  #define CTX_GPGPU_CSR_BASE_ADDRESS     0x44
  
-#define GEN8_CTX_VALID (1<<0)
-#define GEN8_CTX_FORCE_PD_RESTORE (1<<1)
-#define GEN8_CTX_FORCE_RESTORE (1<<2)
-#define GEN8_CTX_L3LLC_COHERENT (1<<5)
-#define GEN8_CTX_PRIVILEGE (1<<8)
-
-#define ASSIGN_CTX_REG(reg_state, pos, reg, val) do { \
+#define CTX_REG(reg_state, pos, reg, val) do { \
         (reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \
         (reg_state)[(pos)+1] = (val); \
  } while (0)
@@ -212,14 +206,6 @@
         reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \
  } while (0)
  
-enum {
-       FAULT_AND_HANG = 0,
-       FAULT_AND_HALT, /* Debug only */
-       FAULT_AND_STREAM,
-       FAULT_AND_CONTINUE /* Unsupported */
-};
-#define GEN8_CTX_ID_SHIFT 32
-#define GEN8_CTX_ID_WIDTH 21
  #define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT       0x17
  #define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT       0x26
  
@@ -267,30 +253,6 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv, int enabl
         return 0;
  }
  
-static void
-logical_ring_init_platform_invariants(struct intel_engine_cs *engine)
-{
-       struct drm_i915_private *dev_priv = engine->i915;
-
-       engine->disable_lite_restore_wa =
-               IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1) &&
-               (engine->id == VCS || engine->id == VCS2);
-
-       engine->ctx_desc_template = GEN8_CTX_VALID;
-       if (IS_GEN8(dev_priv))
-               engine->ctx_desc_template |= GEN8_CTX_L3LLC_COHERENT;
-       engine->ctx_desc_template |= GEN8_CTX_PRIVILEGE;
-
-       /* TODO: WaDisableLiteRestore when we start using semaphore
-        * signalling between Command Streamers */
-       /* ring->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; */
-
-       /* WaEnableForceRestoreInCtxtDescForVCS:skl */
-       /* WaEnableForceRestoreInCtxtDescForVCS:bxt */
-       if (engine->disable_lite_restore_wa)
-               engine->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE;
-}
-
  /**
   * intel_lr_context_descriptor_update() - calculate & cache the descriptor
   *                                       descriptor for a pinned context
@@ -304,7 +266,7 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine)
   *
   * This is what a descriptor looks like, from LSB to MSB::
   *
- *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx_desc_template)
+ *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
   *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
   *      bits 32-52:    ctx ID, a globally unique tag
   *      bits 53-54:    mbz, reserved for use by hardware
@@ -319,8 +281,7 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
  
         BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (1<<GEN8_CTX_ID_WIDTH));
  
-       desc = ctx->desc_template;                              /* bits  3-4  */
-       desc |= engine->ctx_desc_template;                      /* bits  0-11 */
+       desc = ctx->desc_template;                              /* bits  0-11 */
         desc |= i915_ggtt_offset(ce->state) + LRC_PPHWSP_PN * PAGE_SIZE;
                                                                 /* bits 12-31 */
         desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;           /* bits 32-52 */
@@ -365,6 +326,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
                 rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
         u32 *reg_state = ce->lrc_reg_state;
  
+       assert_ring_tail_valid(rq->ring, rq->tail);
         reg_state[CTX_RING_TAIL+1] = rq->tail;
  
         /* True 32b PPGTT with dynamic page allocation: update PDP
@@ -372,7 +334,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
          * PML4 is allocated during ppgtt init, so this is not needed
          * in 48-bit mode.
          */
-       if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
+       if (ppgtt && !i915_vm_is_48bit(&ppgtt->base))
                 execlists_update_context_pdps(ppgtt, reg_state);
  
         return ce->lrc_desc;
@@ -386,17 +348,20 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
                 dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
         u64 desc[2];
  
+       GEM_BUG_ON(port[0].count > 1);
         if (!port[0].count)
                 execlists_context_status_change(port[0].request,
                                                 INTEL_CONTEXT_SCHEDULE_IN);
         desc[0] = execlists_update_context(port[0].request);
-       engine->preempt_wa = port[0].count++; /* bdw only? fixed on skl? */
+       GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0]));
+       port[0].count++;
  
         if (port[1].request) {
                 GEM_BUG_ON(port[1].count);
                 execlists_context_status_change(port[1].request,
                                                 INTEL_CONTEXT_SCHEDULE_IN);
                 desc[1] = execlists_update_context(port[1].request);
+               GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1]));
                 port[1].count = 1;
         } else {
                 desc[1] = 0;
@@ -434,7 +399,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
  {
         struct drm_i915_gem_request *last;
         struct execlist_port *port = engine->execlist_port;
-       unsigned long flags;
         struct rb_node *rb;
         bool submit = false;
  
@@ -471,7 +435,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
          * and context switches) submission.
          */
  
-       spin_lock_irqsave(&engine->timeline->lock, flags);
+       spin_lock_irq(&engine->timeline->lock);
         rb = engine->execlist_first;
         while (rb) {
                 struct drm_i915_gem_request *cursor =
@@ -515,6 +479,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                 cursor->priotree.priority = INT_MAX;
  
                 __i915_gem_request_submit(cursor);
+               trace_i915_gem_request_in(cursor, port - engine->execlist_port);
                 last = cursor;
                 submit = true;
         }
@@ -522,7 +487,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                 i915_gem_request_assign(&port->request, last);
                 engine->execlist_first = rb;
         }
-       spin_unlock_irqrestore(&engine->timeline->lock, flags);
+       spin_unlock_irq(&engine->timeline->lock);
  
         if (submit)
                 execlists_submit_ports(engine);
@@ -533,37 +498,11 @@ static bool execlists_elsp_idle(struct intel_engine_cs *engine)
         return !engine->execlist_port[0].request;
  }
  
-/**
- * intel_execlists_idle() - Determine if all engine submission ports are idle
- * @dev_priv: i915 device private
- *
- * Return true if there are no requests pending on any of the submission ports
- * of any engines.
- */
-bool intel_execlists_idle(struct drm_i915_private *dev_priv)
-{
-       struct intel_engine_cs *engine;
-       enum intel_engine_id id;
-
-       if (!i915.enable_execlists)
-               return true;
-
-       for_each_engine(engine, dev_priv, id)
-               if (!execlists_elsp_idle(engine))
-                       return false;
-
-       return true;
-}
-
-static bool execlists_elsp_ready(struct intel_engine_cs *engine)
+static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
  {
-       int port;
-
-       port = 1; /* wait for a free slot */
-       if (engine->disable_lite_restore_wa || engine->preempt_wa)
-               port = 0; /* wait for GPU to be idle before continuing */
+       const struct execlist_port *port = engine->execlist_port;
  
-       return !engine->execlist_port[port].request;
+       return port[0].count + port[1].count < 2;
  }
  
  /*
@@ -578,44 +517,80 @@ static void intel_lrc_irq_handler(unsigned long data)
  
         intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
  
-       if (!execlists_elsp_idle(engine)) {
+       /* Prefer doing test_and_clear_bit() as a two stage operation to avoid
+        * imposing the cost of a locked atomic transaction when submitting a
+        * new request (outside of the context-switch interrupt).
+        */
+       while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
                 u32 __iomem *csb_mmio =
                         dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
                 u32 __iomem *buf =
                         dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
-               unsigned int csb, head, tail;
-
-               csb = readl(csb_mmio);
-               head = GEN8_CSB_READ_PTR(csb);
-               tail = GEN8_CSB_WRITE_PTR(csb);
-               if (tail < head)
-                       tail += GEN8_CSB_ENTRIES;
-               while (head < tail) {
-                       unsigned int idx = ++head % GEN8_CSB_ENTRIES;
-                       unsigned int status = readl(buf + 2 * idx);
+               unsigned int head, tail;
+
+               /* The write will be ordered by the uncached read (itself
+                * a memory barrier), so we do not need another in the form
+                * of a locked instruction. The race between the interrupt
+                * handler and the split test/clear is harmless as we order
+                * our clear before the CSB read. If the interrupt arrived
+                * first between the test and the clear, we read the updated
+                * CSB and clear the bit. If the interrupt arrives as we read
+                * the CSB or later (i.e. after we had cleared the bit) the bit
+                * is set and we do a new loop.
+                */
+               __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+               head = readl(csb_mmio);
+               tail = GEN8_CSB_WRITE_PTR(head);
+               head = GEN8_CSB_READ_PTR(head);
+               while (head != tail) {
+                       unsigned int status;
+
+                       if (++head == GEN8_CSB_ENTRIES)
+                               head = 0;
+
+                       /* We are flying near dragons again.
+                        *
+                        * We hold a reference to the request in execlist_port[]
+                        * but no more than that. We are operating in softirq
+                        * context and so cannot hold any mutex or sleep. That
+                        * prevents us stopping the requests we are processing
+                        * in port[] from being retired simultaneously (the
+                        * breadcrumb will be complete before we see the
+                        * context-switch). As we only hold the reference to the
+                        * request, any pointer chasing underneath the request
+                        * is subject to a potential use-after-free. Thus we
+                        * store all of the bookkeeping within port[] as
+                        * required, and avoid using unguarded pointers beneath
+                        * request itself. The same applies to the atomic
+                        * status notifier.
+                        */
  
+                       status = readl(buf + 2 * head);
                         if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
                                 continue;
  
+                       /* Check the context/desc id for this event matches */
+                       GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
+                                        port[0].context_id);
+
                         GEM_BUG_ON(port[0].count == 0);
                         if (--port[0].count == 0) {
                                 GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
+                               GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
                                 execlists_context_status_change(port[0].request,
                                                                 INTEL_CONTEXT_SCHEDULE_OUT);
  
+                               trace_i915_gem_request_out(port[0].request);
                                 i915_gem_request_put(port[0].request);
                                 port[0] = port[1];
                                 memset(&port[1], 0, sizeof(port[1]));
-
-                               engine->preempt_wa = false;
                         }
  
                         GEM_BUG_ON(port[0].count == 0 &&
                                    !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
                 }
  
-               writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-                                    GEN8_CSB_WRITE_PTR(csb) << 8),
+               writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
                        csb_mmio);
         }
  
@@ -659,10 +634,11 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
         /* Will be called from irq-context when using foreign fences. */
         spin_lock_irqsave(&engine->timeline->lock, flags);
  
-       if (insert_request(&request->priotree, &engine->execlist_queue))
+       if (insert_request(&request->priotree, &engine->execlist_queue)) {
                 engine->execlist_first = &request->priotree.node;
-       if (execlists_elsp_idle(engine))
-               tasklet_hi_schedule(&engine->irq_tasklet);
+               if (execlists_elsp_ready(engine))
+                       tasklet_hi_schedule(&engine->irq_tasklet);
+       }
  
         spin_unlock_irqrestore(&engine->timeline->lock, flags);
  }
@@ -772,6 +748,7 @@ static int execlists_context_pin(struct intel_engine_cs *engine,
  
         if (ce->pin_count++)
                 return 0;
+       GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
  
         if (!ce->state) {
                 ret = execlists_context_deferred_alloc(ctx, engine);
@@ -780,11 +757,9 @@ static int execlists_context_pin(struct intel_engine_cs *engine,
         }
         GEM_BUG_ON(!ce->state);
  
-       flags = PIN_GLOBAL;
+       flags = PIN_GLOBAL | PIN_HIGH;
         if (ctx->ggtt_offset_bias)
                 flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias;
-       if (i915_gem_context_is_kernel(ctx))
-               flags |= PIN_HIGH;
  
         ret = i915_vma_pin(ce->state, 0, GEN8_LR_CONTEXT_ALIGN, flags);
         if (ret)
@@ -843,6 +818,7 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request)
  {
         struct intel_engine_cs *engine = request->engine;
         struct intel_context *ce = &request->ctx->engine[engine->id];
+       u32 *cs;
         int ret;
  
         GEM_BUG_ON(!ce->pin_count);
@@ -867,9 +843,11 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request)
                         goto err;
         }
  
-       ret = intel_ring_begin(request, 0);
-       if (ret)
+       cs = intel_ring_begin(request, 0);
+       if (IS_ERR(cs)) {
+               ret = PTR_ERR(cs);
                 goto err_unreserve;
+       }
  
         if (!ce->initialised) {
                 ret = engine->init_context(request);
@@ -896,51 +874,6 @@ err:
         return ret;
  }
  
-static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
-{
-       int ret, i;
-       struct intel_ring *ring = req->ring;
-       struct i915_workarounds *w = &req->i915->workarounds;
-
-       if (w->count == 0)
-               return 0;
-
-       ret = req->engine->emit_flush(req, EMIT_BARRIER);
-       if (ret)
-               return ret;
-
-       ret = intel_ring_begin(req, w->count * 2 + 2);
-       if (ret)
-               return ret;
-
-       intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count));
-       for (i = 0; i < w->count; i++) {
-               intel_ring_emit_reg(ring, w->reg[i].addr);
-               intel_ring_emit(ring, w->reg[i].value);
-       }
-       intel_ring_emit(ring, MI_NOOP);
-
-       intel_ring_advance(ring);
-
-       ret = req->engine->emit_flush(req, EMIT_BARRIER);
-       if (ret)
-               return ret;
-
-       return 0;
-}
-
-#define wa_ctx_emit(batch, index, cmd)                                 \
-       do {                                                            \
-               int __index = (index)++;                                \
-               if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
-                       return -ENOSPC;                                 \
-               }                                                       \
-               batch[__index] = (cmd);                                 \
-       } while (0)
-
-#define wa_ctx_emit_reg(batch, index, reg) \
-       wa_ctx_emit((batch), (index), i915_mmio_reg_offset(reg))
-
  /*
   * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
   * PIPE_CONTROL instruction. This is required for the flush to happen correctly
@@ -957,56 +890,29 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
   * This WA is also required for Gen9 so extracting as a function avoids
   * code duplication.
   */
-static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine,
-                                               uint32_t *batch,
-                                               uint32_t index)
+static u32 *
+gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
  {
-       uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES);
-
-       wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 |
-                                  MI_SRM_LRM_GLOBAL_GTT));
-       wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
-       wa_ctx_emit(batch, index, i915_ggtt_offset(engine->scratch) + 256);
-       wa_ctx_emit(batch, index, 0);
-
-       wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
-       wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
-       wa_ctx_emit(batch, index, l3sqc4_flush);
-
-       wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
-       wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL |
-                                  PIPE_CONTROL_DC_FLUSH_ENABLE));
-       wa_ctx_emit(batch, index, 0);
-       wa_ctx_emit(batch, index, 0);
-       wa_ctx_emit(batch, index, 0);
-       wa_ctx_emit(batch, index, 0);
-
-       wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 |
-                                  MI_SRM_LRM_GLOBAL_GTT));
-       wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
-       wa_ctx_emit(batch, index, i915_ggtt_offset(engine->scratch) + 256);
-       wa_ctx_emit(batch, index, 0);
-
-       return index;
-}
+       *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+       *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+       *batch++ = i915_ggtt_offset(engine->scratch) + 256;
+       *batch++ = 0;
  
-static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx,
-                                   uint32_t offset,
-                                   uint32_t start_alignment)
-{
-       return wa_ctx->offset = ALIGN(offset, start_alignment);
-}
+       *batch++ = MI_LOAD_REGISTER_IMM(1);
+       *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+       *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
  
-static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx,
-                            uint32_t offset,
-                            uint32_t size_alignment)
-{
-       wa_ctx->size = offset - wa_ctx->offset;
+       batch = gen8_emit_pipe_control(batch,
+                                      PIPE_CONTROL_CS_STALL |
+                                      PIPE_CONTROL_DC_FLUSH_ENABLE,
+                                      0);
  
-       WARN(wa_ctx->size % size_alignment,
-            "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n",
-            wa_ctx->size, size_alignment);
-       return 0;
+       *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+       *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+       *batch++ = i915_ggtt_offset(engine->scratch) + 256;
+       *batch++ = 0;
+
+       return batch;
  }
  
  /*
@@ -1024,42 +930,28 @@ static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx,
   * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
   * makes a complete batch buffer.
   */
-static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine,
-                                   struct i915_wa_ctx_bb *wa_ctx,
-                                   uint32_t *batch,
-                                   uint32_t *offset)
+static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  {
-       uint32_t scratch_addr;
-       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
-
         /* WaDisableCtxRestoreArbitration:bdw,chv */
-       wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE);
+       *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
  
         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
-       if (IS_BROADWELL(engine->i915)) {
-               int rc = gen8_emit_flush_coherentl3_wa(engine, batch, index);
-               if (rc < 0)
-                       return rc;
-               index = rc;
-       }
+       if (IS_BROADWELL(engine->i915))
+               batch = gen8_emit_flush_coherentl3_wa(engine, batch);
  
         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
         /* Actual scratch location is at 128 bytes offset */
-       scratch_addr = i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
-
-       wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
-       wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
-                                  PIPE_CONTROL_GLOBAL_GTT_IVB |
-                                  PIPE_CONTROL_CS_STALL |
-                                  PIPE_CONTROL_QW_WRITE));
-       wa_ctx_emit(batch, index, scratch_addr);
-       wa_ctx_emit(batch, index, 0);
-       wa_ctx_emit(batch, index, 0);
-       wa_ctx_emit(batch, index, 0);
+       batch = gen8_emit_pipe_control(batch,
+                                      PIPE_CONTROL_FLUSH_L3 |
+                                      PIPE_CONTROL_GLOBAL_GTT_IVB |
+                                      PIPE_CONTROL_CS_STALL |
+                                      PIPE_CONTROL_QW_WRITE,
+                                      i915_ggtt_offset(engine->scratch) +
+                                      2 * CACHELINE_BYTES);
  
         /* Pad to end of cacheline */
-       while (index % CACHELINE_DWORDS)
-               wa_ctx_emit(batch, index, MI_NOOP);
+       while ((unsigned long)batch % CACHELINE_BYTES)
+               *batch++ = MI_NOOP;
  
         /*
          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
@@ -1067,7 +959,7 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine,
          * in the register CTX_RCS_INDIRECT_CTX
          */
  
-       return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
+       return batch;
  }
  
  /*
@@ -1079,65 +971,40 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine,
   *  This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
   *  to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
   */
-static int gen8_init_perctx_bb(struct intel_engine_cs *engine,
-                              struct i915_wa_ctx_bb *wa_ctx,
-                              uint32_t *batch,
-                              uint32_t *offset)
+static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
  {
-       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
-
         /* WaDisableCtxRestoreArbitration:bdw,chv */
-       wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
-
-       wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
+       *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+       *batch++ = MI_BATCH_BUFFER_END;
  
-       return wa_ctx_end(wa_ctx, *offset = index, 1);
+       return batch;
  }
  
-static int gen9_init_indirectctx_bb(struct intel_engine_cs *engine,
-                                   struct i915_wa_ctx_bb *wa_ctx,
-                                   uint32_t *batch,
-                                   uint32_t *offset)
+static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  {
-       int ret;
-       struct drm_i915_private *dev_priv = engine->i915;
-       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
+       /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
+       batch = gen8_emit_flush_coherentl3_wa(engine, batch);
  
-       /* WaDisableCtxRestoreArbitration:bxt */
-       if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
-               wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE);
-
-       /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */
-       ret = gen8_emit_flush_coherentl3_wa(engine, batch, index);
-       if (ret < 0)
-               return ret;
-       index = ret;
-
-       /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl */
-       wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
-       wa_ctx_emit_reg(batch, index, COMMON_SLICE_CHICKEN2);
-       wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(
-                           GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE));
-       wa_ctx_emit(batch, index, MI_NOOP);
+       /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
+       *batch++ = MI_LOAD_REGISTER_IMM(1);
+       *batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2);
+       *batch++ = _MASKED_BIT_DISABLE(
+                       GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE);
+       *batch++ = MI_NOOP;
  
         /* WaClearSlmSpaceAtContextSwitch:kbl */
         /* Actual scratch location is at 128 bytes offset */
-       if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_A0)) {
-               u32 scratch_addr =
-                       i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
-
-               wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
-               wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
-                                          PIPE_CONTROL_GLOBAL_GTT_IVB |
-                                          PIPE_CONTROL_CS_STALL |
-                                          PIPE_CONTROL_QW_WRITE));
-               wa_ctx_emit(batch, index, scratch_addr);
-               wa_ctx_emit(batch, index, 0);
-               wa_ctx_emit(batch, index, 0);
-               wa_ctx_emit(batch, index, 0);
+       if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
+               batch = gen8_emit_pipe_control(batch,
+                                              PIPE_CONTROL_FLUSH_L3 |
+                                              PIPE_CONTROL_GLOBAL_GTT_IVB |
+                                              PIPE_CONTROL_CS_STALL |
+                                              PIPE_CONTROL_QW_WRITE,
+                                              i915_ggtt_offset(engine->scratch)
+                                              + 2 * CACHELINE_BYTES);
         }
  
-       /* WaMediaPoolStateCmdInWABB:bxt */
+       /* WaMediaPoolStateCmdInWABB:bxt,glk */
         if (HAS_POOLED_EU(engine->i915)) {
                 /*
                  * EU pool configuration is setup along with golden context
@@ -1152,73 +1019,37 @@ static int gen9_init_indirectctx_bb(struct intel_engine_cs *engine,
                  * possible configurations, to avoid duplication they are
                  * not shown here again.
                  */
-               u32 eu_pool_config = 0x00777000;
-               wa_ctx_emit(batch, index, GEN9_MEDIA_POOL_STATE);
-               wa_ctx_emit(batch, index, GEN9_MEDIA_POOL_ENABLE);
-               wa_ctx_emit(batch, index, eu_pool_config);
-               wa_ctx_emit(batch, index, 0);
-               wa_ctx_emit(batch, index, 0);
-               wa_ctx_emit(batch, index, 0);
+               *batch++ = GEN9_MEDIA_POOL_STATE;
+               *batch++ = GEN9_MEDIA_POOL_ENABLE;
+               *batch++ = 0x00777000;
+               *batch++ = 0;
+               *batch++ = 0;
+               *batch++ = 0;
         }
  
         /* Pad to end of cacheline */
-       while (index % CACHELINE_DWORDS)
-               wa_ctx_emit(batch, index, MI_NOOP);
+       while ((unsigned long)batch % CACHELINE_BYTES)
+               *batch++ = MI_NOOP;
  
-       return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
+       return batch;
  }
  
-static int gen9_init_perctx_bb(struct intel_engine_cs *engine,
-                              struct i915_wa_ctx_bb *wa_ctx,
-                              uint32_t *batch,
-                              uint32_t *offset)
+static u32 *gen9_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
  {
-       uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
-
-       /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:bxt */
-       if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) {
-               wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
-               wa_ctx_emit_reg(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0);
-               wa_ctx_emit(batch, index,
-                           _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING));
-               wa_ctx_emit(batch, index, MI_NOOP);
-       }
+       *batch++ = MI_BATCH_BUFFER_END;
  
-       /* WaClearTdlStateAckDirtyBits:bxt */
-       if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_B0)) {
-               wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(4));
-
-               wa_ctx_emit_reg(batch, index, GEN8_STATE_ACK);
-               wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS));
-
-               wa_ctx_emit_reg(batch, index, GEN9_STATE_ACK_SLICE1);
-               wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS));
-
-               wa_ctx_emit_reg(batch, index, GEN9_STATE_ACK_SLICE2);
-               wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS));
-
-               wa_ctx_emit_reg(batch, index, GEN7_ROW_CHICKEN2);
-               /* dummy write to CS, mask bits are 0 to ensure the register is not modified */
-               wa_ctx_emit(batch, index, 0x0);
-               wa_ctx_emit(batch, index, MI_NOOP);
-       }
-
-       /* WaDisableCtxRestoreArbitration:bxt */
-       if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1))
-               wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
-
-       wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
-
-       return wa_ctx_end(wa_ctx, *offset = index, 1);
+       return batch;
  }
  
-static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *engine, u32 size)
+#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
+
+static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
  {
         struct drm_i915_gem_object *obj;
         struct i915_vma *vma;
         int err;
  
-       obj = i915_gem_object_create(engine->i915, PAGE_ALIGN(size));
+       obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
         if (IS_ERR(obj))
                 return PTR_ERR(obj);
  
@@ -1240,82 +1071,79 @@ err:
         return err;
  }
  
-static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *engine)
+static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
  {
         i915_vma_unpin_and_release(&engine->wa_ctx.vma);
  }
  
+typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
+
  static int intel_init_workaround_bb(struct intel_engine_cs *engine)
  {
         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
-       uint32_t *batch;
-       uint32_t offset;
+       struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
+                                           &wa_ctx->per_ctx };
+       wa_bb_func_t wa_bb_fn[2];
         struct page *page;
+       void *batch, *batch_ptr;
+       unsigned int i;
         int ret;
  
-       WARN_ON(engine->id != RCS);
+       if (WARN_ON(engine->id != RCS || !engine->scratch))
+               return -EINVAL;
  
-       /* update this when WA for higher Gen are added */
-       if (INTEL_GEN(engine->i915) > 9) {
-               DRM_ERROR("WA batch buffer is not initialized for Gen%d\n",
-                         INTEL_GEN(engine->i915));
+       switch (INTEL_GEN(engine->i915)) {
+       case 9:
+               wa_bb_fn[0] = gen9_init_indirectctx_bb;
+               wa_bb_fn[1] = gen9_init_perctx_bb;
+               break;
+       case 8:
+               wa_bb_fn[0] = gen8_init_indirectctx_bb;
+               wa_bb_fn[1] = gen8_init_perctx_bb;
+               break;
+       default:
+               MISSING_CASE(INTEL_GEN(engine->i915));
                 return 0;
         }
  
-       /* some WA perform writes to scratch page, ensure it is valid */
-       if (!engine->scratch) {
-               DRM_ERROR("scratch page not allocated for %s\n", engine->name);
-               return -EINVAL;
-       }
-
-       ret = lrc_setup_wa_ctx_obj(engine, PAGE_SIZE);
+       ret = lrc_setup_wa_ctx(engine);
         if (ret) {
                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
                 return ret;
         }
  
         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
-       batch = kmap_atomic(page);
-       offset = 0;
-
-       if (IS_GEN8(engine->i915)) {
-               ret = gen8_init_indirectctx_bb(engine,
-                                              &wa_ctx->indirect_ctx,
-                                              batch,
-                                              &offset);
-               if (ret)
-                       goto out;
+       batch = batch_ptr = kmap_atomic(page);
  
-               ret = gen8_init_perctx_bb(engine,
-                                         &wa_ctx->per_ctx,
-                                         batch,
-                                         &offset);
-               if (ret)
-                       goto out;
-       } else if (IS_GEN9(engine->i915)) {
-               ret = gen9_init_indirectctx_bb(engine,
-                                              &wa_ctx->indirect_ctx,
-                                              batch,
-                                              &offset);
-               if (ret)
-                       goto out;
-
-               ret = gen9_init_perctx_bb(engine,
-                                         &wa_ctx->per_ctx,
-                                         batch,
-                                         &offset);
-               if (ret)
-                       goto out;
+       /*
+        * Emit the two workaround batch buffers, recording the offset from the
+        * start of the workaround batch buffer object for each and their
+        * respective sizes.
+        */
+       for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
+               wa_bb[i]->offset = batch_ptr - batch;
+               if (WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, CACHELINE_BYTES))) {
+                       ret = -EINVAL;
+                       break;
+               }
+               batch_ptr = wa_bb_fn[i](engine, batch_ptr);
+               wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
         }
  
-out:
+       BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+
         kunmap_atomic(batch);
         if (ret)
-               lrc_destroy_wa_ctx_obj(engine);
+               lrc_destroy_wa_ctx(engine);
  
         return ret;
  }
  
+static u32 port_seqno(struct execlist_port *port)
+{
+       return port->request ? port->request->global_seqno : 0;
+}
+
  static int gen8_init_common_ring(struct intel_engine_cs *engine)
  {
         struct drm_i915_private *dev_priv = engine->i915;
@@ -1330,7 +1158,6 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
  
         I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff);
         I915_WRITE(RING_MODE_GEN7(engine),
-                  _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
                    _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
         I915_WRITE(RING_HWS_PGA(engine->mmio_base),
                    engine->status_page.ggtt_offset);
@@ -1339,7 +1166,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
         DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
  
         /* After a GPU reset, we may have requests to replay */
-       if (!execlists_elsp_idle(engine)) {
+       clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+       if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
+               DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n",
+                                engine->name,
+                                port_seqno(&engine->execlist_port[0]),
+                                port_seqno(&engine->execlist_port[1]));
                 engine->execlist_port[0].count = 0;
                 engine->execlist_port[1].count = 0;
                 execlists_submit_ports(engine);
@@ -1384,7 +1216,6 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
  static void reset_common_ring(struct intel_engine_cs *engine,
                               struct drm_i915_gem_request *request)
  {
-       struct drm_i915_private *dev_priv = engine->i915;
         struct execlist_port *port = engine->execlist_port;
         struct intel_context *ce;
  
@@ -1418,14 +1249,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
         ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
  
         request->ring->head = request->postfix;
-       request->ring->last_retired_head = -1;
         intel_ring_update_space(request->ring);
  
-       if (i915.enable_guc_submission)
-               return;
-
         /* Catch up with any missed context-switch interrupts */
-       I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
         if (request->ctx != port[0].request->ctx) {
                 i915_gem_request_put(port[0].request);
                 port[0] = port[1];
@@ -1438,42 +1264,42 @@ static void reset_common_ring(struct intel_engine_cs *engine,
         request->tail =
                 intel_ring_wrap(request->ring,
                                 request->wa_tail - WA_TAIL_DWORDS*sizeof(u32));
+       assert_ring_tail_valid(request->ring, request->tail);
  }
  
  static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
  {
         struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
-       struct intel_ring *ring = req->ring;
         struct intel_engine_cs *engine = req->engine;
-       const int num_lri_cmds = GEN8_LEGACY_PDPES * 2;
-       int i, ret;
+       const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
+       u32 *cs;
+       int i;
  
-       ret = intel_ring_begin(req, num_lri_cmds * 2 + 2);
-       if (ret)
-               return ret;
+       cs = intel_ring_begin(req, num_lri_cmds * 2 + 2);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
  
-       intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(num_lri_cmds));
-       for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) {
+       *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
+       for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
  
-               intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, i));
-               intel_ring_emit(ring, upper_32_bits(pd_daddr));
-               intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, i));
-               intel_ring_emit(ring, lower_32_bits(pd_daddr));
+               *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
+               *cs++ = upper_32_bits(pd_daddr);
+               *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
+               *cs++ = lower_32_bits(pd_daddr);
         }
  
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
+       *cs++ = MI_NOOP;
+       intel_ring_advance(req, cs);
  
         return 0;
  }
  
  static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
                               u64 offset, u32 len,
-                             unsigned int dispatch_flags)
+                             const unsigned int flags)
  {
-       struct intel_ring *ring = req->ring;
-       bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE);
+       u32 *cs;
         int ret;
  
         /* Don't rely in hw updating PDPs, specially in lite-restore.
@@ -1483,30 +1309,28 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
          * not idle). PML4 is allocated during ppgtt init so this is
          * not needed in 48-bit.*/
         if (req->ctx->ppgtt &&
-           (intel_engine_flag(req->engine) & req->ctx->ppgtt->pd_dirty_rings)) {
-               if (!USES_FULL_48BIT_PPGTT(req->i915) &&
-                   !intel_vgpu_active(req->i915)) {
-                       ret = intel_logical_ring_emit_pdps(req);
-                       if (ret)
-                               return ret;
-               }
+           (intel_engine_flag(req->engine) & req->ctx->ppgtt->pd_dirty_rings) &&
+           !i915_vm_is_48bit(&req->ctx->ppgtt->base) &&
+           !intel_vgpu_active(req->i915)) {
+               ret = intel_logical_ring_emit_pdps(req);
+               if (ret)
+                       return ret;
  
                 req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
         }
  
-       ret = intel_ring_begin(req, 4);
-       if (ret)
-               return ret;
+       cs = intel_ring_begin(req, 4);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
  
         /* FIXME(BDW): Address space and security selectors. */
-       intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 |
-                       (ppgtt<<8) |
-                       (dispatch_flags & I915_DISPATCH_RS ?
-                        MI_BATCH_RESOURCE_STREAMER : 0));
-       intel_ring_emit(ring, lower_32_bits(offset));
-       intel_ring_emit(ring, upper_32_bits(offset));
-       intel_ring_emit(ring, MI_NOOP);
-       intel_ring_advance(ring);
+       *cs++ = MI_BATCH_BUFFER_START_GEN8 |
+               (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
+               (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
+       *cs++ = lower_32_bits(offset);
+       *cs++ = upper_32_bits(offset);
+       *cs++ = MI_NOOP;
+       intel_ring_advance(req, cs);
  
         return 0;
  }
@@ -1527,13 +1351,11 @@ static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
  
  static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
  {
-       struct intel_ring *ring = request->ring;
-       u32 cmd;
-       int ret;
+       u32 cmd, *cs;
  
-       ret = intel_ring_begin(request, 4);
-       if (ret)
-               return ret;
+       cs = intel_ring_begin(request, 4);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
  
         cmd = MI_FLUSH_DW + 1;
  
@@ -1550,13 +1372,11 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
                         cmd |= MI_INVALIDATE_BSD;
         }
  
-       intel_ring_emit(ring, cmd);
-       intel_ring_emit(ring,
-                       I915_GEM_HWS_SCRATCH_ADDR |
-                       MI_FLUSH_DW_USE_GTT);
-       intel_ring_emit(ring, 0); /* upper addr */
-       intel_ring_emit(ring, 0); /* value */
-       intel_ring_advance(ring);
+       *cs++ = cmd;
+       *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
+       *cs++ = 0; /* upper addr */
+       *cs++ = 0; /* value */
+       intel_ring_advance(request, cs);
  
         return 0;
  }
@@ -1564,13 +1384,11 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode)
  static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
                                   u32 mode)
  {
-       struct intel_ring *ring = request->ring;
         struct intel_engine_cs *engine = request->engine;
         u32 scratch_addr =
                 i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
         bool vf_flush_wa = false, dc_flush_wa = false;
-       u32 flags = 0;
-       int ret;
+       u32 *cs, flags = 0;
         int len;
  
         flags |= PIPE_CONTROL_CS_STALL;
@@ -1612,62 +1430,25 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
         if (dc_flush_wa)
                 len += 12;
  
-       ret = intel_ring_begin(request, len);
-       if (ret)
-               return ret;
+       cs = intel_ring_begin(request, len);
+       if (IS_ERR(cs))
+               return PTR_ERR(cs);
  
-       if (vf_flush_wa) {
-               intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-       }
+       if (vf_flush_wa)
+               cs = gen8_emit_pipe_control(cs, 0, 0);
  
-       if (dc_flush_wa) {
-               intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-               intel_ring_emit(ring, PIPE_CONTROL_DC_FLUSH_ENABLE);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-       }
+       if (dc_flush_wa)
+               cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
+                                           0);
  
-       intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-       intel_ring_emit(ring, flags);
-       intel_ring_emit(ring, scratch_addr);
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, 0);
-       intel_ring_emit(ring, 0);
-
-       if (dc_flush_wa) {
-               intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-               intel_ring_emit(ring, PIPE_CONTROL_CS_STALL);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-               intel_ring_emit(ring, 0);
-       }
+       cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
  
-       intel_ring_advance(ring);
+       if (dc_flush_wa)
+               cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
  
-       return 0;
-}
+       intel_ring_advance(request, cs);
  
-static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
-{
-       /*
-        * On BXT A steppings there is a HW coherency issue whereby the
-        * MI_STORE_DATA_IMM storing the completed request's seqno
-        * occasionally doesn't invalidate the CPU cache. Work around this by
-        * clflushing the corresponding cacheline whenever the caller wants
-        * the coherency to be guaranteed. Note that this cacheline is known
-        * to be clean at this point, since we only write it in
-        * bxt_a_set_seqno(), where we also do a clflush after the write. So
-        * this clflush in practice becomes an invalidate operation.
-        */
-       intel_flush_status_page(engine, I915_GEM_HWS_INDEX);
+       return 0;
  }
  
  /*
@@ -1675,34 +1456,34 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
   * used as a workaround for not being allowed to do lite
   * restore with HEAD==TAIL (WaIdleLiteRestore).
   */
-static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *out)
+static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs)
  {
-       *out++ = MI_NOOP;
-       *out++ = MI_NOOP;
-       request->wa_tail = intel_ring_offset(request->ring, out);
+       *cs++ = MI_NOOP;
+       *cs++ = MI_NOOP;
+       request->wa_tail = intel_ring_offset(request, cs);
  }
  
-static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request,
-                                u32 *out)
+static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
  {
         /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
         BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
  
-       *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
-       *out++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
-       *out++ = 0;
-       *out++ = request->global_seqno;
-       *out++ = MI_USER_INTERRUPT;
-       *out++ = MI_NOOP;
-       request->tail = intel_ring_offset(request->ring, out);
+       *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+       *cs++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
+       *cs++ = 0;
+       *cs++ = request->global_seqno;
+       *cs++ = MI_USER_INTERRUPT;
+       *cs++ = MI_NOOP;
+       request->tail = intel_ring_offset(request, cs);
+       assert_ring_tail_valid(request->ring, request->tail);
  
-       gen8_emit_wa_tail(request, out);
+       gen8_emit_wa_tail(request, cs);
  }
  
  static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
  
  static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
-                                       u32 *out)
+                                       u32 *cs)
  {
         /* We're using qword write, seqno should be aligned to 8 bytes. */
         BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
@@ -1711,20 +1492,20 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
          * need a prior CS_STALL, which is emitted by the flush
          * following the batch.
          */
-       *out++ = GFX_OP_PIPE_CONTROL(6);
-       *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
-                 PIPE_CONTROL_CS_STALL |
-                 PIPE_CONTROL_QW_WRITE);
-       *out++ = intel_hws_seqno_address(request->engine);
-       *out++ = 0;
-       *out++ = request->global_seqno;
+       *cs++ = GFX_OP_PIPE_CONTROL(6);
+       *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL |
+               PIPE_CONTROL_QW_WRITE;
+       *cs++ = intel_hws_seqno_address(request->engine);
+       *cs++ = 0;
+       *cs++ = request->global_seqno;
         /* We're thrashing one dword of HWS. */
-       *out++ = 0;
-       *out++ = MI_USER_INTERRUPT;
-       *out++ = MI_NOOP;
-       request->tail = intel_ring_offset(request->ring, out);
+       *cs++ = 0;
+       *cs++ = MI_USER_INTERRUPT;
+       *cs++ = MI_NOOP;
+       request->tail = intel_ring_offset(request, cs);
+       assert_ring_tail_valid(request->ring, request->tail);
  
-       gen8_emit_wa_tail(request, out);
+       gen8_emit_wa_tail(request, cs);
  }
  
  static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
@@ -1733,7 +1514,7 @@ static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
  {
         int ret;
  
-       ret = intel_logical_ring_workarounds_emit(req);
+       ret = intel_ring_workarounds_emit(req);
         if (ret)
                 return ret;
  
@@ -1779,21 +1560,17 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
  
         intel_engine_cleanup_common(engine);
  
-       lrc_destroy_wa_ctx_obj(engine);
+       lrc_destroy_wa_ctx(engine);
         engine->i915 = NULL;
         dev_priv->engine[engine->id] = NULL;
         kfree(engine);
  }
  
-void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
+static void execlists_set_default_submission(struct intel_engine_cs *engine)
  {
-       struct intel_engine_cs *engine;
-       enum intel_engine_id id;
-
-       for_each_engine(engine, dev_priv, id) {
-               engine->submit_request = execlists_submit_request;
-               engine->schedule = execlists_schedule;
-       }
+       engine->submit_request = execlists_submit_request;
+       engine->schedule = execlists_schedule;
+       engine->irq_tasklet.func = intel_lrc_irq_handler;
  }
  
  static void
@@ -1811,14 +1588,12 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
         engine->emit_flush = gen8_emit_flush;
         engine->emit_breadcrumb = gen8_emit_breadcrumb;
         engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
-       engine->submit_request = execlists_submit_request;
-       engine->schedule = execlists_schedule;
+
+       engine->set_default_submission = execlists_set_default_submission;
  
         engine->irq_enable = gen8_logical_ring_enable_irq;
         engine->irq_disable = gen8_logical_ring_disable_irq;
         engine->emit_bb_start = gen8_emit_bb_start;
-       if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1))
-               engine->irq_seqno_barrier = bxt_a_seqno_barrier;
  }
  
  static inline void
@@ -1875,7 +1650,6 @@ logical_ring_setup(struct intel_engine_cs *engine)
         tasklet_init(&engine->irq_tasklet,
                      intel_lrc_irq_handler, (unsigned long)engine);
  
-       logical_ring_init_platform_invariants(engine);
         logical_ring_default_vfuncs(engine);
         logical_ring_default_irqs(engine);
  }
@@ -2013,105 +1787,89 @@ static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
         return indirect_ctx_offset;
  }
  
-static void execlists_init_reg_state(u32 *reg_state,
+static void execlists_init_reg_state(u32 *regs,
                                      struct i915_gem_context *ctx,
                                      struct intel_engine_cs *engine,
                                      struct intel_ring *ring)
  {
         struct drm_i915_private *dev_priv = engine->i915;
         struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
+       u32 base = engine->mmio_base;
+       bool rcs = engine->id == RCS;
+
+       /* A context is actually a big batch buffer with several
+        * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
+        * values we are setting here are only for the first context restore:
+        * on a subsequent save, the GPU will recreate this batchbuffer with new
+        * values (including all the missing MI_LOAD_REGISTER_IMM commands that
+        * we are not initializing here).
+        */
+       regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
+                                MI_LRI_FORCE_POSTED;
+
+       CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
+               _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
+                                  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
+                                  (HAS_RESOURCE_STREAMER(dev_priv) ?
+                                  CTX_CTRL_RS_CTX_ENABLE : 0)));
+       CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
+       CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
+       CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
+       CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
+               RING_CTL_SIZE(ring->size) | RING_VALID);
+       CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
+       CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
+       CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
+       CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
+       CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
+       CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
+       if (rcs) {
+               CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
+               CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
+               CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
+                       RING_INDIRECT_CTX_OFFSET(base), 0);
  
-       /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
-        * commands followed by (reg, value) pairs. The values we are setting here are
-        * only for the first context restore: on a subsequent save, the GPU will
-        * recreate this batchbuffer with new values (including all the missing
-        * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */
-       reg_state[CTX_LRI_HEADER_0] =
-               MI_LOAD_REGISTER_IMM(engine->id == RCS ? 14 : 11) | MI_LRI_FORCE_POSTED;
-       ASSIGN_CTX_REG(reg_state, CTX_CONTEXT_CONTROL,
-                      RING_CONTEXT_CONTROL(engine),
-                      _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
-                                         CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
-                                         (HAS_RESOURCE_STREAMER(dev_priv) ?
-                                          CTX_CTRL_RS_CTX_ENABLE : 0)));
-       ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(engine->mmio_base),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(engine->mmio_base),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START,
-                      RING_START(engine->mmio_base), 0);
-       ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL,
-                      RING_CTL(engine->mmio_base),
-                      RING_CTL_SIZE(ring->size) | RING_VALID);
-       ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U,
-                      RING_BBADDR_UDW(engine->mmio_base), 0);
-       ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L,
-                      RING_BBADDR(engine->mmio_base), 0);
-       ASSIGN_CTX_REG(reg_state, CTX_BB_STATE,
-                      RING_BBSTATE(engine->mmio_base),
-                      RING_BB_PPGTT);
-       ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_U,
-                      RING_SBBADDR_UDW(engine->mmio_base), 0);
-       ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_L,
-                      RING_SBBADDR(engine->mmio_base), 0);
-       ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_STATE,
-                      RING_SBBSTATE(engine->mmio_base), 0);
-       if (engine->id == RCS) {
-               ASSIGN_CTX_REG(reg_state, CTX_BB_PER_CTX_PTR,
-                              RING_BB_PER_CTX_PTR(engine->mmio_base), 0);
-               ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX,
-                              RING_INDIRECT_CTX(engine->mmio_base), 0);
-               ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET,
-                              RING_INDIRECT_CTX_OFFSET(engine->mmio_base), 0);
                 if (engine->wa_ctx.vma) {
                         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
  
-                       reg_state[CTX_RCS_INDIRECT_CTX+1] =
-                               (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |
-                               (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS);
+                       regs[CTX_RCS_INDIRECT_CTX + 1] =
+                               (ggtt_offset + wa_ctx->indirect_ctx.offset) |
+                               (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
  
-                       reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
+                       regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
                                 intel_lr_indirect_ctx_offset(engine) << 6;
  
-                       reg_state[CTX_BB_PER_CTX_PTR+1] =
-                               (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) |
-                               0x01;
+                       regs[CTX_BB_PER_CTX_PTR + 1] =
+                               (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
                 }
         }
-       reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
-       ASSIGN_CTX_REG(reg_state, CTX_CTX_TIMESTAMP,
-                      RING_CTX_TIMESTAMP(engine->mmio_base), 0);
+
+       regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
+
+       CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
         /* PDP values well be assigned later if needed */
-       ASSIGN_CTX_REG(reg_state, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0),
-                      0);
-       ASSIGN_CTX_REG(reg_state, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0),
-                      0);
-
-       if (ppgtt && USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
+       CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0);
+       CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0);
+       CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0);
+       CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0);
+       CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0);
+       CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0);
+       CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
+       CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
+
+       if (ppgtt && i915_vm_is_48bit(&ppgtt->base)) {
                 /* 64b PPGTT (48bit canonical)
                  * PDP0_DESCRIPTOR contains the base address to PML4 and
                  * other PDP Descriptors are ignored.
                  */
-               ASSIGN_CTX_PML4(ppgtt, reg_state);
+               ASSIGN_CTX_PML4(ppgtt, regs);
         }
  
-       if (engine->id == RCS) {
-               reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
-               ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
-                              make_rpcs(dev_priv));
+       if (rcs) {
+               regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
+               CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
+                       make_rpcs(dev_priv));
         }
  }
  
@@ -2231,7 +1989,7 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
  
         ce->ring = ring;
         ce->state = vma;
-       ce->initialised = engine->init_context == NULL;
+       ce->initialised |= engine->init_context == NULL;
  
         return 0;
  
@@ -2279,7 +2037,6 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
                         i915_gem_object_unpin_map(ce->state->obj);
  
                         ce->ring->head = ce->ring->tail = 0;
-                       ce->ring->last_retired_head = -1;
                         intel_ring_update_space(ce->ring);
                 }
         }