drm/i915: Don't count semaphore waits towards a stuck ring

author Chris Wilson <chris@chris-wilson.co.uk>

Mon, 10 Jun 2013 10:20:21 +0000 (11:20 +0100)

committer Daniel Vetter <daniel.vetter@ffwll.ch>

Tue, 11 Jun 2013 09:49:28 +0000 (11:49 +0200)
author Chris Wilson <chris@chris-wilson.co.uk>
Mon, 10 Jun 2013 10:20:21 +0000 (11:20 +0100)
committer Daniel Vetter <daniel.vetter@ffwll.ch>
Tue, 11 Jun 2013 09:49:28 +0000 (11:49 +0200)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c

index 6a757691488e7d711b35f3076fe24a89f48318b5..563abe79bb206992978f6f7117a7db86798292e7 100644 (file)
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2321,21 +2321,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
                 i915_seqno_passed(seqno, ring_last_seqno(ring)));
  }
  
-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
  {
         struct drm_i915_private *dev_priv = ring->dev->dev_private;
-       u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
-       struct intel_ring_buffer *signaller;
-       u32 cmd, ipehr, acthd_min;
+       u32 cmd, ipehr, acthd, acthd_min;
  
         ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
         if ((ipehr & ~(0x3 << 16)) !=
             (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
-               return false;
+               return NULL;
  
         /* ACTHD is likely pointing to the dword after the actual command,
          * so scan backwards until we find the MBOX.
          */
+       acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
         acthd_min = max((int)acthd - 3 * 4, 0);
         do {
                 cmd = ioread32(ring->virtual_start + acthd);
@@ -2344,22 +2344,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
  
                 acthd -= 4;
                 if (acthd < acthd_min)
-                       return false;
+                       return NULL;
         } while (1);
  
-       signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-       return i915_seqno_passed(signaller->get_seqno(signaller, false),
-                                ioread32(ring->virtual_start+acthd+4)+1);
+       *seqno = ioread32(ring->virtual_start+acthd+4)+1;
+       return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
+}
+
+static int semaphore_passed(struct intel_ring_buffer *ring)
+{
+       struct drm_i915_private *dev_priv = ring->dev->dev_private;
+       struct intel_ring_buffer *signaller;
+       u32 seqno, ctl;
+
+       ring->hangcheck.deadlock = true;
+
+       signaller = semaphore_waits_for(ring, &seqno);
+       if (signaller == NULL || signaller->hangcheck.deadlock)
+               return -1;
+
+       /* cursory check for an unkickable deadlock */
+       ctl = I915_READ_CTL(signaller);
+       if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+               return -1;
+
+       return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
+}
+
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+{
+       struct intel_ring_buffer *ring;
+       int i;
+
+       for_each_ring(ring, dev_priv, i)
+               ring->hangcheck.deadlock = false;
  }
  
-static bool ring_hung(struct intel_ring_buffer *ring)
+static enum { wait, active, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
  {
         struct drm_device *dev = ring->dev;
         struct drm_i915_private *dev_priv = dev->dev_private;
         u32 tmp;
  
+       if (ring->hangcheck.acthd != acthd)
+               return active;
+
         if (IS_GEN2(dev))
-               return true;
+               return hung;
  
         /* Is the chip hanging on a WAIT_FOR_EVENT?
          * If so we can simply poke the RB_WAIT bit
@@ -2371,19 +2402,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
                 DRM_ERROR("Kicking stuck wait on %s\n",
                           ring->name);
                 I915_WRITE_CTL(ring, tmp);
-               return false;
-       }
-
-       if (INTEL_INFO(dev)->gen >= 6 &&
-           tmp & RING_WAIT_SEMAPHORE &&
-           semaphore_passed(ring)) {
-               DRM_ERROR("Kicking stuck semaphore on %s\n",
-                         ring->name);
-               I915_WRITE_CTL(ring, tmp);
-               return false;
+               return kick;
+       }
+
+       if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+               switch (semaphore_passed(ring)) {
+               default:
+                       return hung;
+               case 1:
+                       DRM_ERROR("Kicking stuck semaphore on %s\n",
+                                 ring->name);
+                       I915_WRITE_CTL(ring, tmp);
+                       return kick;
+               case 0:
+                       return wait;
+               }
         }
  
-       return true;
+       return hung;
  }
  
  /**
@@ -2414,6 +2450,8 @@ void i915_hangcheck_elapsed(unsigned long data)
                 u32 seqno, acthd;
                 bool busy = true;
  
+               semaphore_clear_deadlocks(dev_priv);
+
                 seqno = ring->get_seqno(ring, false);
                 acthd = intel_ring_get_active_head(ring);
  
@@ -2430,17 +2468,36 @@ void i915_hangcheck_elapsed(unsigned long data)
                         } else {
                                 int score;
  
-                               stuck[i] = ring->hangcheck.acthd == acthd;
-                               if (stuck[i]) {
-                                       /* Every time we kick the ring, add a
-                                        * small increment to the hangcheck
-                                        * score so that we can catch a
-                                        * batch that is repeatedly kicked.
-                                        */
-                                       score = ring_hung(ring) ? HUNG : KICK;
-                               } else
+                               /* We always increment the hangcheck score
+                                * if the ring is busy and still processing
+                                * the same request, so that no single request
+                                * can run indefinitely (such as a chain of
+                                * batches). The only time we do not increment
+                                * the hangcheck score on this ring, if this
+                                * ring is in a legitimate wait for another
+                                * ring. In that case the waiting ring is a
+                                * victim and we want to be sure we catch the
+                                * right culprit. Then every time we do kick
+                                * the ring, add a small increment to the
+                                * score so that we can catch a batch that is
+                                * being repeatedly kicked and so responsible
+                                * for stalling the machine.
+                                */
+                               switch (ring_stuck(ring, acthd)) {
+                               case wait:
+                                       score = 0;
+                                       break;
+                               case active:
                                         score = BUSY;
-
+                                       break;
+                               case kick:
+                                       score = KICK;
+                                       break;
+                               case hung:
+                                       score = HUNG;
+                                       stuck[i] = true;
+                                       break;
+                               }
                                 ring->hangcheck.score += score;
                         }
                 } else {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h

index efc403d1e3e04c90b5939a71fb184839e2d7fcc2..a3e96103dbe54a71e2e8f54327358c6de03220cb 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -38,6 +38,7 @@ struct  intel_hw_status_page {
  #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
  
  struct intel_ring_hangcheck {
+       bool deadlock;
         u32 seqno;
         u32 acthd;
         int score;
author	Chris Wilson <chris@chris-wilson.co.uk>
	Mon, 10 Jun 2013 10:20:21 +0000 (11:20 +0100)
committer	Daniel Vetter <daniel.vetter@ffwll.ch>
	Tue, 11 Jun 2013 09:49:28 +0000 (11:49 +0200)
drivers/gpu/drm/i915/i915_irq.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_ringbuffer.h		patch \| blob \| history