Merge remote-tracking branch 'xfs/for-next'

[karo-tx-linux.git] / ipc / sem.c
diff --git a/ipc/sem.c b/ipc/sem.c

index 19c8b980d1feff430d82297a53be528cea801c42..db9d241af133d770cb0a95a22cacf257f79b1215 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -252,71 +252,113 @@ static void sem_rcu_free(struct rcu_head *head)
         ipc_rcu_free(head);
  }
  
+/*
+ * Wait until all currently ongoing simple ops have completed.
+ * Caller must own sem_perm.lock.
+ * New simple ops cannot start, because simple ops first check
+ * that sem_perm.lock is free.
+ * that a) sem_perm.lock is free and b) complex_count is 0.
+ */
+static void sem_wait_array(struct sem_array *sma)
+{
+       int i;
+       struct sem *sem;
+
+       if (sma->complex_count)  {
+               /* The thread that increased sma->complex_count waited on
+                * all sem->lock locks. Thus we don't need to wait again.
+                */
+               return;
+       }
+
+       for (i = 0; i < sma->sem_nsems; i++) {
+               sem = sma->sem_base + i;
+               spin_unlock_wait(&sem->lock);
+       }
+}
+
  /*
   * If the request contains only one semaphore operation, and there are
   * no complex transactions pending, lock only the semaphore involved.
   * Otherwise, lock the entire semaphore array, since we either have
   * multiple semaphores in our own semops, or we need to look at
   * semaphores from other pending complex operations.
- *
- * Carefully guard against sma->complex_count changing between zero
- * and non-zero while we are spinning for the lock. The value of
- * sma->complex_count cannot change while we are holding the lock,
- * so sem_unlock should be fine.
- *
- * The global lock path checks that all the local locks have been released,
- * checking each local lock once. This means that the local lock paths
- * cannot start their critical sections while the global lock is held.
   */
  static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
                               int nsops)
  {
-       int locknum;
- again:
-       if (nsops == 1 && !sma->complex_count) {
-               struct sem *sem = sma->sem_base + sops->sem_num;
+       struct sem *sem;
  
-               /* Lock just the semaphore we are interested in. */
-               spin_lock(&sem->lock);
+       if (nsops != 1) {
+               /* Complex operation - acquire a full lock */
+               ipc_lock_object(&sma->sem_perm);
  
-               /*
-                * If sma->complex_count was set while we were spinning,
-                * we may need to look at things we did not lock here.
+               /* And wait until all simple ops that are processed
+                * right now have dropped their locks.
                  */
-               if (unlikely(sma->complex_count)) {
-                       spin_unlock(&sem->lock);
-                       goto lock_array;
-               }
+               sem_wait_array(sma);
+               return -1;
+       }
+
+       /*
+        * Only one semaphore affected - try to optimize locking.
+        * The rules are:
+        * - optimized locking is possible if no complex operation
+        *   is either enqueued or processed right now.
+        * - The test for enqueued complex ops is simple:
+        *      sma->complex_count != 0
+        * - Testing for complex ops that are processed right now is
+        *   a bit more difficult. Complex ops acquire the full lock
+        *   and first wait that the running simple ops have completed.
+        *   (see above)
+        *   Thus: If we own a simple lock and the global lock is free
+        *      and complex_count is now 0, then it will stay 0 and
+        *      thus just locking sem->lock is sufficient.
+        */
+       sem = sma->sem_base + sops->sem_num;
  
+       if (sma->complex_count == 0) {
                 /*
-                * Another process is holding the global lock on the
-                * sem_array; we cannot enter our critical section,
-                * but have to wait for the global lock to be released.
+                * It appears that no complex operation is around.
+                * Acquire the per-semaphore lock.
                  */
-               if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
-                       spin_unlock(&sem->lock);
-                       spin_unlock_wait(&sma->sem_perm.lock);
-                       goto again;
+               spin_lock(&sem->lock);
+
+               /* Then check that the global lock is free */
+               if (!spin_is_locked(&sma->sem_perm.lock)) {
+                       /* spin_is_locked() is not a memory barrier */
+                       smp_mb();
+
+                       /* Now repeat the test of complex_count:
+                        * It can't change anymore until we drop sem->lock.
+                        * Thus: if is now 0, then it will stay 0.
+                        */
+                       if (sma->complex_count == 0) {
+                               /* fast path successful! */
+                               return sops->sem_num;
+                       }
                 }
+               spin_unlock(&sem->lock);
+       }
+
+       /* slow path: acquire the full lock */
+       ipc_lock_object(&sma->sem_perm);
  
-               locknum = sops->sem_num;
+       if (sma->complex_count == 0) {
+               /* False alarm:
+                * There is no complex operation, thus we can switch
+                * back to the fast path.
+                */
+               spin_lock(&sem->lock);
+               ipc_unlock_object(&sma->sem_perm);
+               return sops->sem_num;
         } else {
-               int i;
-               /*
-                * Lock the semaphore array, and wait for all of the
-                * individual semaphore locks to go away.  The code
-                * above ensures no new single-lock holders will enter
-                * their critical section while the array lock is held.
+               /* Not a false alarm, thus complete the sequence for a
+                * full lock.
                  */
- lock_array:
-               ipc_lock_object(&sma->sem_perm);
-               for (i = 0; i < sma->sem_nsems; i++) {
-                       struct sem *sem = sma->sem_base + i;
-                       spin_unlock_wait(&sem->lock);
-               }
-               locknum = -1;
+               sem_wait_array(sma);
+               return -1;
         }
-       return locknum;
  }
  
  static inline void sem_unlock(struct sem_array *sma, int locknum)
@@ -875,6 +917,24 @@ again:
         return semop_completed;
  }
  
+/**
+ * set_semotime(sma, sops) - set sem_otime
+ * @sma: semaphore array
+ * @sops: operations that modified the array, may be NULL
+ *
+ * sem_otime is replicated to avoid cache line trashing.
+ * This function sets one instance to the current time.
+ */
+static void set_semotime(struct sem_array *sma, struct sembuf *sops)
+{
+       if (sops == NULL) {
+               sma->sem_base[0].sem_otime = get_seconds();
+       } else {
+               sma->sem_base[sops[0].sem_num].sem_otime =
+                                                       get_seconds();
+       }
+}
+
  /**
   * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
   * @sma: semaphore array
@@ -925,17 +985,10 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
                         }
                 }
         }
-       if (otime) {
-               if (sops == NULL) {
-                       sma->sem_base[0].sem_otime = get_seconds();
-               } else {
-                       sma->sem_base[sops[0].sem_num].sem_otime =
-                                                               get_seconds();
-               }
-       }
+       if (otime)
+               set_semotime(sma, sops);
  }
  
-
  /* The following counts are associated to each semaphore:
   *   semncnt        number of tasks waiting on semval being nonzero
   *   semzcnt        number of tasks waiting on semval being zero
@@ -1229,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
  
         sem_lock(sma, NULL, -1);
  
+       if (sma->sem_perm.deleted) {
+               sem_unlock(sma, -1);
+               rcu_read_unlock();
+               return -EIDRM;
+       }
+
         curr = &sma->sem_base[semnum];
  
         ipc_assert_locked_object(&sma->sem_perm);
@@ -1283,12 +1342,14 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 int i;
  
                 sem_lock(sma, NULL, -1);
+               if (sma->sem_perm.deleted) {
+                       err = -EIDRM;
+                       goto out_unlock;
+               }
                 if(nsems > SEMMSL_FAST) {
                         if (!ipc_rcu_getref(sma)) {
-                               sem_unlock(sma, -1);
-                               rcu_read_unlock();
                                 err = -EIDRM;
-                               goto out_free;
+                               goto out_unlock;
                         }
                         sem_unlock(sma, -1);
                         rcu_read_unlock();
@@ -1301,10 +1362,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                         rcu_read_lock();
                         sem_lock_and_putref(sma);
                         if (sma->sem_perm.deleted) {
-                               sem_unlock(sma, -1);
-                               rcu_read_unlock();
                                 err = -EIDRM;
-                               goto out_free;
+                               goto out_unlock;
                         }
                 }
                 for (i = 0; i < sma->sem_nsems; i++)
@@ -1322,8 +1381,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 struct sem_undo *un;
  
                 if (!ipc_rcu_getref(sma)) {
-                       rcu_read_unlock();
-                       return -EIDRM;
+                       err = -EIDRM;
+                       goto out_rcu_wakeup;
                 }
                 rcu_read_unlock();
  
@@ -1351,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 rcu_read_lock();
                 sem_lock_and_putref(sma);
                 if (sma->sem_perm.deleted) {
-                       sem_unlock(sma, -1);
-                       rcu_read_unlock();
                         err = -EIDRM;
-                       goto out_free;
+                       goto out_unlock;
                 }
  
                 for (i = 0; i < nsems; i++)
@@ -1378,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                 goto out_rcu_wakeup;
  
         sem_lock(sma, NULL, -1);
+       if (sma->sem_perm.deleted) {
+               err = -EIDRM;
+               goto out_unlock;
+       }
         curr = &sma->sem_base[semnum];
  
         switch (cmd) {
@@ -1783,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
         if (error)
                 goto out_rcu_wakeup;
  
+       error = -EIDRM;
+       locknum = sem_lock(sma, sops, nsops);
+       if (sma->sem_perm.deleted)
+               goto out_unlock_free;
         /*
          * semid identifiers are not unique - find_alloc_undo may have
          * allocated an undo structure, it was invalidated by an RMID
@@ -1790,19 +1855,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
          * This case can be detected checking un->semid. The existence of
          * "un" itself is guaranteed by rcu.
          */
-       error = -EIDRM;
-       locknum = sem_lock(sma, sops, nsops);
         if (un && un->semid == -1)
                 goto out_unlock_free;
  
         error = perform_atomic_semop(sma, sops, nsops, un,
                                         task_tgid_vnr(current));
-       if (error <= 0) {
-               if (alter && error == 0)
+       if (error == 0) {
+               /* If the operation was successful, then do
+                * the required updates.
+                */
+               if (alter)
                         do_smart_update(sma, sops, nsops, 1, &tasks);
-
-               goto out_unlock_free;
+               else
+                       set_semotime(sma, sops);
         }
+       if (error <= 0)
+               goto out_unlock_free;
  
         /* We need to sleep on this operation, so we put the current
          * task into the pending queue and go to sleep.
@@ -1999,6 +2067,12 @@ void exit_sem(struct task_struct *tsk)
                 }
  
                 sem_lock(sma, NULL, -1);
+               /* exit_sem raced with IPC_RMID, nothing to do */
+               if (sma->sem_perm.deleted) {
+                       sem_unlock(sma, -1);
+                       rcu_read_unlock();
+                       continue;
+               }
                 un = __lookup_undo(ulp, semid);
                 if (un == NULL) {
                         /* exit_sem raced with IPC_RMID+semget() that created
@@ -2061,6 +2135,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
         struct sem_array *sma = it;
         time_t sem_otime;
  
+       /*
+        * The proc interface isn't aware of sem_lock(), it calls
+        * ipc_lock_object() directly (in sysvipc_find_ipc).
+        * In order to stay compatible with sem_lock(), we must wait until
+        * all simple semop() calls have left their critical regions.
+        */
+       sem_wait_array(sma);
+
         sem_otime = get_semotime(sma);
  
         return seq_printf(s,