s390: Clarify pagefault interrupt

author Peter Zijlstra <peterz@infradead.org>

Tue, 22 Mar 2016 20:42:53 +0000 (21:42 +0100)

committer Martin Schwidefsky <schwidefsky@de.ibm.com>

Fri, 15 Apr 2016 16:16:37 +0000 (18:16 +0200)
author Peter Zijlstra <peterz@infradead.org>
Tue, 22 Mar 2016 20:42:53 +0000 (21:42 +0100)
committer Martin Schwidefsky <schwidefsky@de.ibm.com>
Fri, 15 Apr 2016 16:16:37 +0000 (18:16 +0200)
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c

index cce577feab1e9998f5e1b29e6011e70e8db075fe..7a31440173016e8cf0281deda2f81774975d80d1 100644 (file)
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -631,6 +631,29 @@ void pfault_fini(void)
  static DEFINE_SPINLOCK(pfault_lock);
  static LIST_HEAD(pfault_list);
  
+#define PF_COMPLETE    0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule().  It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
  static void pfault_interrupt(struct ext_code ext_code,
                              unsigned int param32, unsigned long param64)
  {
@@ -639,10 +662,9 @@ static void pfault_interrupt(struct ext_code ext_code,
         pid_t pid;
  
         /*
-        * Get the external interruption subcode & pfault
-        * initial/completion signal bit. VM stores this 
-        * in the 'cpu address' field associated with the
-         * external interrupt. 
+        * Get the external interruption subcode & pfault initial/completion
+        * signal bit. VM stores this in the 'cpu address' field associated
+        * with the external interrupt.
          */
         subcode = ext_code.subcode;
         if ((subcode & 0xff00) != __SUBCODE_MASK)
@@ -658,7 +680,7 @@ static void pfault_interrupt(struct ext_code ext_code,
         if (!tsk)
                 return;
         spin_lock(&pfault_lock);
-       if (subcode & 0x0080) {
+       if (subcode & PF_COMPLETE) {
                 /* signal bit is set -> a page has been swapped in by VM */
                 if (tsk->thread.pfault_wait == 1) {
                         /* Initial interrupt was faster than the completion
@@ -687,8 +709,7 @@ static void pfault_interrupt(struct ext_code ext_code,
                         goto out;
                 if (tsk->thread.pfault_wait == 1) {
                         /* Already on the list with a reference: put to sleep */
-                       __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-                       set_tsk_need_resched(tsk);
+                       goto block;
                 } else if (tsk->thread.pfault_wait == -1) {
                         /* Completion interrupt was faster than the initial
                          * interrupt (pfault_wait == -1). Set pfault_wait
@@ -703,7 +724,11 @@ static void pfault_interrupt(struct ext_code ext_code,
                         get_task_struct(tsk);
                         tsk->thread.pfault_wait = 1;
                         list_add(&tsk->thread.list, &pfault_list);
-                       __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+block:
+                       /* Since this must be a userspace fault, there
+                        * is no kernel task state to trample. Rely on the
+                        * return to userspace schedule() to block. */
+                       __set_current_state(TASK_UNINTERRUPTIBLE);
                         set_tsk_need_resched(tsk);
                 }
         }
author	Peter Zijlstra <peterz@infradead.org>
	Tue, 22 Mar 2016 20:42:53 +0000 (21:42 +0100)
committer	Martin Schwidefsky <schwidefsky@de.ibm.com>
	Fri, 15 Apr 2016 16:16:37 +0000 (18:16 +0200)