Merge git://git.kernel.org/pub/scm/linux/kernel/git/wim/linux-2.6-watchdog

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 30 Jan 2008 22:36:35 +0000 (09:36 +1100)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 30 Jan 2008 22:36:35 +0000 (09:36 +1100)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Jan 2008 22:36:35 +0000 (09:36 +1100)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Jan 2008 22:36:35 +0000 (09:36 +1100)
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl

index aa38cc5692a005fe4a48f59bdabff3689865c6db..77436d735013f37195750aec890f7db57a1b130d 100644 (file)
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -419,7 +419,13 @@ X!Edrivers/pnp/system.c
  
    <chapter id="blkdev">
       <title>Block Devices</title>
-!Eblock/ll_rw_blk.c
+!Eblock/blk-core.c
+!Eblock/blk-map.c
+!Iblock/blk-sysfs.c
+!Eblock/blk-settings.c
+!Eblock/blk-exec.c
+!Eblock/blk-barrier.c
+!Eblock/blk-tag.c
    </chapter>
  
    <chapter id="chrdev">
diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt

new file mode 100644 (file)

index 0000000..de4804e
--- /dev/null
+++ b/Documentation/debugging-via-ohci1394.txt
@@ -0,0 +1,179 @@
+
+  Using physical DMA provided by OHCI-1394 FireWire controllers for debugging
+  ---------------------------------------------------------------------------
+
+Introduction
+------------
+
+Basically all FireWire controllers which are in use today are compliant
+to the OHCI-1394 specification which defines the controller to be a PCI
+bus master which uses DMA to offload data transfers from the CPU and has
+a "Physical Response Unit" which executes specific requests by employing
+PCI-Bus master DMA after applying filters defined by the OHCI-1394 driver.
+
+Once properly configured, remote machines can send these requests to
+ask the OHCI-1394 controller to perform read and write requests on
+physical system memory and, for read requests, send the result of
+the physical memory read back to the requester.
+
+With that, it is possible to debug issues by reading interesting memory
+locations such as buffers like the printk buffer or the process table.
+
+Retrieving a full system memory dump is also possible over the FireWire,
+using data transfer rates in the order of 10MB/s or more.
+
+Memory access is currently limited to the low 4G of physical address
+space which can be a problem on IA64 machines where memory is located
+mostly above that limit, but it is rarely a problem on more common
+hardware such as hardware based on x86, x86-64 and PowerPC.
+
+Together with a early initialization of the OHCI-1394 controller for debugging,
+this facility proved most useful for examining long debugs logs in the printk
+buffer on to debug early boot problems in areas like ACPI where the system
+fails to boot and other means for debugging (serial port) are either not
+available (notebooks) or too slow for extensive debug information (like ACPI).
+
+Drivers
+-------
+
+The OHCI-1394 drivers in drivers/firewire and drivers/ieee1394 initialize
+the OHCI-1394 controllers to a working state and can be used to enable
+physical DMA. By default you only have to load the driver, and physical
+DMA access will be granted to all remote nodes, but it can be turned off
+when using the ohci1394 driver.
+
+Because these drivers depend on the PCI enumeration to be completed, an
+initialization routine which can runs pretty early (long before console_init(),
+which makes the printk buffer appear on the console can be called) was written.
+
+To activate it, enable CONFIG_PROVIDE_OHCI1394_DMA_INIT (Kernel hacking menu:
+Provide code for enabling DMA over FireWire early on boot) and pass the
+parameter "ohci1394_dma=early" to the recompiled kernel on boot.
+
+Tools
+-----
+
+firescope - Originally developed by Benjamin Herrenschmidt, Andi Kleen ported
+it from PowerPC to x86 and x86_64 and added functionality, firescope can now
+be used to view the printk buffer of a remote machine, even with live update.
+
+Bernhard Kaindl enhanced firescope to support accessing 64-bit machines
+from 32-bit firescope and vice versa:
+- ftp://ftp.suse.de/private/bk/firewire/tools/firescope-0.2.2.tar.bz2
+
+and he implemented fast system dump (alpha version - read README.txt):
+- ftp://ftp.suse.de/private/bk/firewire/tools/firedump-0.1.tar.bz2
+
+There is also a gdb proxy for firewire which allows to use gdb to access
+data which can be referenced from symbols found by gdb in vmlinux:
+- ftp://ftp.suse.de/private/bk/firewire/tools/fireproxy-0.33.tar.bz2
+
+The latest version of this gdb proxy (fireproxy-0.34) can communicate (not
+yet stable) with kgdb over an memory-based communication module (kgdbom).
+
+Getting Started
+---------------
+
+The OHCI-1394 specification regulates that the OHCI-1394 controller must
+disable all physical DMA on each bus reset.
+
+This means that if you want to debug an issue in a system state where
+interrupts are disabled and where no polling of the OHCI-1394 controller
+for bus resets takes place, you have to establish any FireWire cable
+connections and fully initialize all FireWire hardware __before__ the
+system enters such state.
+
+Step-by-step instructions for using firescope with early OHCI initialization:
+
+1) Verify that your hardware is supported:
+
+   Load the ohci1394 or the fw-ohci module and check your kernel logs.
+   You should see a line similar to
+
+   ohci1394: fw-host0: OHCI-1394 1.1 (PCI): IRQ=[18]  MMIO=[fe9ff800-fe9fffff]
+   ... Max Packet=[2048]  IR/IT contexts=[4/8]
+
+   when loading the driver. If you have no supported controller, many PCI,
+   CardBus and even some Express cards which are fully compliant to OHCI-1394
+   specification are available. If it requires no driver for Windows operating
+   systems, it most likely is. Only specialized shops have cards which are not
+   compliant, they are based on TI PCILynx chips and require drivers for Win-
+   dows operating systems.
+
+2) Establish a working FireWire cable connection:
+
+   Any FireWire cable, as long at it provides electrically and mechanically
+   stable connection and has matching connectors (there are small 4-pin and
+   large 6-pin FireWire ports) will do.
+
+   If an driver is running on both machines you should see a line like
+
+   ieee1394: Node added: ID:BUS[0-01:1023]  GUID[0090270001b84bba]
+
+   on both machines in the kernel log when the cable is plugged in
+   and connects the two machines.
+
+3) Test physical DMA using firescope:
+
+   On the debug host,
+       - load the raw1394 module,
+       - make sure that /dev/raw1394 is accessible,
+   then start firescope:
+
+       $ firescope
+       Port 0 (ohci1394) opened, 2 nodes detected
+
+       FireScope
+       ---------
+       Target : <unspecified>
+       Gen    : 1
+       [Ctrl-T] choose target
+       [Ctrl-H] this menu
+       [Ctrl-Q] quit
+
+    ------> Press Ctrl-T now, the output should be similar to:
+
+       2 nodes available, local node is: 0
+        0: ffc0, uuid: 00000000 00000000 [LOCAL]
+        1: ffc1, uuid: 00279000 ba4bb801
+
+   Besides the [LOCAL] node, it must show another node without error message.
+
+4) Prepare for debugging with early OHCI-1394 initialization:
+
+   4.1) Kernel compilation and installation on debug target
+
+   Compile the kernel to be debugged with CONFIG_PROVIDE_OHCI1394_DMA_INIT
+   (Kernel hacking: Provide code for enabling DMA over FireWire early on boot)
+   enabled and install it on the machine to be debugged (debug target).
+
+   4.2) Transfer the System.map of the debugged kernel to the debug host
+
+   Copy the System.map of the kernel be debugged to the debug host (the host
+   which is connected to the debugged machine over the FireWire cable).
+
+5) Retrieving the printk buffer contents:
+
+   With the FireWire cable connected, the OHCI-1394 driver on the debugging
+   host loaded, reboot the debugged machine, booting the kernel which has
+   CONFIG_PROVIDE_OHCI1394_DMA_INIT enabled, with the option ohci1394_dma=early.
+
+   Then, on the debugging host, run firescope, for example by using -A:
+
+       firescope -A System.map-of-debug-target-kernel
+
+   Note: -A automatically attaches to the first non-local node. It only works
+   reliably if only connected two machines are connected using FireWire.
+
+   After having attached to the debug target, press Ctrl-D to view the
+   complete printk buffer or Ctrl-U to enter auto update mode and get an
+   updated live view of recent kernel messages logged on the debug target.
+
+   Call "firescope -h" to get more information on firescope's options.
+
+Notes
+-----
+Documentation and specifications: ftp://ftp.suse.de/private/bk/firewire/docs
+
+FireWire is a trademark of Apple Inc. - for more information please refer to:
+http://en.wikipedia.org/wiki/FireWire
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 880f882160e2d717d9d0a7e1727ff6f29e6dcec8..5d171b7b8393b95c857f9aad5945939a2c999e21 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -416,8 +416,21 @@ and is between 256 and 4096 characters. It is defined in the file
                         [SPARC64] tick
                         [X86-64] hpet,tsc
  
-       code_bytes      [IA32] How many bytes of object code to print in an
-                       oops report.
+       clearcpuid=BITNUM [X86]
+                       Disable CPUID feature X for the kernel. See
+                       include/asm-x86/cpufeature.h for the valid bit numbers.
+                       Note the Linux specific bits are not necessarily
+                       stable over kernel options, but the vendor specific
+                       ones should be.
+                       Also note that user programs calling CPUID directly
+                       or using the feature without checking anything
+                       will still see it. This just prevents it from
+                       being used by the kernel or shown in /proc/cpuinfo.
+                       Also note the kernel might malfunction if you disable
+                       some critical bits.
+
+       code_bytes      [IA32/X86_64] How many bytes of object code to print
+                       in an oops report.
                         Range: 0 - 8192
                         Default: 64
  
@@ -570,6 +583,12 @@ and is between 256 and 4096 characters. It is defined in the file
                         See drivers/char/README.epca and
                         Documentation/digiepca.txt.
  
+       disable_mtrr_trim [X86, Intel and AMD only]
+                       By default the kernel will trim any uncacheable
+                       memory out of your available memory pool based on
+                       MTRR settings.  This parameter disables that behavior,
+                       possibly causing your machine to run very slowly.
+
         dmasound=       [HW,OSS] Sound subsystem buffers
  
         dscc4.setup=    [NET]
@@ -660,6 +679,10 @@ and is between 256 and 4096 characters. It is defined in the file
  
         gamma=          [HW,DRM]
  
+       gart_fix_e820=  [X86_64] disable the fix e820 for K8 GART
+                       Format: off | on
+                       default: on
+
         gdth=           [HW,SCSI]
                         See header of drivers/scsi/gdth.c.
  
@@ -794,6 +817,16 @@ and is between 256 and 4096 characters. It is defined in the file
                         for translation below 32 bit and if not available
                         then look in the higher range.
  
+       io_delay=       [X86-32,X86-64] I/O delay method
+               0x80
+                       Standard port 0x80 based delay
+               0xed
+                       Alternate port 0xed based delay (needed on some systems)
+               udelay
+                       Simple two microseconds delay
+               none
+                       No delay
+
         io7=            [HW] IO7 for Marvel based alpha systems
                         See comment before marvel_specify_io7 in
                         arch/alpha/kernel/core_marvel.c.
@@ -1059,6 +1092,11 @@ and is between 256 and 4096 characters. It is defined in the file
                         Multi-Function General Purpose Timers on AMD Geode
                         platforms.
  
+       mfgptfix        [X86-32] Fix MFGPT timers on AMD Geode platforms when
+                       the BIOS has incorrectly applied a workaround. TinyBIOS
+                       version 0.98 is known to be affected, 0.99 fixes the
+                       problem by letting the user disable the workaround.
+
         mga=            [HW,DRM]
  
         mousedev.tap_time=
@@ -1159,6 +1197,8 @@ and is between 256 and 4096 characters. It is defined in the file
  
         nodisconnect    [HW,SCSI,M68K] Disables SCSI disconnects.
  
+       noefi           [X86-32,X86-64] Disable EFI runtime services support.
+
         noexec          [IA-64]
  
         noexec          [X86-32,X86-64]
@@ -1169,6 +1209,8 @@ and is between 256 and 4096 characters. It is defined in the file
                         register save and restore. The kernel will only save
                         legacy floating-point registers on task switch.
  
+       noclflush       [BUGS=X86] Don't use the CLFLUSH instruction
+
         nohlt           [BUGS=ARM]
  
         no-hlt          [BUGS=X86-32] Tells the kernel that the hlt
@@ -1978,6 +2020,11 @@ and is between 256 and 4096 characters. It is defined in the file
                         vdso=1: enable VDSO (default)
                         vdso=0: disable VDSO mapping
  
+       vdso32=         [X86-32,X86-64]
+                       vdso32=2: enable compat VDSO (default with COMPAT_VDSO)
+                       vdso32=1: enable 32-bit VDSO (default)
+                       vdso32=0: disable 32-bit VDSO mapping
+
         vector=         [IA-64,SMP]
                         vector=percpu: enable percpu vector domain
  
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c

index 9b0e322118b5402eb83868c70266f8afdf420ed1..6c8a2386cd50d150fb58844324c40fbb8685bb8c 100644 (file)
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -79,6 +79,9 @@ static void *guest_base;
  /* The maximum guest physical address allowed, and maximum possible. */
  static unsigned long guest_limit, guest_max;
  
+/* a per-cpu variable indicating whose vcpu is currently running */
+static unsigned int __thread cpu_id;
+
  /* This is our list of devices. */
  struct device_list
  {
@@ -153,6 +156,9 @@ struct virtqueue
         void (*handle_output)(int fd, struct virtqueue *me);
  };
  
+/* Remember the arguments to the program so we can "reboot" */
+static char **main_args;
+
  /* Since guest is UP and we don't run at the same time, we don't need barriers.
   * But I include them in the code in case others copy it. */
  #define wmb()
@@ -554,7 +560,7 @@ static void wake_parent(int pipefd, int lguest_fd)
                         else
                                 FD_CLR(-fd - 1, &devices.infds);
                 } else /* Send LHREQ_BREAK command. */
-                       write(lguest_fd, args, sizeof(args));
+                       pwrite(lguest_fd, args, sizeof(args), cpu_id);
         }
  }
  
@@ -1489,7 +1495,9 @@ static void setup_block_file(const char *filename)
  
         /* Create stack for thread and run it */
         stack = malloc(32768);
-       if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
+       /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
+        * becoming a zombie. */
+       if (clone(io_thread, stack + 32768,  CLONE_VM | SIGCHLD, dev) == -1)
                 err(1, "Creating clone");
  
         /* We don't need to keep the I/O thread's end of the pipes open. */
@@ -1499,7 +1507,21 @@ static void setup_block_file(const char *filename)
         verbose("device %u: virtblock %llu sectors\n",
                 devices.device_num, cap);
  }
-/* That's the end of device setup. */
+/* That's the end of device setup. :*/
+
+/* Reboot */
+static void __attribute__((noreturn)) restart_guest(void)
+{
+       unsigned int i;
+
+       /* Closing pipes causes the waker thread and io_threads to die, and
+        * closing /dev/lguest cleans up the Guest.  Since we don't track all
+        * open fds, we simply close everything beyond stderr. */
+       for (i = 3; i < FD_SETSIZE; i++)
+               close(i);
+       execv(main_args[0], main_args);
+       err(1, "Could not exec %s", main_args[0]);
+}
  
  /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
   * its input and output, and finally, lays it to rest. */
@@ -1511,7 +1533,8 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
                 int readval;
  
                 /* We read from the /dev/lguest device to run the Guest. */
-               readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
+               readval = pread(lguest_fd, &notify_addr,
+                               sizeof(notify_addr), cpu_id);
  
                 /* One unsigned long means the Guest did HCALL_NOTIFY */
                 if (readval == sizeof(notify_addr)) {
@@ -1521,16 +1544,23 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
                 /* ENOENT means the Guest died.  Reading tells us why. */
                 } else if (errno == ENOENT) {
                         char reason[1024] = { 0 };
-                       read(lguest_fd, reason, sizeof(reason)-1);
+                       pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
                         errx(1, "%s", reason);
+               /* ERESTART means that we need to reboot the guest */
+               } else if (errno == ERESTART) {
+                       restart_guest();
                 /* EAGAIN means the Waker wanted us to look at some input.
                  * Anything else means a bug or incompatible change. */
                 } else if (errno != EAGAIN)
                         err(1, "Running guest failed");
  
+               /* Only service input on thread for CPU 0. */
+               if (cpu_id != 0)
+                       continue;
+
                 /* Service input, then unset the BREAK to release the Waker. */
                 handle_input(lguest_fd);
-               if (write(lguest_fd, args, sizeof(args)) < 0)
+               if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
                         err(1, "Resetting break");
         }
  }
@@ -1571,6 +1601,12 @@ int main(int argc, char *argv[])
         /* If they specify an initrd file to load. */
         const char *initrd_name = NULL;
  
+       /* Save the args: we "reboot" by execing ourselves again. */
+       main_args = argv;
+       /* We don't "wait" for the children, so prevent them from becoming
+        * zombies. */
+       signal(SIGCHLD, SIG_IGN);
+
         /* First we initialize the device list.  Since console and network
          * device receive input from a file descriptor, we keep an fdset
          * (infds) and the maximum fd number (max_infd) with the head of the
@@ -1582,6 +1618,7 @@ int main(int argc, char *argv[])
         devices.lastdev = &devices.dev;
         devices.next_irq = 1;
  
+       cpu_id = 0;
         /* We need to know how much memory so we can set up the device
          * descriptor and memory pages for the devices as we parse the command
          * line.  So we quickly look through the arguments to find the amount
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt

index 945311840a10d29be3f8c680f1124a01e8cd4bed..34abae4e94427eabb76aa074f67efc2987f3b9f6 100644 (file)
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -110,12 +110,18 @@ Idle loop
  
  Rebooting
  
-   reboot=b[ios] | t[riple] | k[bd] [, [w]arm | [c]old]
+   reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] [, [w]arm | [c]old]
     bios          Use the CPU reboot vector for warm reset
     warm   Don't set the cold reboot flag
     cold   Set the cold reboot flag
     triple Force a triple fault (init)
     kbd    Use the keyboard controller. cold reset (default)
+   acpi   Use the ACPI RESET_REG in the FADT. If ACPI is not configured or the
+          ACPI reset does not work, the reboot path attempts the reset using
+          the keyboard controller.
+   efi    Use efi reset_system runtime service. If EFI is not configured or the
+          EFI reset does not work, the reboot path attempts the reset using
+          the keyboard controller.
  
     Using warm reset will be much faster especially on big memory
     systems because the BIOS will not go through the memory check.
diff --git a/Documentation/x86_64/uefi.txt b/Documentation/x86_64/uefi.txt

index 91a98edfb58824a7461ccd03d6decccfa5b27e8c..7d77120a51841aea77164a6aab2f682f10ca1a2d 100644 (file)
--- a/Documentation/x86_64/uefi.txt
+++ b/Documentation/x86_64/uefi.txt
@@ -19,6 +19,10 @@ Mechanics:
  - Build the kernel with the following configuration.
         CONFIG_FB_EFI=y
         CONFIG_FRAMEBUFFER_CONSOLE=y
+  If EFI runtime services are expected, the following configuration should
+  be selected.
+       CONFIG_EFI=y
+       CONFIG_EFI_VARS=y or m          # optional
  - Create a VFAT partition on the disk
  - Copy the following to the VFAT partition:
         elilo bootloader with x86_64 support, elilo configuration file,
@@ -27,3 +31,8 @@ Mechanics:
         can be found in the elilo sourceforge project.
  - Boot to EFI shell and invoke elilo choosing the kernel image built
    in first step.
+- If some or all EFI runtime services don't work, you can try following
+  kernel command line parameters to turn off some or all EFI runtime
+  services.
+       noefi           turn off all EFI runtime services
+       reboot_type=k   turn off EFI reboot runtime service
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index de211ac3853e90aa16017641efd295e8496cf3e9..77201d3f7479ccc65bf2e454fa310be75580a49b 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -91,6 +91,11 @@ config GENERIC_IRQ_PROBE
         bool
         default y
  
+config GENERIC_LOCKBREAK
+       bool
+       default y
+       depends on SMP && PREEMPT
+
  config RWSEM_GENERIC_SPINLOCK
         bool
         default y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig

index bef47725d4ad461310e181d98fb3d04aa214b061..5a41e75ae1fefd1d9358917255a3d3b22041d073 100644 (file)
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -42,6 +42,11 @@ config MMU
  config SWIOTLB
         bool
  
+config GENERIC_LOCKBREAK
+       bool
+       default y
+       depends on SMP && PREEMPT
+
  config RWSEM_XCHGADD_ALGORITHM
         bool
         default y
@@ -75,6 +80,9 @@ config GENERIC_TIME_VSYSCALL
         bool
         default y
  
+config ARCH_SETS_UP_PER_CPU_AREA
+       def_bool y
+
  config DMI
         bool
         default y
diff --git a/arch/ia64/hp/sim/simscsi.c b/arch/ia64/hp/sim/simscsi.c

index 6ef9b52199304af2f472f92becd0216cbbe2e492..7661bb065fa570a4a61036acf551d5feacda94ef 100644 (file)
--- a/arch/ia64/hp/sim/simscsi.c
+++ b/arch/ia64/hp/sim/simscsi.c
@@ -360,7 +360,6 @@ static struct scsi_host_template driver_template = {
         .max_sectors            = 1024,
         .cmd_per_lun            = SIMSCSI_REQ_QUEUE_LEN,
         .use_clustering         = DISABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  static int __init
diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c

index 3e35987af4584c094d36fa374a3d23f49887d92d..4f0c30c38e994c1ea188f811232ecc27f7b81be1 100644 (file)
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -222,7 +222,8 @@ elf32_set_personality (void)
  }
  
  static unsigned long
-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
+elf32_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt,
+               int prot, int type, unsigned long unused)
  {
         unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK;
  
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c

index 196287928baebcee3a1ac8a33f389510a316d371..e699eb6c44be0f788397ba0300bac77800db1767 100644 (file)
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -947,7 +947,7 @@ percpu_modcopy (void *pcpudst, const void *src, unsigned long size)
  {
         unsigned int i;
         for_each_possible_cpu(i) {
-               memcpy(pcpudst + __per_cpu_offset[i], src, size);
+               memcpy(pcpudst + per_cpu_offset(i), src, size);
         }
  }
  #endif /* CONFIG_SMP */
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig

index ab9a264cb1947cb6a938c523e677f4c64873b72e..f7237c5f531e511806133f0223ecd48edb0399ab 100644 (file)
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -235,6 +235,11 @@ config IRAM_SIZE
  # Define implied options from the CPU selection here
  #
  
+config GENERIC_LOCKBREAK
+       bool
+       default y
+       depends on SMP && PREEMPT
+
  config RWSEM_GENERIC_SPINLOCK
         bool
         depends on M32R
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig

index 6b0f85f02c7966895656fb5a13681041e52f2394..4fad0a34b9974d31f16b5e1e6a8bb6e3c067e710 100644 (file)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -694,6 +694,11 @@ source "arch/mips/vr41xx/Kconfig"
  
  endmenu
  
+config GENERIC_LOCKBREAK
+       bool
+       default y
+       depends on SMP && PREEMPT
+
  config RWSEM_GENERIC_SPINLOCK
         bool
         default y
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c

index c2d497ceffdd3249c9275157160daddbf705c45a..fc4aa07b6d35f8aa9e7542345f3b2f81de55d817 100644 (file)
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -24,9 +24,7 @@ DEFINE_SPINLOCK(i8253_lock);
  static void init_pit_timer(enum clock_event_mode mode,
                            struct clock_event_device *evt)
  {
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
+       spin_lock(&i8253_lock);
  
         switch(mode) {
         case CLOCK_EVT_MODE_PERIODIC:
@@ -55,7 +53,7 @@ static void init_pit_timer(enum clock_event_mode mode,
                 /* Nothing to do here */
                 break;
         }
-       spin_unlock_irqrestore(&i8253_lock, flags);
+       spin_unlock(&i8253_lock);
  }
  
  /*
@@ -65,12 +63,10 @@ static void init_pit_timer(enum clock_event_mode mode,
   */
  static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
  {
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
+       spin_lock(&i8253_lock);
         outb_p(delta & 0xff , PIT_CH0); /* LSB */
         outb(delta >> 8 , PIT_CH0);     /* MSB */
-       spin_unlock_irqrestore(&i8253_lock, flags);
+       spin_unlock(&i8253_lock);
  
         return 0;
  }
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig

index b8ef1787a191ce81337590391067b9dda811b89d..2b649c46631c14fb07d408dd3bad2e0869a8dae6 100644 (file)
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -19,6 +19,11 @@ config MMU
  config STACK_GROWSUP
         def_bool y
  
+config GENERIC_LOCKBREAK
+       bool
+       default y
+       depends on SMP && PREEMPT
+
  config RWSEM_GENERIC_SPINLOCK
         def_bool y
  
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index 232c298c933fa3601b9fa9099b3eeee26e40d514..fb85f6b72fcfa4931f1ab14d574e32ddc3dea2dd 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -42,6 +42,9 @@ config GENERIC_HARDIRQS
         bool
         default y
  
+config ARCH_SETS_UP_PER_CPU_AREA
+       def_bool PPC64
+
  config IRQ_PER_CPU
         bool
         default y
@@ -53,6 +56,11 @@ config RWSEM_XCHGADD_ALGORITHM
         bool
         default y
  
+config GENERIC_LOCKBREAK
+       bool
+       default y
+       depends on SMP && PREEMPT
+
  config ARCH_HAS_ILOG2_U32
         bool
         default y
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c

index 3e17d154d0d44be624cb42f638ac4799804fa880..8b056d2295cc07ed4f4403e4683fe4cc17734f8b 100644 (file)
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -256,7 +256,7 @@ static int set_evrregs(struct task_struct *task, unsigned long *data)
  #endif /* CONFIG_SPE */
  
  
-static void set_single_step(struct task_struct *task)
+void user_enable_single_step(struct task_struct *task)
  {
         struct pt_regs *regs = task->thread.regs;
  
@@ -271,7 +271,7 @@ static void set_single_step(struct task_struct *task)
         set_tsk_thread_flag(task, TIF_SINGLESTEP);
  }
  
-static void clear_single_step(struct task_struct *task)
+void user_disable_single_step(struct task_struct *task)
  {
         struct pt_regs *regs = task->thread.regs;
  
@@ -313,7 +313,7 @@ static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
  void ptrace_disable(struct task_struct *child)
  {
         /* make sure the single step bit is not set. */
-       clear_single_step(child);
+       user_disable_single_step(child);
  }
  
  /*
@@ -445,52 +445,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                 break;
         }
  
-       case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-       case PTRACE_CONT: { /* restart after signal. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               if (request == PTRACE_SYSCALL)
-                       set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               else
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               child->exit_code = data;
-               /* make sure the single step bit is not set. */
-               clear_single_step(child);
-               wake_up_process(child);
-               ret = 0;
-               break;
-       }
-
-/*
- * make the child exit.  Best I can do is send it a sigkill.
- * perhaps it should be put in the status that it wants to
- * exit.
- */
-       case PTRACE_KILL: {
-               ret = 0;
-               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
-                       break;
-               child->exit_code = SIGKILL;
-               /* make sure the single step bit is not set. */
-               clear_single_step(child);
-               wake_up_process(child);
-               break;
-       }
-
-       case PTRACE_SINGLESTEP: {  /* set the trap flag. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               set_single_step(child);
-               child->exit_code = data;
-               /* give it a chance to run. */
-               wake_up_process(child);
-               ret = 0;
-               break;
-       }
-
         case PTRACE_GET_DEBUGREG: {
                 ret = -EINVAL;
                 /* We only support one DABR and no IABRS at the moment */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c

index 19a5656001c080f8da3f3f5a9050c03a80fb1746..f0bad7070fb54087d241f6a81bf9ea875f71156b 100644 (file)
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -37,8 +37,6 @@
  #include <asm/iseries/hv_call_xm.h>
  #include <asm/iseries/iommu.h>
  
-extern struct kset devices_subsys; /* needed for vio_find_name() */
-
  static struct bus_type vio_bus_type;
  
  static struct vio_dev vio_bus_device  = { /* fake "parent" device */
@@ -361,19 +359,16 @@ EXPORT_SYMBOL(vio_get_attribute);
  #ifdef CONFIG_PPC_PSERIES
  /* vio_find_name() - internal because only vio.c knows how we formatted the
   * kobject name
- * XXX once vio_bus_type.devices is actually used as a kset in
- * drivers/base/bus.c, this function should be removed in favor of
- * "device_find(kobj_name, &vio_bus_type)"
   */
-static struct vio_dev *vio_find_name(const char *kobj_name)
+static struct vio_dev *vio_find_name(const char *name)
  {
-       struct kobject *found;
+       struct device *found;
  
-       found = kset_find_obj(&devices_subsys, kobj_name);
+       found = bus_find_device_by_name(&vio_bus_type, NULL, name);
         if (!found)
                 return NULL;
  
-       return to_vio_dev(container_of(found, struct device, kobj));
+       return to_vio_dev(found);
  }
  
  /**
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig

index 10b212a1f9f5aa046dbe1e2b5702dbcba9a3b7c0..26f5791baa33c7beb4a7264fd655310337bfd977 100644 (file)
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -66,6 +66,9 @@ config AUDIT_ARCH
         bool
         default y
  
+config ARCH_SETS_UP_PER_CPU_AREA
+       def_bool y
+
  config ARCH_NO_VIRT_TO_BUS
         def_bool y
  
@@ -200,6 +203,11 @@ config US2E_FREQ
           If in doubt, say N.
  
  # Global things across all Sun machines.
+config GENERIC_LOCKBREAK
+       bool
+       default y
+       depends on SMP && PREEMPT
+
  config RWSEM_GENERIC_SPINLOCK
         bool
  
diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c

index 1b388b41d95d4baee359fa21b5079f582b785501..7c7142ba3bd7d4c4980b07df559d3524d0fd78be 100644 (file)
--- a/arch/um/kernel/ksyms.c
+++ b/arch/um/kernel/ksyms.c
@@ -71,10 +71,10 @@ EXPORT_SYMBOL(dump_thread);
  
  /* required for SMP */
  
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
+extern void __write_lock_failed(rwlock_t *rw);
  EXPORT_SYMBOL(__write_lock_failed);
  
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+extern void __read_lock_failed(rwlock_t *rw);
  EXPORT_SYMBOL(__read_lock_failed);
  
  #endif
diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c

index 0147227ce18dd25fcffbeac410877752f7150489..19053d46cb60190421d818435af10bac6ce2bbfb 100644 (file)
--- a/arch/um/sys-i386/signal.c
+++ b/arch/um/sys-i386/signal.c
@@ -3,10 +3,10 @@
   * Licensed under the GPL
   */
  
-#include "linux/ptrace.h"
-#include "asm/unistd.h"
-#include "asm/uaccess.h"
-#include "asm/ucontext.h"
+#include <linux/ptrace.h>
+#include <asm/unistd.h>
+#include <asm/uaccess.h>
+#include <asm/ucontext.h>
  #include "frame_kern.h"
  #include "skas.h"
  
@@ -18,17 +18,17 @@ void copy_sc(struct uml_pt_regs *regs, void *from)
         REGS_FS(regs->gp) = sc->fs;
         REGS_ES(regs->gp) = sc->es;
         REGS_DS(regs->gp) = sc->ds;
-       REGS_EDI(regs->gp) = sc->edi;
-       REGS_ESI(regs->gp) = sc->esi;
-       REGS_EBP(regs->gp) = sc->ebp;
-       REGS_SP(regs->gp) = sc->esp;
-       REGS_EBX(regs->gp) = sc->ebx;
-       REGS_EDX(regs->gp) = sc->edx;
-       REGS_ECX(regs->gp) = sc->ecx;
-       REGS_EAX(regs->gp) = sc->eax;
-       REGS_IP(regs->gp) = sc->eip;
+       REGS_EDI(regs->gp) = sc->di;
+       REGS_ESI(regs->gp) = sc->si;
+       REGS_EBP(regs->gp) = sc->bp;
+       REGS_SP(regs->gp) = sc->sp;
+       REGS_EBX(regs->gp) = sc->bx;
+       REGS_EDX(regs->gp) = sc->dx;
+       REGS_ECX(regs->gp) = sc->cx;
+       REGS_EAX(regs->gp) = sc->ax;
+       REGS_IP(regs->gp) = sc->ip;
         REGS_CS(regs->gp) = sc->cs;
-       REGS_EFLAGS(regs->gp) = sc->eflags;
+       REGS_EFLAGS(regs->gp) = sc->flags;
         REGS_SS(regs->gp) = sc->ss;
  }
  
@@ -229,18 +229,18 @@ static int copy_sc_to_user(struct sigcontext __user *to,
         sc.fs = REGS_FS(regs->regs.gp);
         sc.es = REGS_ES(regs->regs.gp);
         sc.ds = REGS_DS(regs->regs.gp);
-       sc.edi = REGS_EDI(regs->regs.gp);
-       sc.esi = REGS_ESI(regs->regs.gp);
-       sc.ebp = REGS_EBP(regs->regs.gp);
-       sc.esp = sp;
-       sc.ebx = REGS_EBX(regs->regs.gp);
-       sc.edx = REGS_EDX(regs->regs.gp);
-       sc.ecx = REGS_ECX(regs->regs.gp);
-       sc.eax = REGS_EAX(regs->regs.gp);
-       sc.eip = REGS_IP(regs->regs.gp);
+       sc.di = REGS_EDI(regs->regs.gp);
+       sc.si = REGS_ESI(regs->regs.gp);
+       sc.bp = REGS_EBP(regs->regs.gp);
+       sc.sp = sp;
+       sc.bx = REGS_EBX(regs->regs.gp);
+       sc.dx = REGS_EDX(regs->regs.gp);
+       sc.cx = REGS_ECX(regs->regs.gp);
+       sc.ax = REGS_EAX(regs->regs.gp);
+       sc.ip = REGS_IP(regs->regs.gp);
         sc.cs = REGS_CS(regs->regs.gp);
-       sc.eflags = REGS_EFLAGS(regs->regs.gp);
-       sc.esp_at_signal = regs->regs.gp[UESP];
+       sc.flags = REGS_EFLAGS(regs->regs.gp);
+       sc.sp_at_signal = regs->regs.gp[UESP];
         sc.ss = regs->regs.gp[SS];
         sc.cr2 = fi->cr2;
         sc.err = fi->error_code;
diff --git a/arch/um/sys-x86_64/signal.c b/arch/um/sys-x86_64/signal.c

index 1778d33808f4428ff971403408d22724b0aa7286..7457436b433a915c6b41d783417d2a0b40cbd171 100644 (file)
--- a/arch/um/sys-x86_64/signal.c
+++ b/arch/um/sys-x86_64/signal.c
@@ -4,11 +4,11 @@
   * Licensed under the GPL
   */
  
-#include "linux/personality.h"
-#include "linux/ptrace.h"
-#include "asm/unistd.h"
-#include "asm/uaccess.h"
-#include "asm/ucontext.h"
+#include <linux/personality.h>
+#include <linux/ptrace.h>
+#include <asm/unistd.h>
+#include <asm/uaccess.h>
+#include <asm/ucontext.h>
  #include "frame_kern.h"
  #include "skas.h"
  
@@ -27,16 +27,16 @@ void copy_sc(struct uml_pt_regs *regs, void *from)
         GETREG(regs, R13, sc, r13);
         GETREG(regs, R14, sc, r14);
         GETREG(regs, R15, sc, r15);
-       GETREG(regs, RDI, sc, rdi);
-       GETREG(regs, RSI, sc, rsi);
-       GETREG(regs, RBP, sc, rbp);
-       GETREG(regs, RBX, sc, rbx);
-       GETREG(regs, RDX, sc, rdx);
-       GETREG(regs, RAX, sc, rax);
-       GETREG(regs, RCX, sc, rcx);
-       GETREG(regs, RSP, sc, rsp);
-       GETREG(regs, RIP, sc, rip);
-       GETREG(regs, EFLAGS, sc, eflags);
+       GETREG(regs, RDI, sc, di);
+       GETREG(regs, RSI, sc, si);
+       GETREG(regs, RBP, sc, bp);
+       GETREG(regs, RBX, sc, bx);
+       GETREG(regs, RDX, sc, dx);
+       GETREG(regs, RAX, sc, ax);
+       GETREG(regs, RCX, sc, cx);
+       GETREG(regs, RSP, sc, sp);
+       GETREG(regs, RIP, sc, ip);
+       GETREG(regs, EFLAGS, sc, flags);
         GETREG(regs, CS, sc, cs);
  
  #undef GETREG
@@ -61,16 +61,16 @@ static int copy_sc_from_user(struct pt_regs *regs,
         err |= GETREG(regs, R13, from, r13);
         err |= GETREG(regs, R14, from, r14);
         err |= GETREG(regs, R15, from, r15);
-       err |= GETREG(regs, RDI, from, rdi);
-       err |= GETREG(regs, RSI, from, rsi);
-       err |= GETREG(regs, RBP, from, rbp);
-       err |= GETREG(regs, RBX, from, rbx);
-       err |= GETREG(regs, RDX, from, rdx);
-       err |= GETREG(regs, RAX, from, rax);
-       err |= GETREG(regs, RCX, from, rcx);
-       err |= GETREG(regs, RSP, from, rsp);
-       err |= GETREG(regs, RIP, from, rip);
-       err |= GETREG(regs, EFLAGS, from, eflags);
+       err |= GETREG(regs, RDI, from, di);
+       err |= GETREG(regs, RSI, from, si);
+       err |= GETREG(regs, RBP, from, bp);
+       err |= GETREG(regs, RBX, from, bx);
+       err |= GETREG(regs, RDX, from, dx);
+       err |= GETREG(regs, RAX, from, ax);
+       err |= GETREG(regs, RCX, from, cx);
+       err |= GETREG(regs, RSP, from, sp);
+       err |= GETREG(regs, RIP, from, ip);
+       err |= GETREG(regs, EFLAGS, from, flags);
         err |= GETREG(regs, CS, from, cs);
         if (err)
                 return 1;
@@ -108,19 +108,19 @@ static int copy_sc_to_user(struct sigcontext __user *to,
         __put_user((regs)->regs.gp[(regno) / sizeof(unsigned long)],    \
                    &(sc)->regname)
  
-       err |= PUTREG(regs, RDI, to, rdi);
-       err |= PUTREG(regs, RSI, to, rsi);
-       err |= PUTREG(regs, RBP, to, rbp);
+       err |= PUTREG(regs, RDI, to, di);
+       err |= PUTREG(regs, RSI, to, si);
+       err |= PUTREG(regs, RBP, to, bp);
         /*
          * Must use orignal RSP, which is passed in, rather than what's in
          * the pt_regs, because that's already been updated to point at the
          * signal frame.
          */
-       err |= __put_user(sp, &to->rsp);
-       err |= PUTREG(regs, RBX, to, rbx);
-       err |= PUTREG(regs, RDX, to, rdx);
-       err |= PUTREG(regs, RCX, to, rcx);
-       err |= PUTREG(regs, RAX, to, rax);
+       err |= __put_user(sp, &to->sp);
+       err |= PUTREG(regs, RBX, to, bx);
+       err |= PUTREG(regs, RDX, to, dx);
+       err |= PUTREG(regs, RCX, to, cx);
+       err |= PUTREG(regs, RAX, to, ax);
         err |= PUTREG(regs, R8, to, r8);
         err |= PUTREG(regs, R9, to, r9);
         err |= PUTREG(regs, R10, to, r10);
@@ -135,8 +135,8 @@ static int copy_sc_to_user(struct sigcontext __user *to,
         err |= __put_user(fi->error_code, &to->err);
         err |= __put_user(fi->trap_no, &to->trapno);
  
-       err |= PUTREG(regs, RIP, to, rip);
-       err |= PUTREG(regs, EFLAGS, to, eflags);
+       err |= PUTREG(regs, RIP, to, ip);
+       err |= PUTREG(regs, EFLAGS, to, flags);
  #undef PUTREG
  
         err |= __put_user(mask, &to->oldmask);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 80b7ba4056dbbb566841c1e1cbef9475730fe199..65b449134cf7b15cbc2bdf66993b813fa6f409f0 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,81 +17,69 @@ config X86_64
  
  ### Arch settings
  config X86
-       bool
-       default y
+       def_bool y
+
+config GENERIC_LOCKBREAK
+       def_bool n
  
  config GENERIC_TIME
-       bool
-       default y
+       def_bool y
  
  config GENERIC_CMOS_UPDATE
-       bool
-       default y
+       def_bool y
  
  config CLOCKSOURCE_WATCHDOG
-       bool
-       default y
+       def_bool y
  
  config GENERIC_CLOCKEVENTS
-       bool
-       default y
+       def_bool y
  
  config GENERIC_CLOCKEVENTS_BROADCAST
-       bool
-       default y
+       def_bool y
         depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
  
  config LOCKDEP_SUPPORT
-       bool
-       default y
+       def_bool y
  
  config STACKTRACE_SUPPORT
-       bool
-       default y
+       def_bool y
  
  config SEMAPHORE_SLEEPERS
-       bool
-       default y
+       def_bool y
  
  config MMU
-       bool
-       default y
+       def_bool y
  
  config ZONE_DMA
-       bool
-       default y
+       def_bool y
  
  config QUICKLIST
-       bool
-       default X86_32
+       def_bool X86_32
  
  config SBUS
         bool
  
  config GENERIC_ISA_DMA
-       bool
-       default y
+       def_bool y
  
  config GENERIC_IOMAP
-       bool
-       default y
+       def_bool y
  
  config GENERIC_BUG
-       bool
-       default y
+       def_bool y
         depends on BUG
  
  config GENERIC_HWEIGHT
-       bool
-       default y
+       def_bool y
+
+config GENERIC_GPIO
+       def_bool n
  
  config ARCH_MAY_HAVE_PC_FDC
-       bool
-       default y
+       def_bool y
  
  config DMI
-       bool
-       default y
+       def_bool y
  
  config RWSEM_GENERIC_SPINLOCK
         def_bool !X86_XADD
@@ -112,10 +100,14 @@ config GENERIC_TIME_VSYSCALL
         bool
         default X86_64
  
+config HAVE_SETUP_PER_CPU_AREA
+       def_bool X86_64
+
  config ARCH_SUPPORTS_OPROFILE
         bool
         default y
  
+select HAVE_KVM
  
  config ZONE_DMA32
         bool
@@ -144,9 +136,17 @@ config GENERIC_PENDING_IRQ
  
  config X86_SMP
         bool
-       depends on X86_32 && SMP && !X86_VOYAGER
+       depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
         default y
  
+config X86_32_SMP
+       def_bool y
+       depends on X86_32 && SMP
+
+config X86_64_SMP
+       def_bool y
+       depends on X86_64 && SMP
+
  config X86_HT
         bool
         depends on SMP
@@ -292,6 +292,18 @@ config X86_ES7000
           Only choose this option if you have such a system, otherwise you
           should say N here.
  
+config X86_RDC321X
+       bool "RDC R-321x SoC"
+       depends on X86_32
+       select M486
+       select X86_REBOOTFIXUPS
+       select GENERIC_GPIO
+       select LEDS_GPIO
+       help
+         This option is needed for RDC R-321x system-on-chip, also known
+         as R-8610-(G).
+         If you don't have one of these chips, you should say N here.
+
  config X86_VSMP
         bool "Support for ScaleMP vSMP"
         depends on X86_64 && PCI
@@ -303,8 +315,8 @@ config X86_VSMP
  endchoice
  
  config SCHED_NO_NO_OMIT_FRAME_POINTER
-       bool "Single-depth WCHAN output"
-       default y
+       def_bool y
+       prompt "Single-depth WCHAN output"
         depends on X86_32
         help
           Calculate simpler /proc/<PID>/wchan values. If this option
@@ -314,18 +326,8 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
  
           If in doubt, say "Y".
  
-config PARAVIRT
-       bool
-       depends on X86_32 && !(X86_VISWS || X86_VOYAGER)
-       help
-         This changes the kernel so it can modify itself when it is run
-         under a hypervisor, potentially improving performance significantly
-         over full virtualization.  However, when run without a hypervisor
-         the kernel is theoretically slower and slightly larger.
-
  menuconfig PARAVIRT_GUEST
         bool "Paravirtualized guest support"
-       depends on X86_32
         help
           Say Y here to get to see options related to running Linux under
           various hypervisors.  This option alone does not add any kernel code.
@@ -339,6 +341,7 @@ source "arch/x86/xen/Kconfig"
  config VMI
         bool "VMI Guest support"
         select PARAVIRT
+       depends on X86_32
         depends on !(X86_VISWS || X86_VOYAGER)
         help
           VMI provides a paravirtualized interface to the VMware ESX server
@@ -348,40 +351,43 @@ config VMI
  
  source "arch/x86/lguest/Kconfig"
  
+config PARAVIRT
+       bool "Enable paravirtualization code"
+       depends on !(X86_VISWS || X86_VOYAGER)
+       help
+         This changes the kernel so it can modify itself when it is run
+         under a hypervisor, potentially improving performance significantly
+         over full virtualization.  However, when run without a hypervisor
+         the kernel is theoretically slower and slightly larger.
+
  endif
  
  config ACPI_SRAT
-       bool
-       default y
+       def_bool y
         depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
         select ACPI_NUMA
  
  config HAVE_ARCH_PARSE_SRAT
-       bool
-       default y
-       depends on ACPI_SRAT
+       def_bool y
+       depends on ACPI_SRAT
  
  config X86_SUMMIT_NUMA
-       bool
-       default y
+       def_bool y
         depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH)
  
  config X86_CYCLONE_TIMER
-       bool
-       default y
+       def_bool y
         depends on X86_32 && X86_SUMMIT || X86_GENERICARCH
  
  config ES7000_CLUSTERED_APIC
-       bool
-       default y
+       def_bool y
         depends on SMP && X86_ES7000 && MPENTIUMIII
  
  source "arch/x86/Kconfig.cpu"
  
  config HPET_TIMER
-       bool
+       def_bool X86_64
         prompt "HPET Timer Support" if X86_32
-       default X86_64
         help
           Use the IA-PC HPET (High Precision Event Timer) to manage
           time in preference to the PIT and RTC, if a HPET is
@@ -399,9 +405,8 @@ config HPET_TIMER
           Choose N to continue using the legacy 8254 timer.
  
  config HPET_EMULATE_RTC
-       bool
-       depends on HPET_TIMER && RTC=y
-       default y
+       def_bool y
+       depends on HPET_TIMER && (RTC=y || RTC=m)
  
  # Mark as embedded because too many people got it wrong.
  # The code disables itself when not needed.
@@ -441,8 +446,8 @@ config CALGARY_IOMMU
           If unsure, say Y.
  
  config CALGARY_IOMMU_ENABLED_BY_DEFAULT
-       bool "Should Calgary be enabled by default?"
-       default y
+       def_bool y
+       prompt "Should Calgary be enabled by default?"
         depends on CALGARY_IOMMU
         help
           Should Calgary be enabled by default? if you choose 'y', Calgary
@@ -486,9 +491,9 @@ config SCHED_SMT
           N here.
  
  config SCHED_MC
-       bool "Multi-core scheduler support"
+       def_bool y
+       prompt "Multi-core scheduler support"
         depends on (X86_64 && SMP) || (X86_32 && X86_HT)
-       default y
         help
           Multi-core scheduler support improves the CPU scheduler's decision
           making when dealing with multi-core CPU chips at a cost of slightly
@@ -522,19 +527,16 @@ config X86_UP_IOAPIC
           an IO-APIC, then the kernel will still run with no slowdown at all.
  
  config X86_LOCAL_APIC
-       bool
+       def_bool y
         depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH))
-       default y
  
  config X86_IO_APIC
-       bool
+       def_bool y
         depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH))
-       default y
  
  config X86_VISWS_APIC
-       bool
+       def_bool y
         depends on X86_32 && X86_VISWS
-       default y
  
  config X86_MCE
         bool "Machine Check Exception"
@@ -554,17 +556,17 @@ config X86_MCE
           the 386 and 486, so nearly everyone can say Y here.
  
  config X86_MCE_INTEL
-       bool "Intel MCE features"
+       def_bool y
+       prompt "Intel MCE features"
         depends on X86_64 && X86_MCE && X86_LOCAL_APIC
-       default y
         help
            Additional support for intel specific MCE features such as
            the thermal monitor.
  
  config X86_MCE_AMD
-       bool "AMD MCE features"
+       def_bool y
+       prompt "AMD MCE features"
         depends on X86_64 && X86_MCE && X86_LOCAL_APIC
-       default y
         help
            Additional support for AMD specific MCE features such as
            the DRAM Error Threshold.
@@ -637,9 +639,9 @@ config I8K
           Say N otherwise.
  
  config X86_REBOOTFIXUPS
-       bool "Enable X86 board specific fixups for reboot"
+       def_bool n
+       prompt "Enable X86 board specific fixups for reboot"
         depends on X86_32 && X86
-       default n
         ---help---
           This enables chipset and/or board specific fixups to be done
           in order to get reboot to work correctly. This is only needed on
@@ -648,7 +650,7 @@ config X86_REBOOTFIXUPS
           system.
  
           Currently, the only fixup is for the Geode machines using
-         CS5530A and CS5536 chipsets.
+         CS5530A and CS5536 chipsets and the RDC R-321x SoC.
  
           Say Y if you want to enable the fixup. Currently, it's safe to
           enable this option even if you don't need it.
@@ -672,9 +674,8 @@ config MICROCODE
           module will be called microcode.
  
  config MICROCODE_OLD_INTERFACE
-       bool
+       def_bool y
         depends on MICROCODE
-       default y
  
  config X86_MSR
         tristate "/dev/cpu/*/msr - Model-specific register support"
@@ -798,13 +799,12 @@ config PAGE_OFFSET
         depends on X86_32
  
  config HIGHMEM
-       bool
+       def_bool y
         depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
-       default y
  
  config X86_PAE
-       bool "PAE (Physical Address Extension) Support"
-       default n
+       def_bool n
+       prompt "PAE (Physical Address Extension) Support"
         depends on X86_32 && !HIGHMEM4G
         select RESOURCES_64BIT
         help
@@ -836,10 +836,10 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
         depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
  
  config K8_NUMA
-       bool "Old style AMD Opteron NUMA detection"
-       depends on X86_64 && NUMA && PCI
-       default y
-       help
+       def_bool y
+       prompt "Old style AMD Opteron NUMA detection"
+       depends on X86_64 && NUMA && PCI
+       help
          Enable K8 NUMA node topology detection.  You should say Y here if
          you have a multi processor AMD K8 system. This uses an old
          method to read the NUMA configuration directly from the builtin
@@ -847,10 +847,10 @@ config K8_NUMA
          instead, which also takes priority if both are compiled in.
  
  config X86_64_ACPI_NUMA
-       bool "ACPI NUMA detection"
+       def_bool y
+       prompt "ACPI NUMA detection"
         depends on X86_64 && NUMA && ACPI && PCI
         select ACPI_NUMA
-       default y
         help
           Enable ACPI SRAT based node topology detection.
  
@@ -864,52 +864,53 @@ config NUMA_EMU
  
  config NODES_SHIFT
         int
+       range 1 15  if X86_64
         default "6" if X86_64
         default "4" if X86_NUMAQ
         default "3"
         depends on NEED_MULTIPLE_NODES
  
  config HAVE_ARCH_BOOTMEM_NODE
-       bool
+       def_bool y
         depends on X86_32 && NUMA
-       default y
  
  config ARCH_HAVE_MEMORY_PRESENT
-       bool
+       def_bool y
         depends on X86_32 && DISCONTIGMEM
-       default y
  
  config NEED_NODE_MEMMAP_SIZE
-       bool
+       def_bool y
         depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
-       default y
  
  config HAVE_ARCH_ALLOC_REMAP
-       bool
+       def_bool y
         depends on X86_32 && NUMA
-       default y
  
  config ARCH_FLATMEM_ENABLE
         def_bool y
-       depends on (X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC) || (X86_64 && !NUMA)
+       depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA
  
  config ARCH_DISCONTIGMEM_ENABLE
         def_bool y
-       depends on NUMA
+       depends on NUMA && X86_32
  
  config ARCH_DISCONTIGMEM_DEFAULT
         def_bool y
-       depends on NUMA
+       depends on NUMA && X86_32
+
+config ARCH_SPARSEMEM_DEFAULT
+       def_bool y
+       depends on X86_64
  
  config ARCH_SPARSEMEM_ENABLE
         def_bool y
-       depends on NUMA || (EXPERIMENTAL && (X86_PC || X86_64))
+       depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
         select SPARSEMEM_STATIC if X86_32
         select SPARSEMEM_VMEMMAP_ENABLE if X86_64
  
  config ARCH_SELECT_MEMORY_MODEL
         def_bool y
-       depends on X86_32 && ARCH_SPARSEMEM_ENABLE
+       depends on ARCH_SPARSEMEM_ENABLE
  
  config ARCH_MEMORY_PROBE
         def_bool X86_64
@@ -987,42 +988,32 @@ config MTRR
           See <file:Documentation/mtrr.txt> for more information.
  
  config EFI
-       bool "Boot from EFI support"
-       depends on X86_32 && ACPI
-       default n
+       def_bool n
+       prompt "EFI runtime service support"
+       depends on ACPI
         ---help---
-       This enables the kernel to boot on EFI platforms using
-       system configuration information passed to it from the firmware.
-       This also enables the kernel to use any EFI runtime services that are
+       This enables the kernel to use EFI runtime services that are
         available (such as the EFI variable services).
  
-       This option is only useful on systems that have EFI firmware
-       and will result in a kernel image that is ~8k larger.  In addition,
-       you must use the latest ELILO loader available at
-       <http://elilo.sourceforge.net> in order to take advantage of
-       kernel initialization using EFI information (neither GRUB nor LILO know
-       anything about EFI).  However, even with this option, the resultant
-       kernel should continue to boot on existing non-EFI platforms.
+       This option is only useful on systems that have EFI firmware.
+       In addition, you should use the latest ELILO loader available
+       at <http://elilo.sourceforge.net> in order to take advantage
+       of EFI runtime services. However, even with this option, the
+       resultant kernel should continue to boot on existing non-EFI
+       platforms.
  
  config IRQBALANCE
-       bool "Enable kernel irq balancing"
+       def_bool y
+       prompt "Enable kernel irq balancing"
         depends on X86_32 && SMP && X86_IO_APIC
-       default y
         help
           The default yes will allow the kernel to do irq load balancing.
           Saying no will keep the kernel from doing irq load balancing.
  
-# turning this on wastes a bunch of space.
-# Summit needs it only when NUMA is on
-config BOOT_IOREMAP
-       bool
-       depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
-       default y
-
  config SECCOMP
-       bool "Enable seccomp to safely compute untrusted bytecode"
+       def_bool y
+       prompt "Enable seccomp to safely compute untrusted bytecode"
         depends on PROC_FS
-       default y
         help
           This kernel feature is useful for number crunching applications
           that may need to compute untrusted bytecode during their
@@ -1189,11 +1180,11 @@ config HOTPLUG_CPU
           suspend.
  
  config COMPAT_VDSO
-       bool "Compat VDSO support"
-       default y
-       depends on X86_32
+       def_bool y
+       prompt "Compat VDSO support"
+       depends on X86_32 || IA32_EMULATION
         help
-         Map the VDSO to the predictable old-style address too.
+         Map the 32-bit VDSO to the predictable old-style address too.
         ---help---
           Say N here if you are running a sufficiently recent glibc
           version (2.3.3 or later), to remove the high-mapped
@@ -1207,30 +1198,26 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
         def_bool y
         depends on X86_64 || (X86_32 && HIGHMEM)
  
-config MEMORY_HOTPLUG_RESERVE
-       def_bool X86_64
-       depends on (MEMORY_HOTPLUG && DISCONTIGMEM)
-
  config HAVE_ARCH_EARLY_PFN_TO_NID
         def_bool X86_64
         depends on NUMA
  
-config OUT_OF_LINE_PFN_TO_PAGE
-       def_bool X86_64
-       depends on DISCONTIGMEM
-
  menu "Power management options"
         depends on !X86_VOYAGER
  
  config ARCH_HIBERNATION_HEADER
-       bool
+       def_bool y
         depends on X86_64 && HIBERNATION
-       default y
  
  source "kernel/power/Kconfig"
  
  source "drivers/acpi/Kconfig"
  
+config X86_APM_BOOT
+       bool
+       default y
+       depends on APM || APM_MODULE
+
  menuconfig APM
         tristate "APM (Advanced Power Management) BIOS support"
         depends on X86_32 && PM_SLEEP && !X86_VISWS
@@ -1371,7 +1358,7 @@ menu "Bus options (PCI etc.)"
  config PCI
         bool "PCI support" if !X86_VISWS
         depends on !X86_VOYAGER
-       default y if X86_VISWS
+       default y
         select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
         help
           Find out whether you have a PCI motherboard. PCI is the name of a
@@ -1418,25 +1405,21 @@ config PCI_GOANY
  endchoice
  
  config PCI_BIOS
-       bool
+       def_bool y
         depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
-       default y
  
  # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
  config PCI_DIRECT
-       bool
+       def_bool y
         depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS)
-       default y
  
  config PCI_MMCONFIG
-       bool
+       def_bool y
         depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
-       default y
  
  config PCI_DOMAINS
-       bool
+       def_bool y
         depends on PCI
-       default y
  
  config PCI_MMCONFIG
         bool "Support mmconfig PCI config space access"
@@ -1453,9 +1436,9 @@ config DMAR
           remapping devices.
  
  config DMAR_GFX_WA
-       bool "Support for Graphics workaround"
+       def_bool y
+       prompt "Support for Graphics workaround"
         depends on DMAR
-       default y
         help
          Current Graphics drivers tend to use physical address
          for DMA and avoid using DMA APIs. Setting this config
@@ -1464,9 +1447,8 @@ config DMAR_GFX_WA
          to use physical addresses for DMA.
  
  config DMAR_FLOPPY_WA
-       bool
+       def_bool y
         depends on DMAR
-       default y
         help
          Floppy disk drivers are know to bypass DMA API calls
          thereby failing to work when IOMMU is enabled. This
@@ -1479,8 +1461,7 @@ source "drivers/pci/Kconfig"
  
  # x86_64 have no ISA slots, but do have ISA-style DMA.
  config ISA_DMA_API
-       bool
-       default y
+       def_bool y
  
  if X86_32
  
@@ -1546,9 +1527,9 @@ config SCx200HR_TIMER
           other workaround is idle=poll boot option.
  
  config GEODE_MFGPT_TIMER
-       bool "Geode Multi-Function General Purpose Timer (MFGPT) events"
+       def_bool y
+       prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
         depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
-       default y
         help
           This driver provides a clock event source based on the MFGPT
           timer(s) in the CS5535 and CS5536 companion chip for the geode.
@@ -1575,6 +1556,7 @@ source "fs/Kconfig.binfmt"
  config IA32_EMULATION
         bool "IA32 Emulation"
         depends on X86_64
+       select COMPAT_BINFMT_ELF
         help
           Include code to run 32-bit programs under a 64-bit kernel. You should
           likely turn this on, unless you're 100% sure that you don't have any
@@ -1587,18 +1569,16 @@ config IA32_AOUT
           Support old a.out binaries in the 32bit emulation.
  
  config COMPAT
-       bool
+       def_bool y
         depends on IA32_EMULATION
-       default y
  
  config COMPAT_FOR_U64_ALIGNMENT
         def_bool COMPAT
         depends on X86_64
  
  config SYSVIPC_COMPAT
-       bool
+       def_bool y
         depends on X86_64 && COMPAT && SYSVIPC
-       default y
  
  endmenu
  
@@ -1619,4 +1599,6 @@ source "security/Kconfig"
  
  source "crypto/Kconfig"
  
+source "arch/x86/kvm/Kconfig"
+
  source "lib/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu

index c30162202dc4b10f767ee71728c13769369d1fae..e09a6b73a1aab5c3fc6a353e22af4936b047416f 100644 (file)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -219,10 +219,10 @@ config MGEODEGX1
           Select this for a Geode GX1 (Cyrix MediaGX) chip.
  
  config MGEODE_LX
-       bool "Geode GX/LX"
+       bool "Geode GX/LX"
         depends on X86_32
-       help
-         Select this for AMD Geode GX and LX processors.
+       help
+         Select this for AMD Geode GX and LX processors.
  
  config MCYRIXIII
         bool "CyrixIII/VIA-C3"
@@ -258,7 +258,7 @@ config MPSC
           Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
           Xeon CPUs with Intel 64bit which is compatible with x86-64.
           Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
-          Netburst core and shouldn't use this option. You can distinguish them
+         Netburst core and shouldn't use this option. You can distinguish them
           using the cpu family field
           in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
  
@@ -317,81 +317,75 @@ config X86_L1_CACHE_SHIFT
         default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
  
  config X86_XADD
-       bool
+       def_bool y
         depends on X86_32 && !M386
-       default y
  
  config X86_PPRO_FENCE
-       bool
+       bool "PentiumPro memory ordering errata workaround"
         depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1
-       default y
+       help
+         Old PentiumPro multiprocessor systems had errata that could cause memory
+         operations to violate the x86 ordering standard in rare cases. Enabling this
+         option will attempt to work around some (but not all) occurances of
+         this problem, at the cost of much heavier spinlock and memory barrier
+         operations.
+
+         If unsure, say n here. Even distro kernels should think twice before enabling
+         this: there are few systems, and an unlikely bug.
  
  config X86_F00F_BUG
-       bool
+       def_bool y
         depends on M586MMX || M586TSC || M586 || M486 || M386
-       default y
  
  config X86_WP_WORKS_OK
-       bool
+       def_bool y
         depends on X86_32 && !M386
-       default y
  
  config X86_INVLPG
-       bool
+       def_bool y
         depends on X86_32 && !M386
-       default y
  
  config X86_BSWAP
-       bool
+       def_bool y
         depends on X86_32 && !M386
-       default y
  
  config X86_POPAD_OK
-       bool
+       def_bool y
         depends on X86_32 && !M386
-       default y
  
  config X86_ALIGNMENT_16
-       bool
+       def_bool y
         depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
-       default y
  
  config X86_GOOD_APIC
-       bool
+       def_bool y
         depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
-       default y
  
  config X86_INTEL_USERCOPY
-       bool
+       def_bool y
         depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
-       default y
  
  config X86_USE_PPRO_CHECKSUM
-       bool
+       def_bool y
         depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
-       default y
  
  config X86_USE_3DNOW
-       bool
+       def_bool y
         depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
-       default y
  
  config X86_OOSTORE
-       bool
+       def_bool y
         depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
-       default y
  
  config X86_TSC
-       bool
+       def_bool y
         depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
-       default y
  
  # this should be set for all -march=.. options where the compiler
  # generates cmov.
  config X86_CMOV
-       bool
+       def_bool y
         depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
-       default y
  
  config X86_MINIMUM_CPU_FAMILY
         int
@@ -399,3 +393,6 @@ config X86_MINIMUM_CPU_FAMILY
         default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
         default "3"
  
+config X86_DEBUGCTLMSR
+       def_bool y
+       depends on !(M586MMX || M586TSC || M586 || M486 || M386)
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index 761ca7b5f120e6cb0d52d2352dc4bb26d23b74e3..2e1e3af28c3a2d5c455b048a330b720a74d1f8bf 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,7 +6,7 @@ config TRACE_IRQFLAGS_SUPPORT
  source "lib/Kconfig.debug"
  
  config EARLY_PRINTK
-       bool "Early printk" if EMBEDDED && DEBUG_KERNEL && X86_32
+       bool "Early printk" if EMBEDDED
         default y
         help
           Write kernel log output directly into the VGA buffer or to a serial
@@ -40,22 +40,49 @@ comment "Page alloc debug is incompatible with Software Suspend on i386"
  
  config DEBUG_PAGEALLOC
         bool "Debug page memory allocations"
-       depends on DEBUG_KERNEL && !HIBERNATION && !HUGETLBFS
-       depends on X86_32
+       depends on DEBUG_KERNEL && X86_32
         help
           Unmap pages from the kernel linear mapping after free_pages().
           This results in a large slowdown, but helps to find certain types
           of memory corruptions.
  
+config DEBUG_PER_CPU_MAPS
+       bool "Debug access to per_cpu maps"
+       depends on DEBUG_KERNEL
+       depends on X86_64_SMP
+       default n
+       help
+         Say Y to verify that the per_cpu map being accessed has
+         been setup.  Adds a fair amount of code to kernel memory
+         and decreases performance.
+
+         Say N if unsure.
+
  config DEBUG_RODATA
         bool "Write protect kernel read-only data structures"
+       default y
         depends on DEBUG_KERNEL
         help
           Mark the kernel read-only data as write-protected in the pagetables,
           in order to catch accidental (and incorrect) writes to such const
-         data. This option may have a slight performance impact because a
-         portion of the kernel code won't be covered by a 2MB TLB anymore.
-         If in doubt, say "N".
+         data. This is recommended so that we can catch kernel bugs sooner.
+         If in doubt, say "Y".
+
+config DEBUG_RODATA_TEST
+       bool "Testcase for the DEBUG_RODATA feature"
+       depends on DEBUG_RODATA
+       help
+         This option enables a testcase for the DEBUG_RODATA
+         feature as well as for the change_page_attr() infrastructure.
+         If in doubt, say "N"
+
+config DEBUG_NX_TEST
+       tristate "Testcase for the NX non-executable stack feature"
+       depends on DEBUG_KERNEL && m
+       help
+         This option enables a testcase for the CPU NX capability
+         and the software setup of this feature.
+         If in doubt, say "N"
  
  config 4KSTACKS
         bool "Use 4Kb for kernel stacks instead of 8Kb"
@@ -75,8 +102,7 @@ config X86_FIND_SMP_CONFIG
  
  config X86_MPPARSE
         def_bool y
-       depends on X86_LOCAL_APIC && !X86_VISWS
-       depends on X86_32
+       depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
  
  config DOUBLEFAULT
         default y
@@ -112,4 +138,91 @@ config IOMMU_LEAK
           Add a simple leak tracer to the IOMMU code. This is useful when you
           are debugging a buggy device driver that leaks IOMMU mappings.
  
+#
+# IO delay types:
+#
+
+config IO_DELAY_TYPE_0X80
+       int
+       default "0"
+
+config IO_DELAY_TYPE_0XED
+       int
+       default "1"
+
+config IO_DELAY_TYPE_UDELAY
+       int
+       default "2"
+
+config IO_DELAY_TYPE_NONE
+       int
+       default "3"
+
+choice
+       prompt "IO delay type"
+       default IO_DELAY_0XED
+
+config IO_DELAY_0X80
+       bool "port 0x80 based port-IO delay [recommended]"
+       help
+         This is the traditional Linux IO delay used for in/out_p.
+         It is the most tested hence safest selection here.
+
+config IO_DELAY_0XED
+       bool "port 0xed based port-IO delay"
+       help
+         Use port 0xed as the IO delay. This frees up port 0x80 which is
+         often used as a hardware-debug port.
+
+config IO_DELAY_UDELAY
+       bool "udelay based port-IO delay"
+       help
+         Use udelay(2) as the IO delay method. This provides the delay
+         while not having any side-effect on the IO port space.
+
+config IO_DELAY_NONE
+       bool "no port-IO delay"
+       help
+         No port-IO delay. Will break on old boxes that require port-IO
+         delay for certain operations. Should work on most new machines.
+
+endchoice
+
+if IO_DELAY_0X80
+config DEFAULT_IO_DELAY_TYPE
+       int
+       default IO_DELAY_TYPE_0X80
+endif
+
+if IO_DELAY_0XED
+config DEFAULT_IO_DELAY_TYPE
+       int
+       default IO_DELAY_TYPE_0XED
+endif
+
+if IO_DELAY_UDELAY
+config DEFAULT_IO_DELAY_TYPE
+       int
+       default IO_DELAY_TYPE_UDELAY
+endif
+
+if IO_DELAY_NONE
+config DEFAULT_IO_DELAY_TYPE
+       int
+       default IO_DELAY_TYPE_NONE
+endif
+
+config DEBUG_BOOT_PARAMS
+       bool "Debug boot parameters"
+       depends on DEBUG_KERNEL
+       depends on DEBUG_FS
+       help
+         This option will cause struct boot_params to be exported via debugfs.
+
+config CPA_DEBUG
+       bool "CPA self test code"
+       depends on DEBUG_KERNEL
+       help
+         Do change_page_attr self tests at boot.
+
  endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile

index 7aa1dc6d67c88f73067ff4be2ab6bf7ee204c9c1..da8f4129780bd8d25801eeee7742986ec92dddbc 100644 (file)
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,13 +7,254 @@ else
          KBUILD_DEFCONFIG := $(ARCH)_defconfig
  endif
  
-# No need to remake these files
-$(srctree)/arch/x86/Makefile%: ;
+core-$(CONFIG_KVM) += arch/x86/kvm/
+
+# BITS is used as extension for files which are available in a 32 bit
+# and a 64 bit version to simplify shared Makefiles.
+# e.g.: obj-y += foo_$(BITS).o
+export BITS
  
  ifeq ($(CONFIG_X86_32),y)
+        BITS := 32
          UTS_MACHINE := i386
-        include $(srctree)/arch/x86/Makefile_32
+        CHECKFLAGS += -D__i386__
+
+        biarch := $(call cc-option,-m32)
+        KBUILD_AFLAGS += $(biarch)
+        KBUILD_CFLAGS += $(biarch)
+
+        ifdef CONFIG_RELOCATABLE
+                LDFLAGS_vmlinux := --emit-relocs
+        endif
+
+        KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
+
+        # prevent gcc from keeping the stack 16 byte aligned
+        KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
+
+        # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
+        # a lot more stack due to the lack of sharing of stacklots:
+        KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
+                echo $(call cc-option,-fno-unit-at-a-time); fi ;)
+
+        # CPU-specific tuning. Anything which can be shared with UML should go here.
+        include $(srctree)/arch/x86/Makefile_32.cpu
+        KBUILD_CFLAGS += $(cflags-y)
+
+        # temporary until string.h is fixed
+        KBUILD_CFLAGS += -ffreestanding
  else
+        BITS := 64
          UTS_MACHINE := x86_64
-        include $(srctree)/arch/x86/Makefile_64
+        CHECKFLAGS += -D__x86_64__ -m64
+
+        KBUILD_AFLAGS += -m64
+        KBUILD_CFLAGS += -m64
+
+        # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+        cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
+        cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
+
+        cflags-$(CONFIG_MCORE2) += \
+                $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
+        cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
+        KBUILD_CFLAGS += $(cflags-y)
+
+        KBUILD_CFLAGS += -mno-red-zone
+        KBUILD_CFLAGS += -mcmodel=kernel
+
+        # -funit-at-a-time shrinks the kernel .text considerably
+        # unfortunately it makes reading oopses harder.
+        KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
+
+        # this works around some issues with generating unwind tables in older gccs
+        # newer gccs do it by default
+        KBUILD_CFLAGS += -maccumulate-outgoing-args
+
+        stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
+        stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
+                "$(CC)" -fstack-protector )
+        stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
+                "$(CC)" -fstack-protector-all )
+
+        KBUILD_CFLAGS += $(stackp-y)
  endif
+
+# Stackpointer is addressed different for 32 bit and 64 bit x86
+sp-$(CONFIG_X86_32) := esp
+sp-$(CONFIG_X86_64) := rsp
+
+# do binutils support CFI?
+cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
+# is .cfi_signal_frame supported too?
+cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
+
+LDFLAGS := -m elf_$(UTS_MACHINE)
+OBJCOPYFLAGS := -O binary -R .note -R .comment -S
+
+# Speed up the build
+KBUILD_CFLAGS += -pipe
+# Workaround for a gcc prelease that unfortunately was shipped in a suse release
+KBUILD_CFLAGS += -Wno-sign-compare
+#
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+# prevent gcc from generating any FP code by mistake
+KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+
+###
+# Sub architecture support
+# fcore-y is linked before mcore-y files.
+
+# Default subarch .c files
+mcore-y  := arch/x86/mach-default/
+
+# Voyager subarch support
+mflags-$(CONFIG_X86_VOYAGER)   := -Iinclude/asm-x86/mach-voyager
+mcore-$(CONFIG_X86_VOYAGER)    := arch/x86/mach-voyager/
+
+# VISWS subarch support
+mflags-$(CONFIG_X86_VISWS)     := -Iinclude/asm-x86/mach-visws
+mcore-$(CONFIG_X86_VISWS)      := arch/x86/mach-visws/
+
+# NUMAQ subarch support
+mflags-$(CONFIG_X86_NUMAQ)     := -Iinclude/asm-x86/mach-numaq
+mcore-$(CONFIG_X86_NUMAQ)      := arch/x86/mach-default/
+
+# BIGSMP subarch support
+mflags-$(CONFIG_X86_BIGSMP)    := -Iinclude/asm-x86/mach-bigsmp
+mcore-$(CONFIG_X86_BIGSMP)     := arch/x86/mach-default/
+
+#Summit subarch support
+mflags-$(CONFIG_X86_SUMMIT)    := -Iinclude/asm-x86/mach-summit
+mcore-$(CONFIG_X86_SUMMIT)     := arch/x86/mach-default/
+
+# generic subarchitecture
+mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
+fcore-$(CONFIG_X86_GENERICARCH)        += arch/x86/mach-generic/
+mcore-$(CONFIG_X86_GENERICARCH)        := arch/x86/mach-default/
+
+
+# ES7000 subarch support
+mflags-$(CONFIG_X86_ES7000)    := -Iinclude/asm-x86/mach-es7000
+fcore-$(CONFIG_X86_ES7000)     := arch/x86/mach-es7000/
+mcore-$(CONFIG_X86_ES7000)     := arch/x86/mach-default/
+
+# RDC R-321x subarch support
+mflags-$(CONFIG_X86_RDC321X)   := -Iinclude/asm-x86/mach-rdc321x
+mcore-$(CONFIG_X86_RDC321X)    := arch/x86/mach-default
+core-$(CONFIG_X86_RDC321X)     += arch/x86/mach-rdc321x/
+
+# default subarch .h files
+mflags-y += -Iinclude/asm-x86/mach-default
+
+# 64 bit does not support subarch support - clear sub arch variables
+fcore-$(CONFIG_X86_64)  :=
+mcore-$(CONFIG_X86_64)  :=
+mflags-$(CONFIG_X86_64) :=
+
+KBUILD_CFLAGS += $(mflags-y)
+KBUILD_AFLAGS += $(mflags-y)
+
+###
+# Kernel objects
+
+head-y                := arch/x86/kernel/head_$(BITS).o
+head-$(CONFIG_X86_64) += arch/x86/kernel/head64.o
+head-y                += arch/x86/kernel/init_task.o
+
+libs-y  += arch/x86/lib/
+
+# Sub architecture files that needs linking first
+core-y += $(fcore-y)
+
+# Xen paravirtualization support
+core-$(CONFIG_XEN) += arch/x86/xen/
+
+# lguest paravirtualization support
+core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
+
+core-y += arch/x86/kernel/
+core-y += arch/x86/mm/
+
+# Remaining sub architecture files
+core-y += $(mcore-y)
+
+core-y += arch/x86/crypto/
+core-y += arch/x86/vdso/
+core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
+
+# drivers-y are linked after core-y
+drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
+drivers-$(CONFIG_PCI)            += arch/x86/pci/
+
+# must be linked after kernel/
+drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
+
+ifeq ($(CONFIG_X86_32),y)
+drivers-$(CONFIG_PM) += arch/x86/power/
+drivers-$(CONFIG_FB) += arch/x86/video/
+endif
+
+####
+# boot loader support. Several targets are kept for legacy purposes
+
+boot := arch/x86/boot
+
+PHONY += zImage bzImage compressed zlilo bzlilo \
+         zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
+
+# Default kernel to build
+all: bzImage
+
+# KBUILD_IMAGE specify target image being built
+                    KBUILD_IMAGE := $(boot)/bzImage
+zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage
+
+zImage bzImage: vmlinux
+       $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+       $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
+       $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage
+
+compressed: zImage
+
+zlilo bzlilo: vmlinux
+       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
+
+zdisk bzdisk: vmlinux
+       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
+
+fdimage fdimage144 fdimage288 isoimage: vmlinux
+       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
+
+install: vdso_install
+       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
+
+PHONY += vdso_install
+vdso_install:
+       $(Q)$(MAKE) $(build)=arch/x86/vdso $@
+
+archclean:
+       $(Q)rm -rf $(objtree)/arch/i386
+       $(Q)rm -rf $(objtree)/arch/x86_64
+       $(Q)$(MAKE) $(clean)=$(boot)
+
+define archhelp
+  echo  '* bzImage      - Compressed kernel image (arch/x86/boot/bzImage)'
+  echo  '  install      - Install kernel using'
+  echo  '                  (your) ~/bin/installkernel or'
+  echo  '                  (distribution) /sbin/installkernel or'
+  echo  '                  install to $$(INSTALL_PATH) and run lilo'
+  echo  '  fdimage      - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+  echo  '  fdimage144   - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+  echo  '  fdimage288   - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+  echo  '  isoimage     - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
+  echo  '                  bzdisk/fdimage*/isoimage also accept:'
+  echo  '                  FDARGS="..."  arguments for the booted kernel'
+  echo  '                  FDINITRD=file initrd for the booted kernel'
+endef
+
+CLEAN_FILES += arch/x86/boot/fdimage \
+              arch/x86/boot/image.iso \
+              arch/x86/boot/mtools.conf
diff --git a/arch/x86/Makefile_32 b/arch/x86/Makefile_32

deleted file mode 100644 (file)

index 50394da..0000000
--- a/arch/x86/Makefile_32
+++ /dev/null
@@ -1,175 +0,0 @@
-#
-# i386 Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" cleaning up for this architecture.
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1994 by Linus Torvalds
-#
-# 19990713  Artur Skawina <skawina@geocities.com>
-#           Added '-march' and '-mpreferred-stack-boundary' support
-#
-# 20050320  Kianusch Sayah Karadji <kianusch@sk-tech.net>
-#           Added support for GEODE CPU
-
-# BITS is used as extension for files which are available in a 32 bit
-# and a 64 bit version to simplify shared Makefiles.
-# e.g.: obj-y += foo_$(BITS).o
-BITS := 32
-export BITS
-
-HAS_BIARCH      := $(call cc-option-yn, -m32)
-ifeq ($(HAS_BIARCH),y)
-AS              := $(AS) --32
-LD              := $(LD) -m elf_i386
-CC              := $(CC) -m32
-endif
-
-LDFLAGS                := -m elf_i386
-OBJCOPYFLAGS   := -O binary -R .note -R .comment -S
-ifdef CONFIG_RELOCATABLE
-LDFLAGS_vmlinux := --emit-relocs
-endif
-CHECKFLAGS     += -D__i386__
-
-KBUILD_CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return
-
-# prevent gcc from keeping the stack 16 byte aligned
-KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
-
-# CPU-specific tuning. Anything which can be shared with UML should go here.
-include $(srctree)/arch/x86/Makefile_32.cpu
-
-# temporary until string.h is fixed
-cflags-y += -ffreestanding
-
-# this works around some issues with generating unwind tables in older gccs
-# newer gccs do it by default
-cflags-y += -maccumulate-outgoing-args
-
-# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
-# a lot more stack due to the lack of sharing of stacklots:
-KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;)
-
-# do binutils support CFI?
-cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
-KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
-
-# is .cfi_signal_frame supported too?
-cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
-KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
-
-KBUILD_CFLAGS += $(cflags-y)
-
-# Default subarch .c files
-mcore-y  := arch/x86/mach-default
-
-# Voyager subarch support
-mflags-$(CONFIG_X86_VOYAGER)   := -Iinclude/asm-x86/mach-voyager
-mcore-$(CONFIG_X86_VOYAGER)    := arch/x86/mach-voyager
-
-# VISWS subarch support
-mflags-$(CONFIG_X86_VISWS)     := -Iinclude/asm-x86/mach-visws
-mcore-$(CONFIG_X86_VISWS)      := arch/x86/mach-visws
-
-# NUMAQ subarch support
-mflags-$(CONFIG_X86_NUMAQ)     := -Iinclude/asm-x86/mach-numaq
-mcore-$(CONFIG_X86_NUMAQ)      := arch/x86/mach-default
-
-# BIGSMP subarch support
-mflags-$(CONFIG_X86_BIGSMP)    := -Iinclude/asm-x86/mach-bigsmp
-mcore-$(CONFIG_X86_BIGSMP)     := arch/x86/mach-default
-
-#Summit subarch support
-mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit
-mcore-$(CONFIG_X86_SUMMIT)  := arch/x86/mach-default
-
-# generic subarchitecture
-mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-x86/mach-generic
-mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default
-core-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
-
-# ES7000 subarch support
-mflags-$(CONFIG_X86_ES7000)    := -Iinclude/asm-x86/mach-es7000
-mcore-$(CONFIG_X86_ES7000)     := arch/x86/mach-default
-core-$(CONFIG_X86_ES7000)      := arch/x86/mach-es7000/
-
-# Xen paravirtualization support
-core-$(CONFIG_XEN)             += arch/x86/xen/
-
-# lguest paravirtualization support
-core-$(CONFIG_LGUEST_GUEST)    += arch/x86/lguest/
-
-# default subarch .h files
-mflags-y += -Iinclude/asm-x86/mach-default
-
-head-y := arch/x86/kernel/head_32.o arch/x86/kernel/init_task.o
-
-libs-y                                         += arch/x86/lib/
-core-y                                 += arch/x86/kernel/ \
-                                          arch/x86/mm/ \
-                                          $(mcore-y)/ \
-                                          arch/x86/crypto/
-drivers-$(CONFIG_MATH_EMULATION)       += arch/x86/math-emu/
-drivers-$(CONFIG_PCI)                  += arch/x86/pci/
-# must be linked after kernel/
-drivers-$(CONFIG_OPROFILE)             += arch/x86/oprofile/
-drivers-$(CONFIG_PM)                   += arch/x86/power/
-drivers-$(CONFIG_FB)                    += arch/x86/video/
-
-KBUILD_CFLAGS += $(mflags-y)
-KBUILD_AFLAGS += $(mflags-y)
-
-boot := arch/x86/boot
-
-PHONY += zImage bzImage compressed zlilo bzlilo \
-         zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
-
-all: bzImage
-
-# KBUILD_IMAGE specify target image being built
-                    KBUILD_IMAGE := $(boot)/bzImage
-zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage
-
-zImage bzImage: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
-       $(Q)mkdir -p $(objtree)/arch/i386/boot
-       $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/i386/boot/bzImage
-
-compressed: zImage
-
-zlilo bzlilo: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
-
-zdisk bzdisk: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
-
-fdimage fdimage144 fdimage288 isoimage: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-install:
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
-
-archclean:
-       $(Q)rm -rf $(objtree)/arch/i386/boot
-       $(Q)$(MAKE) $(clean)=arch/x86/boot
-
-define archhelp
-  echo  '* bzImage     - Compressed kernel image (arch/x86/boot/bzImage)'
-  echo  '  install     - Install kernel using'
-  echo  '                 (your) ~/bin/installkernel or'
-  echo  '                 (distribution) /sbin/installkernel or'
-  echo  '                 install to $$(INSTALL_PATH) and run lilo'
-  echo  '  bzdisk       - Create a boot floppy in /dev/fd0'
-  echo  '  fdimage      - Create a boot floppy image'
-  echo  '  isoimage     - Create a boot CD-ROM image'
-endef
-
-CLEAN_FILES += arch/x86/boot/fdimage \
-              arch/x86/boot/image.iso \
-              arch/x86/boot/mtools.conf
diff --git a/arch/x86/Makefile_64 b/arch/x86/Makefile_64

deleted file mode 100644 (file)

index a804860..0000000
--- a/arch/x86/Makefile_64
+++ /dev/null
@@ -1,144 +0,0 @@
-#
-# x86_64 Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1994 by Linus Torvalds
-#
-# 19990713  Artur Skawina <skawina@geocities.com>
-#           Added '-march' and '-mpreferred-stack-boundary' support
-# 20000913  Pavel Machek <pavel@suse.cz>
-#          Converted for x86_64 architecture
-# 20010105  Andi Kleen, add IA32 compiler.
-#           ....and later removed it again....
-#
-# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $
-
-# BITS is used as extension for files which are available in a 32 bit
-# and a 64 bit version to simplify shared Makefiles.
-# e.g.: obj-y += foo_$(BITS).o
-BITS := 64
-export BITS
-
-LDFLAGS                := -m elf_x86_64
-OBJCOPYFLAGS   := -O binary -R .note -R .comment -S
-LDFLAGS_vmlinux :=
-CHECKFLAGS      += -D__x86_64__ -m64
-
-cflags-y       :=
-cflags-kernel-y        :=
-cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
-cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
-# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it
-# will eventually. Use -mtune=generic as fallback
-cflags-$(CONFIG_MCORE2) += \
-       $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
-cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
-
-cflags-y += -m64
-cflags-y += -mno-red-zone
-cflags-y += -mcmodel=kernel
-cflags-y += -pipe
-cflags-y += -Wno-sign-compare
-cflags-y += -fno-asynchronous-unwind-tables
-ifneq ($(CONFIG_DEBUG_INFO),y)
-# -fweb shrinks the kernel a bit, but the difference is very small
-# it also messes up debugging, so don't use it for now.
-#cflags-y += $(call cc-option,-fweb)
-endif
-# -funit-at-a-time shrinks the kernel .text considerably
-# unfortunately it makes reading oopses harder.
-cflags-y += $(call cc-option,-funit-at-a-time)
-# prevent gcc from generating any FP code by mistake
-cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
-# this works around some issues with generating unwind tables in older gccs
-# newer gccs do it by default
-cflags-y += -maccumulate-outgoing-args
-
-# do binutils support CFI?
-cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
-KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
-
-# is .cfi_signal_frame supported too?
-cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
-KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
-
-cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector )
-cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector-all )
-
-KBUILD_CFLAGS += $(cflags-y)
-CFLAGS_KERNEL += $(cflags-kernel-y)
-KBUILD_AFLAGS += -m64
-
-head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task.o
-
-libs-y                                         += arch/x86/lib/
-core-y                                 += arch/x86/kernel/ \
-                                          arch/x86/mm/ \
-                                          arch/x86/crypto/ \
-                                          arch/x86/vdso/
-core-$(CONFIG_IA32_EMULATION)          += arch/x86/ia32/
-drivers-$(CONFIG_PCI)                  += arch/x86/pci/
-drivers-$(CONFIG_OPROFILE)             += arch/x86/oprofile/
-
-boot := arch/x86/boot
-
-PHONY += bzImage bzlilo install archmrproper \
-        fdimage fdimage144 fdimage288 isoimage archclean
-
-#Default target when executing "make"
-all: bzImage
-
-BOOTIMAGE                     := arch/x86/boot/bzImage
-KBUILD_IMAGE                  := $(BOOTIMAGE)
-
-bzImage: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE)
-       $(Q)mkdir -p $(objtree)/arch/x86_64/boot
-       $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/x86_64/boot/bzImage
-
-bzlilo: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo
-
-bzdisk: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
-
-fdimage fdimage144 fdimage288 isoimage: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
-
-install: vdso_install
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ 
-
-vdso_install:
-ifeq ($(CONFIG_IA32_EMULATION),y)
-       $(Q)$(MAKE) $(build)=arch/x86/ia32 $@
-endif
-       $(Q)$(MAKE) $(build)=arch/x86/vdso $@
-
-archclean:
-       $(Q)rm -rf $(objtree)/arch/x86_64/boot
-       $(Q)$(MAKE) $(clean)=$(boot)
-
-define archhelp
-  echo  '* bzImage     - Compressed kernel image (arch/x86/boot/bzImage)'
-  echo  '  install     - Install kernel using'
-  echo  '                 (your) ~/bin/installkernel or'
-  echo  '                 (distribution) /sbin/installkernel or'
-  echo  '                 install to $$(INSTALL_PATH) and run lilo'
-  echo  '  bzdisk       - Create a boot floppy in /dev/fd0'
-  echo  '  fdimage      - Create a boot floppy image'
-  echo  '  isoimage     - Create a boot CD-ROM image'
-endef
-
-CLEAN_FILES += arch/x86/boot/fdimage \
-              arch/x86/boot/image.iso \
-              arch/x86/boot/mtools.conf
-
-
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile

index 7a3116ccf3878d6e3212f8d4e346a3ceef912e8b..349b81a39c40b586275a5c8695f058ac80bc162f 100644 (file)
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -28,9 +28,11 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
  targets                := vmlinux.bin setup.bin setup.elf zImage bzImage
  subdir-        := compressed
  
-setup-y                += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y                += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
  setup-y                += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y                += printf.o string.o tty.o video.o version.o voyager.o
+setup-y                += printf.o string.o tty.o video.o version.o
+setup-$(CONFIG_X86_APM_BOOT) += apm.o
+setup-$(CONFIG_X86_VOYAGER) += voyager.o
  
  # The link order of the video-*.o modules can matter.  In particular,
  # video-vga.o *must* be listed first, followed by video-vesa.o.
@@ -49,10 +51,7 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE)
  
  # How to compile the 16-bit code.  Note we always compile for -march=i386,
  # that way we can complain to the user if the CPU is insufficient.
-cflags-$(CONFIG_X86_32) :=
-cflags-$(CONFIG_X86_64) := -m32
  KBUILD_CFLAGS  := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
-                  $(cflags-y) \
                    -Wall -Wstrict-prototypes \
                    -march=i386 -mregparm=3 \
                    -include $(srctree)/$(src)/code16gcc.h \
@@ -62,6 +61,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
                         $(call cc-option, -fno-unit-at-a-time)) \
                    $(call cc-option, -fno-stack-protector) \
                    $(call cc-option, -mpreferred-stack-boundary=2)
+KBUILD_CFLAGS +=   $(call cc-option,-m32)
  KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
  
  $(obj)/zImage:  IMAGE_OFFSET := 0x1000
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c

index eab50c55a3a565f626443fffe015fa5ad9c38b04..c117c7fb859c12bb13685f9fd213dd3e8facec2e 100644 (file)
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -19,8 +19,6 @@
  
  #include "boot.h"
  
-#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
-
  int query_apm_bios(void)
  {
         u16 ax, bx, cx, dx, di;
@@ -95,4 +93,3 @@ int query_apm_bios(void)
         return 0;
  }
  
-#endif
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h

index d2b5adf46512f20feb7fc27a14d39bc37c10e3e4..7822a4983da2376a8b4297cf3bad80a3519c717e 100644 (file)
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -109,7 +109,7 @@ typedef unsigned int addr_t;
  static inline u8 rdfs8(addr_t addr)
  {
         u8 v;
-       asm volatile("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
+       asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
         return v;
  }
  static inline u16 rdfs16(addr_t addr)
@@ -127,21 +127,21 @@ static inline u32 rdfs32(addr_t addr)
  
  static inline void wrfs8(u8 v, addr_t addr)
  {
-       asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v));
+       asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
  }
  static inline void wrfs16(u16 v, addr_t addr)
  {
-       asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v));
+       asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
  }
  static inline void wrfs32(u32 v, addr_t addr)
  {
-       asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v));
+       asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
  }
  
  static inline u8 rdgs8(addr_t addr)
  {
         u8 v;
-       asm volatile("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr));
+       asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
         return v;
  }
  static inline u16 rdgs16(addr_t addr)
@@ -159,15 +159,15 @@ static inline u32 rdgs32(addr_t addr)
  
  static inline void wrgs8(u8 v, addr_t addr)
  {
-       asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v));
+       asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
  }
  static inline void wrgs16(u16 v, addr_t addr)
  {
-       asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v));
+       asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
  }
  static inline void wrgs32(u32 v, addr_t addr)
  {
-       asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v));
+       asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
  }
  
  /* Note: these only return true/false, not a signed return value! */
@@ -241,6 +241,7 @@ int query_apm_bios(void);
  
  /* cmdline.c */
  int cmdline_find_option(const char *option, char *buffer, int bufsize);
+int cmdline_find_option_bool(const char *option);
  
  /* cpu.c, cpucheck.c */
  int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c

index 34bb778c4357b9492d020ff234e39eb4a09fa6a1..680408a0f46317c898d5e73a620b499e9a9cbdb2 100644 (file)
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -95,3 +95,68 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize)
  
         return len;
  }
+
+/*
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * Returns the position of that option (starts counting with 1)
+ * or 0 on not found
+ */
+int cmdline_find_option_bool(const char *option)
+{
+       u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr;
+       addr_t cptr;
+       char c;
+       int pos = 0, wstart = 0;
+       const char *opptr = NULL;
+       enum {
+               st_wordstart,   /* Start of word/after whitespace */
+               st_wordcmp,     /* Comparing this word */
+               st_wordskip,    /* Miscompare, skip */
+       } state = st_wordstart;
+
+       if (!cmdline_ptr || cmdline_ptr >= 0x100000)
+               return -1;      /* No command line, or inaccessible */
+
+       cptr = cmdline_ptr & 0xf;
+       set_fs(cmdline_ptr >> 4);
+
+       while (cptr < 0x10000) {
+               c = rdfs8(cptr++);
+               pos++;
+
+               switch (state) {
+               case st_wordstart:
+                       if (!c)
+                               return 0;
+                       else if (myisspace(c))
+                               break;
+
+                       state = st_wordcmp;
+                       opptr = option;
+                       wstart = pos;
+                       /* fall through */
+
+               case st_wordcmp:
+                       if (!*opptr)
+                               if (!c || myisspace(c))
+                                       return wstart;
+                               else
+                                       state = st_wordskip;
+                       else if (!c)
+                               return 0;
+                       else if (c != *opptr++)
+                               state = st_wordskip;
+                       break;
+
+               case st_wordskip:
+                       if (!c)
+                               return 0;
+                       else if (myisspace(c))
+                               state = st_wordstart;
+                       break;
+               }
+       }
+
+       return 0;       /* Buffer overrun */
+}
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile

index 52c1db85452019b2447ece2f20fba818e4128e40..fe24ceabd9095b9d1c0c4c0aac50672e2cc38023 100644 (file)
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -1,5 +1,63 @@
+#
+# linux/arch/x86/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+
+targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o
+
+KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
+KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
+cflags-$(CONFIG_X86_64) := -mcmodel=small
+KBUILD_CFLAGS += $(cflags-y)
+KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
+KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
+
+KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+
+LDFLAGS := -m elf_$(UTS_MACHINE)
+LDFLAGS_vmlinux := -T
+
+$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+       $(call if_changed,ld)
+       @:
+
+$(obj)/vmlinux.bin: vmlinux FORCE
+       $(call if_changed,objcopy)
+
+
  ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/boot/compressed/Makefile_32
+targets += vmlinux.bin.all vmlinux.relocs
+hostprogs-y := relocs
+
+quiet_cmd_relocs = RELOCS  $@
+      cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
+$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
+       $(call if_changed,relocs)
+
+vmlinux.bin.all-y := $(obj)/vmlinux.bin
+vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
+quiet_cmd_relocbin = BUILD   $@
+      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
+$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
+       $(call if_changed,relocbin)
+
+ifdef CONFIG_RELOCATABLE
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
+       $(call if_changed,gzip)
  else
-include ${srctree}/arch/x86/boot/compressed/Makefile_64
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+       $(call if_changed,gzip)
  endif
+LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
+
+else
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+       $(call if_changed,gzip)
+
+LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
+endif
+
+
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
+       $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32

deleted file mode 100644 (file)

index e43ff7c..0000000
--- a/arch/x86/boot/compressed/Makefile_32
+++ /dev/null
@@ -1,50 +0,0 @@
-#
-# linux/arch/x86/boot/compressed/Makefile
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-
-targets                := vmlinux vmlinux.bin vmlinux.bin.gz head_32.o misc_32.o piggy.o \
-                       vmlinux.bin.all vmlinux.relocs
-EXTRA_AFLAGS   := -traditional
-
-LDFLAGS_vmlinux := -T
-hostprogs-y    := relocs
-
-KBUILD_CFLAGS  := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
-          -fno-strict-aliasing -fPIC \
-          $(call cc-option,-ffreestanding) \
-          $(call cc-option,-fno-stack-protector)
-LDFLAGS := -m elf_i386
-
-$(obj)/vmlinux: $(src)/vmlinux_32.lds $(obj)/head_32.o $(obj)/misc_32.o $(obj)/piggy.o FORCE
-       $(call if_changed,ld)
-       @:
-
-$(obj)/vmlinux.bin: vmlinux FORCE
-       $(call if_changed,objcopy)
-
-quiet_cmd_relocs = RELOCS  $@
-      cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
-$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
-       $(call if_changed,relocs)
-
-vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
-quiet_cmd_relocbin = BUILD   $@
-      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
-$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
-       $(call if_changed,relocbin)
-
-ifdef CONFIG_RELOCATABLE
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
-       $(call if_changed,gzip)
-else
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,gzip)
-endif
-
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
-
-$(obj)/piggy.o: $(src)/vmlinux_32.scr $(obj)/vmlinux.bin.gz FORCE
-       $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64

deleted file mode 100644 (file)

index 7801e8d..0000000
--- a/arch/x86/boot/compressed/Makefile_64
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# linux/arch/x86/boot/compressed/Makefile
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-
-targets                := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o
-
-KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2  \
-         -fno-strict-aliasing -fPIC -mcmodel=small \
-          $(call cc-option, -ffreestanding) \
-          $(call cc-option, -fno-stack-protector)
-KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-LDFLAGS := -m elf_x86_64
-
-LDFLAGS_vmlinux := -T
-$(obj)/vmlinux: $(src)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o $(obj)/piggy.o FORCE
-       $(call if_changed,ld)
-       @:
-
-$(obj)/vmlinux.bin: vmlinux FORCE
-       $(call if_changed,objcopy)
-
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,gzip)
-
-LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
-
-$(obj)/piggy.o: $(obj)/vmlinux_64.scr $(obj)/vmlinux.bin.gz FORCE
-       $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc.c

similarity index 86%

rename from arch/x86/boot/compressed/misc_32.c

rename to arch/x86/boot/compressed/misc.c

index b74d60d1b2fa1329f6f28c434fc47445cb39ccb4..8182e32c1b42c05372a8dabf7d8232faf5b3da76 100644 (file)
--- a/arch/x86/boot/compressed/misc_32.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -1,7 +1,7 @@
  /*
   * misc.c
- * 
- * This is a collection of several routines from gzip-1.0.3 
+ *
+ * This is a collection of several routines from gzip-1.0.3
   * adapted for Linux.
   *
   * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
@@ -9,9 +9,18 @@
   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
   */
  
+/*
+ * we have to be careful, because no indirections are allowed here, and
+ * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening
+ */
  #undef CONFIG_PARAVIRT
+#ifdef CONFIG_X86_64
+#define _LINUX_STRING_H_ 1
+#define __LINUX_BITMAP_H 1
+#endif
+
  #include <linux/linkage.h>
-#include <linux/vmalloc.h>
  #include <linux/screen_info.h>
  #include <asm/io.h>
  #include <asm/page.h>
@@ -186,10 +195,20 @@ static void *memcpy(void *dest, const void *src, unsigned n);
  
  static void putstr(const char *);
  
-static unsigned long free_mem_ptr;
-static unsigned long free_mem_end_ptr;
+#ifdef CONFIG_X86_64
+#define memptr long
+#else
+#define memptr unsigned
+#endif
+
+static memptr free_mem_ptr;
+static memptr free_mem_end_ptr;
  
+#ifdef CONFIG_X86_64
+#define HEAP_SIZE             0x7000
+#else
  #define HEAP_SIZE             0x4000
+#endif
  
  static char *vidmem = (char *)0xb8000;
  static int vidport;
@@ -230,7 +249,7 @@ static void gzip_mark(void **ptr)
  
  static void gzip_release(void **ptr)
  {
-       free_mem_ptr = (unsigned long) *ptr;
+       free_mem_ptr = (memptr) *ptr;
  }
   
  static void scroll(void)
@@ -247,8 +266,10 @@ static void putstr(const char *s)
         int x,y,pos;
         char c;
  
+#ifdef CONFIG_X86_32
         if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0)
                 return;
+#endif
  
         x = RM_SCREEN_INFO.orig_x;
         y = RM_SCREEN_INFO.orig_y;
@@ -261,7 +282,7 @@ static void putstr(const char *s)
                                 y--;
                         }
                 } else {
-                       vidmem [ ( x + cols * y ) * 2 ] = c;
+                       vidmem [(x + cols * y) * 2] = c;
                         if ( ++x >= cols ) {
                                 x = 0;
                                 if ( ++y >= lines ) {
@@ -276,16 +297,16 @@ static void putstr(const char *s)
         RM_SCREEN_INFO.orig_y = y;
  
         pos = (x + cols * y) * 2;       /* Update cursor position */
-       outb_p(14, vidport);
-       outb_p(0xff & (pos >> 9), vidport+1);
-       outb_p(15, vidport);
-       outb_p(0xff & (pos >> 1), vidport+1);
+       outb(14, vidport);
+       outb(0xff & (pos >> 9), vidport+1);
+       outb(15, vidport);
+       outb(0xff & (pos >> 1), vidport+1);
  }
  
  static void* memset(void* s, int c, unsigned n)
  {
         int i;
-       char *ss = (char*)s;
+       char *ss = s;
  
         for (i=0;i<n;i++) ss[i] = c;
         return s;
@@ -294,7 +315,8 @@ static void* memset(void* s, int c, unsigned n)
  static void* memcpy(void* dest, const void* src, unsigned n)
  {
         int i;
-       char *d = (char *)dest, *s = (char *)src;
+       const char *s = src;
+       char *d = dest;
  
         for (i=0;i<n;i++) d[i] = s[i];
         return dest;
@@ -339,11 +361,13 @@ static void error(char *x)
         putstr(x);
         putstr("\n\n -- System halted");
  
-       while(1);       /* Halt */
+       while (1)
+               asm("hlt");
  }
  
-asmlinkage void decompress_kernel(void *rmode, unsigned long end,
-                       uch *input_data, unsigned long input_len, uch *output)
+asmlinkage void decompress_kernel(void *rmode, memptr heap,
+                                 uch *input_data, unsigned long input_len,
+                                 uch *output)
  {
         real_mode = rmode;
  
@@ -358,25 +382,32 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end,
         lines = RM_SCREEN_INFO.orig_video_lines;
         cols = RM_SCREEN_INFO.orig_video_cols;
  
-       window = output;        /* Output buffer (Normally at 1M) */
-       free_mem_ptr     = end; /* Heap  */
-       free_mem_end_ptr = end + HEAP_SIZE;
-       inbuf  = input_data;    /* Input buffer */
+       window = output;                /* Output buffer (Normally at 1M) */
+       free_mem_ptr     = heap;        /* Heap */
+       free_mem_end_ptr = heap + HEAP_SIZE;
+       inbuf  = input_data;            /* Input buffer */
         insize = input_len;
         inptr  = 0;
  
+#ifdef CONFIG_X86_64
+       if ((ulg)output & (__KERNEL_ALIGN - 1))
+               error("Destination address not 2M aligned");
+       if ((ulg)output >= 0xffffffffffUL)
+               error("Destination address too large");
+#else
         if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
                 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
-       if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff))
+       if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
                 error("Destination address too large");
  #ifndef CONFIG_RELOCATABLE
         if ((u32)output != LOAD_PHYSICAL_ADDR)
                 error("Wrong destination address");
+#endif
  #endif
  
         makecrc();
-       putstr("Uncompressing Linux... ");
+       putstr("\nDecompressing Linux... ");
         gunzip();
-       putstr("Ok, booting the kernel.\n");
+       putstr("done.\nBooting the kernel.\n");
         return;
  }
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c

deleted file mode 100644 (file)

index 6ea015a..0000000
--- a/arch/x86/boot/compressed/misc_64.c
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * misc.c
- * 
- * This is a collection of several routines from gzip-1.0.3 
- * adapted for Linux.
- *
- * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
- * puts by Nick Holloway 1993, better puts by Martin Mares 1995
- * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
- */
-
-#define _LINUX_STRING_H_ 1
-#define __LINUX_BITMAP_H 1
-
-#include <linux/linkage.h>
-#include <linux/screen_info.h>
-#include <asm/io.h>
-#include <asm/page.h>
-
-/* WARNING!!
- * This code is compiled with -fPIC and it is relocated dynamically
- * at run time, but no relocation processing is performed.
- * This means that it is not safe to place pointers in static structures.
- */
-
-/*
- * Getting to provable safe in place decompression is hard.
- * Worst case behaviours need to be analyzed.
- * Background information:
- *
- * The file layout is:
- *    magic[2]
- *    method[1]
- *    flags[1]
- *    timestamp[4]
- *    extraflags[1]
- *    os[1]
- *    compressed data blocks[N]
- *    crc[4] orig_len[4]
- *
- * resulting in 18 bytes of non compressed data overhead.
- *
- * Files divided into blocks
- * 1 bit (last block flag)
- * 2 bits (block type)
- *
- * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
- * The smallest block type encoding is always used.
- *
- * stored:
- *    32 bits length in bytes.
- *
- * fixed:
- *    magic fixed tree.
- *    symbols.
- *
- * dynamic:
- *    dynamic tree encoding.
- *    symbols.
- *
- *
- * The buffer for decompression in place is the length of the
- * uncompressed data, plus a small amount extra to keep the algorithm safe.
- * The compressed data is placed at the end of the buffer.  The output
- * pointer is placed at the start of the buffer and the input pointer
- * is placed where the compressed data starts.  Problems will occur
- * when the output pointer overruns the input pointer.
- *
- * The output pointer can only overrun the input pointer if the input
- * pointer is moving faster than the output pointer.  A condition only
- * triggered by data whose compressed form is larger than the uncompressed
- * form.
- *
- * The worst case at the block level is a growth of the compressed data
- * of 5 bytes per 32767 bytes.
- *
- * The worst case internal to a compressed block is very hard to figure.
- * The worst case can at least be boundined by having one bit that represents
- * 32764 bytes and then all of the rest of the bytes representing the very
- * very last byte.
- *
- * All of which is enough to compute an amount of extra data that is required
- * to be safe.  To avoid problems at the block level allocating 5 extra bytes
- * per 32767 bytes of data is sufficient.  To avoind problems internal to a block
- * adding an extra 32767 bytes (the worst case uncompressed block size) is
- * sufficient, to ensure that in the worst case the decompressed data for
- * block will stop the byte before the compressed data for a block begins.
- * To avoid problems with the compressed data's meta information an extra 18
- * bytes are needed.  Leading to the formula:
- *
- * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
- *
- * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
- * Adding 32768 instead of 32767 just makes for round numbers.
- * Adding the decompressor_size is necessary as it musht live after all
- * of the data as well.  Last I measured the decompressor is about 14K.
- * 10K of actual data and 4K of bss.
- *
- */
-
-/*
- * gzip declarations
- */
-
-#define OF(args)  args
-#define STATIC static
-
-#undef memset
-#undef memcpy
-#define memzero(s, n)     memset ((s), 0, (n))
-
-typedef unsigned char  uch;
-typedef unsigned short ush;
-typedef unsigned long  ulg;
-
-#define WSIZE 0x80000000       /* Window size must be at least 32k,
-                                * and a power of two
-                                * We don't actually have a window just
-                                * a huge output buffer so I report
-                                * a 2G windows size, as that should
-                                * always be larger than our output buffer.
-                                */
-
-static uch *inbuf;     /* input buffer */
-static uch *window;    /* Sliding window buffer, (and final output buffer) */
-
-static unsigned insize;  /* valid bytes in inbuf */
-static unsigned inptr;   /* index of next byte to be processed in inbuf */
-static unsigned outcnt;  /* bytes in output buffer */
-
-/* gzip flag byte */
-#define ASCII_FLAG   0x01 /* bit 0 set: file probably ASCII text */
-#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
-#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
-#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
-#define COMMENT      0x10 /* bit 4 set: file comment present */
-#define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
-#define RESERVED     0xC0 /* bit 6,7:   reserved */
-
-#define get_byte()  (inptr < insize ? inbuf[inptr++] : fill_inbuf())
-               
-/* Diagnostic functions */
-#ifdef DEBUG
-#  define Assert(cond,msg) {if(!(cond)) error(msg);}
-#  define Trace(x) fprintf x
-#  define Tracev(x) {if (verbose) fprintf x ;}
-#  define Tracevv(x) {if (verbose>1) fprintf x ;}
-#  define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
-#  define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
-#else
-#  define Assert(cond,msg)
-#  define Trace(x)
-#  define Tracev(x)
-#  define Tracevv(x)
-#  define Tracec(c,x)
-#  define Tracecv(c,x)
-#endif
-
-static int  fill_inbuf(void);
-static void flush_window(void);
-static void error(char *m);
-static void gzip_mark(void **);
-static void gzip_release(void **);
-  
-/*
- * This is set up by the setup-routine at boot-time
- */
-static unsigned char *real_mode; /* Pointer to real-mode data */
-
-#define RM_EXT_MEM_K   (*(unsigned short *)(real_mode + 0x2))
-#ifndef STANDARD_MEMORY_BIOS_CALL
-#define RM_ALT_MEM_K   (*(unsigned long *)(real_mode + 0x1e0))
-#endif
-#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
-
-extern unsigned char input_data[];
-extern int input_len;
-
-static long bytes_out = 0;
-
-static void *malloc(int size);
-static void free(void *where);
-
-static void *memset(void *s, int c, unsigned n);
-static void *memcpy(void *dest, const void *src, unsigned n);
-
-static void putstr(const char *);
-
-static long free_mem_ptr;
-static long free_mem_end_ptr;
-
-#define HEAP_SIZE             0x7000
-
-static char *vidmem = (char *)0xb8000;
-static int vidport;
-static int lines, cols;
-
-#include "../../../../lib/inflate.c"
-
-static void *malloc(int size)
-{
-       void *p;
-
-       if (size <0) error("Malloc error");
-       if (free_mem_ptr <= 0) error("Memory error");
-
-       free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
-
-       p = (void *)free_mem_ptr;
-       free_mem_ptr += size;
-
-       if (free_mem_ptr >= free_mem_end_ptr)
-               error("Out of memory");
-
-       return p;
-}
-
-static void free(void *where)
-{      /* Don't care */
-}
-
-static void gzip_mark(void **ptr)
-{
-       *ptr = (void *) free_mem_ptr;
-}
-
-static void gzip_release(void **ptr)
-{
-       free_mem_ptr = (long) *ptr;
-}
- 
-static void scroll(void)
-{
-       int i;
-
-       memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
-       for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
-               vidmem[i] = ' ';
-}
-
-static void putstr(const char *s)
-{
-       int x,y,pos;
-       char c;
-
-       x = RM_SCREEN_INFO.orig_x;
-       y = RM_SCREEN_INFO.orig_y;
-
-       while ( ( c = *s++ ) != '\0' ) {
-               if ( c == '\n' ) {
-                       x = 0;
-                       if ( ++y >= lines ) {
-                               scroll();
-                               y--;
-                       }
-               } else {
-                       vidmem [ ( x + cols * y ) * 2 ] = c; 
-                       if ( ++x >= cols ) {
-                               x = 0;
-                               if ( ++y >= lines ) {
-                                       scroll();
-                                       y--;
-                               }
-                       }
-               }
-       }
-
-       RM_SCREEN_INFO.orig_x = x;
-       RM_SCREEN_INFO.orig_y = y;
-
-       pos = (x + cols * y) * 2;       /* Update cursor position */
-       outb_p(14, vidport);
-       outb_p(0xff & (pos >> 9), vidport+1);
-       outb_p(15, vidport);
-       outb_p(0xff & (pos >> 1), vidport+1);
-}
-
-static void* memset(void* s, int c, unsigned n)
-{
-       int i;
-       char *ss = (char*)s;
-
-       for (i=0;i<n;i++) ss[i] = c;
-       return s;
-}
-
-static void* memcpy(void* dest, const void* src, unsigned n)
-{
-       int i;
-       char *d = (char *)dest, *s = (char *)src;
-
-       for (i=0;i<n;i++) d[i] = s[i];
-       return dest;
-}
-
-/* ===========================================================================
- * Fill the input buffer. This is called only when the buffer is empty
- * and at least one byte is really needed.
- */
-static int fill_inbuf(void)
-{
-       error("ran out of input data");
-       return 0;
-}
-
-/* ===========================================================================
- * Write the output window window[0..outcnt-1] and update crc and bytes_out.
- * (Used for the decompressed data only.)
- */
-static void flush_window(void)
-{
-       /* With my window equal to my output buffer
-        * I only need to compute the crc here.
-        */
-       ulg c = crc;         /* temporary variable */
-       unsigned n;
-       uch *in, ch;
-
-       in = window;
-       for (n = 0; n < outcnt; n++) {
-               ch = *in++;
-               c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
-       }
-       crc = c;
-       bytes_out += (ulg)outcnt;
-       outcnt = 0;
-}
-
-static void error(char *x)
-{
-       putstr("\n\n");
-       putstr(x);
-       putstr("\n\n -- System halted");
-
-       while(1);       /* Halt */
-}
-
-asmlinkage void decompress_kernel(void *rmode, unsigned long heap,
-       uch *input_data, unsigned long input_len, uch *output)
-{
-       real_mode = rmode;
-
-       if (RM_SCREEN_INFO.orig_video_mode == 7) {
-               vidmem = (char *) 0xb0000;
-               vidport = 0x3b4;
-       } else {
-               vidmem = (char *) 0xb8000;
-               vidport = 0x3d4;
-       }
-
-       lines = RM_SCREEN_INFO.orig_video_lines;
-       cols = RM_SCREEN_INFO.orig_video_cols;
-
-       window = output;                /* Output buffer (Normally at 1M) */
-       free_mem_ptr     = heap;        /* Heap  */
-       free_mem_end_ptr = heap + HEAP_SIZE;
-       inbuf  = input_data;            /* Input buffer */
-       insize = input_len;
-       inptr  = 0;
-
-       if ((ulg)output & (__KERNEL_ALIGN - 1))
-               error("Destination address not 2M aligned");
-       if ((ulg)output >= 0xffffffffffUL)
-               error("Destination address too large");
-
-       makecrc();
-       putstr(".\nDecompressing Linux...");
-       gunzip();
-       putstr("done.\nBooting the kernel.\n");
-       return;
-}
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c

index 7a0d00b2cf28303f6b1471767bb15423eaada7cb..d01ea42187e6aa8ddcd1218f229e74e6dda13148 100644 (file)
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -27,11 +27,6 @@ static unsigned long *relocs;
   * absolute relocations present w.r.t these symbols.
   */
  static const char* safe_abs_relocs[] = {
-               "__kernel_vsyscall",
-               "__kernel_rt_sigreturn",
-               "__kernel_sigreturn",
-               "SYSENTER_RETURN",
-               "VDSO_NOTE_MASK",
                 "xen_irq_disable_direct_reloc",
                 "xen_save_fl_direct_reloc",
  };
@@ -45,6 +40,8 @@ static int is_safe_abs_reloc(const char* sym_name)
                         /* Match found */
                         return 1;
         }
+       if (strncmp(sym_name, "VDSO", 4) == 0)
+               return 1;
         if (strncmp(sym_name, "__crc_", 6) == 0)
                 return 1;
         return 0;
diff --git a/arch/x86/boot/compressed/vmlinux_64.scr b/arch/x86/boot/compressed/vmlinux.scr

similarity index 84%

rename from arch/x86/boot/compressed/vmlinux_64.scr

rename to arch/x86/boot/compressed/vmlinux.scr

index bd1429ce193e903b1c1127edcff1b32371dfabfe..f02382ae5c48b1cd319a4cb1e755fa060f03cfdc 100644 (file)
--- a/arch/x86/boot/compressed/vmlinux_64.scr
+++ b/arch/x86/boot/compressed/vmlinux.scr
@@ -1,6 +1,6 @@
  SECTIONS
  {
-  .text.compressed : {
+  .rodata.compressed : {
         input_len = .;
         LONG(input_data_end - input_data) input_data = .;
         *(.data)
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds

index cc4854f6c6c1d722fef7d83b4261f9b0e21bcc4a..bb3c48379c40f858f6ce09d5be215ad192524ed4 100644 (file)
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ b/arch/x86/boot/compressed/vmlinux_32.lds
@@ -3,17 +3,17 @@ OUTPUT_ARCH(i386)
  ENTRY(startup_32)
  SECTIONS
  {
-        /* Be careful parts of head.S assume startup_32 is at
-         * address 0.
+       /* Be careful parts of head_32.S assume startup_32 is at
+        * address 0.
          */
-       . =  0  ;
+       . = 0;
         .text.head : {
                 _head = . ;
                 *(.text.head)
                 _ehead = . ;
         }
-       .data.compressed : {
-               *(.data.compressed)
+       .rodata.compressed : {
+               *(.rodata.compressed)
         }
         .text : {
                 _text = .;      /* Text */
diff --git a/arch/x86/boot/compressed/vmlinux_32.scr b/arch/x86/boot/compressed/vmlinux_32.scr

deleted file mode 100644 (file)

index 707a88f..0000000
--- a/arch/x86/boot/compressed/vmlinux_32.scr
+++ /dev/null
@@ -1,10 +0,0 @@
-SECTIONS
-{
-  .data.compressed : {
-       input_len = .;
-       LONG(input_data_end - input_data) input_data = .; 
-       *(.data) 
-       output_len = . - 4;
-       input_data_end = .; 
-       }
-}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds

index 94c13e557fb4f057a620ef319367d623a08e9f88..f6e5b445f45734126152819bf0dcb8e9de7acbfb 100644 (file)
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux_64.lds
@@ -3,15 +3,19 @@ OUTPUT_ARCH(i386:x86-64)
  ENTRY(startup_64)
  SECTIONS
  {
-       /* Be careful parts of head.S assume startup_32 is at
-        * address 0.
+       /* Be careful parts of head_64.S assume startup_64 is at
+        * address 0.
          */
         . = 0;
-       .text : {
+       .text.head : {
                 _head = . ;
                 *(.text.head)
                 _ehead = . ;
-               *(.text.compressed)
+       }
+       .rodata.compressed : {
+               *(.rodata.compressed)
+       }
+       .text : {
                 _text = .;      /* Text */
                 *(.text)
                 *(.text.*)
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c

index bd138e442ec25bfd08b5dec3a166d1f5d4e6a48e..8721dc46a0b618336b9908e0d855611f7beb6175 100644 (file)
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -129,6 +129,7 @@ void query_edd(void)
         char eddarg[8];
         int do_mbr = 1;
         int do_edd = 1;
+       int be_quiet;
         int devno;
         struct edd_info ei, *edp;
         u32 *mbrptr;
@@ -140,12 +141,21 @@ void query_edd(void)
                         do_edd = 0;
         }
  
+       be_quiet = cmdline_find_option_bool("quiet");
+
         edp    = boot_params.eddbuf;
         mbrptr = boot_params.edd_mbr_sig_buffer;
  
         if (!do_edd)
                 return;
  
+       /* Bugs in OnBoard or AddOnCards Bios may hang the EDD probe,
+        * so give a hint if this happens.
+        */
+
+       if (!be_quiet)
+               printf("Probing EDD (edd=off to disable)... ");
+
         for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
                 /*
                  * Scan the BIOS-supported hard disks and query EDD
@@ -162,6 +172,9 @@ void query_edd(void)
                 if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++))
                         boot_params.edd_mbr_sig_buf_entries = devno-0x80+1;
         }
+
+       if (!be_quiet)
+               printf("ok\n");
  }
  
  #endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S

index 4cc5b0411db58dd8201081702f4b93822a72101d..64ad9016585afd26fec078bc7b3a14ff69d699fd 100644 (file)
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -195,10 +195,13 @@ cmd_line_ptr:     .long   0               # (Header version 0x0202 or later)
                                         # can be located anywhere in
                                         # low memory 0x10000 or higher.
  
-ramdisk_max:   .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
+ramdisk_max:   .long 0x7fffffff
                                         # (Header version 0x0203 or later)
                                         # The highest safe address for
                                         # the contents of an initrd
+                                       # The current kernel allows up to 4 GB,
+                                       # but leave it at 2 GB to avoid
+                                       # possible bootloader bugs.
  
  kernel_alignment:  .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
                                                 #required for protected mode
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c

index 1f95750ede28461c41d6c7247723254edb06926e..7828da5cfd07475376c7d4d3fc95282334b60a3d 100644 (file)
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -100,20 +100,32 @@ static void set_bios_mode(void)
  #endif
  }
  
-void main(void)
+static void init_heap(void)
  {
-       /* First, copy the boot header into the "zeropage" */
-       copy_boot_params();
+       char *stack_end;
  
-       /* End of heap check */
         if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
-               heap_end = (char *)(boot_params.hdr.heap_end_ptr
-                                   +0x200-STACK_SIZE);
+               asm("leal %P1(%%esp),%0"
+                   : "=r" (stack_end) : "i" (-STACK_SIZE));
+
+               heap_end = (char *)
+                       ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
+               if (heap_end > stack_end)
+                       heap_end = stack_end;
         } else {
                 /* Boot protocol 2.00 only, no heap available */
                 puts("WARNING: Ancient bootloader, some functionality "
                      "may be limited!\n");
         }
+}
+
+void main(void)
+{
+       /* First, copy the boot header into the "zeropage" */
+       copy_boot_params();
+
+       /* End of heap check */
+       init_heap();
  
         /* Make sure we have all the proper CPU support */
         if (validate_cpu()) {
@@ -131,9 +143,6 @@ void main(void)
         /* Set keyboard repeat rate (why?) */
         keyboard_set_repeat();
  
-       /* Set the video mode */
-       set_video();
-
         /* Query MCA information */
         query_mca();
  
@@ -154,6 +163,10 @@ void main(void)
  #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
         query_edd();
  #endif
+
+       /* Set the video mode */
+       set_video();
+
         /* Do the last things and invoke protected mode */
         go_to_protected_mode();
  }
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c

index 09fb342cc62e1965a4edf12fa73e19589d6b7ab0..1a0f936c160b0f1d250db22d5687314799576d65 100644 (file)
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -104,7 +104,7 @@ static void reset_coprocessor(void)
         (((u64)(base & 0xff000000) << 32) |     \
          ((u64)flags << 40) |                   \
          ((u64)(limit & 0x00ff0000) << 32) |    \
-        ((u64)(base & 0x00ffff00) << 16) |     \
+        ((u64)(base & 0x00ffffff) << 16) |     \
          ((u64)(limit & 0x0000ffff)))
  
  struct gdt_ptr {
@@ -121,6 +121,10 @@ static void setup_gdt(void)
                 [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
                 /* DS: data, read/write, 4 GB, base 0 */
                 [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
+               /* TSS: 32-bit tss, 104 bytes, base 4096 */
+               /* We only have a TSS here to keep Intel VT happy;
+                  we don't actually use it for anything. */
+               [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
         };
         /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
            of the gdt_ptr contents.  Thus, make it static so it will
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S

index fa6bed1fac14862b86936f1bc45af5c9b64aa7cf..f5402d51f7c3d1ff534a009d54d438a3cfa54ec8 100644 (file)
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -15,6 +15,7 @@
   */
  
  #include <asm/boot.h>
+#include <asm/processor-flags.h>
  #include <asm/segment.h>
  
         .text
@@ -29,28 +30,55 @@
   */
  protected_mode_jump:
         movl    %edx, %esi              # Pointer to boot_params table
-       movl    %eax, 2f                # Patch ljmpl instruction
+
+       xorl    %ebx, %ebx
+       movw    %cs, %bx
+       shll    $4, %ebx
+       addl    %ebx, 2f
  
         movw    $__BOOT_DS, %cx
-       xorl    %ebx, %ebx              # Per the 32-bit boot protocol
-       xorl    %ebp, %ebp              # Per the 32-bit boot protocol
-       xorl    %edi, %edi              # Per the 32-bit boot protocol
+       movw    $__BOOT_TSS, %di
  
         movl    %cr0, %edx
-       orb     $1, %dl                 # Protected mode (PE) bit
+       orb     $X86_CR0_PE, %dl        # Protected mode
         movl    %edx, %cr0
         jmp     1f                      # Short jump to serialize on 386/486
  1:
  
-       movw    %cx, %ds
-       movw    %cx, %es
-       movw    %cx, %fs
-       movw    %cx, %gs
-       movw    %cx, %ss
-
-       # Jump to the 32-bit entrypoint
+       # Transition to 32-bit mode
         .byte   0x66, 0xea              # ljmpl opcode
-2:     .long   0                       # offset
+2:     .long   in_pm32                 # offset
         .word   __BOOT_CS               # segment
  
         .size   protected_mode_jump, .-protected_mode_jump
+
+       .code32
+       .type   in_pm32, @function
+in_pm32:
+       # Set up data segments for flat 32-bit mode
+       movl    %ecx, %ds
+       movl    %ecx, %es
+       movl    %ecx, %fs
+       movl    %ecx, %gs
+       movl    %ecx, %ss
+       # The 32-bit code sets up its own stack, but this way we do have
+       # a valid stack if some debugging hack wants to use it.
+       addl    %ebx, %esp
+
+       # Set up TR to make Intel VT happy
+       ltr     %di
+
+       # Clear registers to allow for future extensions to the
+       # 32-bit boot protocol
+       xorl    %ecx, %ecx
+       xorl    %edx, %edx
+       xorl    %ebx, %ebx
+       xorl    %ebp, %ebp
+       xorl    %edi, %edi
+
+       # Set up LDTR to make Intel VT happy
+       lldt    %cx
+
+       jmpl    *%eax                   # Jump to the 32-bit entrypoint
+
+       .size   in_pm32, .-in_pm32
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c

index ed0672a818709b86d8c6ebd6416269434934091e..ff664a117096d5453708f541f709858f5ffc05f0 100644 (file)
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -104,6 +104,7 @@ static int bios_probe(void)
  
                 mi = GET_HEAP(struct mode_info, 1);
                 mi->mode = VIDEO_FIRST_BIOS+mode;
+               mi->depth = 0;  /* text */
                 mi->x = rdfs16(0x44a);
                 mi->y = rdfs8(0x484)+1;
                 nmodes++;
@@ -116,7 +117,7 @@ static int bios_probe(void)
  
  __videocard video_bios =
  {
-       .card_name      = "BIOS (scanned)",
+       .card_name      = "BIOS",
         .probe          = bios_probe,
         .set_mode       = bios_set_mode,
         .unsafe         = 1,
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c

index 4716b9a963575aa8d0eedf703feaa7997ada608e..662dd2f130684c050d46f6950263ef9b4e7cfc08 100644 (file)
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -79,20 +79,28 @@ static int vesa_probe(void)
                         /* Text Mode, TTY BIOS supported,
                            supported by hardware */
                         mi = GET_HEAP(struct mode_info, 1);
-                       mi->mode = mode + VIDEO_FIRST_VESA;
-                       mi->x    = vminfo.h_res;
-                       mi->y    = vminfo.v_res;
+                       mi->mode  = mode + VIDEO_FIRST_VESA;
+                       mi->depth = 0; /* text */
+                       mi->x     = vminfo.h_res;
+                       mi->y     = vminfo.v_res;
                         nmodes++;
-               } else if ((vminfo.mode_attr & 0x99) == 0x99) {
+               } else if ((vminfo.mode_attr & 0x99) == 0x99 &&
+                          (vminfo.memory_layout == 4 ||
+                           vminfo.memory_layout == 6) &&
+                          vminfo.memory_planes == 1) {
  #ifdef CONFIG_FB
                         /* Graphics mode, color, linear frame buffer
-                          supported -- register the mode but hide from
-                          the menu.  Only do this if framebuffer is
-                          configured, however, otherwise the user will
-                          be left without a screen. */
+                          supported.  Only register the mode if
+                          if framebuffer is configured, however,
+                          otherwise the user will be left without a screen.
+                          We don't require CONFIG_FB_VESA, however, since
+                          some of the other framebuffer drivers can use
+                          this mode-setting, too. */
                         mi = GET_HEAP(struct mode_info, 1);
                         mi->mode = mode + VIDEO_FIRST_VESA;
-                       mi->x = mi->y = 0;
+                       mi->depth = vminfo.bpp;
+                       mi->x = vminfo.h_res;
+                       mi->y = vminfo.v_res;
                         nmodes++;
  #endif
                 }
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c

index aef02f9ec0c130e50b139a30863373c0aed22395..7259387b7d1980e20b08705d358cf0b92c2111fa 100644 (file)
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -18,22 +18,22 @@
  #include "video.h"
  
  static struct mode_info vga_modes[] = {
-       { VIDEO_80x25,  80, 25 },
-       { VIDEO_8POINT, 80, 50 },
-       { VIDEO_80x43,  80, 43 },
-       { VIDEO_80x28,  80, 28 },
-       { VIDEO_80x30,  80, 30 },
-       { VIDEO_80x34,  80, 34 },
-       { VIDEO_80x60,  80, 60 },
+       { VIDEO_80x25,  80, 25, 0 },
+       { VIDEO_8POINT, 80, 50, 0 },
+       { VIDEO_80x43,  80, 43, 0 },
+       { VIDEO_80x28,  80, 28, 0 },
+       { VIDEO_80x30,  80, 30, 0 },
+       { VIDEO_80x34,  80, 34, 0 },
+       { VIDEO_80x60,  80, 60, 0 },
  };
  
  static struct mode_info ega_modes[] = {
-       { VIDEO_80x25,  80, 25 },
-       { VIDEO_8POINT, 80, 43 },
+       { VIDEO_80x25,  80, 25, 0 },
+       { VIDEO_8POINT, 80, 43, 0 },
  };
  
  static struct mode_info cga_modes[] = {
-       { VIDEO_80x25,  80, 25 },
+       { VIDEO_80x25,  80, 25, 0 },
  };
  
  __videocard video_vga;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c

index ad9712f01739748c0d0d7d727ac34e20401de26f..696d08f3843ccbc6a14d9d01bca817554175d933 100644 (file)
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -293,13 +293,28 @@ static void display_menu(void)
         struct mode_info *mi;
         char ch;
         int i;
+       int nmodes;
+       int modes_per_line;
+       int col;
  
-       puts("Mode:    COLSxROWS:\n");
+       nmodes = 0;
+       for (card = video_cards; card < video_cards_end; card++)
+               nmodes += card->nmodes;
  
+       modes_per_line = 1;
+       if (nmodes >= 20)
+               modes_per_line = 3;
+
+       for (col = 0; col < modes_per_line; col++)
+               puts("Mode: Resolution:  Type: ");
+       putchar('\n');
+
+       col = 0;
         ch = '0';
         for (card = video_cards; card < video_cards_end; card++) {
                 mi = card->modes;
                 for (i = 0; i < card->nmodes; i++, mi++) {
+                       char resbuf[32];
                         int visible = mi->x && mi->y;
                         u16 mode_id = mi->mode ? mi->mode :
                                 (mi->y << 8)+mi->x;
@@ -307,8 +322,18 @@ static void display_menu(void)
                         if (!visible)
                                 continue; /* Hidden mode */
  
-                       printf("%c  %04X  %3dx%-3d  %s\n",
-                              ch, mode_id, mi->x, mi->y, card->card_name);
+                       if (mi->depth)
+                               sprintf(resbuf, "%dx%d", mi->y, mi->depth);
+                       else
+                               sprintf(resbuf, "%d", mi->y);
+
+                       printf("%c %03X %4dx%-7s %-6s",
+                              ch, mode_id, mi->x, resbuf, card->card_name);
+                       col++;
+                       if (col >= modes_per_line) {
+                               putchar('\n');
+                               col = 0;
+                       }
  
                         if (ch == '9')
                                 ch = 'a';
@@ -318,6 +343,8 @@ static void display_menu(void)
                                 ch++;
                 }
         }
+       if (col)
+               putchar('\n');
  }
  
  #define H(x)   ((x)-'a'+10)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h

index b92447d51213be34075c0397fb62787d348d85e6..d69347f79e8e5b76e23233f7b64216023336e96f 100644 (file)
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -83,7 +83,8 @@ void store_screen(void);
  
  struct mode_info {
         u16 mode;               /* Mode number (vga= style) */
-       u8  x, y;               /* Width, height */
+       u16 x, y;               /* Width, height */
+       u16 depth;              /* Bits per pixel, 0 for text mode */
  };
  
  struct card_info {
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c

index 61c8fe0453be5e08773e6334d7361edddcb5c78c..6499e3239b4132213907ab9ff54d00be9219dd1b 100644 (file)
--- a/arch/x86/boot/voyager.c
+++ b/arch/x86/boot/voyager.c
@@ -16,8 +16,6 @@
  
  #include "boot.h"
  
-#ifdef CONFIG_X86_VOYAGER
-
  int query_voyager(void)
  {
         u8 err;
@@ -42,5 +40,3 @@ int query_voyager(void)
         copy_from_fs(data_ptr, di, 7);  /* Table is 7 bytes apparently */
         return 0;
  }
-
-#endif /* CONFIG_X86_VOYAGER */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig

index 54ee1764fdaebbb3d21aefc48083188d716ce256..77562e7cdab67a9044347214adcfd285a44233c2 100644 (file)
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -99,9 +99,9 @@ CONFIG_IOSCHED_NOOP=y
  CONFIG_IOSCHED_AS=y
  CONFIG_IOSCHED_DEADLINE=y
  CONFIG_IOSCHED_CFQ=y
-CONFIG_DEFAULT_AS=y
+# CONFIG_DEFAULT_AS is not set
  # CONFIG_DEFAULT_DEADLINE is not set
-# CONFIG_DEFAULT_CFQ is not set
+CONFIG_DEFAULT_CFQ=y
  # CONFIG_DEFAULT_NOOP is not set
  CONFIG_DEFAULT_IOSCHED="anticipatory"
  
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig

index 38a83f9c966ff252be2218a53fae5d65c8b41d24..9e2b0ef851dee9ef62ed6494abcafb389921e10e 100644 (file)
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -145,15 +145,6 @@ CONFIG_K8_NUMA=y
  CONFIG_NODES_SHIFT=6
  CONFIG_X86_64_ACPI_NUMA=y
  CONFIG_NUMA_EMU=y
-CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
-CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
-CONFIG_ARCH_SPARSEMEM_ENABLE=y
-CONFIG_SELECT_MEMORY_MODEL=y
-# CONFIG_FLATMEM_MANUAL is not set
-CONFIG_DISCONTIGMEM_MANUAL=y
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_DISCONTIGMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
  CONFIG_NEED_MULTIPLE_NODES=y
  # CONFIG_SPARSEMEM_STATIC is not set
  CONFIG_SPLIT_PTLOCK_CPUS=4
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile

index e2edda255a8494c79fbcd04a048f9315932a5f95..52d0ccfcf6eafbc84fe0d66270b165b34101639d 100644 (file)
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -2,9 +2,7 @@
  # Makefile for the ia32 kernel emulation subsystem.
  #
  
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \
-       ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \
-       mmap32.o
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
  
  sysv-$(CONFIG_SYSVIPC) := ipc32.o
  obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
@@ -13,40 +11,3 @@ obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
  
  audit-class-$(CONFIG_AUDIT) := audit.o
  obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
-
-$(obj)/syscall32_syscall.o: \
-       $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
-
-# Teach kbuild about targets
-targets := $(foreach F,$(addprefix vsyscall-,sysenter syscall),\
-                    $F.o $F.so $F.so.dbg)
-
-# The DSO images are built using a special linker script
-quiet_cmd_syscall = SYSCALL $@
-      cmd_syscall = $(CC) -m32 -nostdlib -shared \
-                         $(call ld-option, -Wl$(comma)--hash-style=sysv) \
-                          -Wl,-soname=linux-gate.so.1 -o $@ \
-                          -Wl,-T,$(filter-out FORCE,$^)
-
-$(obj)/%.so: OBJCOPYFLAGS := -S
-$(obj)/%.so: $(obj)/%.so.dbg FORCE
-       $(call if_changed,objcopy)
-
-$(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \
-$(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
-       $(call if_changed,syscall)
-
-AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
-AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
-
-vdsos := vdso32-sysenter.so vdso32-syscall.so
-
-quiet_cmd_vdso_install = INSTALL $@
-      cmd_vdso_install = cp $(@:vdso32-%.so=$(obj)/vsyscall-%.so.dbg) \
-                           $(MODLIB)/vdso/$@
-
-$(vdsos):
-       @mkdir -p $(MODLIB)/vdso
-       $(call cmd,vdso_install)
-
-vdso_install: $(vdsos)
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c

index 91b7b5922dfa8a150d3aaa189fec81e0080033b1..5d7b381da692e082ae4edb75a71e8a4c009bdb34 100644 (file)
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -27,7 +27,7 @@ unsigned ia32_signal_class[] = {
  
  int ia32_classify_syscall(unsigned syscall)
  {
-       switch(syscall) {
+       switch (syscall) {
         case __NR_open:
                 return 2;
         case __NR_openat:
diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c

deleted file mode 100644 (file)

index 2c8209a..0000000
--- a/arch/x86/ia32/fpu32.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/* 
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
- * This is used for ptrace, signals and coredumps in 32bit emulation.
- */ 
-
-#include <linux/sched.h>
-#include <asm/sigcontext32.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-#include <asm/i387.h>
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
-       unsigned int tmp; /* to avoid 16 bit prefixes in the code */
- 
-       /* Transform each pair of bits into 01 (valid) or 00 (empty) */
-        tmp = ~twd;
-        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-        /* and move the valid bits to the lower byte. */
-        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-        return tmp;
-}
-
-static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-       struct _fpxreg *st = NULL;
-       unsigned long tos = (fxsave->swd >> 11) & 7;
-       unsigned long twd = (unsigned long) fxsave->twd;
-       unsigned long tag;
-       unsigned long ret = 0xffff0000;
-       int i;
-
-#define FPREG_ADDR(f, n)       ((void *)&(f)->st_space + (n) * 16);
-
-       for (i = 0 ; i < 8 ; i++) {
-               if (twd & 0x1) {
-                       st = FPREG_ADDR( fxsave, (i - tos) & 7 );
-
-                       switch (st->exponent & 0x7fff) {
-                       case 0x7fff:
-                               tag = 2;                /* Special */
-                               break;
-                       case 0x0000:
-                               if ( !st->significand[0] &&
-                                    !st->significand[1] &&
-                                    !st->significand[2] &&
-                                    !st->significand[3] ) {
-                                       tag = 1;        /* Zero */
-                               } else {
-                                       tag = 2;        /* Special */
-                               }
-                               break;
-                       default:
-                               if (st->significand[3] & 0x8000) {
-                                       tag = 0;        /* Valid */
-                               } else {
-                                       tag = 2;        /* Special */
-                               }
-                               break;
-                       }
-               } else {
-                       tag = 3;                        /* Empty */
-               }
-               ret |= (tag << (2 * i));
-               twd = twd >> 1;
-       }
-       return ret;
-}
-
-
-static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
-                                        struct _fpstate_ia32 __user *buf)
-{
-       struct _fpxreg *to;
-       struct _fpreg __user *from;
-       int i;
-       u32 v;
-       int err = 0;
-
-#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
-       G(0, fxsave->cwd);
-       G(1, fxsave->swd);
-       G(2, fxsave->twd);
-       fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
-       G(3, fxsave->rip);
-       G(4, v);
-       fxsave->fop = v>>16;    /* cs ignored */
-       G(5, fxsave->rdp);
-       /* 6: ds ignored */
-#undef G
-       if (err) 
-               return -1; 
-
-       to = (struct _fpxreg *)&fxsave->st_space[0];
-       from = &buf->_st[0];
-       for (i = 0 ; i < 8 ; i++, to++, from++) {
-               if (__copy_from_user(to, from, sizeof(*from)))
-                       return -1;
-       }
-       return 0;
-}
-
-
-static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
-                                      struct i387_fxsave_struct *fxsave,
-                                      struct pt_regs *regs,
-                                      struct task_struct *tsk)
-{
-       struct _fpreg __user *to;
-       struct _fpxreg *from;
-       int i;
-       u16 cs,ds; 
-       int err = 0; 
-
-       if (tsk == current) {
-               /* should be actually ds/cs at fpu exception time,
-                  but that information is not available in 64bit mode. */
-               asm("movw %%ds,%0 " : "=r" (ds)); 
-               asm("movw %%cs,%0 " : "=r" (cs));               
-       } else { /* ptrace. task has stopped. */
-               ds = tsk->thread.ds;
-               cs = regs->cs;
-       } 
-
-#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
-       P(0, (u32)fxsave->cwd | 0xffff0000);
-       P(1, (u32)fxsave->swd | 0xffff0000);
-       P(2, twd_fxsr_to_i387(fxsave));
-       P(3, (u32)fxsave->rip);
-       P(4,  cs | ((u32)fxsave->fop) << 16); 
-       P(5, fxsave->rdp);
-       P(6, 0xffff0000 | ds);
-#undef P
-
-       if (err) 
-               return -1; 
-
-       to = &buf->_st[0];
-       from = (struct _fpxreg *) &fxsave->st_space[0];
-       for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
-               if (__copy_to_user(to, from, sizeof(*to)))
-                       return -1;
-       }
-       return 0;
-}
-
-int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) 
-{ 
-       clear_fpu(tsk);
-       if (!fsave) { 
-               if (__copy_from_user(&tsk->thread.i387.fxsave, 
-                                    &buf->_fxsr_env[0],
-                                    sizeof(struct i387_fxsave_struct)))
-                       return -1;
-               tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-               set_stopped_child_used_math(tsk);
-       } 
-       return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
-}  
-
-int save_i387_ia32(struct task_struct *tsk, 
-                  struct _fpstate_ia32 __user *buf, 
-                  struct pt_regs *regs,
-                  int fsave)
-{
-       int err = 0;
-
-       init_fpu(tsk);
-       if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
-               return -1;
-       if (fsave)
-               return 0;
-       err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
-       if (fsave) 
-               return err ? -1 : 1;    
-       err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
-       err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
-                             sizeof(struct i387_fxsave_struct));
-       return err ? -1 : 1;
-}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c

index f82e1a94fcb7eb3e5060ce21dd34e0cbafc4be1a..e4c12079171b682a5a45bc5d70daedec3536e2aa 100644 (file)
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -25,6 +25,7 @@
  #include <linux/binfmts.h>
  #include <linux/personality.h>
  #include <linux/init.h>
+#include <linux/jiffies.h>
  
  #include <asm/system.h>
  #include <asm/uaccess.h>
@@ -36,61 +37,67 @@
  #undef WARN_OLD
  #undef CORE_DUMP /* probably broken */
  
-static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
-static int load_aout_library(struct file*);
+static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
+static int load_aout_library(struct file *);
  
  #ifdef CORE_DUMP
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
+                         unsigned long limit);
  
  /*
   * fill in the user structure for a core dump..
   */
-static void dump_thread32(struct pt_regs * regs, struct user32 * dump)
+static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
  {
-       u32 fs,gs;
+       u32 fs, gs;
  
  /* changed the size calculations - should hopefully work better. lbt */
         dump->magic = CMAGIC;
         dump->start_code = 0;
-       dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
+       dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
         dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
-       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+       dump->u_dsize = ((unsigned long)
+                        (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
         dump->u_dsize -= dump->u_tsize;
         dump->u_ssize = 0;
-       dump->u_debugreg[0] = current->thread.debugreg0;  
-       dump->u_debugreg[1] = current->thread.debugreg1;  
-       dump->u_debugreg[2] = current->thread.debugreg2;  
-       dump->u_debugreg[3] = current->thread.debugreg3;  
-       dump->u_debugreg[4] = 0;  
-       dump->u_debugreg[5] = 0;  
-       dump->u_debugreg[6] = current->thread.debugreg6;  
-       dump->u_debugreg[7] = current->thread.debugreg7;  
-
-       if (dump->start_stack < 0xc0000000)
-               dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT;
-
-       dump->regs.ebx = regs->rbx;
-       dump->regs.ecx = regs->rcx;
-       dump->regs.edx = regs->rdx;
-       dump->regs.esi = regs->rsi;
-       dump->regs.edi = regs->rdi;
-       dump->regs.ebp = regs->rbp;
-       dump->regs.eax = regs->rax;
+       dump->u_debugreg[0] = current->thread.debugreg0;
+       dump->u_debugreg[1] = current->thread.debugreg1;
+       dump->u_debugreg[2] = current->thread.debugreg2;
+       dump->u_debugreg[3] = current->thread.debugreg3;
+       dump->u_debugreg[4] = 0;
+       dump->u_debugreg[5] = 0;
+       dump->u_debugreg[6] = current->thread.debugreg6;
+       dump->u_debugreg[7] = current->thread.debugreg7;
+
+       if (dump->start_stack < 0xc0000000) {
+               unsigned long tmp;
+
+               tmp = (unsigned long) (0xc0000000 - dump->start_stack);
+               dump->u_ssize = tmp >> PAGE_SHIFT;
+       }
+
+       dump->regs.bx = regs->bx;
+       dump->regs.cx = regs->cx;
+       dump->regs.dx = regs->dx;
+       dump->regs.si = regs->si;
+       dump->regs.di = regs->di;
+       dump->regs.bp = regs->bp;
+       dump->regs.ax = regs->ax;
         dump->regs.ds = current->thread.ds;
         dump->regs.es = current->thread.es;
         asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
-       asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 
-       dump->regs.orig_eax = regs->orig_rax;
-       dump->regs.eip = regs->rip;
+       asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
+       dump->regs.orig_ax = regs->orig_ax;
+       dump->regs.ip = regs->ip;
         dump->regs.cs = regs->cs;
-       dump->regs.eflags = regs->eflags;
-       dump->regs.esp = regs->rsp;
+       dump->regs.flags = regs->flags;
+       dump->regs.sp = regs->sp;
         dump->regs.ss = regs->ss;
  
  #if 1 /* FIXME */
         dump->u_fpvalid = 0;
  #else
-       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+       dump->u_fpvalid = dump_fpu(regs, &dump->i387);
  #endif
  }
  
@@ -128,15 +135,19 @@ static int dump_write(struct file *file, const void *addr, int nr)
         return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
  }
  
-#define DUMP_WRITE(addr, nr)   \
+#define DUMP_WRITE(addr, nr)                        \
         if (!dump_write(file, (void *)(addr), (nr))) \
                 goto end_coredump;
  
-#define DUMP_SEEK(offset) \
-if (file->f_op->llseek) { \
-       if (file->f_op->llseek(file,(offset),0) != (offset)) \
-               goto end_coredump; \
-} else file->f_pos = (offset)
+#define DUMP_SEEK(offset)                                              \
+       if (file->f_op->llseek) {                                       \
+               if (file->f_op->llseek(file, (offset), 0) != (offset))  \
+                       goto end_coredump;                              \
+       } else                                                          \
+               file->f_pos = (offset)
+
+#define START_DATA()   (u.u_tsize << PAGE_SHIFT)
+#define START_STACK(u) (u.start_stack)
  
  /*
   * Routine writes a core dump image in the current directory.
@@ -148,62 +159,70 @@ if (file->f_op->llseek) { \
   * dumping of the process results in another error..
   */
  
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
+                         unsigned long limit)
  {
         mm_segment_t fs;
         int has_dumped = 0;
         unsigned long dump_start, dump_size;
         struct user32 dump;
-#       define START_DATA(u)   (u.u_tsize << PAGE_SHIFT)
-#       define START_STACK(u)   (u.start_stack)
  
         fs = get_fs();
         set_fs(KERNEL_DS);
         has_dumped = 1;
         current->flags |= PF_DUMPCORE;
-               strncpy(dump.u_comm, current->comm, sizeof(current->comm));
-       dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump)));
+       strncpy(dump.u_comm, current->comm, sizeof(current->comm));
+       dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) -
+                          ((unsigned long)(&dump)));
         dump.signal = signr;
         dump_thread32(regs, &dump);
  
-/* If the size of the dump file exceeds the rlimit, then see what would happen
-   if we wrote the stack, but not the data area.  */
+       /*
+        * If the size of the dump file exceeds the rlimit, then see
+        * what would happen if we wrote the stack, but not the data
+        * area.
+        */
         if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit)
                 dump.u_dsize = 0;
  
-/* Make sure we have enough room to write the stack and data areas. */
+       /* Make sure we have enough room to write the stack and data areas. */
         if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
                 dump.u_ssize = 0;
  
-/* make sure we actually have a data and stack area to dump */
+       /* make sure we actually have a data and stack area to dump */
         set_fs(USER_DS);
-       if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+       if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump),
+                      dump.u_dsize << PAGE_SHIFT))
                 dump.u_dsize = 0;
-       if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+       if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump),
+                      dump.u_ssize << PAGE_SHIFT))
                 dump.u_ssize = 0;
  
         set_fs(KERNEL_DS);
-/* struct user */
-       DUMP_WRITE(&dump,sizeof(dump));
-/* Now dump all of the user data.  Include malloced stuff as well */
+       /* struct user */
+       DUMP_WRITE(&dump, sizeof(dump));
+       /* Now dump all of the user data.  Include malloced stuff as well */
         DUMP_SEEK(PAGE_SIZE);
-/* now we start writing out the user space info */
+       /* now we start writing out the user space info */
         set_fs(USER_DS);
-/* Dump the data area */
+       /* Dump the data area */
         if (dump.u_dsize != 0) {
                 dump_start = START_DATA(dump);
                 dump_size = dump.u_dsize << PAGE_SHIFT;
-               DUMP_WRITE(dump_start,dump_size);
+               DUMP_WRITE(dump_start, dump_size);
         }
-/* Now prepare to dump the stack area */
+       /* Now prepare to dump the stack area */
         if (dump.u_ssize != 0) {
                 dump_start = START_STACK(dump);
                 dump_size = dump.u_ssize << PAGE_SHIFT;
-               DUMP_WRITE(dump_start,dump_size);
+               DUMP_WRITE(dump_start, dump_size);
         }
-/* Finally dump the task struct.  Not be used by gdb, but could be useful */
+       /*
+        * Finally dump the task struct.  Not be used by gdb, but
+        * could be useful
+        */
         set_fs(KERNEL_DS);
-       DUMP_WRITE(current,sizeof(*current));
+       DUMP_WRITE(current, sizeof(*current));
  end_coredump:
         set_fs(fs);
         return has_dumped;
@@ -217,35 +236,34 @@ end_coredump:
   */
  static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
  {
-       u32 __user *argv;
-       u32 __user *envp;
-       u32 __user *sp;
-       int argc = bprm->argc;
-       int envc = bprm->envc;
+       u32 __user *argv, *envp, *sp;
+       int argc = bprm->argc, envc = bprm->envc;
  
         sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
         sp -= envc+1;
         envp = sp;
         sp -= argc+1;
         argv = sp;
-       put_user((unsigned long) envp,--sp);
-       put_user((unsigned long) argv,--sp);
-       put_user(argc,--sp);
+       put_user((unsigned long) envp, --sp);
+       put_user((unsigned long) argv, --sp);
+       put_user(argc, --sp);
         current->mm->arg_start = (unsigned long) p;
-       while (argc-->0) {
+       while (argc-- > 0) {
                 char c;
-               put_user((u32)(unsigned long)p,argv++);
+
+               put_user((u32)(unsigned long)p, argv++);
                 do {
-                       get_user(c,p++);
+                       get_user(c, p++);
                 } while (c);
         }
         put_user(0, argv);
         current->mm->arg_end = current->mm->env_start = (unsigned long) p;
-       while (envc-->0) {
+       while (envc-- > 0) {
                 char c;
-               put_user((u32)(unsigned long)p,envp++);
+
+               put_user((u32)(unsigned long)p, envp++);
                 do {
-                       get_user(c,p++);
+                       get_user(c, p++);
                 } while (c);
         }
         put_user(0, envp);
@@ -257,20 +275,18 @@ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
   * These are the functions used to load a.out style executables and shared
   * libraries.  There is no binary dependent code anywhere else.
   */
-
-static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
  {
+       unsigned long error, fd_offset, rlim;
         struct exec ex;
-       unsigned long error;
-       unsigned long fd_offset;
-       unsigned long rlim;
         int retval;
  
         ex = *((struct exec *) bprm->buf);              /* exec-header */
         if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
              N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
             N_TRSIZE(ex) || N_DRSIZE(ex) ||
-           i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+           i_size_read(bprm->file->f_path.dentry->d_inode) <
+           ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
                 return -ENOEXEC;
         }
  
@@ -291,13 +307,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
         if (retval)
                 return retval;
  
-       regs->cs = __USER32_CS; 
+       regs->cs = __USER32_CS;
         regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
                 regs->r13 = regs->r14 = regs->r15 = 0;
  
         /* OK, This is the point of no return */
         set_personality(PER_LINUX);
-       set_thread_flag(TIF_IA32); 
+       set_thread_flag(TIF_IA32);
         clear_thread_flag(TIF_ABI_PENDING);
  
         current->mm->end_code = ex.a_text +
@@ -311,7 +327,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
  
         current->mm->mmap = NULL;
         compute_creds(bprm);
-       current->flags &= ~PF_FORKNOEXEC;
+       current->flags &= ~PF_FORKNOEXEC;
  
         if (N_MAGIC(ex) == OMAGIC) {
                 unsigned long text_addr, map_size;
@@ -338,30 +354,31 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                         send_sig(SIGKILL, current, 0);
                         return error;
                 }
-                        
+
                 flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
         } else {
  #ifdef WARN_OLD
                 static unsigned long error_time, error_time2;
                 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
-                   (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
-               {
+                   (N_MAGIC(ex) != NMAGIC) &&
+                               time_after(jiffies, error_time2 + 5*HZ)) {
                         printk(KERN_NOTICE "executable not page aligned\n");
                         error_time2 = jiffies;
                 }
  
                 if ((fd_offset & ~PAGE_MASK) != 0 &&
-                   (jiffies-error_time) > 5*HZ)
-               {
-                       printk(KERN_WARNING 
-                              "fd_offset is not page aligned. Please convert program: %s\n",
+                           time_after(jiffies, error_time + 5*HZ)) {
+                       printk(KERN_WARNING
+                              "fd_offset is not page aligned. Please convert "
+                              "program: %s\n",
                                bprm->file->f_path.dentry->d_name.name);
                         error_time = jiffies;
                 }
  #endif
  
-               if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
+               if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
                         loff_t pos = fd_offset;
+
                         down_write(&current->mm->mmap_sem);
                         do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
                         up_write(&current->mm->mmap_sem);
@@ -376,9 +393,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
  
                 down_write(&current->mm->mmap_sem);
                 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
-                       PROT_READ | PROT_EXEC,
-                       MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
-                       fd_offset);
+                               PROT_READ | PROT_EXEC,
+                               MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
+                               MAP_EXECUTABLE | MAP_32BIT,
+                               fd_offset);
                 up_write(&current->mm->mmap_sem);
  
                 if (error != N_TXTADDR(ex)) {
@@ -387,9 +405,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                 }
  
                 down_write(&current->mm->mmap_sem);
-               error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+               error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
                                 PROT_READ | PROT_WRITE | PROT_EXEC,
-                               MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
+                               MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
+                               MAP_EXECUTABLE | MAP_32BIT,
                                 fd_offset + ex.a_text);
                 up_write(&current->mm->mmap_sem);
                 if (error != N_DATADDR(ex)) {
@@ -403,9 +422,9 @@ beyond_if:
         set_brk(current->mm->start_brk, current->mm->brk);
  
         retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
-       if (retval < 0) { 
-               /* Someone check-me: is this error path enough? */ 
-               send_sig(SIGKILL, current, 0); 
+       if (retval < 0) {
+               /* Someone check-me: is this error path enough? */
+               send_sig(SIGKILL, current, 0);
                 return retval;
         }
  
@@ -414,10 +433,10 @@ beyond_if:
         /* start thread */
         asm volatile("movl %0,%%fs" :: "r" (0)); \
         asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
-       load_gs_index(0); 
-       (regs)->rip = ex.a_entry;
-       (regs)->rsp = current->mm->start_stack;
-       (regs)->eflags = 0x200;
+       load_gs_index(0);
+       (regs)->ip = ex.a_entry;
+       (regs)->sp = current->mm->start_stack;
+       (regs)->flags = 0x200;
         (regs)->cs = __USER32_CS;
         (regs)->ss = __USER32_DS;
         regs->r8 = regs->r9 = regs->r10 = regs->r11 =
@@ -425,7 +444,7 @@ beyond_if:
         set_fs(USER_DS);
         if (unlikely(current->ptrace & PT_PTRACED)) {
                 if (current->ptrace & PT_TRACE_EXEC)
-                       ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
+                       ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
                 else
                         send_sig(SIGTRAP, current, 0);
         }
@@ -434,9 +453,8 @@ beyond_if:
  
  static int load_aout_library(struct file *file)
  {
-       struct inode * inode;
-       unsigned long bss, start_addr, len;
-       unsigned long error;
+       struct inode *inode;
+       unsigned long bss, start_addr, len, error;
         int retval;
         struct exec ex;
  
@@ -450,7 +468,8 @@ static int load_aout_library(struct file *file)
         /* We come in here for the regular a.out style of shared libraries */
         if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
             N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
-           i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+           i_size_read(inode) <
+           ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
                 goto out;
         }
  
@@ -467,10 +486,10 @@ static int load_aout_library(struct file *file)
  
  #ifdef WARN_OLD
                 static unsigned long error_time;
-               if ((jiffies-error_time) > 5*HZ)
-               {
-                       printk(KERN_WARNING 
-                              "N_TXTOFF is not page aligned. Please convert library: %s\n",
+               if (time_after(jiffies, error_time + 5*HZ)) {
+                       printk(KERN_WARNING
+                              "N_TXTOFF is not page aligned. Please convert "
+                              "library: %s\n",
                                file->f_path.dentry->d_name.name);
                         error_time = jiffies;
                 }
@@ -478,11 +497,12 @@ static int load_aout_library(struct file *file)
                 down_write(&current->mm->mmap_sem);
                 do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
                 up_write(&current->mm->mmap_sem);
-               
+
                 file->f_op->read(file, (char __user *)start_addr,
                         ex.a_text + ex.a_data, &pos);
                 flush_icache_range((unsigned long) start_addr,
-                                  (unsigned long) start_addr + ex.a_text + ex.a_data);
+                                  (unsigned long) start_addr + ex.a_text +
+                                  ex.a_data);
  
                 retval = 0;
                 goto out;
diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c

deleted file mode 100644 (file)

index 55822d2..0000000
--- a/arch/x86/ia32/ia32_binfmt.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/* 
- * Written 2000,2002 by Andi Kleen. 
- * 
- * Loosely based on the sparc64 and IA64 32bit emulation loaders.
- * This tricks binfmt_elf.c into loading 32bit binaries using lots 
- * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
- */ 
-
-#include <linux/types.h>
-#include <linux/stddef.h>
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/compat.h>
-#include <linux/string.h>
-#include <linux/binfmts.h>
-#include <linux/mm.h>
-#include <linux/security.h>
-#include <linux/elfcore-compat.h>
-
-#include <asm/segment.h> 
-#include <asm/ptrace.h>
-#include <asm/processor.h>
-#include <asm/user32.h>
-#include <asm/sigcontext32.h>
-#include <asm/fpu32.h>
-#include <asm/i387.h>
-#include <asm/uaccess.h>
-#include <asm/ia32.h>
-#include <asm/vsyscall32.h>
-
-#undef ELF_ARCH
-#undef ELF_CLASS
-#define ELF_CLASS      ELFCLASS32
-#define ELF_ARCH       EM_386
-
-#undef elfhdr
-#undef elf_phdr
-#undef elf_note
-#undef elf_addr_t
-#define elfhdr         elf32_hdr
-#define elf_phdr       elf32_phdr
-#define elf_note       elf32_note
-#define elf_addr_t     Elf32_Off
-
-#define ELF_NAME "elf/i386"
-
-#define AT_SYSINFO 32
-#define AT_SYSINFO_EHDR                33
-
-int sysctl_vsyscall32 = 1;
-
-#undef ARCH_DLINFO
-#define ARCH_DLINFO do {  \
-       if (sysctl_vsyscall32) { \
-               current->mm->context.vdso = (void *)VSYSCALL32_BASE;    \
-               NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
-               NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE);    \
-       }       \
-} while(0)
-
-struct file;
-
-#define IA32_EMULATOR 1
-
-#undef ELF_ET_DYN_BASE
-
-#define ELF_ET_DYN_BASE                (TASK_UNMAPPED_BASE + 0x1000000)
-
-#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
-
-#define _GET_SEG(x) \
-       ({ __u32 seg; asm("movl %%" __stringify(x) ",%0" : "=r"(seg)); seg; })
-
-/* Assumes current==process to be dumped */
-#undef ELF_CORE_COPY_REGS
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                       \
-       pr_reg[0] = regs->rbx;                          \
-       pr_reg[1] = regs->rcx;                          \
-       pr_reg[2] = regs->rdx;                          \
-       pr_reg[3] = regs->rsi;                          \
-       pr_reg[4] = regs->rdi;                          \
-       pr_reg[5] = regs->rbp;                          \
-       pr_reg[6] = regs->rax;                          \
-       pr_reg[7] = _GET_SEG(ds);                       \
-       pr_reg[8] = _GET_SEG(es);                       \
-       pr_reg[9] = _GET_SEG(fs);                       \
-       pr_reg[10] = _GET_SEG(gs);                      \
-       pr_reg[11] = regs->orig_rax;                    \
-       pr_reg[12] = regs->rip;                         \
-       pr_reg[13] = regs->cs;                          \
-       pr_reg[14] = regs->eflags;                      \
-       pr_reg[15] = regs->rsp;                         \
-       pr_reg[16] = regs->ss;
-
-
-#define elf_prstatus   compat_elf_prstatus
-#define elf_prpsinfo   compat_elf_prpsinfo
-#define elf_fpregset_t struct user_i387_ia32_struct
-#define        elf_fpxregset_t struct user32_fxsr_struct
-#define user           user32
-
-#undef elf_read_implies_exec
-#define elf_read_implies_exec(ex, executable_stack)     (executable_stack != EXSTACK_DISABLE_X)
-
-#define elf_core_copy_regs             elf32_core_copy_regs
-static inline void elf32_core_copy_regs(compat_elf_gregset_t *elfregs,
-                                       struct pt_regs *regs)
-{
-       ELF_CORE_COPY_REGS((&elfregs->ebx), regs)
-}
-
-#define elf_core_copy_task_regs                elf32_core_copy_task_regs
-static inline int elf32_core_copy_task_regs(struct task_struct *t,
-                                           compat_elf_gregset_t* elfregs)
-{      
-       struct pt_regs *pp = task_pt_regs(t);
-       ELF_CORE_COPY_REGS((&elfregs->ebx), pp);
-       /* fix wrong segments */ 
-       elfregs->ds = t->thread.ds;
-       elfregs->fs = t->thread.fsindex;
-       elfregs->gs = t->thread.gsindex;
-       elfregs->es = t->thread.es;
-       return 1; 
-}
-
-#define elf_core_copy_task_fpregs      elf32_core_copy_task_fpregs
-static inline int 
-elf32_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs,
-                           elf_fpregset_t *fpu)
-{
-       struct _fpstate_ia32 *fpstate = (void*)fpu; 
-       mm_segment_t oldfs = get_fs();
-
-       if (!tsk_used_math(tsk))
-               return 0;
-       if (!regs)
-               regs = task_pt_regs(tsk);
-       if (tsk == current)
-               unlazy_fpu(tsk);
-       set_fs(KERNEL_DS); 
-       save_i387_ia32(tsk, fpstate, regs, 1);
-       /* Correct for i386 bug. It puts the fop into the upper 16bits of 
-          the tag word (like FXSAVE), not into the fcs*/ 
-       fpstate->cssel |= fpstate->tag & 0xffff0000; 
-       set_fs(oldfs); 
-       return 1; 
-}
-
-#define ELF_CORE_COPY_XFPREGS 1
-#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
-#define elf_core_copy_task_xfpregs     elf32_core_copy_task_xfpregs
-static inline int 
-elf32_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
-{
-       struct pt_regs *regs = task_pt_regs(t);
-       if (!tsk_used_math(t))
-               return 0;
-       if (t == current)
-               unlazy_fpu(t); 
-       memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
-       xfpu->fcs = regs->cs; 
-       xfpu->fos = t->thread.ds; /* right? */ 
-       return 1;
-}
-
-#undef elf_check_arch
-#define elf_check_arch(x) \
-       ((x)->e_machine == EM_386)
-
-extern int force_personality32;
-
-#undef ELF_EXEC_PAGESIZE
-#undef ELF_HWCAP
-#undef ELF_PLATFORM
-#undef SET_PERSONALITY
-#define ELF_EXEC_PAGESIZE PAGE_SIZE
-#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
-#define ELF_PLATFORM  ("i686")
-#define SET_PERSONALITY(ex, ibcs2)                     \
-do {                                                   \
-       unsigned long new_flags = 0;                            \
-       if ((ex).e_ident[EI_CLASS] == ELFCLASS32)               \
-               new_flags = _TIF_IA32;                          \
-       if ((current_thread_info()->flags & _TIF_IA32)          \
-           != new_flags)                                       \
-               set_thread_flag(TIF_ABI_PENDING);               \
-       else                                                    \
-               clear_thread_flag(TIF_ABI_PENDING);             \
-       /* XXX This overwrites the user set personality */      \
-       current->personality |= force_personality32;            \
-} while (0)
-
-/* Override some function names */
-#define elf_format                     elf32_format
-
-#define init_elf_binfmt                        init_elf32_binfmt
-#define exit_elf_binfmt                        exit_elf32_binfmt
-
-#define load_elf_binary load_elf32_binary
-
-#undef ELF_PLAT_INIT
-#define ELF_PLAT_INIT(r, load_addr)    elf32_init(r)
-
-#undef start_thread
-#define start_thread(regs,new_rip,new_rsp) do { \
-       asm volatile("movl %0,%%fs" :: "r" (0)); \
-       asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
-       load_gs_index(0); \
-       (regs)->rip = (new_rip); \
-       (regs)->rsp = (new_rsp); \
-       (regs)->eflags = 0x200; \
-       (regs)->cs = __USER32_CS; \
-       (regs)->ss = __USER32_DS; \
-       set_fs(USER_DS); \
-} while(0) 
-
-
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); 
-MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
-
-#undef MODULE_DESCRIPTION
-#undef MODULE_AUTHOR
-
-static void elf32_init(struct pt_regs *);
-
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
-#define arch_setup_additional_pages syscall32_setup_pages
-extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
-
-#include "../../../fs/binfmt_elf.c" 
-
-static void elf32_init(struct pt_regs *regs)
-{
-       struct task_struct *me = current; 
-       regs->rdi = 0;
-       regs->rsi = 0;
-       regs->rdx = 0;
-       regs->rcx = 0;
-       regs->rax = 0;
-       regs->rbx = 0; 
-       regs->rbp = 0; 
-       regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
-               regs->r13 = regs->r14 = regs->r15 = 0; 
-    me->thread.fs = 0; 
-       me->thread.gs = 0;
-       me->thread.fsindex = 0; 
-       me->thread.gsindex = 0;
-    me->thread.ds = __USER_DS; 
-       me->thread.es = __USER_DS;
-}
-
-#ifdef CONFIG_SYSCTL
-/* Register vsyscall32 into the ABI table */
-#include <linux/sysctl.h>
-
-static ctl_table abi_table2[] = {
-       {
-               .procname       = "vsyscall32",
-               .data           = &sysctl_vsyscall32,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec
-       },
-       {}
-};
-
-static ctl_table abi_root_table2[] = {
-       {
-               .ctl_name = CTL_ABI,
-               .procname = "abi",
-               .mode = 0555,
-               .child = abi_table2
-       },
-       {}
-};
-
-static __init int ia32_binfmt_init(void)
-{ 
-       register_sysctl_table(abi_root_table2);
-       return 0;
-}
-__initcall(ia32_binfmt_init);
-#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c

index 6ea19c25f90d46ce209c646382443c5a8ae0e46e..1c0503bdfb1a1dbd33080620ec67715644fcf054 100644 (file)
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -29,9 +29,8 @@
  #include <asm/ia32_unistd.h>
  #include <asm/user32.h>
  #include <asm/sigcontext32.h>
-#include <asm/fpu32.h>
  #include <asm/proto.h>
-#include <asm/vsyscall32.h>
+#include <asm/vdso.h>
  
  #define DEBUG_SIG 0
  
@@ -43,7 +42,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
  int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
  {
         int err;
-       if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+
+       if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
                 return -EFAULT;
  
         /* If you change siginfo_t structure, please make sure that
@@ -53,16 +53,19 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
            3 ints plus the relevant union member.  */
         err = __put_user(from->si_signo, &to->si_signo);
         err |= __put_user(from->si_errno, &to->si_errno);
-       err |= __put_user((short)from->si_code, &to->si_code);
+       err |= __put_user((short)from->si_code, &to->si_code);
  
         if (from->si_code < 0) {
                 err |= __put_user(from->si_pid, &to->si_pid);
-               err |= __put_user(from->si_uid, &to->si_uid);
-               err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
+               err |= __put_user(from->si_uid, &to->si_uid);
+               err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
         } else {
-               /* First 32bits of unions are always present:
-                * si_pid === si_band === si_tid === si_addr(LS half) */
-               err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]);
+               /*
+                * First 32bits of unions are always present:
+                * si_pid === si_band === si_tid === si_addr(LS half)
+                */
+               err |= __put_user(from->_sifields._pad[0],
+                                 &to->_sifields._pad[0]);
                 switch (from->si_code >> 16) {
                 case __SI_FAULT >> 16:
                         break;
@@ -76,14 +79,15 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
                         err |= __put_user(from->si_uid, &to->si_uid);
                         break;
                 case __SI_POLL >> 16:
-                       err |= __put_user(from->si_fd, &to->si_fd); 
+                       err |= __put_user(from->si_fd, &to->si_fd);
                         break;
                 case __SI_TIMER >> 16:
-                       err |= __put_user(from->si_overrun, &to->si_overrun); 
+                       err |= __put_user(from->si_overrun, &to->si_overrun);
                         err |= __put_user(ptr_to_compat(from->si_ptr),
-                                       &to->si_ptr);
+                                         &to->si_ptr);
                         break;
-               case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
+                        /* This is not generated by the kernel as of now.  */
+               case __SI_RT >> 16:
                 case __SI_MESGQ >> 16:
                         err |= __put_user(from->si_uid, &to->si_uid);
                         err |= __put_user(from->si_int, &to->si_int);
@@ -97,7 +101,8 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
  {
         int err;
         u32 ptr32;
-       if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t)))
+
+       if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
                 return -EFAULT;
  
         err = __get_user(to->si_signo, &from->si_signo);
@@ -112,8 +117,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
         return err;
  }
  
-asmlinkage long
-sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
+asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
  {
         mask &= _BLOCKABLE;
         spin_lock_irq(&current->sighand->siglock);
@@ -128,36 +132,37 @@ sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
         return -ERESTARTNOHAND;
  }
  
-asmlinkage long
-sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
-                 stack_ia32_t __user *uoss_ptr, 
-                 struct pt_regs *regs)
+asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
+                                 stack_ia32_t __user *uoss_ptr,
+                                 struct pt_regs *regs)
  {
-       stack_t uss,uoss; 
+       stack_t uss, uoss;
         int ret;
-       mm_segment_t seg; 
-       if (uss_ptr) { 
+       mm_segment_t seg;
+
+       if (uss_ptr) {
                 u32 ptr;
-               memset(&uss,0,sizeof(stack_t));
-               if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) ||
+
+               memset(&uss, 0, sizeof(stack_t));
+               if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) ||
                             __get_user(ptr, &uss_ptr->ss_sp) ||
                             __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
                             __get_user(uss.ss_size, &uss_ptr->ss_size))
                         return -EFAULT;
                 uss.ss_sp = compat_ptr(ptr);
         }
-       seg = get_fs(); 
-       set_fs(KERNEL_DS); 
-       ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
-       set_fs(seg); 
+       seg = get_fs();
+       set_fs(KERNEL_DS);
+       ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
+       set_fs(seg);
         if (ret >= 0 && uoss_ptr)  {
-               if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) ||
+               if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) ||
                     __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
                     __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
                     __put_user(uoss.ss_size, &uoss_ptr->ss_size))
                         ret = -EFAULT;
-       }       
-       return ret;     
+       }
+       return ret;
  }
  
  /*
@@ -186,87 +191,85 @@ struct rt_sigframe
         char retcode[8];
  };
  
-static int
-ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax)
+#define COPY(x)                {               \
+       unsigned int reg;               \
+       err |= __get_user(reg, &sc->x); \
+       regs->x = reg;                  \
+}
+
+#define RELOAD_SEG(seg,mask)                                           \
+       { unsigned int cur;                                             \
+         unsigned short pre;                                           \
+         err |= __get_user(pre, &sc->seg);                             \
+         asm volatile("movl %%" #seg ",%0" : "=r" (cur));              \
+         pre |= mask;                                                  \
+         if (pre != cur) loadsegment(seg, pre); }
+
+static int ia32_restore_sigcontext(struct pt_regs *regs,
+                                  struct sigcontext_ia32 __user *sc,
+                                  unsigned int *peax)
  {
-       unsigned int err = 0;
-       
+       unsigned int tmpflags, gs, oldgs, err = 0;
+       struct _fpstate_ia32 __user *buf;
+       u32 tmp;
+
         /* Always make any pending restarted system calls return -EINTR */
         current_thread_info()->restart_block.fn = do_no_restart_syscall;
  
  #if DEBUG_SIG
-       printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
-               sc, sc->err, sc->eip, sc->cs, sc->eflags);
+       printk(KERN_DEBUG "SIG restore_sigcontext: "
+              "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
+              sc, sc->err, sc->ip, sc->cs, sc->flags);
  #endif
-#define COPY(x)                { \
-       unsigned int reg;                       \
-       err |= __get_user(reg, &sc->e ##x);     \
-       regs->r ## x = reg;                     \
-}
  
-#define RELOAD_SEG(seg,mask)                                           \
-       { unsigned int cur;                                             \
-         unsigned short pre;                                           \
-         err |= __get_user(pre, &sc->seg);                             \
-         asm volatile("movl %%" #seg ",%0" : "=r" (cur));              \
-         pre |= mask;                                                  \
-         if (pre != cur) loadsegment(seg,pre); }
-
-       /* Reload fs and gs if they have changed in the signal handler.
-          This does not handle long fs/gs base changes in the handler, but 
-          does not clobber them at least in the normal case. */ 
-       
-       {
-               unsigned gs, oldgs; 
-               err |= __get_user(gs, &sc->gs);
-               gs |= 3; 
-               asm("movl %%gs,%0" : "=r" (oldgs));
-               if (gs != oldgs)
-               load_gs_index(gs); 
-       } 
-       RELOAD_SEG(fs,3);
-       RELOAD_SEG(ds,3);
-       RELOAD_SEG(es,3);
+       /*
+        * Reload fs and gs if they have changed in the signal
+        * handler.  This does not handle long fs/gs base changes in
+        * the handler, but does not clobber them at least in the
+        * normal case.
+        */
+       err |= __get_user(gs, &sc->gs);
+       gs |= 3;
+       asm("movl %%gs,%0" : "=r" (oldgs));
+       if (gs != oldgs)
+               load_gs_index(gs);
+
+       RELOAD_SEG(fs, 3);
+       RELOAD_SEG(ds, 3);
+       RELOAD_SEG(es, 3);
  
         COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
         COPY(dx); COPY(cx); COPY(ip);
-       /* Don't touch extended registers */ 
-       
-       err |= __get_user(regs->cs, &sc->cs); 
-       regs->cs |= 3;  
-       err |= __get_user(regs->ss, &sc->ss); 
-       regs->ss |= 3; 
-
-       {
-               unsigned int tmpflags;
-               err |= __get_user(tmpflags, &sc->eflags);
-               regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
-               regs->orig_rax = -1;            /* disable syscall checks */
-       }
+       /* Don't touch extended registers */
+
+       err |= __get_user(regs->cs, &sc->cs);
+       regs->cs |= 3;
+       err |= __get_user(regs->ss, &sc->ss);
+       regs->ss |= 3;
+
+       err |= __get_user(tmpflags, &sc->flags);
+       regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+       /* disable syscall checks */
+       regs->orig_ax = -1;
+
+       err |= __get_user(tmp, &sc->fpstate);
+       buf = compat_ptr(tmp);
+       if (buf) {
+               if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                       goto badframe;
+               err |= restore_i387_ia32(buf);
+       } else {
+               struct task_struct *me = current;
  
-       {
-               u32 tmp;
-               struct _fpstate_ia32 __user * buf;
-               err |= __get_user(tmp, &sc->fpstate);
-               buf = compat_ptr(tmp);
-               if (buf) {
-                       if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-                               goto badframe;
-                       err |= restore_i387_ia32(current, buf, 0);
-               } else {
-                       struct task_struct *me = current;
-                       if (used_math()) {
-                               clear_fpu(me);
-                               clear_used_math();
-                       }
+               if (used_math()) {
+                       clear_fpu(me);
+                       clear_used_math();
                 }
         }
  
-       { 
-               u32 tmp;
-               err |= __get_user(tmp, &sc->eax);
-               *peax = tmp;
-       }
+       err |= __get_user(tmp, &sc->ax);
+       *peax = tmp;
+
         return err;
  
  badframe:
@@ -275,15 +278,16 @@ badframe:
  
  asmlinkage long sys32_sigreturn(struct pt_regs *regs)
  {
-       struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
+       struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8);
         sigset_t set;
-       unsigned int eax;
+       unsigned int ax;
  
         if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
                 goto badframe;
         if (__get_user(set.sig[0], &frame->sc.oldmask)
             || (_COMPAT_NSIG_WORDS > 1
-               && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask,
+               && __copy_from_user((((char *) &set.sig) + 4),
+                                   &frame->extramask,
                                     sizeof(frame->extramask))))
                 goto badframe;
  
@@ -292,24 +296,24 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
         current->blocked = set;
         recalc_sigpending();
         spin_unlock_irq(&current->sighand->siglock);
-       
-       if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
+
+       if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
                 goto badframe;
-       return eax;
+       return ax;
  
  badframe:
         signal_fault(regs, frame, "32bit sigreturn");
         return 0;
-}      
+}
  
  asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
  {
         struct rt_sigframe __user *frame;
         sigset_t set;
-       unsigned int eax;
+       unsigned int ax;
         struct pt_regs tregs;
  
-       frame = (struct rt_sigframe __user *)(regs->rsp - 4);
+       frame = (struct rt_sigframe __user *)(regs->sp - 4);
  
         if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
                 goto badframe;
@@ -321,28 +325,28 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
         current->blocked = set;
         recalc_sigpending();
         spin_unlock_irq(&current->sighand->siglock);
-       
-       if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+
+       if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
                 goto badframe;
  
         tregs = *regs;
         if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
                 goto badframe;
  
-       return eax;
+       return ax;
  
  badframe:
-       signal_fault(regs,frame,"32bit rt sigreturn");
+       signal_fault(regs, frame, "32bit rt sigreturn");
         return 0;
-}      
+}
  
  /*
   * Set up a signal frame.
   */
  
-static int
-ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate,
-                struct pt_regs *regs, unsigned int mask)
+static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
+                                struct _fpstate_ia32 __user *fpstate,
+                                struct pt_regs *regs, unsigned int mask)
  {
         int tmp, err = 0;
  
@@ -356,26 +360,26 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
         __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
         err |= __put_user(tmp, (unsigned int __user *)&sc->es);
  
-       err |= __put_user((u32)regs->rdi, &sc->edi);
-       err |= __put_user((u32)regs->rsi, &sc->esi);
-       err |= __put_user((u32)regs->rbp, &sc->ebp);
-       err |= __put_user((u32)regs->rsp, &sc->esp);
-       err |= __put_user((u32)regs->rbx, &sc->ebx);
-       err |= __put_user((u32)regs->rdx, &sc->edx);
-       err |= __put_user((u32)regs->rcx, &sc->ecx);
-       err |= __put_user((u32)regs->rax, &sc->eax);
+       err |= __put_user((u32)regs->di, &sc->di);
+       err |= __put_user((u32)regs->si, &sc->si);
+       err |= __put_user((u32)regs->bp, &sc->bp);
+       err |= __put_user((u32)regs->sp, &sc->sp);
+       err |= __put_user((u32)regs->bx, &sc->bx);
+       err |= __put_user((u32)regs->dx, &sc->dx);
+       err |= __put_user((u32)regs->cx, &sc->cx);
+       err |= __put_user((u32)regs->ax, &sc->ax);
         err |= __put_user((u32)regs->cs, &sc->cs);
         err |= __put_user((u32)regs->ss, &sc->ss);
         err |= __put_user(current->thread.trap_no, &sc->trapno);
         err |= __put_user(current->thread.error_code, &sc->err);
-       err |= __put_user((u32)regs->rip, &sc->eip);
-       err |= __put_user((u32)regs->eflags, &sc->eflags);
-       err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
+       err |= __put_user((u32)regs->ip, &sc->ip);
+       err |= __put_user((u32)regs->flags, &sc->flags);
+       err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
  
-       tmp = save_i387_ia32(current, fpstate, regs, 0);
+       tmp = save_i387_ia32(fpstate);
         if (tmp < 0)
                 err = -EFAULT;
-       else { 
+       else {
                 clear_used_math();
                 stts();
                 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
@@ -392,40 +396,53 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __
  /*
   * Determine which stack to use..
   */
-static void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
+                                size_t frame_size)
  {
-       unsigned long rsp;
+       unsigned long sp;
  
         /* Default to using normal stack */
-       rsp = regs->rsp;
+       sp = regs->sp;
  
         /* This is the X/Open sanctioned signal stack switching.  */
         if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(rsp) == 0)
-                       rsp = current->sas_ss_sp + current->sas_ss_size;
+               if (sas_ss_flags(sp) == 0)
+                       sp = current->sas_ss_sp + current->sas_ss_size;
         }
  
         /* This is the legacy signal stack switching. */
         else if ((regs->ss & 0xffff) != __USER_DS &&
                 !(ka->sa.sa_flags & SA_RESTORER) &&
-                ka->sa.sa_restorer) {
-               rsp = (unsigned long) ka->sa.sa_restorer;
-       }
+                ka->sa.sa_restorer)
+               sp = (unsigned long) ka->sa.sa_restorer;
  
-       rsp -= frame_size;
+       sp -= frame_size;
         /* Align the stack pointer according to the i386 ABI,
          * i.e. so that on function entry ((sp + 4) & 15) == 0. */
-       rsp = ((rsp + 4) & -16ul) - 4;
-       return (void __user *) rsp;
+       sp = ((sp + 4) & -16ul) - 4;
+       return (void __user *) sp;
  }
  
  int ia32_setup_frame(int sig, struct k_sigaction *ka,
-                    compat_sigset_t *set, struct pt_regs * regs)
+                    compat_sigset_t *set, struct pt_regs *regs)
  {
         struct sigframe __user *frame;
+       void __user *restorer;
         int err = 0;
  
+       /* copy_to_user optimizes that into a single 8 byte store */
+       static const struct {
+               u16 poplmovl;
+               u32 val;
+               u16 int80;
+               u16 pad;
+       } __attribute__((packed)) code = {
+               0xb858,          /* popl %eax ; movl $...,%eax */
+               __NR_ia32_sigreturn,
+               0x80cd,         /* int $0x80 */
+               0,
+       };
+
         frame = get_sigframe(ka, regs, sizeof(*frame));
  
         if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
@@ -443,64 +460,53 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
         if (_COMPAT_NSIG_WORDS > 1) {
                 err |= __copy_to_user(frame->extramask, &set->sig[1],
                                       sizeof(frame->extramask));
+               if (err)
+                       goto give_sigsegv;
         }
-       if (err)
-               goto give_sigsegv;
  
-       /* Return stub is in 32bit vsyscall page */
-       { 
-               void __user *restorer;
+       if (ka->sa.sa_flags & SA_RESTORER) {
+               restorer = ka->sa.sa_restorer;
+       } else {
+               /* Return stub is in 32bit vsyscall page */
                 if (current->binfmt->hasvdso)
-                       restorer = VSYSCALL32_SIGRETURN;
+                       restorer = VDSO32_SYMBOL(current->mm->context.vdso,
+                                                sigreturn);
                 else
-                       restorer = (void *)&frame->retcode;
-               if (ka->sa.sa_flags & SA_RESTORER)
-                       restorer = ka->sa.sa_restorer;       
-               err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
-       }
-       /* These are actually not used anymore, but left because some 
-          gdb versions depend on them as a marker. */
-       { 
-               /* copy_to_user optimizes that into a single 8 byte store */
-               static const struct { 
-                       u16 poplmovl;
-                       u32 val;
-                       u16 int80;    
-                       u16 pad; 
-               } __attribute__((packed)) code = { 
-                       0xb858,          /* popl %eax ; movl $...,%eax */
-                       __NR_ia32_sigreturn,   
-                       0x80cd,         /* int $0x80 */
-                       0,
-               }; 
-               err |= __copy_to_user(frame->retcode, &code, 8); 
+                       restorer = &frame->retcode;
         }
+       err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+
+       /*
+        * These are actually not used anymore, but left because some
+        * gdb versions depend on them as a marker.
+        */
+       err |= __copy_to_user(frame->retcode, &code, 8);
         if (err)
                 goto give_sigsegv;
  
         /* Set up registers for signal handler */
-       regs->rsp = (unsigned long) frame;
-       regs->rip = (unsigned long) ka->sa.sa_handler;
+       regs->sp = (unsigned long) frame;
+       regs->ip = (unsigned long) ka->sa.sa_handler;
  
         /* Make -mregparm=3 work */
-       regs->rax = sig;
-       regs->rdx = 0;
-       regs->rcx = 0;
+       regs->ax = sig;
+       regs->dx = 0;
+       regs->cx = 0;
  
-       asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 
-       asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 
+       asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
+       asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
  
-       regs->cs = __USER32_CS; 
-       regs->ss = __USER32_DS; 
+       regs->cs = __USER32_CS;
+       regs->ss = __USER32_DS;
  
         set_fs(USER_DS);
-       regs->eflags &= ~TF_MASK;
+       regs->flags &= ~X86_EFLAGS_TF;
         if (test_thread_flag(TIF_SINGLESTEP))
                 ptrace_notify(SIGTRAP);
  
  #if DEBUG_SIG
-       printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
-               current->comm, current->pid, frame, regs->rip, frame->pretcode);
+       printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
+              current->comm, current->pid, frame, regs->ip, frame->pretcode);
  #endif
  
         return 0;
@@ -511,25 +517,34 @@ give_sigsegv:
  }
  
  int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-                       compat_sigset_t *set, struct pt_regs * regs)
+                       compat_sigset_t *set, struct pt_regs *regs)
  {
         struct rt_sigframe __user *frame;
+       struct exec_domain *ed = current_thread_info()->exec_domain;
+       void __user *restorer;
         int err = 0;
  
+       /* __copy_to_user optimizes that into a single 8 byte store */
+       static const struct {
+               u8 movl;
+               u32 val;
+               u16 int80;
+               u16 pad;
+               u8  pad2;
+       } __attribute__((packed)) code = {
+               0xb8,
+               __NR_ia32_rt_sigreturn,
+               0x80cd,
+               0,
+       };
+
         frame = get_sigframe(ka, regs, sizeof(*frame));
  
         if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
                 goto give_sigsegv;
  
-       {
-               struct exec_domain *ed = current_thread_info()->exec_domain;
-               err |= __put_user((ed
-                          && ed->signal_invmap
-                          && sig < 32
-                          ? ed->signal_invmap[sig]
-                          : sig),
-                         &frame->sig);
-       }
+       err |= __put_user((ed && ed->signal_invmap && sig < 32
+                          ? ed->signal_invmap[sig] : sig), &frame->sig);
         err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
         err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
         err |= copy_siginfo_to_user32(&frame->info, info);
@@ -540,73 +555,58 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
         err |= __put_user(0, &frame->uc.uc_flags);
         err |= __put_user(0, &frame->uc.uc_link);
         err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-       err |= __put_user(sas_ss_flags(regs->rsp),
+       err |= __put_user(sas_ss_flags(regs->sp),
                           &frame->uc.uc_stack.ss_flags);
         err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
         err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
-                               regs, set->sig[0]);
+                                    regs, set->sig[0]);
         err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
         if (err)
                 goto give_sigsegv;
  
-       
-       { 
-               void __user *restorer = VSYSCALL32_RTSIGRETURN; 
-               if (ka->sa.sa_flags & SA_RESTORER)
-                       restorer = ka->sa.sa_restorer;       
-               err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
-       }
-
-       /* This is movl $,%eax ; int $0x80 */
-       /* Not actually used anymore, but left because some gdb versions
-          need it. */ 
-       { 
-               /* __copy_to_user optimizes that into a single 8 byte store */
-               static const struct { 
-                       u8 movl; 
-                       u32 val; 
-                       u16 int80; 
-                       u16 pad;
-                       u8  pad2;                               
-               } __attribute__((packed)) code = { 
-                       0xb8,
-                       __NR_ia32_rt_sigreturn,
-                       0x80cd,
-                       0,
-               }; 
-               err |= __copy_to_user(frame->retcode, &code, 8); 
-       } 
+       if (ka->sa.sa_flags & SA_RESTORER)
+               restorer = ka->sa.sa_restorer;
+       else
+               restorer = VDSO32_SYMBOL(current->mm->context.vdso,
+                                        rt_sigreturn);
+       err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+
+       /*
+        * Not actually used anymore, but left because some gdb
+        * versions need it.
+        */
+       err |= __copy_to_user(frame->retcode, &code, 8);
         if (err)
                 goto give_sigsegv;
  
         /* Set up registers for signal handler */
-       regs->rsp = (unsigned long) frame;
-       regs->rip = (unsigned long) ka->sa.sa_handler;
+       regs->sp = (unsigned long) frame;
+       regs->ip = (unsigned long) ka->sa.sa_handler;
  
         /* Make -mregparm=3 work */
-       regs->rax = sig;
-       regs->rdx = (unsigned long) &frame->info;
-       regs->rcx = (unsigned long) &frame->uc;
+       regs->ax = sig;
+       regs->dx = (unsigned long) &frame->info;
+       regs->cx = (unsigned long) &frame->uc;
  
         /* Make -mregparm=3 work */
-       regs->rax = sig;
-       regs->rdx = (unsigned long) &frame->info;
-       regs->rcx = (unsigned long) &frame->uc;
+       regs->ax = sig;
+       regs->dx = (unsigned long) &frame->info;
+       regs->cx = (unsigned long) &frame->uc;
+
+       asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
+       asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
  
-       asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 
-       asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 
-       
-       regs->cs = __USER32_CS; 
-       regs->ss = __USER32_DS; 
+       regs->cs = __USER32_CS;
+       regs->ss = __USER32_DS;
  
         set_fs(USER_DS);
-       regs->eflags &= ~TF_MASK;
+       regs->flags &= ~X86_EFLAGS_TF;
         if (test_thread_flag(TIF_SINGLESTEP))
                 ptrace_notify(SIGTRAP);
  
  #if DEBUG_SIG
-       printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
-               current->comm, current->pid, frame, regs->rip, frame->pretcode);
+       printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
+              current->comm, current->pid, frame, regs->ip, frame->pretcode);
  #endif
  
         return 0;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S

index df588f0f76e1c9c3d8027a3baced0089de612832..0db0a6291bbd06d059c654ce58f8e5d387d685c0 100644 (file)
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,7 +12,6 @@
  #include <asm/ia32_unistd.h>   
  #include <asm/thread_info.h>   
  #include <asm/segment.h>
-#include <asm/vsyscall32.h>
  #include <asm/irqflags.h>
  #include <linux/linkage.h>
  
@@ -104,7 +103,7 @@ ENTRY(ia32_sysenter_target)
         pushfq
         CFI_ADJUST_CFA_OFFSET 8
         /*CFI_REL_OFFSET rflags,0*/
-       movl    $VSYSCALL32_SYSEXIT, %r10d
+       movl    8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d
         CFI_REGISTER rip,r10
         pushq   $__USER32_CS
         CFI_ADJUST_CFA_OFFSET 8
@@ -142,6 +141,8 @@ sysenter_do_call:
         andl    $~TS_COMPAT,threadinfo_status(%r10)
         /* clear IF, that popfq doesn't enable interrupts early */
         andl  $~0x200,EFLAGS-R11(%rsp) 
+       movl    RIP-R11(%rsp),%edx              /* User %eip */
+       CFI_REGISTER rip,rdx
         RESTORE_ARGS 1,24,1,1,1,1
         popfq
         CFI_ADJUST_CFA_OFFSET -8
@@ -149,8 +150,6 @@ sysenter_do_call:
         popq    %rcx                            /* User %esp */
         CFI_ADJUST_CFA_OFFSET -8
         CFI_REGISTER rsp,rcx
-       movl    $VSYSCALL32_SYSEXIT,%edx        /* User %eip */
-       CFI_REGISTER rip,rdx
         TRACE_IRQS_ON
         swapgs
         sti             /* sti only takes effect after the next instruction */
@@ -644,8 +643,8 @@ ia32_sys_call_table:
         .quad compat_sys_futex          /* 240 */
         .quad compat_sys_sched_setaffinity
         .quad compat_sys_sched_getaffinity
-       .quad sys32_set_thread_area
-       .quad sys32_get_thread_area
+       .quad sys_set_thread_area
+       .quad sys_get_thread_area
         .quad compat_sys_io_setup       /* 245 */
         .quad sys_io_destroy
         .quad compat_sys_io_getevents
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c

index 7b3342e5aab59e514770eb1ab361175e9860cd72..d21991ce606cc54dc4a3c71745049364bda26582 100644 (file)
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -9,9 +9,8 @@
  #include <linux/ipc.h>
  #include <linux/compat.h>
  
-asmlinkage long
-sys32_ipc(u32 call, int first, int second, int third,
-               compat_uptr_t ptr, u32 fifth)
+asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
+                         compat_uptr_t ptr, u32 fifth)
  {
         int version;
  
@@ -19,36 +18,35 @@ sys32_ipc(u32 call, int first, int second, int third,
         call &= 0xffff;
  
         switch (call) {
-             case SEMOP:
+       case SEMOP:
                 /* struct sembuf is the same on 32 and 64bit :)) */
                 return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
-             case SEMTIMEDOP:
+       case SEMTIMEDOP:
                 return compat_sys_semtimedop(first, compat_ptr(ptr), second,
                                                 compat_ptr(fifth));
-             case SEMGET:
+       case SEMGET:
                 return sys_semget(first, second, third);
-             case SEMCTL:
+       case SEMCTL:
                 return compat_sys_semctl(first, second, third, compat_ptr(ptr));
  
-             case MSGSND:
+       case MSGSND:
                 return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
-             case MSGRCV:
+       case MSGRCV:
                 return compat_sys_msgrcv(first, second, fifth, third,
                                          version, compat_ptr(ptr));
-             case MSGGET:
+       case MSGGET:
                 return sys_msgget((key_t) first, second);
-             case MSGCTL:
+       case MSGCTL:
                 return compat_sys_msgctl(first, second, compat_ptr(ptr));
  
-             case SHMAT:
+       case SHMAT:
                 return compat_sys_shmat(first, second, third, version,
                                         compat_ptr(ptr));
-               break;
-             case SHMDT:
+       case SHMDT:
                 return sys_shmdt(compat_ptr(ptr));
-             case SHMGET:
+       case SHMGET:
                 return sys_shmget(first, (unsigned)second, third);
-             case SHMCTL:
+       case SHMCTL:
                 return compat_sys_shmctl(first, second, compat_ptr(ptr));
         }
         return -ENOSYS;
diff --git a/arch/x86/ia32/mmap32.c b/arch/x86/ia32/mmap32.c

deleted file mode 100644 (file)

index e4b84b4..0000000
--- a/arch/x86/ia32/mmap32.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  linux/arch/x86_64/ia32/mm/mmap.c
- *
- *  flexible mmap layout support
- *
- * Based on the i386 version which was
- *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
- */
-
-#include <linux/personality.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave an at least ~128 MB hole.
- */
-#define MIN_GAP (128*1024*1024)
-#define MAX_GAP (TASK_SIZE/6*5)
-
-static inline unsigned long mmap_base(struct mm_struct *mm)
-{
-       unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
-       unsigned long random_factor = 0;
-
-       if (current->flags & PF_RANDOMIZE)
-               random_factor = get_random_int() % (1024*1024);
-
-       if (gap < MIN_GAP)
-               gap = MIN_GAP;
-       else if (gap > MAX_GAP)
-               gap = MAX_GAP;
-
-       return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
-}
-
-/*
- * This function, called very early during the creation of a new
- * process VM image, sets up which VM layout function to use:
- */
-void ia32_pick_mmap_layout(struct mm_struct *mm)
-{
-       /*
-        * Fall back to the standard layout if the personality
-        * bit is set, or if the expected stack growth is unlimited:
-        */
-       if (sysctl_legacy_va_layout ||
-                       (current->personality & ADDR_COMPAT_LAYOUT) ||
-                       current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
-               mm->mmap_base = TASK_UNMAPPED_BASE;
-               mm->get_unmapped_area = arch_get_unmapped_area;
-               mm->unmap_area = arch_unmap_area;
-       } else {
-               mm->mmap_base = mmap_base(mm);
-               mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-               mm->unmap_area = arch_unmap_area_topdown;
-       }
-}
diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c

deleted file mode 100644 (file)

index 4a233ad..0000000
--- a/arch/x86/ia32/ptrace32.c
+++ /dev/null
@@ -1,404 +0,0 @@
-/* 
- * 32bit ptrace for x86-64.
- *
- * Copyright 2001,2002 Andi Kleen, SuSE Labs.
- * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier 
- * copyright.
- * 
- * This allows to access 64bit processes too; but there is no way to see the extended 
- * register contents.
- */ 
-
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/sched.h>
-#include <linux/syscalls.h>
-#include <linux/unistd.h>
-#include <linux/mm.h>
-#include <linux/err.h>
-#include <linux/ptrace.h>
-#include <asm/ptrace.h>
-#include <asm/compat.h>
-#include <asm/uaccess.h>
-#include <asm/user32.h>
-#include <asm/user.h>
-#include <asm/errno.h>
-#include <asm/debugreg.h>
-#include <asm/i387.h>
-#include <asm/fpu32.h>
-#include <asm/ia32.h>
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
- * Also masks reserved bits (31-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x54dd5UL
-
-#define R32(l,q) \
-       case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
-
-static int putreg32(struct task_struct *child, unsigned regno, u32 val)
-{
-       int i;
-       __u64 *stack = (__u64 *)task_pt_regs(child);
-
-       switch (regno) {
-       case offsetof(struct user32, regs.fs):
-               if (val && (val & 3) != 3) return -EIO; 
-               child->thread.fsindex = val & 0xffff;
-               break;
-       case offsetof(struct user32, regs.gs):
-               if (val && (val & 3) != 3) return -EIO; 
-               child->thread.gsindex = val & 0xffff;
-               break;
-       case offsetof(struct user32, regs.ds):
-               if (val && (val & 3) != 3) return -EIO; 
-               child->thread.ds = val & 0xffff;
-               break;
-       case offsetof(struct user32, regs.es):
-               child->thread.es = val & 0xffff;
-               break;
-       case offsetof(struct user32, regs.ss): 
-               if ((val & 3) != 3) return -EIO;
-               stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
-               break;
-       case offsetof(struct user32, regs.cs): 
-               if ((val & 3) != 3) return -EIO;
-               stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
-               break;
-
-       R32(ebx, rbx); 
-       R32(ecx, rcx);
-       R32(edx, rdx);
-       R32(edi, rdi);
-       R32(esi, rsi);
-       R32(ebp, rbp);
-       R32(eax, rax);
-       R32(orig_eax, orig_rax);
-       R32(eip, rip);
-       R32(esp, rsp);
-
-       case offsetof(struct user32, regs.eflags): {
-               __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
-               val &= FLAG_MASK;
-               *flags = val | (*flags & ~FLAG_MASK);
-               break;
-       }
-
-       case offsetof(struct user32, u_debugreg[4]): 
-       case offsetof(struct user32, u_debugreg[5]):
-               return -EIO;
-
-       case offsetof(struct user32, u_debugreg[0]):
-               child->thread.debugreg0 = val;
-               break;
-
-       case offsetof(struct user32, u_debugreg[1]):
-               child->thread.debugreg1 = val;
-               break;
-
-       case offsetof(struct user32, u_debugreg[2]):
-               child->thread.debugreg2 = val;
-               break;
-
-       case offsetof(struct user32, u_debugreg[3]):
-               child->thread.debugreg3 = val;
-               break;
-
-       case offsetof(struct user32, u_debugreg[6]):
-               child->thread.debugreg6 = val;
-               break; 
-
-       case offsetof(struct user32, u_debugreg[7]):
-               val &= ~DR_CONTROL_RESERVED;
-               /* See arch/i386/kernel/ptrace.c for an explanation of
-                * this awkward check.*/
-               for(i=0; i<4; i++)
-                       if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
-                              return -EIO;
-               child->thread.debugreg7 = val; 
-               if (val)
-                       set_tsk_thread_flag(child, TIF_DEBUG);
-               else
-                       clear_tsk_thread_flag(child, TIF_DEBUG);
-               break; 
-                   
-       default:
-               if (regno > sizeof(struct user32) || (regno & 3))
-                       return -EIO;
-              
-               /* Other dummy fields in the virtual user structure are ignored */ 
-               break;          
-       }
-       return 0;
-}
-
-#undef R32
-
-#define R32(l,q) \
-       case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
-
-static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
-{
-       __u64 *stack = (__u64 *)task_pt_regs(child);
-
-       switch (regno) {
-       case offsetof(struct user32, regs.fs):
-               *val = child->thread.fsindex;
-               break;
-       case offsetof(struct user32, regs.gs):
-               *val = child->thread.gsindex;
-               break;
-       case offsetof(struct user32, regs.ds):
-               *val = child->thread.ds;
-               break;
-       case offsetof(struct user32, regs.es):
-               *val = child->thread.es;
-               break;
-
-       R32(cs, cs);
-       R32(ss, ss);
-       R32(ebx, rbx); 
-       R32(ecx, rcx);
-       R32(edx, rdx);
-       R32(edi, rdi);
-       R32(esi, rsi);
-       R32(ebp, rbp);
-       R32(eax, rax);
-       R32(orig_eax, orig_rax);
-       R32(eip, rip);
-       R32(eflags, eflags);
-       R32(esp, rsp);
-
-       case offsetof(struct user32, u_debugreg[0]): 
-               *val = child->thread.debugreg0; 
-               break; 
-       case offsetof(struct user32, u_debugreg[1]): 
-               *val = child->thread.debugreg1; 
-               break; 
-       case offsetof(struct user32, u_debugreg[2]): 
-               *val = child->thread.debugreg2; 
-               break; 
-       case offsetof(struct user32, u_debugreg[3]): 
-               *val = child->thread.debugreg3; 
-               break; 
-       case offsetof(struct user32, u_debugreg[6]): 
-               *val = child->thread.debugreg6; 
-               break; 
-       case offsetof(struct user32, u_debugreg[7]): 
-               *val = child->thread.debugreg7; 
-               break; 
-                   
-       default:
-               if (regno > sizeof(struct user32) || (regno & 3))
-                       return -EIO;
-
-               /* Other dummy fields in the virtual user structure are ignored */ 
-               *val = 0;
-               break;          
-       }
-       return 0;
-}
-
-#undef R32
-
-static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
-{
-       int ret;
-       compat_siginfo_t __user *si32 = compat_ptr(data);
-       siginfo_t ssi; 
-       siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
-       if (request == PTRACE_SETSIGINFO) {
-               memset(&ssi, 0, sizeof(siginfo_t));
-               ret = copy_siginfo_from_user32(&ssi, si32);
-               if (ret)
-                       return ret;
-               if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
-                       return -EFAULT;
-       }
-       ret = sys_ptrace(request, pid, addr, (unsigned long)si);
-       if (ret)
-               return ret;
-       if (request == PTRACE_GETSIGINFO) {
-               if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
-                       return -EFAULT;
-               ret = copy_siginfo_to_user32(si32, &ssi);
-       }
-       return ret;
-}
-
-asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
-{
-       struct task_struct *child;
-       struct pt_regs *childregs; 
-       void __user *datap = compat_ptr(data);
-       int ret;
-       __u32 val;
-
-       switch (request) { 
-       case PTRACE_TRACEME:
-       case PTRACE_ATTACH:
-       case PTRACE_KILL:
-       case PTRACE_CONT:
-       case PTRACE_SINGLESTEP:
-       case PTRACE_DETACH:
-       case PTRACE_SYSCALL:
-       case PTRACE_OLDSETOPTIONS:
-       case PTRACE_SETOPTIONS:
-       case PTRACE_SET_THREAD_AREA:
-       case PTRACE_GET_THREAD_AREA:
-               return sys_ptrace(request, pid, addr, data); 
-
-       default:
-               return -EINVAL;
-
-       case PTRACE_PEEKTEXT:
-       case PTRACE_PEEKDATA:
-       case PTRACE_POKEDATA:
-       case PTRACE_POKETEXT:
-       case PTRACE_POKEUSR:       
-       case PTRACE_PEEKUSR:
-       case PTRACE_GETREGS:
-       case PTRACE_SETREGS:
-       case PTRACE_SETFPREGS:
-       case PTRACE_GETFPREGS:
-       case PTRACE_SETFPXREGS:
-       case PTRACE_GETFPXREGS:
-       case PTRACE_GETEVENTMSG:
-               break;
-
-       case PTRACE_SETSIGINFO:
-       case PTRACE_GETSIGINFO:
-               return ptrace32_siginfo(request, pid, addr, data);
-       }
-
-       child = ptrace_get_task_struct(pid);
-       if (IS_ERR(child))
-               return PTR_ERR(child);
-
-       ret = ptrace_check_attach(child, request == PTRACE_KILL);
-       if (ret < 0)
-               goto out;
-
-       childregs = task_pt_regs(child);
-
-       switch (request) {
-       case PTRACE_PEEKDATA:
-       case PTRACE_PEEKTEXT:
-               ret = 0;
-               if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
-                       ret = -EIO;
-               else
-                       ret = put_user(val, (unsigned int __user *)datap); 
-               break; 
-
-       case PTRACE_POKEDATA:
-       case PTRACE_POKETEXT:
-               ret = 0;
-               if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
-                       ret = -EIO; 
-               break;
-
-       case PTRACE_PEEKUSR:
-               ret = getreg32(child, addr, &val);
-               if (ret == 0)
-                       ret = put_user(val, (__u32 __user *)datap);
-               break;
-
-       case PTRACE_POKEUSR:
-               ret = putreg32(child, addr, data);
-               break;
-
-       case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-               int i;
-               if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0;
-               for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
-                       getreg32(child, i, &val);
-                       ret |= __put_user(val,(u32 __user *)datap);
-                       datap += sizeof(u32);
-               }
-               break;
-       }
-
-       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-               unsigned long tmp;
-               int i;
-               if (!access_ok(VERIFY_READ, datap, 16*4)) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0; 
-               for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
-                       ret |= __get_user(tmp, (u32 __user *)datap);
-                       putreg32(child, i, tmp);
-                       datap += sizeof(u32);
-               }
-               break;
-       }
-
-       case PTRACE_GETFPREGS:
-               ret = -EIO; 
-               if (!access_ok(VERIFY_READ, compat_ptr(data), 
-                              sizeof(struct user_i387_struct)))
-                       break;
-               save_i387_ia32(child, datap, childregs, 1);
-               ret = 0; 
-                       break;
-
-       case PTRACE_SETFPREGS:
-               ret = -EIO;
-               if (!access_ok(VERIFY_WRITE, datap, 
-                              sizeof(struct user_i387_struct)))
-                       break;
-               ret = 0;
-               /* don't check EFAULT to be bug-to-bug compatible to i386 */
-               restore_i387_ia32(child, datap, 1);
-               break;
-
-       case PTRACE_GETFPXREGS: { 
-               struct user32_fxsr_struct __user *u = datap;
-               init_fpu(child); 
-               ret = -EIO;
-               if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
-                       break;
-                       ret = -EFAULT;
-               if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
-                       break;
-               ret = __put_user(childregs->cs, &u->fcs);
-               ret |= __put_user(child->thread.ds, &u->fos); 
-               break; 
-       } 
-       case PTRACE_SETFPXREGS: { 
-               struct user32_fxsr_struct __user *u = datap;
-               unlazy_fpu(child);
-               ret = -EIO;
-               if (!access_ok(VERIFY_READ, u, sizeof(*u)))
-                       break;
-               /* no checking to be bug-to-bug compatible with i386. */
-               /* but silence warning */
-               if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
-                       ;
-               set_stopped_child_used_math(child);
-               child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-               ret = 0; 
-               break;
-       }
-
-       case PTRACE_GETEVENTMSG:
-               ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
-               break;
-
-       default:
-               BUG();
-       }
-
- out:
-       put_task_struct(child);
-       return ret;
-}
-
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c

index bee96d6144326fa553019cbbc86998d898f52491..abf71d26fc2ae29446605bcc59a01c6655db3d59 100644 (file)
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -1,29 +1,29 @@
  /*
   * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
- *             sys_sparc32 
+ *             sys_sparc32
   *
   * Copyright (C) 2000          VA Linux Co
   * Copyright (C) 2000          Don Dugger <n0ano@valinux.com>
- * Copyright (C) 1999          Arun Sharma <arun.sharma@intel.com>
- * Copyright (C) 1997,1998     Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 1997          David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1999          Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998     Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997          David S. Miller (davem@caip.rutgers.edu)
   * Copyright (C) 2000          Hewlett-Packard Co.
   * Copyright (C) 2000          David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2000,2001,2002        Andi Kleen, SuSE Labs (x86-64 port) 
+ * Copyright (C) 2000,2001,2002        Andi Kleen, SuSE Labs (x86-64 port)
   *
   * These routines maintain argument size conversion between 32bit and 64bit
- * environment. In 2.5 most of this should be moved to a generic directory. 
+ * environment. In 2.5 most of this should be moved to a generic directory.
   *
   * This file assumes that there is a hole at the end of user address space.
- * 
- * Some of the functions are LE specific currently. These are hopefully all marked.
- * This should be fixed.
+ *
+ * Some of the functions are LE specific currently. These are
+ * hopefully all marked.  This should be fixed.
   */
  
  #include <linux/kernel.h>
  #include <linux/sched.h>
-#include <linux/fs.h> 
-#include <linux/file.h> 
+#include <linux/fs.h>
+#include <linux/file.h>
  #include <linux/signal.h>
  #include <linux/syscalls.h>
  #include <linux/resource.h>
@@ -90,43 +90,44 @@ int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
         if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
                 return -EOVERFLOW;
         if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
-           __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
-           __put_user (ino, &ubuf->st_ino) ||
-           __put_user (kbuf->mode, &ubuf->st_mode) ||
-           __put_user (kbuf->nlink, &ubuf->st_nlink) ||
-           __put_user (uid, &ubuf->st_uid) ||
-           __put_user (gid, &ubuf->st_gid) ||
-           __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
-           __put_user (kbuf->size, &ubuf->st_size) ||
-           __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) ||
-           __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
-           __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
-           __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
-           __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
-           __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
-           __put_user (kbuf->blksize, &ubuf->st_blksize) ||
-           __put_user (kbuf->blocks, &ubuf->st_blocks))
+           __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
+           __put_user(ino, &ubuf->st_ino) ||
+           __put_user(kbuf->mode, &ubuf->st_mode) ||
+           __put_user(kbuf->nlink, &ubuf->st_nlink) ||
+           __put_user(uid, &ubuf->st_uid) ||
+           __put_user(gid, &ubuf->st_gid) ||
+           __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
+           __put_user(kbuf->size, &ubuf->st_size) ||
+           __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
+           __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+           __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
+           __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+           __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
+           __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+           __put_user(kbuf->blksize, &ubuf->st_blksize) ||
+           __put_user(kbuf->blocks, &ubuf->st_blocks))
                 return -EFAULT;
         return 0;
  }
  
-asmlinkage long
-sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high)
+asmlinkage long sys32_truncate64(char __user *filename,
+                                unsigned long offset_low,
+                                unsigned long offset_high)
  {
         return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
  }
  
-asmlinkage long
-sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high)
+asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low,
+                                 unsigned long offset_high)
  {
         return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
  }
  
-/* Another set for IA32/LFS -- x86_64 struct stat is different due to 
-   support for 64bit inode numbers. */
-
-static int
-cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
+/*
+ * Another set for IA32/LFS -- x86_64 struct stat is different due to
+ * support for 64bit inode numbers.
+ */
+static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
  {
         typeof(ubuf->st_uid) uid = 0;
         typeof(ubuf->st_gid) gid = 0;
@@ -134,38 +135,39 @@ cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
         SET_GID(gid, stat->gid);
         if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
             __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
-           __put_user (stat->ino, &ubuf->__st_ino) ||
-           __put_user (stat->ino, &ubuf->st_ino) ||
-           __put_user (stat->mode, &ubuf->st_mode) ||
-           __put_user (stat->nlink, &ubuf->st_nlink) ||
-           __put_user (uid, &ubuf->st_uid) ||
-           __put_user (gid, &ubuf->st_gid) ||
-           __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
-           __put_user (stat->size, &ubuf->st_size) ||
-           __put_user (stat->atime.tv_sec, &ubuf->st_atime) ||
-           __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
-           __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) ||
-           __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
-           __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) ||
-           __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
-           __put_user (stat->blksize, &ubuf->st_blksize) ||
-           __put_user (stat->blocks, &ubuf->st_blocks))
+           __put_user(stat->ino, &ubuf->__st_ino) ||
+           __put_user(stat->ino, &ubuf->st_ino) ||
+           __put_user(stat->mode, &ubuf->st_mode) ||
+           __put_user(stat->nlink, &ubuf->st_nlink) ||
+           __put_user(uid, &ubuf->st_uid) ||
+           __put_user(gid, &ubuf->st_gid) ||
+           __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
+           __put_user(stat->size, &ubuf->st_size) ||
+           __put_user(stat->atime.tv_sec, &ubuf->st_atime) ||
+           __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+           __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) ||
+           __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+           __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) ||
+           __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+           __put_user(stat->blksize, &ubuf->st_blksize) ||
+           __put_user(stat->blocks, &ubuf->st_blocks))
                 return -EFAULT;
         return 0;
  }
  
-asmlinkage long
-sys32_stat64(char __user * filename, struct stat64 __user *statbuf)
+asmlinkage long sys32_stat64(char __user *filename,
+                            struct stat64 __user *statbuf)
  {
         struct kstat stat;
         int ret = vfs_stat(filename, &stat);
+
         if (!ret)
                 ret = cp_stat64(statbuf, &stat);
         return ret;
  }
  
-asmlinkage long
-sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
+asmlinkage long sys32_lstat64(char __user *filename,
+                             struct stat64 __user *statbuf)
  {
         struct kstat stat;
         int ret = vfs_lstat(filename, &stat);
@@ -174,8 +176,7 @@ sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
         return ret;
  }
  
-asmlinkage long
-sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
+asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
  {
         struct kstat stat;
         int ret = vfs_fstat(fd, &stat);
@@ -184,9 +185,8 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
         return ret;
  }
  
-asmlinkage long
-sys32_fstatat(unsigned int dfd, char __user *filename,
-             struct stat64 __user* statbuf, int flag)
+asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
+                             struct stat64 __user *statbuf, int flag)
  {
         struct kstat stat;
         int error = -EINVAL;
@@ -221,8 +221,7 @@ struct mmap_arg_struct {
         unsigned int offset;
  };
  
-asmlinkage long
-sys32_mmap(struct mmap_arg_struct __user *arg)
+asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
  {
         struct mmap_arg_struct a;
         struct file *file = NULL;
@@ -233,33 +232,33 @@ sys32_mmap(struct mmap_arg_struct __user *arg)
                 return -EFAULT;
  
         if (a.offset & ~PAGE_MASK)
-               return -EINVAL; 
+               return -EINVAL;
  
         if (!(a.flags & MAP_ANONYMOUS)) {
                 file = fget(a.fd);
                 if (!file)
                         return -EBADF;
         }
-       
-       mm = current->mm; 
-       down_write(&mm->mmap_sem); 
-       retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT);
+
+       mm = current->mm;
+       down_write(&mm->mmap_sem);
+       retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
+                              a.offset>>PAGE_SHIFT);
         if (file)
                 fput(file);
  
-       up_write(&mm->mmap_sem); 
+       up_write(&mm->mmap_sem);
  
         return retval;
  }
  
-asmlinkage long 
-sys32_mprotect(unsigned long start, size_t len, unsigned long prot)
+asmlinkage long sys32_mprotect(unsigned long start, size_t len,
+                              unsigned long prot)
  {
-       return sys_mprotect(start,len,prot); 
+       return sys_mprotect(start, len, prot);
  }
  
-asmlinkage long
-sys32_pipe(int __user *fd)
+asmlinkage long sys32_pipe(int __user *fd)
  {
         int retval;
         int fds[2];
@@ -269,13 +268,13 @@ sys32_pipe(int __user *fd)
                 goto out;
         if (copy_to_user(fd, fds, sizeof(fds)))
                 retval = -EFAULT;
-  out:
+out:
         return retval;
  }
  
-asmlinkage long
-sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
-                  struct sigaction32 __user *oact,  unsigned int sigsetsize)
+asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
+                                  struct sigaction32 __user *oact,
+                                  unsigned int sigsetsize)
  {
         struct k_sigaction new_ka, old_ka;
         int ret;
@@ -291,12 +290,17 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
                 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
                     __get_user(handler, &act->sa_handler) ||
                     __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
-                   __get_user(restorer, &act->sa_restorer)||
-                   __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)))
+                   __get_user(restorer, &act->sa_restorer) ||
+                   __copy_from_user(&set32, &act->sa_mask,
+                                    sizeof(compat_sigset_t)))
                         return -EFAULT;
                 new_ka.sa.sa_handler = compat_ptr(handler);
                 new_ka.sa.sa_restorer = compat_ptr(restorer);
-               /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+
+               /*
+                * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
+                * than _NSIG_WORDS << 1
+                */
                 switch (_NSIG_WORDS) {
                 case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
                                 | (((long)set32.sig[7]) << 32);
@@ -312,7 +316,10 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
         ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
  
         if (!ret && oact) {
-               /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+               /*
+                * FIXME: here we rely on _COMPAT_NSIG_WORS to be >=
+                * than _NSIG_WORDS << 1
+                */
                 switch (_NSIG_WORDS) {
                 case 4:
                         set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
@@ -328,23 +335,26 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
                         set32.sig[0] = old_ka.sa.sa_mask.sig[0];
                 }
                 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
-                   __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
-                   __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+                   __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+                              &oact->sa_handler) ||
+                   __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+                              &oact->sa_restorer) ||
                     __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
-                   __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)))
+                   __copy_to_user(&oact->sa_mask, &set32,
+                                  sizeof(compat_sigset_t)))
                         return -EFAULT;
         }
  
         return ret;
  }
  
-asmlinkage long
-sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact)
+asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,
+                               struct old_sigaction32 __user *oact)
  {
-        struct k_sigaction new_ka, old_ka;
-        int ret;
+       struct k_sigaction new_ka, old_ka;
+       int ret;
  
-        if (act) {
+       if (act) {
                 compat_old_sigset_t mask;
                 compat_uptr_t handler, restorer;
  
@@ -359,33 +369,35 @@ sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigacti
                 new_ka.sa.sa_restorer = compat_ptr(restorer);
  
                 siginitset(&new_ka.sa.sa_mask, mask);
-        }
+       }
  
-        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+       ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
  
         if (!ret && oact) {
                 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
-                   __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
-                   __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+                   __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+                              &oact->sa_handler) ||
+                   __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+                              &oact->sa_restorer) ||
                     __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                     __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                         return -EFAULT;
-        }
+       }
  
         return ret;
  }
  
-asmlinkage long
-sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
-                       compat_sigset_t __user *oset, unsigned int sigsetsize)
+asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
+                                    compat_sigset_t __user *oset,
+                                    unsigned int sigsetsize)
  {
         sigset_t s;
         compat_sigset_t s32;
         int ret;
         mm_segment_t old_fs = get_fs();
-       
+
         if (set) {
-               if (copy_from_user (&s32, set, sizeof(compat_sigset_t)))
+               if (copy_from_user(&s32, set, sizeof(compat_sigset_t)))
                         return -EFAULT;
                 switch (_NSIG_WORDS) {
                 case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
@@ -394,13 +406,14 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
                 case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
                 }
         }
-       set_fs (KERNEL_DS);
+       set_fs(KERNEL_DS);
         ret = sys_rt_sigprocmask(how,
                                  set ? (sigset_t __user *)&s : NULL,
                                  oset ? (sigset_t __user *)&s : NULL,
-                                sigsetsize); 
-       set_fs (old_fs);
-       if (ret) return ret;
+                                sigsetsize);
+       set_fs(old_fs);
+       if (ret)
+               return ret;
         if (oset) {
                 switch (_NSIG_WORDS) {
                 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
@@ -408,52 +421,49 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
                 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
                 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
                 }
-               if (copy_to_user (oset, &s32, sizeof(compat_sigset_t)))
+               if (copy_to_user(oset, &s32, sizeof(compat_sigset_t)))
                         return -EFAULT;
         }
         return 0;
  }
  
-static inline long
-get_tv32(struct timeval *o, struct compat_timeval __user *i)
+static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
  {
-       int err = -EFAULT; 
-       if (access_ok(VERIFY_READ, i, sizeof(*i))) { 
+       int err = -EFAULT;
+
+       if (access_ok(VERIFY_READ, i, sizeof(*i))) {
                 err = __get_user(o->tv_sec, &i->tv_sec);
                 err |= __get_user(o->tv_usec, &i->tv_usec);
         }
-       return err; 
+       return err;
  }
  
-static inline long
-put_tv32(struct compat_timeval __user *o, struct timeval *i)
+static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
  {
         int err = -EFAULT;
-       if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { 
+
+       if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
                 err = __put_user(i->tv_sec, &o->tv_sec);
                 err |= __put_user(i->tv_usec, &o->tv_usec);
-       } 
-       return err; 
+       }
+       return err;
  }
  
-extern unsigned int alarm_setitimer(unsigned int seconds);
-
-asmlinkage long
-sys32_alarm(unsigned int seconds)
+asmlinkage long sys32_alarm(unsigned int seconds)
  {
         return alarm_setitimer(seconds);
  }
  
-/* Translations due to time_t size differences.  Which affects all
-   sorts of things, like timeval and itimerval.  */
-
-extern struct timezone sys_tz;
-
-asmlinkage long
-sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+/*
+ * Translations due to time_t size differences. Which affects all
+ * sorts of things, like timeval and itimerval.
+ */
+asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
+                                  struct timezone __user *tz)
  {
         if (tv) {
                 struct timeval ktv;
+
                 do_gettimeofday(&ktv);
                 if (put_tv32(tv, &ktv))
                         return -EFAULT;
@@ -465,14 +475,14 @@ sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
         return 0;
  }
  
-asmlinkage long
-sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
+                                  struct timezone __user *tz)
  {
         struct timeval ktv;
         struct timespec kts;
         struct timezone ktz;
  
-       if (tv) {
+       if (tv) {
                 if (get_tv32(&ktv, tv))
                         return -EFAULT;
                 kts.tv_sec = ktv.tv_sec;
@@ -494,8 +504,7 @@ struct sel_arg_struct {
         unsigned int tvp;
  };
  
-asmlinkage long
-sys32_old_select(struct sel_arg_struct __user *arg)
+asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
  {
         struct sel_arg_struct a;
  
@@ -505,50 +514,45 @@ sys32_old_select(struct sel_arg_struct __user *arg)
                                  compat_ptr(a.exp), compat_ptr(a.tvp));
  }
  
-extern asmlinkage long
-compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options,
-                struct compat_rusage *ru);
-
-asmlinkage long
-sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
+asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
+                             int options)
  {
         return compat_sys_wait4(pid, stat_addr, options, NULL);
  }
  
  /* 32-bit timeval and related flotsam.  */
  
-asmlinkage long
-sys32_sysfs(int option, u32 arg1, u32 arg2)
+asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2)
  {
         return sys_sysfs(option, arg1, arg2);
  }
  
-asmlinkage long
-sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval)
+asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid,
+                                   struct compat_timespec __user *interval)
  {
         struct timespec t;
         int ret;
-       mm_segment_t old_fs = get_fs ();
-       
-       set_fs (KERNEL_DS);
+       mm_segment_t old_fs = get_fs();
+
+       set_fs(KERNEL_DS);
         ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
-       set_fs (old_fs);
+       set_fs(old_fs);
         if (put_compat_timespec(&t, interval))
                 return -EFAULT;
         return ret;
  }
  
-asmlinkage long
-sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
+asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set,
+                                   compat_size_t sigsetsize)
  {
         sigset_t s;
         compat_sigset_t s32;
         int ret;
         mm_segment_t old_fs = get_fs();
-               
-       set_fs (KERNEL_DS);
+
+       set_fs(KERNEL_DS);
         ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
-       set_fs (old_fs);
+       set_fs(old_fs);
         if (!ret) {
                 switch (_NSIG_WORDS) {
                 case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
@@ -556,30 +560,29 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
                 case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
                 case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
                 }
-               if (copy_to_user (set, &s32, sizeof(compat_sigset_t)))
+               if (copy_to_user(set, &s32, sizeof(compat_sigset_t)))
                         return -EFAULT;
         }
         return ret;
  }
  
-asmlinkage long
-sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
+asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
+                                     compat_siginfo_t __user *uinfo)
  {
         siginfo_t info;
         int ret;
         mm_segment_t old_fs = get_fs();
-       
+
         if (copy_siginfo_from_user32(&info, uinfo))
                 return -EFAULT;
-       set_fs (KERNEL_DS);
+       set_fs(KERNEL_DS);
         ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
-       set_fs (old_fs);
+       set_fs(old_fs);
         return ret;
  }
  
  /* These are here just in case some old ia32 binary calls it. */
-asmlinkage long
-sys32_pause(void)
+asmlinkage long sys32_pause(void)
  {
         current->state = TASK_INTERRUPTIBLE;
         schedule();
@@ -599,25 +602,25 @@ struct sysctl_ia32 {
  };
  
  
-asmlinkage long
-sys32_sysctl(struct sysctl_ia32 __user *args32)
+asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
  {
         struct sysctl_ia32 a32;
-       mm_segment_t old_fs = get_fs ();
+       mm_segment_t old_fs = get_fs();
         void __user *oldvalp, *newvalp;
         size_t oldlen;
         int __user *namep;
         long ret;
  
-       if (copy_from_user(&a32, args32, sizeof (a32)))
+       if (copy_from_user(&a32, args32, sizeof(a32)))
                 return -EFAULT;
  
         /*
-        * We need to pre-validate these because we have to disable address checking
-        * before calling do_sysctl() because of OLDLEN but we can't run the risk of the
-        * user specifying bad addresses here.  Well, since we're dealing with 32 bit
-        * addresses, we KNOW that access_ok() will always succeed, so this is an
-        * expensive NOP, but so what...
+        * We need to pre-validate these because we have to disable
+        * address checking before calling do_sysctl() because of
+        * OLDLEN but we can't run the risk of the user specifying bad
+        * addresses here.  Well, since we're dealing with 32 bit
+        * addresses, we KNOW that access_ok() will always succeed, so
+        * this is an expensive NOP, but so what...
          */
         namep = compat_ptr(a32.name);
         oldvalp = compat_ptr(a32.oldval);
@@ -636,34 +639,34 @@ sys32_sysctl(struct sysctl_ia32 __user *args32)
         unlock_kernel();
         set_fs(old_fs);
  
-       if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp)))
+       if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
                 return -EFAULT;
  
         return ret;
  }
  #endif
  
-/* warning: next two assume little endian */ 
-asmlinkage long
-sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+/* warning: next two assume little endian */
+asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
+                           u32 poslo, u32 poshi)
  {
         return sys_pread64(fd, ubuf, count,
                          ((loff_t)AA(poshi) << 32) | AA(poslo));
  }
  
-asmlinkage long
-sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count,
+                            u32 poslo, u32 poshi)
  {
         return sys_pwrite64(fd, ubuf, count,
                           ((loff_t)AA(poshi) << 32) | AA(poslo));
  }
  
  
-asmlinkage long
-sys32_personality(unsigned long personality)
+asmlinkage long sys32_personality(unsigned long personality)
  {
         int ret;
-       if (personality(current->personality) == PER_LINUX32 && 
+
+       if (personality(current->personality) == PER_LINUX32 &&
                 personality == PER_LINUX)
                 personality = PER_LINUX32;
         ret = sys_personality(personality);
@@ -672,34 +675,33 @@ sys32_personality(unsigned long personality)
         return ret;
  }
  
-asmlinkage long
-sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
+asmlinkage long sys32_sendfile(int out_fd, int in_fd,
+                              compat_off_t __user *offset, s32 count)
  {
         mm_segment_t old_fs = get_fs();
         int ret;
         off_t of;
-       
+
         if (offset && get_user(of, offset))
                 return -EFAULT;
-               
+
         set_fs(KERNEL_DS);
         ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
                            count);
         set_fs(old_fs);
-       
+
         if (offset && put_user(of, offset))
                 return -EFAULT;
-               
         return ret;
  }
  
  asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
-       unsigned long prot, unsigned long flags,
-       unsigned long fd, unsigned long pgoff)
+                           unsigned long prot, unsigned long flags,
+                           unsigned long fd, unsigned long pgoff)
  {
         struct mm_struct *mm = current->mm;
         unsigned long error;
-       struct file * file = NULL;
+       struct file *file = NULL;
  
         flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
         if (!(flags & MAP_ANONYMOUS)) {
@@ -717,36 +719,35 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
         return error;
  }
  
-asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
+asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
  {
+       char *arch = "x86_64";
         int err;
  
         if (!name)
                 return -EFAULT;
         if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
                 return -EFAULT;
-  
-       down_read(&uts_sem);
-
-       err = __copy_to_user(&name->sysname,&utsname()->sysname,
-                               __OLD_UTS_LEN);
-       err |= __put_user(0,name->sysname+__OLD_UTS_LEN);
-       err |= __copy_to_user(&name->nodename,&utsname()->nodename,
-                               __OLD_UTS_LEN);
-       err |= __put_user(0,name->nodename+__OLD_UTS_LEN);
-       err |= __copy_to_user(&name->release,&utsname()->release,
-                               __OLD_UTS_LEN);
-       err |= __put_user(0,name->release+__OLD_UTS_LEN);
-       err |= __copy_to_user(&name->version,&utsname()->version,
-                               __OLD_UTS_LEN);
-       err |= __put_user(0,name->version+__OLD_UTS_LEN);
-       {
-               char *arch = "x86_64";
-               if (personality(current->personality) == PER_LINUX32)
-                       arch = "i686";
-                
-               err |= __copy_to_user(&name->machine, arch, strlen(arch)+1);
-       }
+
+       down_read(&uts_sem);
+
+       err = __copy_to_user(&name->sysname, &utsname()->sysname,
+                            __OLD_UTS_LEN);
+       err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
+       err |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                             __OLD_UTS_LEN);
+       err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
+       err |= __copy_to_user(&name->release, &utsname()->release,
+                             __OLD_UTS_LEN);
+       err |= __put_user(0, name->release+__OLD_UTS_LEN);
+       err |= __copy_to_user(&name->version, &utsname()->version,
+                             __OLD_UTS_LEN);
+       err |= __put_user(0, name->version+__OLD_UTS_LEN);
+
+       if (personality(current->personality) == PER_LINUX32)
+               arch = "i686";
+
+       err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
  
         up_read(&uts_sem);
  
@@ -755,17 +756,19 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
         return err;
  }
  
-long sys32_uname(struct old_utsname __user * name)
+long sys32_uname(struct old_utsname __user *name)
  {
         int err;
+
         if (!name)
                 return -EFAULT;
         down_read(&uts_sem);
-       err = copy_to_user(name, utsname(), sizeof (*name));
+       err = copy_to_user(name, utsname(), sizeof(*name));
         up_read(&uts_sem);
-       if (personality(current->personality) == PER_LINUX32) 
+       if (personality(current->personality) == PER_LINUX32)
                 err |= copy_to_user(&name->machine, "i686", 5);
-       return err?-EFAULT:0;
+
+       return err ? -EFAULT : 0;
  }
  
  long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
@@ -773,27 +776,28 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
         struct ustat u;
         mm_segment_t seg;
         int ret;
-       
-       seg = get_fs(); 
-       set_fs(KERNEL_DS); 
+
+       seg = get_fs();
+       set_fs(KERNEL_DS);
         ret = sys_ustat(dev, (struct ustat __user *)&u);
         set_fs(seg);
-       if (ret >= 0) { 
-               if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || 
-                   __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
-                   __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
-                   __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
-                   __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
-                       ret = -EFAULT;
-       }
+       if (ret < 0)
+               return ret;
+
+       if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) ||
+           __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
+           __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
+           __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
+           __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
+               ret = -EFAULT;
         return ret;
-} 
+}
  
  asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
                              compat_uptr_t __user *envp, struct pt_regs *regs)
  {
         long error;
-       char * filename;
+       char *filename;
  
         filename = getname(name);
         error = PTR_ERR(filename);
@@ -812,18 +816,19 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
  asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
                             struct pt_regs *regs)
  {
-       void __user *parent_tid = (void __user *)regs->rdx;
-       void __user *child_tid = (void __user *)regs->rdi;
+       void __user *parent_tid = (void __user *)regs->dx;
+       void __user *child_tid = (void __user *)regs->di;
+
         if (!newsp)
-               newsp = regs->rsp;
-        return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+               newsp = regs->sp;
+       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
  }
  
  /*
- * Some system calls that need sign extended arguments. This could be done by a generic wrapper.
- */ 
-
-long sys32_lseek (unsigned int fd, int offset, unsigned int whence)
+ * Some system calls that need sign extended arguments. This could be
+ * done by a generic wrapper.
+ */
+long sys32_lseek(unsigned int fd, int offset, unsigned int whence)
  {
         return sys_lseek(fd, offset, whence);
  }
@@ -832,49 +837,52 @@ long sys32_kill(int pid, int sig)
  {
         return sys_kill(pid, sig);
  }
- 
-long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, 
+
+long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
                         __u32 len_low, __u32 len_high, int advice)
-{ 
+{
         return sys_fadvise64_64(fd,
                                (((u64)offset_high)<<32) | offset_low,
                                (((u64)len_high)<<32) | len_low,
-                              advice); 
-} 
+                               advice);
+}
  
  long sys32_vm86_warning(void)
-{ 
+{
         struct task_struct *me = current;
         static char lastcomm[sizeof(me->comm)];
+
         if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-               compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
-                      me->comm);
+               compat_printk(KERN_INFO
+                             "%s: vm86 mode not supported on 64 bit kernel\n",
+                             me->comm);
                 strncpy(lastcomm, me->comm, sizeof(lastcomm));
-       } 
+       }
         return -ENOSYS;
-} 
+}
  
  long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
-                         char __user * buf, size_t len)
+                         char __user *buf, size_t len)
  {
         return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
  }
  
-asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count)
+asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
+                                  size_t count)
  {
         return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
  }
  
  asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi,
-                          unsigned n_low, unsigned n_hi,  int flags)
+                                     unsigned n_low, unsigned n_hi,  int flags)
  {
         return sys_sync_file_range(fd,
                                    ((u64)off_hi << 32) | off_low,
                                    ((u64)n_hi << 32) | n_low, flags);
  }
  
-asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len,
-                    int advice)
+asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
+                               size_t len, int advice)
  {
         return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
                                 len, advice);
diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c

deleted file mode 100644 (file)

index 15013ba..0000000
--- a/arch/x86/ia32/syscall32.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
-
-/* vsyscall handling for 32bit processes. Map a stub page into it 
-   on demand because 32bit cannot reach the kernel's fixmaps */
-
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/stringify.h>
-#include <linux/security.h>
-#include <asm/proto.h>
-#include <asm/tlbflush.h>
-#include <asm/ia32_unistd.h>
-#include <asm/vsyscall32.h>
-
-extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
-extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
-extern int sysctl_vsyscall32;
-
-static struct page *syscall32_pages[1];
-static int use_sysenter = -1;
-
-struct linux_binprm;
-
-/* Setup a VMA at program startup for the vsyscall page */
-int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
-{
-       struct mm_struct *mm = current->mm;
-       int ret;
-
-       down_write(&mm->mmap_sem);
-       /*
-        * MAYWRITE to allow gdb to COW and set breakpoints
-        *
-        * Make sure the vDSO gets into every core dump.
-        * Dumping its contents makes post-mortem fully interpretable later
-        * without matching up the same kernel and hardware config to see
-        * what PC values meant.
-        */
-       /* Could randomize here */
-       ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE,
-                                     VM_READ|VM_EXEC|
-                                     VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-                                     VM_ALWAYSDUMP,
-                                     syscall32_pages);
-       up_write(&mm->mmap_sem);
-       return ret;
-}
-
-static int __init init_syscall32(void)
-{ 
-       char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
-       if (!syscall32_page) 
-               panic("Cannot allocate syscall32 page"); 
-       syscall32_pages[0] = virt_to_page(syscall32_page);
-       if (use_sysenter > 0) {
-               memcpy(syscall32_page, syscall32_sysenter,
-                      syscall32_sysenter_end - syscall32_sysenter);
-       } else {
-               memcpy(syscall32_page, syscall32_syscall,
-                      syscall32_syscall_end - syscall32_syscall);
-       }       
-       return 0;
-} 
-       
-__initcall(init_syscall32); 
-
-/* May not be __init: called during resume */
-void syscall32_cpu_init(void)
-{
-       if (use_sysenter < 0)
-               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
-
-       /* Load these always in case some future AMD CPU supports
-          SYSENTER from compat mode too. */
-       checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-       checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
-       checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
-       wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
diff --git a/arch/x86/ia32/syscall32_syscall.S b/arch/x86/ia32/syscall32_syscall.S

deleted file mode 100644 (file)

index 933f0f0..0000000
--- a/arch/x86/ia32/syscall32_syscall.S
+++ /dev/null
@@ -1,17 +0,0 @@
-/* 32bit VDSOs mapped into user space. */
-
-       .section ".init.data","aw"
-
-       .globl syscall32_syscall
-       .globl syscall32_syscall_end
-
-syscall32_syscall:
-       .incbin "arch/x86/ia32/vsyscall-syscall.so"
-syscall32_syscall_end:
-
-       .globl syscall32_sysenter
-       .globl syscall32_sysenter_end
-
-syscall32_sysenter:
-       .incbin "arch/x86/ia32/vsyscall-sysenter.so"
-syscall32_sysenter_end:
diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c

deleted file mode 100644 (file)

index 1cc4340..0000000
--- a/arch/x86/ia32/tls32.c
+++ /dev/null
@@ -1,163 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/user.h>
-
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/processor.h>
-#include <asm/proto.h>
-
-/*
- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
- */
-static int get_free_idx(void)
-{
-       struct thread_struct *t = &current->thread;
-       int idx;
-
-       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
-               if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
-                       return idx + GDT_ENTRY_TLS_MIN;
-       return -ESRCH;
-}
-
-/*
- * Set a given TLS descriptor:
- * When you want addresses > 32bit use arch_prctl() 
- */
-int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
-{
-       struct user_desc info;
-       struct n_desc_struct *desc;
-       int cpu, idx;
-
-       if (copy_from_user(&info, u_info, sizeof(info)))
-               return -EFAULT;
-
-       idx = info.entry_number;
-
-       /*
-        * index -1 means the kernel should try to find and
-        * allocate an empty descriptor:
-        */
-       if (idx == -1) {
-               idx = get_free_idx();
-               if (idx < 0)
-                       return idx;
-               if (put_user(idx, &u_info->entry_number))
-                       return -EFAULT;
-       }
-
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
-
-       /*
-        * We must not get preempted while modifying the TLS.
-        */
-       cpu = get_cpu();
-
-       if (LDT_empty(&info)) {
-               desc->a = 0;
-               desc->b = 0;
-       } else {
-               desc->a = LDT_entry_a(&info);
-               desc->b = LDT_entry_b(&info);
-       }
-       if (t == &current->thread)
-               load_TLS(t, cpu);
-
-       put_cpu();
-       return 0;
-}
-
-asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
-{ 
-       return do_set_thread_area(&current->thread, u_info); 
-} 
-
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-       (((desc)->a >> 16) & 0x0000ffff) | \
-       (((desc)->b << 16) & 0x00ff0000) | \
-       ( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-       ((desc)->a & 0x0ffff) | \
-        ((desc)->b & 0xf0000) )
-       
-#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
-#define GET_LONGMODE(desc)     (((desc)->b >> 21) & 1)
-
-int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
-{
-       struct user_desc info;
-       struct n_desc_struct *desc;
-       int idx;
-
-       if (get_user(idx, &u_info->entry_number))
-               return -EFAULT;
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
-
-       memset(&info, 0, sizeof(struct user_desc));
-       info.entry_number = idx;
-       info.base_addr = GET_BASE(desc);
-       info.limit = GET_LIMIT(desc);
-       info.seg_32bit = GET_32BIT(desc);
-       info.contents = GET_CONTENTS(desc);
-       info.read_exec_only = !GET_WRITABLE(desc);
-       info.limit_in_pages = GET_LIMIT_PAGES(desc);
-       info.seg_not_present = !GET_PRESENT(desc);
-       info.useable = GET_USEABLE(desc);
-       info.lm = GET_LONGMODE(desc);
-
-       if (copy_to_user(u_info, &info, sizeof(info)))
-               return -EFAULT;
-       return 0;
-}
-
-asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
-{
-       return do_get_thread_area(&current->thread, u_info);
-} 
-
-
-int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
-{
-       struct n_desc_struct *desc;
-       struct user_desc info;
-       struct user_desc __user *cp;
-       int idx;
-       
-       cp = (void __user *)childregs->rsi;
-       if (copy_from_user(&info, cp, sizeof(info)))
-               return -EFAULT;
-       if (LDT_empty(&info))
-               return -EINVAL;
-       
-       idx = info.entry_number;
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-       
-       desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
-       desc->a = LDT_entry_a(&info);
-       desc->b = LDT_entry_b(&info);
-
-       return 0;
-}
diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S

deleted file mode 100644 (file)

index b383be0..0000000
--- a/arch/x86/ia32/vsyscall-sigreturn.S
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Common code for the sigreturn entry points on the vsyscall page.
- * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
- * to enter the kernel.
- * This file is #include'd by vsyscall-*.S to define them after the
- * vsyscall entry point.  The addresses we get for these entry points
- * by doing ".balign 32" must match in both versions of the page.
- */
-
-       .code32
-       .section .text.sigreturn,"ax"
-       .balign 32
-       .globl __kernel_sigreturn
-       .type __kernel_sigreturn,@function
-__kernel_sigreturn:
-.LSTART_sigreturn:
-       popl %eax
-       movl $__NR_ia32_sigreturn, %eax
-       SYSCALL_ENTER_KERNEL
-.LEND_sigreturn:
-       .size __kernel_sigreturn,.-.LSTART_sigreturn
-
-       .section .text.rtsigreturn,"ax"
-       .balign 32
-       .globl __kernel_rt_sigreturn
-       .type __kernel_rt_sigreturn,@function
-__kernel_rt_sigreturn:
-.LSTART_rt_sigreturn:
-       movl $__NR_ia32_rt_sigreturn, %eax
-       SYSCALL_ENTER_KERNEL
-.LEND_rt_sigreturn:
-       .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
-
-       .section .eh_frame,"a",@progbits
-.LSTARTFRAMES:
-        .long .LENDCIES-.LSTARTCIES
-.LSTARTCIES:
-       .long 0                 /* CIE ID */
-       .byte 1                 /* Version number */
-       .string "zRS"           /* NUL-terminated augmentation string */
-       .uleb128 1              /* Code alignment factor */
-       .sleb128 -4             /* Data alignment factor */
-       .byte 8                 /* Return address register column */
-       .uleb128 1              /* Augmentation value length */
-       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-       .byte 0x0c              /* DW_CFA_def_cfa */
-       .uleb128 4
-       .uleb128 4
-       .byte 0x88              /* DW_CFA_offset, column 0x8 */
-       .uleb128 1
-       .align 4
-.LENDCIES:
-
-       .long .LENDFDE2-.LSTARTFDE2     /* Length FDE */
-.LSTARTFDE2:
-       .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */
-       /* HACK: The dwarf2 unwind routines will subtract 1 from the
-          return address to get an address in the middle of the
-          presumed call instruction.  Since we didn't get here via
-          a call, we need to include the nop before the real start
-          to make up for it.  */
-       .long .LSTART_sigreturn-1-.     /* PC-relative start address */
-       .long .LEND_sigreturn-.LSTART_sigreturn+1
-       .uleb128 0                      /* Augmentation length */
-       /* What follows are the instructions for the table generation.
-          We record the locations of each register saved.  This is
-          complicated by the fact that the "CFA" is always assumed to
-          be the value of the stack pointer in the caller.  This means
-          that we must define the CFA of this body of code to be the
-          saved value of the stack pointer in the sigcontext.  Which
-          also means that there is no fixed relation to the other 
-          saved registers, which means that we must use DW_CFA_expression
-          to compute their addresses.  It also means that when we 
-          adjust the stack with the popl, we have to do it all over again.  */
-
-#define do_cfa_expr(offset)                                            \
-       .byte 0x0f;                     /* DW_CFA_def_cfa_expression */ \
-       .uleb128 1f-0f;                 /*   length */                  \
-0:     .byte 0x74;                     /*     DW_OP_breg4 */           \
-       .sleb128 offset;                /*      offset */               \
-       .byte 0x06;                     /*     DW_OP_deref */           \
-1:
-
-#define do_expr(regno, offset)                                         \
-       .byte 0x10;                     /* DW_CFA_expression */         \
-       .uleb128 regno;                 /*   regno */                   \
-       .uleb128 1f-0f;                 /*   length */                  \
-0:     .byte 0x74;                     /*     DW_OP_breg4 */           \
-       .sleb128 offset;                /*       offset */              \
-1:
-
-       do_cfa_expr(IA32_SIGCONTEXT_esp+4)
-       do_expr(0, IA32_SIGCONTEXT_eax+4)
-       do_expr(1, IA32_SIGCONTEXT_ecx+4)
-       do_expr(2, IA32_SIGCONTEXT_edx+4)
-       do_expr(3, IA32_SIGCONTEXT_ebx+4)
-       do_expr(5, IA32_SIGCONTEXT_ebp+4)
-       do_expr(6, IA32_SIGCONTEXT_esi+4)
-       do_expr(7, IA32_SIGCONTEXT_edi+4)
-       do_expr(8, IA32_SIGCONTEXT_eip+4)
-
-       .byte 0x42      /* DW_CFA_advance_loc 2 -- nop; popl eax. */
-
-       do_cfa_expr(IA32_SIGCONTEXT_esp)
-       do_expr(0, IA32_SIGCONTEXT_eax)
-       do_expr(1, IA32_SIGCONTEXT_ecx)
-       do_expr(2, IA32_SIGCONTEXT_edx)
-       do_expr(3, IA32_SIGCONTEXT_ebx)
-       do_expr(5, IA32_SIGCONTEXT_ebp)
-       do_expr(6, IA32_SIGCONTEXT_esi)
-       do_expr(7, IA32_SIGCONTEXT_edi)
-       do_expr(8, IA32_SIGCONTEXT_eip)
-
-       .align 4
-.LENDFDE2:
-
-       .long .LENDFDE3-.LSTARTFDE3     /* Length FDE */
-.LSTARTFDE3:
-       .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */
-       /* HACK: See above wrt unwind library assumptions.  */
-       .long .LSTART_rt_sigreturn-1-.  /* PC-relative start address */
-       .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
-       .uleb128 0                      /* Augmentation */
-       /* What follows are the instructions for the table generation.
-          We record the locations of each register saved.  This is
-          slightly less complicated than the above, since we don't
-          modify the stack pointer in the process.  */
-
-       do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
-       do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
-       do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
-       do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
-       do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
-       do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
-       do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
-       do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
-       do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
-
-       .align 4
-.LENDFDE3:
-
-#include "../../x86/kernel/vsyscall-note_32.S"
-
diff --git a/arch/x86/ia32/vsyscall-sysenter.S b/arch/x86/ia32/vsyscall-sysenter.S

deleted file mode 100644 (file)

index ae056e5..0000000
--- a/arch/x86/ia32/vsyscall-sysenter.S
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Code for the vsyscall page.  This version uses the sysenter instruction.
- */
-
-#include <asm/ia32_unistd.h>
-#include <asm/asm-offsets.h>
-
-       .code32
-       .text
-       .section .text.vsyscall,"ax"
-       .globl __kernel_vsyscall
-       .type __kernel_vsyscall,@function
-__kernel_vsyscall:
-.LSTART_vsyscall:
-       push    %ecx
-.Lpush_ecx:
-       push    %edx
-.Lpush_edx:
-       push    %ebp
-.Lenter_kernel:
-       movl    %esp,%ebp
-       sysenter
-       .space 7,0x90
-       jmp     .Lenter_kernel
-       /* 16: System call normal return point is here! */
-       pop     %ebp
-.Lpop_ebp:
-       pop     %edx
-.Lpop_edx:
-       pop     %ecx
-.Lpop_ecx:
-       ret
-.LEND_vsyscall:
-       .size __kernel_vsyscall,.-.LSTART_vsyscall
-
-       .section .eh_frame,"a",@progbits
-.LSTARTFRAME:
-       .long .LENDCIE-.LSTARTCIE
-.LSTARTCIE:
-       .long 0                 /* CIE ID */
-       .byte 1                 /* Version number */
-       .string "zR"            /* NUL-terminated augmentation string */
-       .uleb128 1              /* Code alignment factor */
-       .sleb128 -4             /* Data alignment factor */
-       .byte 8                 /* Return address register column */
-       .uleb128 1              /* Augmentation value length */
-       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-       .byte 0x0c              /* DW_CFA_def_cfa */
-       .uleb128 4
-       .uleb128 4
-       .byte 0x88              /* DW_CFA_offset, column 0x8 */
-       .uleb128 1
-       .align 4
-.LENDCIE:
-
-       .long .LENDFDE1-.LSTARTFDE1     /* Length FDE */
-.LSTARTFDE1:
-       .long .LSTARTFDE1-.LSTARTFRAME  /* CIE pointer */
-       .long .LSTART_vsyscall-.        /* PC-relative start address */
-       .long .LEND_vsyscall-.LSTART_vsyscall
-       .uleb128 0                      /* Augmentation length */
-       /* What follows are the instructions for the table generation.
-          We have to record all changes of the stack pointer.  */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpush_ecx-.LSTART_vsyscall
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x08              /* RA at offset 8 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpush_edx-.Lpush_ecx
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x0c              /* RA at offset 12 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lenter_kernel-.Lpush_edx
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x10              /* RA at offset 16 now */
-       .byte 0x85, 0x04        /* DW_CFA_offset %ebp -16 */
-       /* Finally the epilogue.  */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_ebp-.Lenter_kernel
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x12              /* RA at offset 12 now */
-       .byte 0xc5              /* DW_CFA_restore %ebp */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_edx-.Lpop_ebp
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x08              /* RA at offset 8 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_ecx-.Lpop_edx
-       .byte 0x0e              /* DW_CFA_def_cfa_offset */
-       .byte 0x04              /* RA at offset 4 now */
-       .align 4
-.LENDFDE1:
-
-#define SYSCALL_ENTER_KERNEL   int $0x80
-#include "vsyscall-sigreturn.S"
diff --git a/arch/x86/ia32/vsyscall.lds b/arch/x86/ia32/vsyscall.lds

deleted file mode 100644 (file)

index 1dc86ff..0000000
--- a/arch/x86/ia32/vsyscall.lds
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
- * object prelinked to its virtual address. This script controls its layout.
- */
-
-/* This must match <asm/fixmap.h>.  */
-VSYSCALL_BASE = 0xffffe000;
-
-SECTIONS
-{
-  . = VSYSCALL_BASE + SIZEOF_HEADERS;
-
-  .hash           : { *(.hash) }               :text
-  .gnu.hash       : { *(.gnu.hash) }
-  .dynsym         : { *(.dynsym) }
-  .dynstr         : { *(.dynstr) }
-  .gnu.version    : { *(.gnu.version) }
-  .gnu.version_d  : { *(.gnu.version_d) }
-  .gnu.version_r  : { *(.gnu.version_r) }
-
-  /* This linker script is used both with -r and with -shared.
-     For the layouts to match, we need to skip more than enough
-     space for the dynamic symbol table et al.  If this amount
-     is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VSYSCALL_BASE + 0x400;
-  
-  .text.vsyscall   : { *(.text.vsyscall) }     :text =0x90909090
-
-  /* This is an 32bit object and we cannot easily get the offsets
-     into the 64bit kernel. Just hardcode them here. This assumes
-     that all the stubs don't need more than 0x100 bytes. */
-  . = VSYSCALL_BASE + 0x500;
-
-  .text.sigreturn  : { *(.text.sigreturn) }    :text =0x90909090
-
-  . = VSYSCALL_BASE + 0x600;
-
-  .text.rtsigreturn : { *(.text.rtsigreturn) }   :text =0x90909090
-       
-  .note                  : { *(.note.*) }              :text :note
-  .eh_frame_hdr   : { *(.eh_frame_hdr) }       :text :eh_frame_hdr
-  .eh_frame       : { KEEP (*(.eh_frame)) }    :text
-  .dynamic        : { *(.dynamic) }            :text :dynamic
-  .useless        : {
-       *(.got.plt) *(.got)
-       *(.data .data.* .gnu.linkonce.d.*)
-       *(.dynbss)
-       *(.bss .bss.* .gnu.linkonce.b.*)
-  }                                            :text
-}
-
-/*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
- */
-PHDRS
-{
-  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
-  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
-  note PT_NOTE FLAGS(4); /* PF_R */
-  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
-}
-
-/*
- * This controls what symbols we export from the DSO.
- */
-VERSION
-{
-  LINUX_2.5 {
-    global:
-       __kernel_vsyscall;
-       __kernel_sigreturn;
-       __kernel_rt_sigreturn;
-
-    local: *;
-  };
-}
-
-/* The ELF entry point can be used to set the AT_SYSINFO value.  */
-ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index 38573340b14367b138df651b5fba84e691a51c32..6f813009d44b348de25cbb355a197f9c8f1241fe 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,9 +1,91 @@
-ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/kernel/Makefile_32
-else
-include ${srctree}/arch/x86/kernel/Makefile_64
+#
+# Makefile for the linux kernel.
+#
+
+extra-y                := head_$(BITS).o init_task.o vmlinux.lds
+extra-$(CONFIG_X86_64) += head64.o
+
+CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
+CFLAGS_vsyscall_64.o := $(PROFILING) -g0
+
+obj-y                  := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
+obj-y                  += traps_$(BITS).o irq_$(BITS).o
+obj-y                  += time_$(BITS).o ioport.o ldt.o
+obj-y                  += setup_$(BITS).o i8259_$(BITS).o
+obj-$(CONFIG_X86_32)   += sys_i386_32.o i386_ksyms_32.o
+obj-$(CONFIG_X86_64)   += sys_x86_64.o x8664_ksyms_64.o
+obj-$(CONFIG_X86_64)   += syscall_64.o vsyscall_64.o setup64.o
+obj-y                  += pci-dma_$(BITS).o  bootflag.o e820_$(BITS).o
+obj-y                  += quirks.o i8237.o topology.o kdebugfs.o
+obj-y                  += alternative.o i8253.o
+obj-$(CONFIG_X86_64)   += pci-nommu_64.o bugs_64.o
+obj-y                  += tsc_$(BITS).o io_delay.o rtc.o
+
+obj-y                          += i387.o
+obj-y                          += ptrace.o
+obj-y                          += ds.o
+obj-$(CONFIG_X86_32)           += tls.o
+obj-$(CONFIG_IA32_EMULATION)   += tls.o
+obj-y                          += step.o
+obj-$(CONFIG_STACKTRACE)       += stacktrace.o
+obj-y                          += cpu/
+obj-y                          += acpi/
+obj-$(CONFIG_X86_BIOS_REBOOT)  += reboot.o
+obj-$(CONFIG_X86_64)           += reboot.o
+obj-$(CONFIG_MCA)              += mca_32.o
+obj-$(CONFIG_X86_MSR)          += msr.o
+obj-$(CONFIG_X86_CPUID)                += cpuid.o
+obj-$(CONFIG_MICROCODE)                += microcode.o
+obj-$(CONFIG_PCI)              += early-quirks.o
+obj-$(CONFIG_APM)              += apm_32.o
+obj-$(CONFIG_X86_SMP)          += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o
+obj-$(CONFIG_X86_32_SMP)       += smpcommon_32.o
+obj-$(CONFIG_X86_64_SMP)       += smp_64.o smpboot_64.o tsc_sync.o
+obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline_$(BITS).o
+obj-$(CONFIG_X86_MPPARSE)      += mpparse_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC)   += apic_$(BITS).o nmi_$(BITS).o
+obj-$(CONFIG_X86_IO_APIC)      += io_apic_$(BITS).o
+obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
+obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
+obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
+obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit_32.o
+obj-$(CONFIG_X86_VSMP)         += vsmp_64.o
+obj-$(CONFIG_KPROBES)          += kprobes.o
+obj-$(CONFIG_MODULES)          += module_$(BITS).o
+obj-$(CONFIG_ACPI_SRAT)        += srat_32.o
+obj-$(CONFIG_EFI)              += efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_DOUBLEFAULT)      += doublefault_32.o
+obj-$(CONFIG_VM86)             += vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
+
+obj-$(CONFIG_HPET_TIMER)       += hpet.o
+
+obj-$(CONFIG_K8_NB)            += k8.o
+obj-$(CONFIG_MGEODE_LX)                += geode_32.o mfgpt_32.o
+obj-$(CONFIG_DEBUG_RODATA_TEST)        += test_rodata.o
+obj-$(CONFIG_DEBUG_NX_TEST)    += test_nx.o
+
+obj-$(CONFIG_VMI)              += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o
+
+ifdef CONFIG_INPUT_PCSPKR
+obj-y                          += pcspeaker.o
  endif
  
-# Workaround to delete .lds files with make clean
-# The problem is that we do not enter Makefile_32 with make clean.
-clean-files := vsyscall*.lds vsyscall*.so
+obj-$(CONFIG_SCx200)           += scx200_32.o
+
+###
+# 64 bit specific files
+ifeq ($(CONFIG_X86_64),y)
+        obj-y                          += genapic_64.o genapic_flat_64.o
+        obj-$(CONFIG_X86_PM_TIMER)     += pmtimer_64.o
+        obj-$(CONFIG_AUDIT)            += audit_64.o
+        obj-$(CONFIG_PM)               += suspend_64.o
+        obj-$(CONFIG_HIBERNATION)      += suspend_asm_64.o
+
+        obj-$(CONFIG_GART_IOMMU)       += pci-gart_64.o aperture_64.o
+        obj-$(CONFIG_CALGARY_IOMMU)    += pci-calgary_64.o tce_64.o
+        obj-$(CONFIG_SWIOTLB)          += pci-swiotlb_64.o
+endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32

deleted file mode 100644 (file)

index a7bc93c..0000000
--- a/arch/x86/kernel/Makefile_32
+++ /dev/null
@@ -1,88 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-extra-y := head_32.o init_task.o vmlinux.lds
-CPPFLAGS_vmlinux.lds += -Ui386
-
-obj-y  := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
-               ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
-               pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
-               quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o
-
-obj-$(CONFIG_STACKTRACE)       += stacktrace.o
-obj-y                          += cpu/
-obj-y                          += acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT)  += reboot_32.o
-obj-$(CONFIG_MCA)              += mca_32.o
-obj-$(CONFIG_X86_MSR)          += msr.o
-obj-$(CONFIG_X86_CPUID)                += cpuid.o
-obj-$(CONFIG_MICROCODE)                += microcode.o
-obj-$(CONFIG_PCI)              += early-quirks.o
-obj-$(CONFIG_APM)              += apm_32.o
-obj-$(CONFIG_X86_SMP)          += smp_32.o smpboot_32.o tsc_sync.o
-obj-$(CONFIG_SMP)              += smpcommon_32.o
-obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline_32.o
-obj-$(CONFIG_X86_MPPARSE)      += mpparse_32.o
-obj-$(CONFIG_X86_LOCAL_APIC)   += apic_32.o nmi_32.o
-obj-$(CONFIG_X86_IO_APIC)      += io_apic_32.o
-obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
-obj-$(CONFIG_KEXEC)            += machine_kexec_32.o relocate_kernel_32.o crash.o
-obj-$(CONFIG_CRASH_DUMP)       += crash_dump_32.o
-obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
-obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit_32.o
-obj-$(CONFIG_KPROBES)          += kprobes_32.o
-obj-$(CONFIG_MODULES)          += module_32.o
-obj-y                          += sysenter_32.o vsyscall_32.o
-obj-$(CONFIG_ACPI_SRAT)        += srat_32.o
-obj-$(CONFIG_EFI)              += efi_32.o efi_stub_32.o
-obj-$(CONFIG_DOUBLEFAULT)      += doublefault_32.o
-obj-$(CONFIG_VM86)             += vm86_32.o
-obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
-obj-$(CONFIG_HPET_TIMER)       += hpet.o
-obj-$(CONFIG_K8_NB)            += k8.o
-obj-$(CONFIG_MGEODE_LX)                += geode_32.o mfgpt_32.o
-
-obj-$(CONFIG_VMI)              += vmi_32.o vmiclock_32.o
-obj-$(CONFIG_PARAVIRT)         += paravirt_32.o
-obj-y                          += pcspeaker.o
-
-obj-$(CONFIG_SCx200)           += scx200_32.o
-
-# vsyscall_32.o contains the vsyscall DSO images as __initdata.
-# We must build both images before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
-targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so)
-targets += vsyscall-note_32.o vsyscall_32.lds
-
-# The DSO images are built using a special linker script.
-quiet_cmd_syscall = SYSCALL $@
-      cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
-                         -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386
-
-vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
-                $(call ld-option, -Wl$(comma)--hash-style=sysv)
-SYSCFLAGS_vsyscall-sysenter_32.so      = $(vsyscall-flags)
-SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
-
-$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
-$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
-                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
-       $(call if_changed,syscall)
-
-# We also create a special relocatable object that should mirror the symbol
-# table and layout of the linked DSO.  With ld -R we can then refer to
-# these symbols in the kernel code rather than hand-coded addresses.
-extra-y += vsyscall-syms.o
-$(obj)/built-in.o: $(obj)/vsyscall-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
-
-SYSCFLAGS_vsyscall-syms.o = -r
-$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
-                       $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
-       $(call if_changed,syscall)
-
-
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64

deleted file mode 100644 (file)

index 5a88890..0000000
--- a/arch/x86/kernel/Makefile_64
+++ /dev/null
@@ -1,45 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-extra-y        := head_64.o head64.o init_task.o vmlinux.lds
-CPPFLAGS_vmlinux.lds += -Ux86_64
-EXTRA_AFLAGS   := -traditional
-
-obj-y  := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
-               ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
-               x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
-               setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
-               pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
-               i8253.o
-
-obj-$(CONFIG_STACKTRACE)       += stacktrace.o
-obj-y                          += cpu/
-obj-y                          += acpi/
-obj-$(CONFIG_X86_MSR)          += msr.o
-obj-$(CONFIG_MICROCODE)                += microcode.o
-obj-$(CONFIG_X86_CPUID)                += cpuid.o
-obj-$(CONFIG_SMP)              += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
-obj-y                          += apic_64.o  nmi_64.o
-obj-y                          += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
-obj-$(CONFIG_KEXEC)            += machine_kexec_64.o relocate_kernel_64.o crash.o
-obj-$(CONFIG_CRASH_DUMP)       += crash_dump_64.o
-obj-$(CONFIG_PM)               += suspend_64.o
-obj-$(CONFIG_HIBERNATION)      += suspend_asm_64.o
-obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
-obj-$(CONFIG_GART_IOMMU)       += pci-gart_64.o aperture_64.o
-obj-$(CONFIG_CALGARY_IOMMU)    += pci-calgary_64.o tce_64.o
-obj-$(CONFIG_SWIOTLB)          += pci-swiotlb_64.o
-obj-$(CONFIG_KPROBES)          += kprobes_64.o
-obj-$(CONFIG_X86_PM_TIMER)     += pmtimer_64.o
-obj-$(CONFIG_X86_VSMP)         += vsmp_64.o
-obj-$(CONFIG_K8_NB)            += k8.o
-obj-$(CONFIG_AUDIT)            += audit_64.o
-
-obj-$(CONFIG_MODULES)          += module_64.o
-obj-$(CONFIG_PCI)              += early-quirks.o
-
-obj-y                          += topology.o
-obj-y                          += pcspeaker.o
-
-CFLAGS_vsyscall_64.o           := $(PROFILING) -g0
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile

index 1351c3982ee4ee296df1787bdbecc4d6cc815022..19d3d6e9d09b80f359a79de7525344b6d87401b5 100644 (file)
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,5 @@
  obj-$(CONFIG_ACPI)             += boot.o
-obj-$(CONFIG_ACPI_SLEEP)       += sleep_$(BITS).o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP)       += sleep.o wakeup_$(BITS).o
  
  ifneq ($(CONFIG_ACPI_PROCESSOR),)
  obj-y                          += cstate.o processor.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c

new file mode 100644 (file)

index 0000000..6bc815c
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,87 @@
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ *  Copyright (C) 2001-2003 Patrick Mochel
+ *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ */
+
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+
+#include <asm/smp.h>
+
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_realmode_flags;
+extern char wakeup_start, wakeup_end;
+
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem(void)
+{
+       if (!acpi_wakeup_address) {
+               printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
+               return -ENOMEM;
+       }
+       memcpy((void *)acpi_wakeup_address, &wakeup_start,
+              &wakeup_end - &wakeup_start);
+       acpi_copy_wakeup_routine(acpi_wakeup_address);
+
+       return 0;
+}
+
+/*
+ * acpi_restore_state - undo effects of acpi_save_state_mem
+ */
+void acpi_restore_state_mem(void)
+{
+}
+
+
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page from the first 1MB of memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16MB pages, but not
+ * <1MB pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
+               printk(KERN_ERR
+                      "ACPI: Wakeup code way too big, S3 disabled.\n");
+               return;
+       }
+
+       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
+       if (!acpi_wakeup_address)
+               printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+}
+
+
+static int __init acpi_sleep_setup(char *str)
+{
+       while ((str != NULL) && (*str != '\0')) {
+               if (strncmp(str, "s3_bios", 7) == 0)
+                       acpi_realmode_flags |= 1;
+               if (strncmp(str, "s3_mode", 7) == 0)
+                       acpi_realmode_flags |= 2;
+               if (strncmp(str, "s3_beep", 7) == 0)
+                       acpi_realmode_flags |= 4;
+               str = strchr(str, ',');
+               if (str != NULL)
+                       str += strspn(str, ", \t");
+       }
+       return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c

index 10699489cfe7f776e9666ad588cf8265b21642b1..63fe5525e026137f0b21ec8736fbd8aa42fdd437 100644 (file)
--- a/arch/x86/kernel/acpi/sleep_32.c
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -12,76 +12,6 @@
  
  #include <asm/smp.h>
  
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;
-
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
-
-/**
- * acpi_save_state_mem - save kernel state
- *
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
- */
-int acpi_save_state_mem(void)
-{
-       if (!acpi_wakeup_address)
-               return 1;
-       memcpy((void *)acpi_wakeup_address, &wakeup_start,
-              &wakeup_end - &wakeup_start);
-       acpi_copy_wakeup_routine(acpi_wakeup_address);
-
-       return 0;
-}
-
-/*
- * acpi_restore_state - undo effects of acpi_save_state_mem
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
- *
- * We allocate a page from the first 1MB of memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16MB pages, but not
- * <1MB pages.
- */
-void __init acpi_reserve_bootmem(void)
-{
-       if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
-               printk(KERN_ERR
-                      "ACPI: Wakeup code way too big, S3 disabled.\n");
-               return;
-       }
-
-       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
-       if (!acpi_wakeup_address)
-               printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
-}
-
-static int __init acpi_sleep_setup(char *str)
-{
-       while ((str != NULL) && (*str != '\0')) {
-               if (strncmp(str, "s3_bios", 7) == 0)
-                       acpi_realmode_flags |= 1;
-               if (strncmp(str, "s3_mode", 7) == 0)
-                       acpi_realmode_flags |= 2;
-               if (strncmp(str, "s3_beep", 7) == 0)
-                       acpi_realmode_flags |= 4;
-               str = strchr(str, ',');
-               if (str != NULL)
-                       str += strspn(str, ", \t");
-       }
-       return 1;
-}
-
-__setup("acpi_sleep=", acpi_sleep_setup);
-
  /* Ouch, we want to delete this. We already have better version in userspace, in
     s2ram from suspend.sf.net project */
  static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c

deleted file mode 100644 (file)

index da42de2..0000000
--- a/arch/x86/kernel/acpi/sleep_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- *  acpi.c - Architecture-Specific Low-Level ACPI Support
- *
- *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
- *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
- *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
- *  Copyright (C) 2003 Pavel Machek, SuSE Labs
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/bootmem.h>
-#include <linux/acpi.h>
-#include <linux/cpumask.h>
-
-#include <asm/mpspec.h>
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/tlbflush.h>
-
-/* --------------------------------------------------------------------------
-                              Low-Level Sleep Support
-   -------------------------------------------------------------------------- */
-
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;
-
-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
-
-/**
- * acpi_save_state_mem - save kernel state
- *
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
- */
-int acpi_save_state_mem(void)
-{
-       memcpy((void *)acpi_wakeup_address, &wakeup_start,
-              &wakeup_end - &wakeup_start);
-       acpi_copy_wakeup_routine(acpi_wakeup_address);
-
-       return 0;
-}
-
-/*
- * acpi_restore_state
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
- *
- * We allocate a page in low memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16M pages, but not
- * <1M pages.
- */
-void __init acpi_reserve_bootmem(void)
-{
-       acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
-       if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
-               printk(KERN_CRIT
-                      "ACPI: Wakeup code way too big, will crash on attempt"
-                      " to suspend\n");
-}
-
-static int __init acpi_sleep_setup(char *str)
-{
-       while ((str != NULL) && (*str != '\0')) {
-               if (strncmp(str, "s3_bios", 7) == 0)
-                       acpi_realmode_flags |= 1;
-               if (strncmp(str, "s3_mode", 7) == 0)
-                       acpi_realmode_flags |= 2;
-               if (strncmp(str, "s3_beep", 7) == 0)
-                       acpi_realmode_flags |= 4;
-               str = strchr(str, ',');
-               if (str != NULL)
-                       str += strspn(str, ", \t");
-       }
-       return 1;
-}
-
-__setup("acpi_sleep=", acpi_sleep_setup);
-
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S

index 1e931aaf2ef6db657ec4bc471d17ab169a4aa975..f53e3277f8e500512d1c86a3aa9c2b718715a6f3 100644 (file)
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
-.text
+       .section .text.page_aligned
  #include <linux/linkage.h>
  #include <asm/segment.h>
  #include <asm/page.h>
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S

index 5ed3bc5c61d78d76ce9ddbacca5254372db56b6a..2e1b9e0d07678848bee7b6224c3d7f676edcc8ca 100644 (file)
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -344,13 +344,13 @@ do_suspend_lowlevel:
         call    save_processor_state
  
         movq    $saved_context, %rax
-       movq    %rsp, pt_regs_rsp(%rax)
-       movq    %rbp, pt_regs_rbp(%rax)
-       movq    %rsi, pt_regs_rsi(%rax)
-       movq    %rdi, pt_regs_rdi(%rax)
-       movq    %rbx, pt_regs_rbx(%rax)
-       movq    %rcx, pt_regs_rcx(%rax)
-       movq    %rdx, pt_regs_rdx(%rax)
+       movq    %rsp, pt_regs_sp(%rax)
+       movq    %rbp, pt_regs_bp(%rax)
+       movq    %rsi, pt_regs_si(%rax)
+       movq    %rdi, pt_regs_di(%rax)
+       movq    %rbx, pt_regs_bx(%rax)
+       movq    %rcx, pt_regs_cx(%rax)
+       movq    %rdx, pt_regs_dx(%rax)
         movq    %r8, pt_regs_r8(%rax)
         movq    %r9, pt_regs_r9(%rax)
         movq    %r10, pt_regs_r10(%rax)
@@ -360,7 +360,7 @@ do_suspend_lowlevel:
         movq    %r14, pt_regs_r14(%rax)
         movq    %r15, pt_regs_r15(%rax)
         pushfq
-       popq    pt_regs_eflags(%rax)
+       popq    pt_regs_flags(%rax)
  
         movq    $.L97, saved_rip(%rip)
  
@@ -391,15 +391,15 @@ do_suspend_lowlevel:
         movq    %rbx, %cr2
         movq    saved_context_cr0(%rax), %rbx
         movq    %rbx, %cr0
-       pushq   pt_regs_eflags(%rax)
+       pushq   pt_regs_flags(%rax)
         popfq
-       movq    pt_regs_rsp(%rax), %rsp
-       movq    pt_regs_rbp(%rax), %rbp
-       movq    pt_regs_rsi(%rax), %rsi
-       movq    pt_regs_rdi(%rax), %rdi
-       movq    pt_regs_rbx(%rax), %rbx
-       movq    pt_regs_rcx(%rax), %rcx
-       movq    pt_regs_rdx(%rax), %rdx
+       movq    pt_regs_sp(%rax), %rsp
+       movq    pt_regs_bp(%rax), %rbp
+       movq    pt_regs_si(%rax), %rsi
+       movq    pt_regs_di(%rax), %rdi
+       movq    pt_regs_bx(%rax), %rbx
+       movq    pt_regs_cx(%rax), %rcx
+       movq    pt_regs_dx(%rax), %rdx
         movq    pt_regs_r8(%rax), %r8
         movq    pt_regs_r9(%rax), %r9
         movq    pt_regs_r10(%rax), %r10
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c

index d6405e0842b55d35835a9ffa203c6b22ea41c189..45d79ea890aee92107f451b9554cb69080c00d0b 100644 (file)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -273,6 +273,7 @@ struct smp_alt_module {
  };
  static LIST_HEAD(smp_alt_modules);
  static DEFINE_SPINLOCK(smp_alt);
+static int smp_mode = 1;       /* protected by smp_alt */
  
  void alternatives_smp_module_add(struct module *mod, char *name,
                                  void *locks, void *locks_end,
@@ -341,12 +342,13 @@ void alternatives_smp_switch(int smp)
  
  #ifdef CONFIG_LOCKDEP
         /*
-        * A not yet fixed binutils section handling bug prevents
-        * alternatives-replacement from working reliably, so turn
-        * it off:
+        * Older binutils section handling bug prevented
+        * alternatives-replacement from working reliably.
+        *
+        * If this still occurs then you should see a hang
+        * or crash shortly after this line:
          */
-       printk("lockdep: not fixing up alternatives.\n");
-       return;
+       printk("lockdep: fixing up alternatives.\n");
  #endif
  
         if (noreplace_smp || smp_alt_once)
@@ -354,21 +356,29 @@ void alternatives_smp_switch(int smp)
         BUG_ON(!smp && (num_online_cpus() > 1));
  
         spin_lock_irqsave(&smp_alt, flags);
-       if (smp) {
+
+       /*
+        * Avoid unnecessary switches because it forces JIT based VMs to
+        * throw away all cached translations, which can be quite costly.
+        */
+       if (smp == smp_mode) {
+               /* nothing */
+       } else if (smp) {
                 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
-               clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-               clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+               clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
                 list_for_each_entry(mod, &smp_alt_modules, next)
                         alternatives_smp_lock(mod->locks, mod->locks_end,
                                               mod->text, mod->text_end);
         } else {
                 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
-               set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-               set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability);
+               set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+               set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
                 list_for_each_entry(mod, &smp_alt_modules, next)
                         alternatives_smp_unlock(mod->locks, mod->locks_end,
                                                 mod->text, mod->text_end);
         }
+       smp_mode = smp;
         spin_unlock_irqrestore(&smp_alt, flags);
  }
  
@@ -431,8 +441,9 @@ void __init alternative_instructions(void)
         if (smp_alt_once) {
                 if (1 == num_possible_cpus()) {
                         printk(KERN_INFO "SMP alternatives: switching to UP code\n");
-                       set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-                       set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability);
+                       set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+                       set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
+
                         alternatives_smp_unlock(__smp_locks, __smp_locks_end,
                                                 _text, _etext);
                 }
@@ -440,7 +451,10 @@ void __init alternative_instructions(void)
                 alternatives_smp_module_add(NULL, "core kernel",
                                             __smp_locks, __smp_locks_end,
                                             _text, _etext);
-               alternatives_smp_switch(0);
+
+               /* Only switch to UP mode if we don't immediately boot others */
+               if (num_possible_cpus() == 1 || setup_max_cpus <= 1)
+                       alternatives_smp_switch(0);
         }
  #endif
         apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c

index 5b6992799c9d20882bab7fd1efa0f0772ad77243..608152a2a05ea0931512542999272a9716562e9a 100644 (file)
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,12 +1,12 @@
-/* 
+/*
   * Firmware replacement code.
- * 
+ *
   * Work around broken BIOSes that don't set an aperture or only set the
- * aperture in the AGP bridge. 
- * If all fails map the aperture over some low memory.  This is cheaper than 
- * doing bounce buffering. The memory is lost. This is done at early boot 
- * because only the bootmem allocator can allocate 32+MB. 
- * 
+ * aperture in the AGP bridge.
+ * If all fails map the aperture over some low memory.  This is cheaper than
+ * doing bounce buffering. The memory is lost. This is done at early boot
+ * because only the bootmem allocator can allocate 32+MB.
+ *
   * Copyright 2002 Andi Kleen, SuSE Labs.
   */
  #include <linux/kernel.h>
@@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0;
  int gart_iommu_aperture_allowed __initdata = 0;
  
  int fallback_aper_order __initdata = 1; /* 64MB */
-int fallback_aper_force __initdata = 0; 
+int fallback_aper_force __initdata = 0;
  
  int fix_aperture __initdata = 1;
  
@@ -49,167 +49,270 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
  /* This code runs before the PCI subsystem is initialized, so just
     access the northbridge directly. */
  
-static u32 __init allocate_aperture(void) 
+static u32 __init allocate_aperture(void)
  {
         u32 aper_size;
-       void *p; 
+       void *p;
  
-       if (fallback_aper_order > 7) 
-               fallback_aper_order = 7; 
-       aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+       if (fallback_aper_order > 7)
+               fallback_aper_order = 7;
+       aper_size = (32 * 1024 * 1024) << fallback_aper_order;
  
-       /* 
-        * Aperture has to be naturally aligned. This means an 2GB aperture won't
-        * have much chance of finding a place in the lower 4GB of memory.
-        * Unfortunately we cannot move it up because that would make the
-        * IOMMU useless.
+       /*
+        * Aperture has to be naturally aligned. This means a 2GB aperture
+        * won't have much chance of finding a place in the lower 4GB of
+        * memory. Unfortunately we cannot move it up because that would
+        * make the IOMMU useless.
          */
         p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
         if (!p || __pa(p)+aper_size > 0xffffffff) {
-               printk("Cannot allocate aperture memory hole (%p,%uK)\n",
-                      p, aper_size>>10);
+               printk(KERN_ERR
+                       "Cannot allocate aperture memory hole (%p,%uK)\n",
+                               p, aper_size>>10);
                 if (p)
                         free_bootmem(__pa(p), aper_size);
                 return 0;
         }
-       printk("Mapping aperture over %d KB of RAM @ %lx\n",
-              aper_size >> 10, __pa(p)); 
+       printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
+                       aper_size >> 10, __pa(p));
         insert_aperture_resource((u32)__pa(p), aper_size);
-       return (u32)__pa(p); 
+
+       return (u32)__pa(p);
  }
  
  static int __init aperture_valid(u64 aper_base, u32 aper_size)
-{ 
-       if (!aper_base) 
-               return 0;
-       if (aper_size < 64*1024*1024) { 
-               printk("Aperture too small (%d MB)\n", aper_size>>20);
+{
+       if (!aper_base)
                 return 0;
-       }
+
         if (aper_base + aper_size > 0x100000000UL) {
-               printk("Aperture beyond 4GB. Ignoring.\n");
-               return 0; 
+               printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
+               return 0;
         }
         if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-               printk("Aperture pointing to e820 RAM. Ignoring.\n");
-               return 0; 
-       } 
+               printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
+               return 0;
+       }
+       if (aper_size < 64*1024*1024) {
+               printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
+               return 0;
+       }
+
         return 1;
-} 
+}
  
  /* Find a PCI capability */
-static __u32 __init find_cap(int num, int slot, int func, int cap) 
-{ 
-       u8 pos;
+static __u32 __init find_cap(int num, int slot, int func, int cap)
+{
         int bytes;
-       if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+       u8 pos;
+
+       if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+                                               PCI_STATUS_CAP_LIST))
                 return 0;
-       pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
-       for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+
+       pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+       for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
                 u8 id;
-               pos &= ~3; 
-               id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+
+               pos &= ~3;
+               id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
                 if (id == 0xff)
                         break;
-               if (id == cap) 
-                       return pos; 
-               pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
-       } 
+               if (id == cap)
+                       return pos;
+               pos = read_pci_config_byte(num, slot, func,
+                                               pos+PCI_CAP_LIST_NEXT);
+       }
         return 0;
-} 
+}
  
  /* Read a standard AGPv3 bridge header */
  static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
-{ 
+{
         u32 apsize;
         u32 apsizereg;
         int nbits;
         u32 aper_low, aper_hi;
         u64 aper;
  
-       printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
-       apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+       printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+       apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
         if (apsizereg == 0xffffffff) {
-               printk("APSIZE in AGP bridge unreadable\n");
+               printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
                 return 0;
         }
  
         apsize = apsizereg & 0xfff;
         /* Some BIOS use weird encodings not in the AGPv3 table. */
-       if (apsize & 0xff) 
-               apsize |= 0xf00; 
+       if (apsize & 0xff)
+               apsize |= 0xf00;
         nbits = hweight16(apsize);
         *order = 7 - nbits;
         if ((int)*order < 0) /* < 32MB */
                 *order = 0;
-       
-       aper_low = read_pci_config(num,slot,func, 0x10);
-       aper_hi = read_pci_config(num,slot,func,0x14);
+
+       aper_low = read_pci_config(num, slot, func, 0x10);
+       aper_hi = read_pci_config(num, slot, func, 0x14);
         aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
  
-       printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
-              aper, 32 << *order, apsizereg);
+       printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
+                       aper, 32 << *order, apsizereg);
  
         if (!aperture_valid(aper, (32*1024*1024) << *order))
-           return 0;
-       return (u32)aper; 
-} 
-
-/* Look for an AGP bridge. Windows only expects the aperture in the
-   AGP bridge and some BIOS forget to initialize the Northbridge too.
-   Work around this here. 
-
-   Do an PCI bus scan by hand because we're running before the PCI
-   subsystem. 
+               return 0;
+       return (u32)aper;
+}
  
-   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
-   generically. It's probably overkill to always scan all slots because
-   the AGP bridges should be always an own bus on the HT hierarchy, 
-   but do it here for future safety. */
+/*
+ * Look for an AGP bridge. Windows only expects the aperture in the
+ * AGP bridge and some BIOS forget to initialize the Northbridge too.
+ * Work around this here.
+ *
+ * Do an PCI bus scan by hand because we're running before the PCI
+ * subsystem.
+ *
+ * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * generically. It's probably overkill to always scan all slots because
+ * the AGP bridges should be always an own bus on the HT hierarchy,
+ * but do it here for future safety.
+ */
  static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
  {
         int num, slot, func;
  
         /* Poor man's PCI discovery */
-       for (num = 0; num < 256; num++) { 
-               for (slot = 0; slot < 32; slot++) { 
-                       for (func = 0; func < 8; func++) { 
+       for (num = 0; num < 256; num++) {
+               for (slot = 0; slot < 32; slot++) {
+                       for (func = 0; func < 8; func++) {
                                 u32 class, cap;
                                 u8 type;
-                               class = read_pci_config(num,slot,func,
+                               class = read_pci_config(num, slot, func,
                                                         PCI_CLASS_REVISION);
                                 if (class == 0xffffffff)
-                                       break; 
-                               
-                               switch (class >> 16) { 
+                                       break;
+
+                               switch (class >> 16) {
                                 case PCI_CLASS_BRIDGE_HOST:
                                 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
                                         /* AGP bridge? */
-                                       cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+                                       cap = find_cap(num, slot, func,
+                                                       PCI_CAP_ID_AGP);
                                         if (!cap)
                                                 break;
-                                       *valid_agp = 1; 
-                                       return read_agp(num,slot,func,cap,order);
-                               } 
-                               
+                                       *valid_agp = 1;
+                                       return read_agp(num, slot, func, cap,
+                                                       order);
+                               }
+
                                 /* No multi-function device? */
-                               type = read_pci_config_byte(num,slot,func,
+                               type = read_pci_config_byte(num, slot, func,
                                                                PCI_HEADER_TYPE);
                                 if (!(type & 0x80))
                                         break;
-                       } 
-               } 
+                       }
+               }
         }
-       printk("No AGP bridge found\n"); 
+       printk(KERN_INFO "No AGP bridge found\n");
+
         return 0;
  }
  
+static int gart_fix_e820 __initdata = 1;
+
+static int __init parse_gart_mem(char *p)
+{
+       if (!p)
+               return -EINVAL;
+
+       if (!strncmp(p, "off", 3))
+               gart_fix_e820 = 0;
+       else if (!strncmp(p, "on", 2))
+               gart_fix_e820 = 1;
+
+       return 0;
+}
+early_param("gart_fix_e820", parse_gart_mem);
+
+void __init early_gart_iommu_check(void)
+{
+       /*
+        * in case it is enabled before, esp for kexec/kdump,
+        * previous kernel already enable that. memset called
+        * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
+        * or second kernel have different position for GART hole. and new
+        * kernel could use hole as RAM that is still used by GART set by
+        * first kernel
+        * or BIOS forget to put that in reserved.
+        * try to update e820 to make that region as reserved.
+        */
+       int fix, num;
+       u32 ctl;
+       u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
+       u64 aper_base = 0, last_aper_base = 0;
+       int aper_enabled = 0, last_aper_enabled = 0;
+
+       if (!early_pci_allowed())
+               return;
+
+       fix = 0;
+       for (num = 24; num < 32; num++) {
+               if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+                       continue;
+
+               ctl = read_pci_config(0, num, 3, 0x90);
+               aper_enabled = ctl & 1;
+               aper_order = (ctl >> 1) & 7;
+               aper_size = (32 * 1024 * 1024) << aper_order;
+               aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+               aper_base <<= 25;
+
+               if ((last_aper_order && aper_order != last_aper_order) ||
+                   (last_aper_base && aper_base != last_aper_base) ||
+                   (last_aper_enabled && aper_enabled != last_aper_enabled)) {
+                       fix = 1;
+                       break;
+               }
+               last_aper_order = aper_order;
+               last_aper_base = aper_base;
+               last_aper_enabled = aper_enabled;
+       }
+
+       if (!fix && !aper_enabled)
+               return;
+
+       if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
+               fix = 1;
+
+       if (gart_fix_e820 && !fix && aper_enabled) {
+               if (e820_any_mapped(aper_base, aper_base + aper_size,
+                                   E820_RAM)) {
+                       /* reserved it, so we can resuse it in second kernel */
+                       printk(KERN_INFO "update e820 for GART\n");
+                       add_memory_region(aper_base, aper_size, E820_RESERVED);
+                       update_e820();
+               }
+               return;
+       }
+
+       /* different nodes have different setting, disable them all at first*/
+       for (num = 24; num < 32; num++) {
+               if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+                       continue;
+
+               ctl = read_pci_config(0, num, 3, 0x90);
+               ctl &= ~1;
+               write_pci_config(0, num, 3, 0x90, ctl);
+       }
+
+}
+
  void __init gart_iommu_hole_init(void)
-{ 
-       int fix, num; 
+{
         u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
         u64 aper_base, last_aper_base = 0;
-       int valid_agp = 0;
+       int fix, num, valid_agp = 0;
+       int node;
  
         if (gart_iommu_aperture_disabled || !fix_aperture ||
             !early_pci_allowed())
@@ -218,24 +321,26 @@ void __init gart_iommu_hole_init(void)
         printk(KERN_INFO  "Checking aperture...\n");
  
         fix = 0;
-       for (num = 24; num < 32; num++) {               
+       node = 0;
+       for (num = 24; num < 32; num++) {
                 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
                         continue;
  
                 iommu_detected = 1;
                 gart_iommu_aperture = 1;
  
-               aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
-               aper_size = (32 * 1024 * 1024) << aper_order; 
+               aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
+               aper_size = (32 * 1024 * 1024) << aper_order;
                 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
-               aper_base <<= 25; 
+               aper_base <<= 25;
+
+               printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
+                               node, aper_base, aper_size >> 20);
+               node++;
  
-               printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
-                      aper_base, aper_size>>20);
-               
                 if (!aperture_valid(aper_base, aper_size)) {
-                       fix = 1; 
-                       break; 
+                       fix = 1;
+                       break;
                 }
  
                 if ((last_aper_order && aper_order != last_aper_order) ||
@@ -245,55 +350,64 @@ void __init gart_iommu_hole_init(void)
                 }
                 last_aper_order = aper_order;
                 last_aper_base = aper_base;
-       } 
+       }
  
         if (!fix && !fallback_aper_force) {
                 if (last_aper_base) {
                         unsigned long n = (32 * 1024 * 1024) << last_aper_order;
+
                         insert_aperture_resource((u32)last_aper_base, n);
                 }
-               return; 
+               return;
         }
  
         if (!fallback_aper_force)
-               aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
-               
-       if (aper_alloc) { 
+               aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+
+       if (aper_alloc) {
                 /* Got the aperture from the AGP bridge */
         } else if (swiotlb && !valid_agp) {
                 /* Do nothing */
         } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
                    force_iommu ||
                    valid_agp ||
-                  fallback_aper_force) { 
-               printk("Your BIOS doesn't leave a aperture memory hole\n");
-               printk("Please enable the IOMMU option in the BIOS setup\n");
-               printk("This costs you %d MB of RAM\n",
-                      32 << fallback_aper_order);
+                  fallback_aper_force) {
+               printk(KERN_ERR
+                       "Your BIOS doesn't leave a aperture memory hole\n");
+               printk(KERN_ERR
+                       "Please enable the IOMMU option in the BIOS setup\n");
+               printk(KERN_ERR
+                       "This costs you %d MB of RAM\n",
+                               32 << fallback_aper_order);
  
                 aper_order = fallback_aper_order;
                 aper_alloc = allocate_aperture();
-               if (!aper_alloc) { 
-                       /* Could disable AGP and IOMMU here, but it's probably
-                          not worth it. But the later users cannot deal with
-                          bad apertures and turning on the aperture over memory
-                          causes very strange problems, so it's better to 
-                          panic early. */
+               if (!aper_alloc) {
+                       /*
+                        * Could disable AGP and IOMMU here, but it's
+                        * probably not worth it. But the later users
+                        * cannot deal with bad apertures and turning
+                        * on the aperture over memory causes very
+                        * strange problems, so it's better to panic
+                        * early.
+                        */
                         panic("Not enough memory for aperture");
                 }
-       } else { 
-               return; 
-       } 
+       } else {
+               return;
+       }
  
         /* Fix up the north bridges */
-       for (num = 24; num < 32; num++) {               
+       for (num = 24; num < 32; num++) {
                 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-                       continue;       
-
-               /* Don't enable translation yet. That is done later. 
-                  Assume this BIOS didn't initialise the GART so 
-                  just overwrite all previous bits */ 
-               write_pci_config(0, num, 3, 0x90, aper_order<<1); 
-               write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
-       } 
-} 
+                       continue;
+
+               /*
+                * Don't enable translation yet. That is done later.
+                * Assume this BIOS didn't initialise the GART so
+                * just overwrite all previous bits
+                */
+               write_pci_config(0, num, 3, 0x90, aper_order<<1);
+               write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
+       }
+}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c

index a56c782653be7dc17474e133518d75f299659945..35a568ea8400269f4fde96377e76e1013fc5c3e3 100644 (file)
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -43,12 +43,10 @@
  #include <mach_apicdef.h>
  #include <mach_ipi.h>
  
-#include "io_ports.h"
-
  /*
   * Sanity check
   */
-#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
+#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
  # error SPURIOUS_APIC_VECTOR definition error
  #endif
  
@@ -57,7 +55,7 @@
   *
   * -1=force-disable, +1=force-enable
   */
-static int enable_local_apic __initdata = 0;
+static int enable_local_apic __initdata;
  
  /* Local APIC timer verification ok */
  static int local_apic_timer_verify_ok;
@@ -101,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
  /* Local APIC was disabled by the BIOS and enabled by the kernel */
  static int enabled_via_apicbase;
  
+static unsigned long apic_phys;
+
  /*
   * Get the LAPIC version
   */
@@ -110,7 +110,7 @@ static inline int lapic_get_version(void)
  }
  
  /*
- * Check, if the APIC is integrated or a seperate chip
+ * Check, if the APIC is integrated or a separate chip
   */
  static inline int lapic_is_integrated(void)
  {
@@ -135,9 +135,9 @@ void apic_wait_icr_idle(void)
                 cpu_relax();
  }
  
-unsigned long safe_apic_wait_icr_idle(void)
+u32 safe_apic_wait_icr_idle(void)
  {
-       unsigned long send_status;
+       u32 send_status;
         int timeout;
  
         timeout = 0;
@@ -154,7 +154,7 @@ unsigned long safe_apic_wait_icr_idle(void)
  /**
   * enable_NMI_through_LVT0 - enable NMI through local vector table 0
   */
-void enable_NMI_through_LVT0 (void * dummy)
+void __cpuinit enable_NMI_through_LVT0(void)
  {
         unsigned int v = APIC_DM_NMI;
  
@@ -379,8 +379,10 @@ void __init setup_boot_APIC_clock(void)
          */
         if (local_apic_timer_disabled) {
                 /* No broadcast on UP ! */
-               if (num_possible_cpus() > 1)
+               if (num_possible_cpus() > 1) {
+                       lapic_clockevent.mult = 1;
                         setup_APIC_timer();
+               }
                 return;
         }
  
@@ -434,7 +436,7 @@ void __init setup_boot_APIC_clock(void)
                                "with PM Timer: %ldms instead of 100ms\n",
                                (long)res);
                         /* Correct the lapic counter value */
-                       res = (((u64) delta ) * pm_100ms);
+                       res = (((u64) delta) * pm_100ms);
                         do_div(res, deltapm);
                         printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
                                "%lu (%ld)\n", (unsigned long) res, delta);
@@ -472,6 +474,19 @@ void __init setup_boot_APIC_clock(void)
  
         local_apic_timer_verify_ok = 1;
  
+       /*
+        * Do a sanity check on the APIC calibration result
+        */
+       if (calibration_result < (1000000 / HZ)) {
+               local_irq_enable();
+               printk(KERN_WARNING
+                      "APIC frequency too slow, disabling apic timer\n");
+               /* No broadcast on UP ! */
+               if (num_possible_cpus() > 1)
+                       setup_APIC_timer();
+               return;
+       }
+
         /* We trust the pm timer based calibration */
         if (!pm_referenced) {
                 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -563,6 +578,9 @@ static void local_apic_timer_interrupt(void)
                 return;
         }
  
+       /*
+        * the NMI deadlock-detector uses this.
+        */
         per_cpu(irq_stat, cpu).apic_timer_irqs++;
  
         evt->event_handler(evt);
@@ -576,8 +594,7 @@ static void local_apic_timer_interrupt(void)
   * [ if a single-CPU system runs an SMP kernel then we call the local
   *   interrupt as well. Thus we cannot inline the local irq ... ]
   */
-
-void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
+void smp_apic_timer_interrupt(struct pt_regs *regs)
  {
         struct pt_regs *old_regs = set_irq_regs(regs);
  
@@ -616,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier)
   */
  void clear_local_APIC(void)
  {
-       int maxlvt = lapic_get_maxlvt();
-       unsigned long v;
+       int maxlvt;
+       u32 v;
+
+       /* APIC hasn't been mapped yet */
+       if (!apic_phys)
+               return;
  
+       maxlvt = lapic_get_maxlvt();
         /*
          * Masking an LVT entry can trigger a local APIC error
          * if the vector is zero. Mask LVTERR first to prevent this.
@@ -976,7 +998,8 @@ void __cpuinit setup_local_APIC(void)
                 value |= APIC_LVT_LEVEL_TRIGGER;
         apic_write_around(APIC_LVT1, value);
  
-       if (integrated && !esr_disable) {               /* !82489DX */
+       if (integrated && !esr_disable) {
+               /* !82489DX */
                 maxlvt = lapic_get_maxlvt();
                 if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
                         apic_write(APIC_ESR, 0);
@@ -1020,7 +1043,7 @@ void __cpuinit setup_local_APIC(void)
  /*
   * Detect and initialize APIC
   */
-static int __init detect_init_APIC (void)
+static int __init detect_init_APIC(void)
  {
         u32 h, l, features;
  
@@ -1077,7 +1100,7 @@ static int __init detect_init_APIC (void)
                 printk(KERN_WARNING "Could not enable APIC!\n");
                 return -1;
         }
-       set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+       set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
         mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
  
         /* The BIOS may have set up the APIC at some other address */
@@ -1104,8 +1127,6 @@ no_apic:
   */
  void __init init_apic_mappings(void)
  {
-       unsigned long apic_phys;
-
         /*
          * If no local APIC can be found then set up a fake all
          * zeroes page to simulate the local APIC and another
@@ -1164,10 +1185,10 @@ fake_ioapic_page:
   * This initializes the IO-APIC and APIC hardware if this is
   * a UP kernel.
   */
-int __init APIC_init_uniprocessor (void)
+int __init APIC_init_uniprocessor(void)
  {
         if (enable_local_apic < 0)
-               clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
  
         if (!smp_found_config && !cpu_has_apic)
                 return -1;
@@ -1179,7 +1200,7 @@ int __init APIC_init_uniprocessor (void)
             APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
                 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
                        boot_cpu_physical_apicid);
-               clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
                 return -1;
         }
  
@@ -1209,50 +1230,6 @@ int __init APIC_init_uniprocessor (void)
         return 0;
  }
  
-/*
- * APIC command line parameters
- */
-static int __init parse_lapic(char *arg)
-{
-       enable_local_apic = 1;
-       return 0;
-}
-early_param("lapic", parse_lapic);
-
-static int __init parse_nolapic(char *arg)
-{
-       enable_local_apic = -1;
-       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-       return 0;
-}
-early_param("nolapic", parse_nolapic);
-
-static int __init parse_disable_lapic_timer(char *arg)
-{
-       local_apic_timer_disabled = 1;
-       return 0;
-}
-early_param("nolapic_timer", parse_disable_lapic_timer);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
-       local_apic_timer_c2_ok = 1;
-       return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static int __init apic_set_verbosity(char *str)
-{
-       if (strcmp("debug", str) == 0)
-               apic_verbosity = APIC_DEBUG;
-       else if (strcmp("verbose", str) == 0)
-               apic_verbosity = APIC_VERBOSE;
-       return 1;
-}
-
-__setup("apic=", apic_set_verbosity);
-
-
  /*
   * Local APIC interrupts
   */
@@ -1306,7 +1283,7 @@ void smp_error_interrupt(struct pt_regs *regs)
            6: Received illegal vector
            7: Illegal register address
         */
-       printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+       printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
                 smp_processor_id(), v , v1);
         irq_exit();
  }
@@ -1393,7 +1370,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                         value = apic_read(APIC_LVT0);
                         value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
                                 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-                               APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+                               APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
                         value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
                         value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
                         apic_write_around(APIC_LVT0, value);
@@ -1565,3 +1542,46 @@ device_initcall(init_lapic_sysfs);
  static void apic_pm_activate(void) { }
  
  #endif /* CONFIG_PM */
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+       enable_local_apic = 1;
+       return 0;
+}
+early_param("lapic", parse_lapic);
+
+static int __init parse_nolapic(char *arg)
+{
+       enable_local_apic = -1;
+       clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+       return 0;
+}
+early_param("nolapic", parse_nolapic);
+
+static int __init parse_disable_lapic_timer(char *arg)
+{
+       local_apic_timer_disabled = 1;
+       return 0;
+}
+early_param("nolapic_timer", parse_disable_lapic_timer);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+       local_apic_timer_c2_ok = 1;
+       return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static int __init apic_set_verbosity(char *str)
+{
+       if (strcmp("debug", str) == 0)
+               apic_verbosity = APIC_DEBUG;
+       else if (strcmp("verbose", str) == 0)
+               apic_verbosity = APIC_VERBOSE;
+       return 1;
+}
+__setup("apic=", apic_set_verbosity);
+
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c

index fa6cdee6d303a9dbd29b9f198fd2742140ebc0d8..d8d03e09dea24a45e94903ae1c61a646d65777d1 100644 (file)
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -23,32 +23,37 @@
  #include <linux/mc146818rtc.h>
  #include <linux/kernel_stat.h>
  #include <linux/sysdev.h>
-#include <linux/module.h>
  #include <linux/ioport.h>
  #include <linux/clockchips.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/module.h>
  
  #include <asm/atomic.h>
  #include <asm/smp.h>
  #include <asm/mtrr.h>
  #include <asm/mpspec.h>
+#include <asm/hpet.h>
  #include <asm/pgalloc.h>
  #include <asm/mach_apic.h>
  #include <asm/nmi.h>
  #include <asm/idle.h>
  #include <asm/proto.h>
  #include <asm/timex.h>
-#include <asm/hpet.h>
  #include <asm/apic.h>
  
-int apic_verbosity;
  int disable_apic_timer __cpuinitdata;
  static int apic_calibrate_pmtmr __initdata;
+int disable_apic;
  
-/* Local APIC timer works in C2? */
+/* Local APIC timer works in C2 */
  int local_apic_timer_c2_ok;
  EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
  
-static struct resource *ioapic_resources;
+/*
+ * Debug level, exported for io_apic.c
+ */
+int apic_verbosity;
+
  static struct resource lapic_resource = {
         .name = "Local APIC",
         .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -60,10 +65,8 @@ static int lapic_next_event(unsigned long delta,
                             struct clock_event_device *evt);
  static void lapic_timer_setup(enum clock_event_mode mode,
                               struct clock_event_device *evt);
-
  static void lapic_timer_broadcast(cpumask_t mask);
-
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen);
+static void apic_pm_activate(void);
  
  static struct clock_event_device lapic_clockevent = {
         .name           = "lapic",
@@ -78,66 +81,45 @@ static struct clock_event_device lapic_clockevent = {
  };
  static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
  
-static int lapic_next_event(unsigned long delta,
-                           struct clock_event_device *evt)
+static unsigned long apic_phys;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
  {
-       apic_write(APIC_TMICT, delta);
-       return 0;
+       return GET_APIC_VERSION(apic_read(APIC_LVR));
  }
  
-static void lapic_timer_setup(enum clock_event_mode mode,
-                             struct clock_event_device *evt)
+/*
+ * Check, if the APIC is integrated or a seperate chip
+ */
+static inline int lapic_is_integrated(void)
  {
-       unsigned long flags;
-       unsigned int v;
-
-       /* Lapic used as dummy for broadcast ? */
-       if (evt->features & CLOCK_EVT_FEAT_DUMMY)
-               return;
-
-       local_irq_save(flags);
-
-       switch (mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-       case CLOCK_EVT_MODE_ONESHOT:
-               __setup_APIC_LVTT(calibration_result,
-                                 mode != CLOCK_EVT_MODE_PERIODIC, 1);
-               break;
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               v = apic_read(APIC_LVTT);
-               v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-               apic_write(APIC_LVTT, v);
-               break;
-       case CLOCK_EVT_MODE_RESUME:
-               /* Nothing to do here */
-               break;
-       }
-
-       local_irq_restore(flags);
+       return 1;
  }
  
  /*
- * Local APIC timer broadcast function
+ * Check, whether this is a modern or a first generation APIC
   */
-static void lapic_timer_broadcast(cpumask_t mask)
+static int modern_apic(void)
  {
-#ifdef CONFIG_SMP
-       send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#endif
+       /* AMD systems use old APIC versions, so check the CPU */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+           boot_cpu_data.x86 >= 0xf)
+               return 1;
+       return lapic_get_version() >= 0x14;
  }
  
-static void apic_pm_activate(void);
-
  void apic_wait_icr_idle(void)
  {
         while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
                 cpu_relax();
  }
  
-unsigned int safe_apic_wait_icr_idle(void)
+u32 safe_apic_wait_icr_idle(void)
  {
-       unsigned int send_status;
+       u32 send_status;
         int timeout;
  
         timeout = 0;
@@ -151,7 +133,10 @@ unsigned int safe_apic_wait_icr_idle(void)
         return send_status;
  }
  
-void enable_NMI_through_LVT0 (void * dummy)
+/**
+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0
+ */
+void __cpuinit enable_NMI_through_LVT0(void)
  {
         unsigned int v;
  
@@ -160,7 +145,10 @@ void enable_NMI_through_LVT0 (void * dummy)
         apic_write(APIC_LVT0, v);
  }
  
-int get_maxlvt(void)
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
  {
         unsigned int v, maxlvt;
  
@@ -170,176 +158,488 @@ int get_maxlvt(void)
  }
  
  /*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
   */
-void ack_bad_irq(unsigned int irq)
-{
-       printk("unexpected IRQ trap at vector %02x\n", irq);
-       /*
-        * Currently unexpected vectors happen only on SMP and APIC.
-        * We _must_ ack these because every local APIC has only N
-        * irq slots per priority level, and a 'hanging, unacked' IRQ
-        * holds up an irq slot - in excessive cases (when multiple
-        * unexpected vectors occur) that might lock up the APIC
-        * completely.
-        * But don't ack when the APIC is disabled. -AK
-        */
-       if (!disable_apic)
-               ack_APIC_irq();
-}
  
-void clear_local_APIC(void)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
  {
-       int maxlvt;
-       unsigned int v;
+       unsigned int lvtt_value, tmp_value;
  
-       maxlvt = get_maxlvt();
+       lvtt_value = LOCAL_TIMER_VECTOR;
+       if (!oneshot)
+               lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+       if (!irqen)
+               lvtt_value |= APIC_LVT_MASKED;
  
-       /*
-        * Masking an LVT entry can trigger a local APIC error
-        * if the vector is zero. Mask LVTERR first to prevent this.
-        */
-       if (maxlvt >= 3) {
-               v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-               apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
-       }
-       /*
-        * Careful: we have to set masks only first to deassert
-        * any level-triggered sources.
-        */
-       v = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT0);
-       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT1);
-       apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
-       if (maxlvt >= 4) {
-               v = apic_read(APIC_LVTPC);
-               apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
-       }
+       apic_write(APIC_LVTT, lvtt_value);
  
         /*
-        * Clean APIC state for other OSs:
+        * Divide PICLK by 16
          */
-       apic_write(APIC_LVTT, APIC_LVT_MASKED);
-       apic_write(APIC_LVT0, APIC_LVT_MASKED);
-       apic_write(APIC_LVT1, APIC_LVT_MASKED);
-       if (maxlvt >= 3)
-               apic_write(APIC_LVTERR, APIC_LVT_MASKED);
-       if (maxlvt >= 4)
-               apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
+       tmp_value = apic_read(APIC_TDCR);
+       apic_write(APIC_TDCR, (tmp_value
+                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+                               | APIC_TDR_DIV_16);
+
+       if (!oneshot)
+               apic_write(APIC_TMICT, clocks);
  }
  
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
-       /* Go back to Virtual Wire compatibility mode */
-       unsigned long value;
+/*
+ * Setup extended LVT, AMD specific (K8, family 10h)
+ *
+ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ */
  
-       /* For the spurious interrupt use vector F, and enable it */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       value |= APIC_SPIV_APIC_ENABLED;
-       value |= 0xf;
-       apic_write(APIC_SPIV, value);
+#define APIC_EILVT_LVTOFF_MCE 0
+#define APIC_EILVT_LVTOFF_IBS 1
  
-       if (!virt_wire_setup) {
-               /*
-                * For LVT0 make it edge triggered, active high,
-                * external and enabled
-                */
-               value = apic_read(APIC_LVT0);
-               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
-               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-               apic_write(APIC_LVT0, value);
-       } else {
-               /* Disable LVT0 */
-               apic_write(APIC_LVT0, APIC_LVT_MASKED);
-       }
+static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+{
+       unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+       unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
  
-       /* For LVT1 make it edge triggered, active high, nmi and enabled */
-       value = apic_read(APIC_LVT1);
-       value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-       value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-       value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-       apic_write(APIC_LVT1, value);
+       apic_write(reg, v);
  }
  
-void disable_local_APIC(void)
+u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
  {
-       unsigned int value;
+       setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+       return APIC_EILVT_LVTOFF_MCE;
+}
  
-       clear_local_APIC();
+u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+{
+       setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+       return APIC_EILVT_LVTOFF_IBS;
+}
  
-       /*
-        * Disable APIC (implies clearing of registers
-        * for 82489DX!).
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_SPIV_APIC_ENABLED;
-       apic_write(APIC_SPIV, value);
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+                           struct clock_event_device *evt)
+{
+       apic_write(APIC_TMICT, delta);
+       return 0;
  }
  
-void lapic_shutdown(void)
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+                             struct clock_event_device *evt)
  {
         unsigned long flags;
+       unsigned int v;
  
-       if (!cpu_has_apic)
+       /* Lapic used as dummy for broadcast ? */
+       if (evt->features & CLOCK_EVT_FEAT_DUMMY)
                 return;
  
         local_irq_save(flags);
  
-       disable_local_APIC();
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+       case CLOCK_EVT_MODE_ONESHOT:
+               __setup_APIC_LVTT(calibration_result,
+                                 mode != CLOCK_EVT_MODE_PERIODIC, 1);
+               break;
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               v = apic_read(APIC_LVTT);
+               v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+               apic_write(APIC_LVTT, v);
+               break;
+       case CLOCK_EVT_MODE_RESUME:
+               /* Nothing to do here */
+               break;
+       }
  
         local_irq_restore(flags);
  }
  
  /*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
+ * Local APIC timer broadcast function
   */
-int __init verify_local_APIC(void)
+static void lapic_timer_broadcast(cpumask_t mask)
  {
-       unsigned int reg0, reg1;
+#ifdef CONFIG_SMP
+       send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
  
-       /*
-        * The version register is read-only in a real APIC.
-        */
-       reg0 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-       reg1 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+/*
+ * Setup the local APIC timer for this CPU. Copy the initilized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void setup_APIC_timer(void)
+{
+       struct clock_event_device *levt = &__get_cpu_var(lapic_events);
  
-       /*
-        * The two version reads above should print the same
-        * numbers.  If the second one is different, then we
-        * poke at a non-APIC.
-        */
-       if (reg1 != reg0)
-               return 0;
+       memcpy(levt, &lapic_clockevent, sizeof(*levt));
+       levt->cpumask = cpumask_of_cpu(smp_processor_id());
  
-       /*
-        * Check if the version looks reasonably.
-        */
-       reg1 = GET_APIC_VERSION(reg0);
-       if (reg1 == 0x00 || reg1 == 0xff)
-               return 0;
-       reg1 = get_maxlvt();
-       if (reg1 < 0x02 || reg1 == 0xff)
-               return 0;
+       clockevents_register_device(levt);
+}
  
-       /*
-        * The ID register is read/write in a real APIC.
-        */
+/*
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
+ *
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
+ */
+
+#define TICK_COUNT 100000000
+
+static void __init calibrate_APIC_clock(void)
+{
+       unsigned apic, apic_start;
+       unsigned long tsc, tsc_start;
+       int result;
+
+       local_irq_disable();
+
+       /*
+        * Put whatever arbitrary (but long enough) timeout
+        * value into the APIC clock, we just want to get the
+        * counter running for calibration.
+        *
+        * No interrupt enable !
+        */
+       __setup_APIC_LVTT(250000000, 0, 0);
+
+       apic_start = apic_read(APIC_TMCCT);
+#ifdef CONFIG_X86_PM_TIMER
+       if (apic_calibrate_pmtmr && pmtmr_ioport) {
+               pmtimer_wait(5000);  /* 5ms wait */
+               apic = apic_read(APIC_TMCCT);
+               result = (apic_start - apic) * 1000L / 5;
+       } else
+#endif
+       {
+               rdtscll(tsc_start);
+
+               do {
+                       apic = apic_read(APIC_TMCCT);
+                       rdtscll(tsc);
+               } while ((tsc - tsc_start) < TICK_COUNT &&
+                               (apic_start - apic) < TICK_COUNT);
+
+               result = (apic_start - apic) * 1000L * tsc_khz /
+                                       (tsc - tsc_start);
+       }
+
+       local_irq_enable();
+
+       printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
+
+       printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
+               result / 1000 / 1000, result / 1000 % 1000);
+
+       /* Calculate the scaled math multiplication factor */
+       lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
+       lapic_clockevent.max_delta_ns =
+               clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+       lapic_clockevent.min_delta_ns =
+               clockevent_delta2ns(0xF, &lapic_clockevent);
+
+       calibration_result = result / HZ;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+       /*
+        * The local apic timer can be disabled via the kernel commandline.
+        * Register the lapic timer as a dummy clock event source on SMP
+        * systems, so the broadcast mechanism is used. On UP systems simply
+        * ignore it.
+        */
+       if (disable_apic_timer) {
+               printk(KERN_INFO "Disabling APIC timer\n");
+               /* No broadcast on UP ! */
+               if (num_possible_cpus() > 1) {
+                       lapic_clockevent.mult = 1;
+                       setup_APIC_timer();
+               }
+               return;
+       }
+
+       printk(KERN_INFO "Using local APIC timer interrupts.\n");
+       calibrate_APIC_clock();
+
+       /*
+        * Do a sanity check on the APIC calibration result
+        */
+       if (calibration_result < (1000000 / HZ)) {
+               printk(KERN_WARNING
+                      "APIC frequency too slow, disabling apic timer\n");
+               /* No broadcast on UP ! */
+               if (num_possible_cpus() > 1)
+                       setup_APIC_timer();
+               return;
+       }
+
+       /*
+        * If nmi_watchdog is set to IO_APIC, we need the
+        * PIT/HPET going.  Otherwise register lapic as a dummy
+        * device.
+        */
+       if (nmi_watchdog != NMI_IO_APIC)
+               lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+       else
+               printk(KERN_WARNING "APIC timer registered as dummy,"
+                      " due to nmi_watchdog=1!\n");
+
+       setup_APIC_timer();
+}
+
+/*
+ * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
+ * C1E flag only in the secondary CPU, so when we detect the wreckage
+ * we already have enabled the boot CPU local apic timer. Check, if
+ * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
+ * set the DUMMY flag again and force the broadcast mode in the
+ * clockevents layer.
+ */
+void __cpuinit check_boot_apic_timer_broadcast(void)
+{
+       if (!disable_apic_timer ||
+           (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
+               return;
+
+       printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
+       lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
+
+       local_irq_enable();
+       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
+       local_irq_disable();
+}
+
+void __cpuinit setup_secondary_APIC_clock(void)
+{
+       check_boot_apic_timer_broadcast();
+       setup_APIC_timer();
+}
+
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+       int cpu = smp_processor_id();
+       struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+
+       /*
+        * Normally we should not be here till LAPIC has been initialized but
+        * in some cases like kdump, its possible that there is a pending LAPIC
+        * timer interrupt from previous kernel's context and is delivered in
+        * new kernel the moment interrupts are enabled.
+        *
+        * Interrupts are enabled early and LAPIC is setup much later, hence
+        * its possible that when we get here evt->event_handler is NULL.
+        * Check for event_handler being NULL and discard the interrupt as
+        * spurious.
+        */
+       if (!evt->event_handler) {
+               printk(KERN_WARNING
+                      "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+               /* Switch it off */
+               lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+               return;
+       }
+
+       /*
+        * the NMI deadlock-detector uses this.
+        */
+       add_pda(apic_timer_irqs, 1);
+
+       evt->event_handler(evt);
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       /*
+        * NOTE! We'd better ACK the irq immediately,
+        * because timer handling can be slow.
+        */
+       ack_APIC_irq();
+       /*
+        * update_process_times() expects us to have done irq_enter().
+        * Besides, if we don't timer interrupts ignore the global
+        * interrupt lock, which is the WrongThing (tm) to do.
+        */
+       exit_idle();
+       irq_enter();
+       local_apic_timer_interrupt();
+       irq_exit();
+       set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+       return -EINVAL;
+}
+
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+       int maxlvt = lapic_get_maxlvt();
+       u32 v;
+
+       /* APIC hasn't been mapped yet */
+       if (!apic_phys)
+               return;
+
+       maxlvt = lapic_get_maxlvt();
+       /*
+        * Masking an LVT entry can trigger a local APIC error
+        * if the vector is zero. Mask LVTERR first to prevent this.
+        */
+       if (maxlvt >= 3) {
+               v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+               apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+       }
+       /*
+        * Careful: we have to set masks only first to deassert
+        * any level-triggered sources.
+        */
+       v = apic_read(APIC_LVTT);
+       apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+       v = apic_read(APIC_LVT0);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+       v = apic_read(APIC_LVT1);
+       apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
+       if (maxlvt >= 4) {
+               v = apic_read(APIC_LVTPC);
+               apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
+       }
+
+       /*
+        * Clean APIC state for other OSs:
+        */
+       apic_write(APIC_LVTT, APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, APIC_LVT_MASKED);
+       if (maxlvt >= 3)
+               apic_write(APIC_LVTERR, APIC_LVT_MASKED);
+       if (maxlvt >= 4)
+               apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+       unsigned int value;
+
+       clear_local_APIC();
+
+       /*
+        * Disable APIC (implies clearing of registers
+        * for 82489DX!).
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_SPIV_APIC_ENABLED;
+       apic_write(APIC_SPIV, value);
+}
+
+void lapic_shutdown(void)
+{
+       unsigned long flags;
+
+       if (!cpu_has_apic)
+               return;
+
+       local_irq_save(flags);
+
+       disable_local_APIC();
+
+       local_irq_restore(flags);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+       unsigned int reg0, reg1;
+
+       /*
+        * The version register is read-only in a real APIC.
+        */
+       reg0 = apic_read(APIC_LVR);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+       reg1 = apic_read(APIC_LVR);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+       /*
+        * The two version reads above should print the same
+        * numbers.  If the second one is different, then we
+        * poke at a non-APIC.
+        */
+       if (reg1 != reg0)
+               return 0;
+
+       /*
+        * Check if the version looks reasonably.
+        */
+       reg1 = GET_APIC_VERSION(reg0);
+       if (reg1 == 0x00 || reg1 == 0xff)
+               return 0;
+       reg1 = lapic_get_maxlvt();
+       if (reg1 < 0x02 || reg1 == 0xff)
+               return 0;
+
+       /*
+        * The ID register is read/write in a real APIC.
+        */
         reg0 = apic_read(APIC_ID);
         apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
         apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
@@ -355,18 +655,20 @@ int __init verify_local_APIC(void)
          * compatibility mode, but most boxes are anymore.
          */
         reg0 = apic_read(APIC_LVT0);
-       apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
+       apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
         reg1 = apic_read(APIC_LVT1);
         apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
  
         return 1;
  }
  
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
  void __init sync_Arb_IDs(void)
  {
         /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
-       unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-       if (ver >= 0x14)        /* P4 or higher */
+       if (modern_apic())
                 return;
  
         /*
@@ -418,9 +720,12 @@ void __init init_bsp_APIC(void)
         apic_write(APIC_LVT1, value);
  }
  
-void __cpuinit setup_local_APIC (void)
+/**
+ * setup_local_APIC - setup the local APIC
+ */
+void __cpuinit setup_local_APIC(void)
  {
-       unsigned int value, maxlvt;
+       unsigned int value;
         int i, j;
  
         value = apic_read(APIC_LVR);
@@ -516,183 +821,27 @@ void __cpuinit setup_local_APIC (void)
         else
                 value = APIC_DM_NMI | APIC_LVT_MASKED;
         apic_write(APIC_LVT1, value);
-
-       {
-               unsigned oldvalue;
-               maxlvt = get_maxlvt();
-               oldvalue = apic_read(APIC_ESR);
-               value = ERROR_APIC_VECTOR;      // enables sending errors
-               apic_write(APIC_LVTERR, value);
-               /*
-                * spec says clear errors after enabling vector.
-                */
-               if (maxlvt > 3)
-                       apic_write(APIC_ESR, 0);
-               value = apic_read(APIC_ESR);
-               if (value != oldvalue)
-                       apic_printk(APIC_VERBOSE,
-                       "ESR value after enabling vector: %08x, after %08x\n",
-                       oldvalue, value);
-       }
-
-       nmi_watchdog_default();
-       setup_apic_nmi_watchdog(NULL);
-       apic_pm_activate();
-}
-
-#ifdef CONFIG_PM
-
-static struct {
-       /* 'active' is true if the local APIC was enabled by us and
-          not the BIOS; this signifies that we are also responsible
-          for disabling it before entering apm/acpi suspend */
-       int active;
-       /* r/w apic fields */
-       unsigned int apic_id;
-       unsigned int apic_taskpri;
-       unsigned int apic_ldr;
-       unsigned int apic_dfr;
-       unsigned int apic_spiv;
-       unsigned int apic_lvtt;
-       unsigned int apic_lvtpc;
-       unsigned int apic_lvt0;
-       unsigned int apic_lvt1;
-       unsigned int apic_lvterr;
-       unsigned int apic_tmict;
-       unsigned int apic_tdcr;
-       unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-       unsigned long flags;
-       int maxlvt;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       maxlvt = get_maxlvt();
-
-       apic_pm_state.apic_id = apic_read(APIC_ID);
-       apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
-       apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-       apic_pm_state.apic_dfr = apic_read(APIC_DFR);
-       apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
-       apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-       if (maxlvt >= 4)
-               apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
-       apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
-       apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
-       apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
-       apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
-       apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_INTEL
-       if (maxlvt >= 5)
-               apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-       local_irq_save(flags);
-       disable_local_APIC();
-       local_irq_restore(flags);
-       return 0;
  }
  
-static int lapic_resume(struct sys_device *dev)
-{
-       unsigned int l, h;
-       unsigned long flags;
-       int maxlvt;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       maxlvt = get_maxlvt();
-
-       local_irq_save(flags);
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       l &= ~MSR_IA32_APICBASE_BASE;
-       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-       wrmsr(MSR_IA32_APICBASE, l, h);
-       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
-       apic_write(APIC_ID, apic_pm_state.apic_id);
-       apic_write(APIC_DFR, apic_pm_state.apic_dfr);
-       apic_write(APIC_LDR, apic_pm_state.apic_ldr);
-       apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
-       apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
-       apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
-       apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_INTEL
-       if (maxlvt >= 5)
-               apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
-       if (maxlvt >= 4)
-               apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
-       apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-       apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
-       apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       local_irq_restore(flags);
-       return 0;
-}
-
-static struct sysdev_class lapic_sysclass = {
-       .name           = "lapic",
-       .resume         = lapic_resume,
-       .suspend        = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
-       .id             = 0,
-       .cls            = &lapic_sysclass,
-};
-
-static void __cpuinit apic_pm_activate(void)
+void __cpuinit lapic_setup_esr(void)
  {
-       apic_pm_state.active = 1;
-}
+       unsigned maxlvt = lapic_get_maxlvt();
  
-static int __init init_lapic_sysfs(void)
-{
-       int error;
-       if (!cpu_has_apic)
-               return 0;
-       /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-       error = sysdev_class_register(&lapic_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic);
-       return error;
+       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
+       /*
+        * spec says clear errors after enabling vector.
+        */
+       if (maxlvt > 3)
+               apic_write(APIC_ESR, 0);
  }
-device_initcall(init_lapic_sysfs);
-
-#else  /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
  
-static int __init apic_set_verbosity(char *str)
+void __cpuinit end_local_APIC_setup(void)
  {
-       if (str == NULL)  {
-               skip_ioapic_setup = 0;
-               ioapic_force = 1;
-               return 0;
-       }
-       if (strcmp("debug", str) == 0)
-               apic_verbosity = APIC_DEBUG;
-       else if (strcmp("verbose", str) == 0)
-               apic_verbosity = APIC_VERBOSE;
-       else {
-               printk(KERN_WARNING "APIC Verbosity level %s not recognised"
-                               " use apic=verbose or apic=debug\n", str);
-               return -EINVAL;
-       }
-
-       return 0;
+       lapic_setup_esr();
+       nmi_watchdog_default();
+       setup_apic_nmi_watchdog(NULL);
+       apic_pm_activate();
  }
-early_param("apic", apic_set_verbosity);
  
  /*
   * Detect and enable local APICs on non-SMP boards.
@@ -700,81 +849,23 @@ early_param("apic", apic_set_verbosity);
   * On AMD64 we trust the BIOS - if it says no APIC it is likely
   * not correctly set up (usually the APIC timer won't work etc.)
   */
-
-static int __init detect_init_APIC (void)
+static int __init detect_init_APIC(void)
  {
         if (!cpu_has_apic) {
                 printk(KERN_INFO "No local APIC present\n");
-               return -1;
-       }
-
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-       boot_cpu_id = 0;
-       return 0;
-}
-
-#ifdef CONFIG_X86_IO_APIC
-static struct resource * __init ioapic_setup_resources(void)
-{
-#define IOAPIC_RESOURCE_NAME_SIZE 11
-       unsigned long n;
-       struct resource *res;
-       char *mem;
-       int i;
-
-       if (nr_ioapics <= 0)
-               return NULL;
-
-       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
-       n *= nr_ioapics;
-
-       mem = alloc_bootmem(n);
-       res = (void *)mem;
-
-       if (mem != NULL) {
-               memset(mem, 0, n);
-               mem += sizeof(struct resource) * nr_ioapics;
-
-               for (i = 0; i < nr_ioapics; i++) {
-                       res[i].name = mem;
-                       res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-                       sprintf(mem,  "IOAPIC %u", i);
-                       mem += IOAPIC_RESOURCE_NAME_SIZE;
-               }
-       }
-
-       ioapic_resources = res;
-
-       return res;
-}
-
-static int __init ioapic_insert_resources(void)
-{
-       int i;
-       struct resource *r = ioapic_resources;
-
-       if (!r) {
-               printk("IO APIC resources could be not be allocated.\n");
-               return -1;
-       }
-
-       for (i = 0; i < nr_ioapics; i++) {
-               insert_resource(&iomem_resource, r);
-               r++;
+               return -1;
         }
  
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+       boot_cpu_id = 0;
         return 0;
  }
  
-/* Insert the IO APIC resources after PCI initialization has occured to handle
- * IO APICS that are mapped in on a BAR in PCI space. */
-late_initcall(ioapic_insert_resources);
-#endif
-
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
  void __init init_apic_mappings(void)
  {
-       unsigned long apic_phys;
-
         /*
          * If no local APIC can be found then set up a fake all
          * zeroes page to simulate the local APIC and another
@@ -800,295 +891,281 @@ void __init init_apic_mappings(void)
          * default configuration (or the MP table is broken).
          */
         boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-
-       {
-               unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-               int i;
-               struct resource *ioapic_res;
-
-               ioapic_res = ioapic_setup_resources();
-               for (i = 0; i < nr_ioapics; i++) {
-                       if (smp_found_config) {
-                               ioapic_phys = mp_ioapics[i].mpc_apicaddr;
-                       } else {
-                               ioapic_phys = (unsigned long)
-                                       alloc_bootmem_pages(PAGE_SIZE);
-                               ioapic_phys = __pa(ioapic_phys);
-                       }
-                       set_fixmap_nocache(idx, ioapic_phys);
-                       apic_printk(APIC_VERBOSE,
-                                   "mapped IOAPIC to %016lx (%016lx)\n",
-                                   __fix_to_virt(idx), ioapic_phys);
-                       idx++;
-
-                       if (ioapic_res != NULL) {
-                               ioapic_res->start = ioapic_phys;
-                               ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
-                               ioapic_res++;
-                       }
-               }
-       }
  }
  
  /*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
   */
-
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+int __init APIC_init_uniprocessor(void)
  {
-       unsigned int lvtt_value, tmp_value;
+       if (disable_apic) {
+               printk(KERN_INFO "Apic disabled\n");
+               return -1;
+       }
+       if (!cpu_has_apic) {
+               disable_apic = 1;
+               printk(KERN_INFO "Apic disabled by BIOS\n");
+               return -1;
+       }
  
-       lvtt_value = LOCAL_TIMER_VECTOR;
-       if (!oneshot)
-               lvtt_value |= APIC_LVT_TIMER_PERIODIC;
-       if (!irqen)
-               lvtt_value |= APIC_LVT_MASKED;
+       verify_local_APIC();
  
-       apic_write(APIC_LVTT, lvtt_value);
+       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
+       apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
+
+       setup_local_APIC();
  
         /*
-        * Divide PICLK by 16
+        * Now enable IO-APICs, actually call clear_IO_APIC
+        * We need clear_IO_APIC before enabling vector on BP
          */
-       tmp_value = apic_read(APIC_TDCR);
-       apic_write(APIC_TDCR, (tmp_value
-                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-                               | APIC_TDR_DIV_16);
-
-       if (!oneshot)
-               apic_write(APIC_TMICT, clocks);
-}
-
-static void setup_APIC_timer(void)
-{
-       struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+       if (!skip_ioapic_setup && nr_ioapics)
+               enable_IO_APIC();
  
-       memcpy(levt, &lapic_clockevent, sizeof(*levt));
-       levt->cpumask = cpumask_of_cpu(smp_processor_id());
+       end_local_APIC_setup();
  
-       clockevents_register_device(levt);
+       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+               setup_IO_APIC();
+       else
+               nr_ioapics = 0;
+       setup_boot_APIC_clock();
+       check_nmi_watchdog();
+       return 0;
  }
  
  /*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
+ * Local APIC interrupts
   */
  
-#define TICK_COUNT 100000000
-
-static void __init calibrate_APIC_clock(void)
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_spurious_interrupt(void)
  {
-       unsigned apic, apic_start;
-       unsigned long tsc, tsc_start;
-       int result;
-
-       local_irq_disable();
-
+       unsigned int v;
+       exit_idle();
+       irq_enter();
         /*
-        * Put whatever arbitrary (but long enough) timeout
-        * value into the APIC clock, we just want to get the
-        * counter running for calibration.
-        *
-        * No interrupt enable !
+        * Check if this really is a spurious interrupt and ACK it
+        * if it is a vectored one.  Just in case...
+        * Spurious interrupts should not be ACKed.
          */
-       __setup_APIC_LVTT(250000000, 0, 0);
-
-       apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
-       if (apic_calibrate_pmtmr && pmtmr_ioport) {
-               pmtimer_wait(5000);  /* 5ms wait */
-               apic = apic_read(APIC_TMCCT);
-               result = (apic_start - apic) * 1000L / 5;
-       } else
-#endif
-       {
-               rdtscll(tsc_start);
-
-               do {
-                       apic = apic_read(APIC_TMCCT);
-                       rdtscll(tsc);
-               } while ((tsc - tsc_start) < TICK_COUNT &&
-                               (apic_start - apic) < TICK_COUNT);
-
-               result = (apic_start - apic) * 1000L * tsc_khz /
-                                       (tsc - tsc_start);
-       }
-
-       local_irq_enable();
+       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+               ack_APIC_irq();
  
-       printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
+       add_pda(irq_spurious_count, 1);
+       irq_exit();
+}
  
-       printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
-               result / 1000 / 1000, result / 1000 % 1000);
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_error_interrupt(void)
+{
+       unsigned int v, v1;
  
-       /* Calculate the scaled math multiplication factor */
-       lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
-       lapic_clockevent.max_delta_ns =
-               clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
-       lapic_clockevent.min_delta_ns =
-               clockevent_delta2ns(0xF, &lapic_clockevent);
+       exit_idle();
+       irq_enter();
+       /* First tickle the hardware, only then report what went on. -- REW */
+       v = apic_read(APIC_ESR);
+       apic_write(APIC_ESR, 0);
+       v1 = apic_read(APIC_ESR);
+       ack_APIC_irq();
+       atomic_inc(&irq_err_count);
  
-       calibration_result = result / HZ;
+       /* Here is what the APIC error bits mean:
+          0: Send CS error
+          1: Receive CS error
+          2: Send accept error
+          3: Receive accept error
+          4: Reserved
+          5: Send illegal vector
+          6: Received illegal vector
+          7: Illegal register address
+       */
+       printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+               smp_processor_id(), v , v1);
+       irq_exit();
  }
  
-void __init setup_boot_APIC_clock (void)
+void disconnect_bsp_APIC(int virt_wire_setup)
  {
-       /*
-        * The local apic timer can be disabled via the kernel commandline.
-        * Register the lapic timer as a dummy clock event source on SMP
-        * systems, so the broadcast mechanism is used. On UP systems simply
-        * ignore it.
-        */
-       if (disable_apic_timer) {
-               printk(KERN_INFO "Disabling APIC timer\n");
-               /* No broadcast on UP ! */
-               if (num_possible_cpus() > 1)
-                       setup_APIC_timer();
-               return;
-       }
+       /* Go back to Virtual Wire compatibility mode */
+       unsigned long value;
  
-       printk(KERN_INFO "Using local APIC timer interrupts.\n");
-       calibrate_APIC_clock();
+       /* For the spurious interrupt use vector F, and enable it */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_VECTOR_MASK;
+       value |= APIC_SPIV_APIC_ENABLED;
+       value |= 0xf;
+       apic_write(APIC_SPIV, value);
  
-       /*
-        * If nmi_watchdog is set to IO_APIC, we need the
-        * PIT/HPET going.  Otherwise register lapic as a dummy
-        * device.
-        */
-       if (nmi_watchdog != NMI_IO_APIC)
-               lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-       else
-               printk(KERN_WARNING "APIC timer registered as dummy,"
-                      " due to nmi_watchdog=1!\n");
+       if (!virt_wire_setup) {
+               /*
+                * For LVT0 make it edge triggered, active high,
+                * external and enabled
+                */
+               value = apic_read(APIC_LVT0);
+               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+               apic_write(APIC_LVT0, value);
+       } else {
+               /* Disable LVT0 */
+               apic_write(APIC_LVT0, APIC_LVT_MASKED);
+       }
  
-       setup_APIC_timer();
+       /* For LVT1 make it edge triggered, active high, nmi and enabled */
+       value = apic_read(APIC_LVT1);
+       value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+       value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+       value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+       apic_write(APIC_LVT1, value);
  }
  
  /*
- * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
- * C1E flag only in the secondary CPU, so when we detect the wreckage
- * we already have enabled the boot CPU local apic timer. Check, if
- * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
- * set the DUMMY flag again and force the broadcast mode in the
- * clockevents layer.
+ * Power management
   */
-void __cpuinit check_boot_apic_timer_broadcast(void)
+#ifdef CONFIG_PM
+
+static struct {
+       /* 'active' is true if the local APIC was enabled by us and
+          not the BIOS; this signifies that we are also responsible
+          for disabling it before entering apm/acpi suspend */
+       int active;
+       /* r/w apic fields */
+       unsigned int apic_id;
+       unsigned int apic_taskpri;
+       unsigned int apic_ldr;
+       unsigned int apic_dfr;
+       unsigned int apic_spiv;
+       unsigned int apic_lvtt;
+       unsigned int apic_lvtpc;
+       unsigned int apic_lvt0;
+       unsigned int apic_lvt1;
+       unsigned int apic_lvterr;
+       unsigned int apic_tmict;
+       unsigned int apic_tdcr;
+       unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
  {
-       if (!disable_apic_timer ||
-           (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
-               return;
+       unsigned long flags;
+       int maxlvt;
  
-       printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
-       lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
+       if (!apic_pm_state.active)
+               return 0;
  
-       local_irq_enable();
-       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
-       local_irq_disable();
-}
+       maxlvt = lapic_get_maxlvt();
  
-void __cpuinit setup_secondary_APIC_clock(void)
-{
-       check_boot_apic_timer_broadcast();
-       setup_APIC_timer();
+       apic_pm_state.apic_id = apic_read(APIC_ID);
+       apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+       apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+       apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+       apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+       apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+       if (maxlvt >= 4)
+               apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+       apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+       apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+       apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+       apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+       apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 5)
+               apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+       local_irq_save(flags);
+       disable_local_APIC();
+       local_irq_restore(flags);
+       return 0;
  }
  
-int setup_profiling_timer(unsigned int multiplier)
+static int lapic_resume(struct sys_device *dev)
  {
-       return -EINVAL;
-}
+       unsigned int l, h;
+       unsigned long flags;
+       int maxlvt;
  
-void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
-                            unsigned char msg_type, unsigned char mask)
-{
-       unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
-       unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
-       apic_write(reg, v);
-}
+       if (!apic_pm_state.active)
+               return 0;
  
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
+       maxlvt = lapic_get_maxlvt();
  
-void smp_local_timer_interrupt(void)
-{
-       int cpu = smp_processor_id();
-       struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+       local_irq_save(flags);
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       l &= ~MSR_IA32_APICBASE_BASE;
+       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+       wrmsr(MSR_IA32_APICBASE, l, h);
+       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+       apic_write(APIC_ID, apic_pm_state.apic_id);
+       apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+       apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+       apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+       apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+       apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+       apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 5)
+               apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+       if (maxlvt >= 4)
+               apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+       apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+       apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+       apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+       apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+       local_irq_restore(flags);
+       return 0;
+}
  
-       /*
-        * Normally we should not be here till LAPIC has been initialized but
-        * in some cases like kdump, its possible that there is a pending LAPIC
-        * timer interrupt from previous kernel's context and is delivered in
-        * new kernel the moment interrupts are enabled.
-        *
-        * Interrupts are enabled early and LAPIC is setup much later, hence
-        * its possible that when we get here evt->event_handler is NULL.
-        * Check for event_handler being NULL and discard the interrupt as
-        * spurious.
-        */
-       if (!evt->event_handler) {
-               printk(KERN_WARNING
-                      "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
-               /* Switch it off */
-               lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
-               return;
-       }
+static struct sysdev_class lapic_sysclass = {
+       .name           = "lapic",
+       .resume         = lapic_resume,
+       .suspend        = lapic_suspend,
+};
  
-       /*
-        * the NMI deadlock-detector uses this.
-        */
-       add_pda(apic_timer_irqs, 1);
+static struct sys_device device_lapic = {
+       .id     = 0,
+       .cls    = &lapic_sysclass,
+};
  
-       evt->event_handler(evt);
+static void __cpuinit apic_pm_activate(void)
+{
+       apic_pm_state.active = 1;
  }
  
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
- */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
+static int __init init_lapic_sysfs(void)
  {
-       struct pt_regs *old_regs = set_irq_regs(regs);
+       int error;
  
-       /*
-        * NOTE! We'd better ACK the irq immediately,
-        * because timer handling can be slow.
-        */
-       ack_APIC_irq();
-       /*
-        * update_process_times() expects us to have done irq_enter().
-        * Besides, if we don't timer interrupts ignore the global
-        * interrupt lock, which is the WrongThing (tm) to do.
-        */
-       exit_idle();
-       irq_enter();
-       smp_local_timer_interrupt();
-       irq_exit();
-       set_irq_regs(old_regs);
+       if (!cpu_has_apic)
+               return 0;
+       /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+
+       error = sysdev_class_register(&lapic_sysclass);
+       if (!error)
+               error = sysdev_register(&device_lapic);
+       return error;
  }
+device_initcall(init_lapic_sysfs);
+
+#else  /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
  
  /*
   * apic_is_clustered_box() -- Check if we can expect good TSC
@@ -1103,21 +1180,34 @@ __cpuinit int apic_is_clustered_box(void)
  {
         int i, clusters, zeros;
         unsigned id;
+       u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
         DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
  
         bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
  
         for (i = 0; i < NR_CPUS; i++) {
-               id = bios_cpu_apicid[i];
+               /* are we being called early in kernel startup? */
+               if (bios_cpu_apicid) {
+                       id = bios_cpu_apicid[i];
+               }
+               else if (i < nr_cpu_ids) {
+                       if (cpu_present(i))
+                               id = per_cpu(x86_bios_cpu_apicid, i);
+                       else
+                               continue;
+               }
+               else
+                       break;
+
                 if (id != BAD_APICID)
                         __set_bit(APIC_CLUSTERID(id), clustermap);
         }
  
         /* Problem:  Partially populated chassis may not have CPUs in some of
          * the APIC clusters they have been allocated.  Only present CPUs have
-        * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
-        * clusters are allocated sequentially, count zeros only if they are
-        * bounded by ones.
+        * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
+        * Since clusters are allocated sequentially, count zeros only if
+        * they are bounded by ones.
          */
         clusters = 0;
         zeros = 0;
@@ -1138,96 +1228,33 @@ __cpuinit int apic_is_clustered_box(void)
  }
  
  /*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-asmlinkage void smp_spurious_interrupt(void)
-{
-       unsigned int v;
-       exit_idle();
-       irq_enter();
-       /*
-        * Check if this really is a spurious interrupt and ACK it
-        * if it is a vectored one.  Just in case...
-        * Spurious interrupts should not be ACKed.
-        */
-       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-               ack_APIC_irq();
-
-       add_pda(irq_spurious_count, 1);
-       irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-
-asmlinkage void smp_error_interrupt(void)
-{
-       unsigned int v, v1;
-
-       exit_idle();
-       irq_enter();
-       /* First tickle the hardware, only then report what went on. -- REW */
-       v = apic_read(APIC_ESR);
-       apic_write(APIC_ESR, 0);
-       v1 = apic_read(APIC_ESR);
-       ack_APIC_irq();
-       atomic_inc(&irq_err_count);
-
-       /* Here is what the APIC error bits mean:
-          0: Send CS error
-          1: Receive CS error
-          2: Send accept error
-          3: Receive accept error
-          4: Reserved
-          5: Send illegal vector
-          6: Received illegal vector
-          7: Illegal register address
-       */
-       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
-               smp_processor_id(), v , v1);
-       irq_exit();
-}
-
-int disable_apic;
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
+ * APIC command line parameters
   */
-int __init APIC_init_uniprocessor (void)
+static int __init apic_set_verbosity(char *str)
  {
-       if (disable_apic) {
-               printk(KERN_INFO "Apic disabled\n");
-               return -1;
+       if (str == NULL)  {
+               skip_ioapic_setup = 0;
+               ioapic_force = 1;
+               return 0;
         }
-       if (!cpu_has_apic) {
-               disable_apic = 1;
-               printk(KERN_INFO "Apic disabled by BIOS\n");
-               return -1;
+       if (strcmp("debug", str) == 0)
+               apic_verbosity = APIC_DEBUG;
+       else if (strcmp("verbose", str) == 0)
+               apic_verbosity = APIC_VERBOSE;
+       else {
+               printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+                               " use apic=verbose or apic=debug\n", str);
+               return -EINVAL;
         }
  
-       verify_local_APIC();
-
-       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
-       apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
-
-       setup_local_APIC();
-
-       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
-               setup_IO_APIC();
-       else
-               nr_ioapics = 0;
-       setup_boot_APIC_clock();
-       check_nmi_watchdog();
         return 0;
  }
+early_param("apic", apic_set_verbosity);
  
  static __init int setup_disableapic(char *str)
  {
         disable_apic = 1;
-       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+       clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
         return 0;
  }
  early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c

index af045ca0f653e7587fdd39dac737877ba0ebe17f..d4438ef296d8a607f7a703a845040715a8335fd9 100644 (file)
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -227,6 +227,7 @@
  #include <linux/dmi.h>
  #include <linux/suspend.h>
  #include <linux/kthread.h>
+#include <linux/jiffies.h>
  
  #include <asm/system.h>
  #include <asm/uaccess.h>
@@ -235,8 +236,6 @@
  #include <asm/paravirt.h>
  #include <asm/reboot.h>
  
-#include "io_ports.h"
-
  #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
  extern int (*console_blank_hook)(int);
  #endif
@@ -324,7 +323,7 @@ extern int (*console_blank_hook)(int);
  /*
   * Ignore suspend events for this amount of time after a resume
   */
-#define DEFAULT_BOUNCE_INTERVAL                (3 * HZ)
+#define DEFAULT_BOUNCE_INTERVAL        (3 * HZ)
  
  /*
   * Maximum number of events stored
@@ -336,7 +335,7 @@ extern int (*console_blank_hook)(int);
   */
  struct apm_user {
         int             magic;
-       struct apm_user *       next;
+       struct apm_user *next;
         unsigned int    suser: 1;
         unsigned int    writer: 1;
         unsigned int    reader: 1;
@@ -372,44 +371,44 @@ struct apm_user {
  static struct {
         unsigned long   offset;
         unsigned short  segment;
-}                              apm_bios_entry;
-static int                     clock_slowed;
-static int                     idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
-static int                     idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
-static int                     set_pm_idle;
-static int                     suspends_pending;
-static int                     standbys_pending;
-static int                     ignore_sys_suspend;
-static int                     ignore_normal_resume;
-static int                     bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-
-static int                     debug __read_mostly;
-static int                     smp __read_mostly;
-static int                     apm_disabled = -1;
+} apm_bios_entry;
+static int clock_slowed;
+static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int set_pm_idle;
+static int suspends_pending;
+static int standbys_pending;
+static int ignore_sys_suspend;
+static int ignore_normal_resume;
+static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+
+static int debug __read_mostly;
+static int smp __read_mostly;
+static int apm_disabled = -1;
  #ifdef CONFIG_SMP
-static int                     power_off;
+static int power_off;
  #else
-static int                     power_off = 1;
+static int power_off = 1;
  #endif
  #ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int                     realmode_power_off = 1;
+static int realmode_power_off = 1;
  #else
-static int                     realmode_power_off;
+static int realmode_power_off;
  #endif
  #ifdef CONFIG_APM_ALLOW_INTS
-static int                     allow_ints = 1;
+static int allow_ints = 1;
  #else
-static int                     allow_ints;
+static int allow_ints;
  #endif
-static int                     broken_psr;
+static int broken_psr;
  
  static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
  static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
-static struct apm_user *       user_list;
+static struct apm_user *user_list;
  static DEFINE_SPINLOCK(user_list_lock);
-static const struct desc_struct        bad_bios_desc = { 0, 0x00409200 };
+static const struct desc_struct        bad_bios_desc = { { { 0, 0x00409200 } } };
  
-static const char              driver_version[] = "1.16ac";    /* no spaces */
+static const char driver_version[] = "1.16ac"; /* no spaces */
  
  static struct task_struct *kapmd_task;
  
@@ -417,7 +416,7 @@ static struct task_struct *kapmd_task;
   *     APM event names taken from the APM 1.2 specification. These are
   *     the message codes that the BIOS uses to tell us about events
   */
-static const char *    const apm_event_name[] = {
+static const char * const apm_event_name[] = {
         "system standby",
         "system suspend",
         "normal resume",
@@ -435,14 +434,14 @@ static const char *       const apm_event_name[] = {
  
  typedef struct lookup_t {
         int     key;
-       char *  msg;
+       char    *msg;
  } lookup_t;
  
  /*
   *     The BIOS returns a set of standard error codes in AX when the
   *     carry flag is set.
   */
- 
+
  static const lookup_t error_table[] = {
  /* N/A { APM_SUCCESS,          "Operation succeeded" }, */
         { APM_DISABLED,         "Power management disabled" },
@@ -472,24 +471,25 @@ static const lookup_t error_table[] = {
   *     Write a meaningful log entry to the kernel log in the event of
   *     an APM error.
   */
- 
+
  static void apm_error(char *str, int err)
  {
-       int     i;
+       int i;
  
         for (i = 0; i < ERROR_COUNT; i++)
-               if (error_table[i].key == err) break;
+               if (error_table[i].key == err)
+                       break;
         if (i < ERROR_COUNT)
                 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
         else
                 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
-                       str, err);
+                      str, err);
  }
  
  /*
   * Lock APM functionality to physical CPU 0
   */
- 
+
  #ifdef CONFIG_SMP
  
  static cpumask_t apm_save_cpus(void)
@@ -511,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask)
  /*
   *     No CPU lockdown needed on a uniprocessor
   */
- 
+
  #define apm_save_cpus()                (current->cpus_allowed)
  #define apm_restore_cpus(x)    (void)(x)
  
@@ -590,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags)
   *     code is returned in AH (bits 8-15 of eax) and this function
   *     returns non-zero.
   */
- 
+
  static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
         u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
  {
@@ -602,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
         struct desc_struct      *gdt;
  
         cpus = apm_save_cpus();
-       
+
         cpu = get_cpu();
         gdt = get_cpu_gdt_table(cpu);
         save_desc_40 = gdt[0x40 / 8];
@@ -616,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
         gdt[0x40 / 8] = save_desc_40;
         put_cpu();
         apm_restore_cpus(cpus);
-       
+
         return *eax & 0xff;
  }
  
@@ -645,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
         struct desc_struct      *gdt;
  
         cpus = apm_save_cpus();
-       
+
         cpu = get_cpu();
         gdt = get_cpu_gdt_table(cpu);
         save_desc_40 = gdt[0x40 / 8];
@@ -680,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
  
  static int apm_driver_version(u_short *val)
  {
-       u32     eax;
+       u32 eax;
  
         if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
                 return (eax >> 8) & 0xff;
@@ -704,16 +704,16 @@ static int apm_driver_version(u_short *val)
   *     that APM 1.2 is in use. If no messges are pending the value 0x80
   *     is returned (No power management events pending).
   */
- 
+
  static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
  {
-       u32     eax;
-       u32     ebx;
-       u32     ecx;
-       u32     dummy;
+       u32 eax;
+       u32 ebx;
+       u32 ecx;
+       u32 dummy;
  
         if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
-                       &dummy, &dummy))
+                         &dummy, &dummy))
                 return (eax >> 8) & 0xff;
         *event = ebx;
         if (apm_info.connection_version < 0x0102)
@@ -736,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
   *     The state holds the state to transition to, which may in fact
   *     be an acceptance of a BIOS requested state change.
   */
- 
+
  static int set_power_state(u_short what, u_short state)
  {
-       u32     eax;
+       u32 eax;
  
         if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
                 return (eax >> 8) & 0xff;
@@ -752,7 +752,7 @@ static int set_power_state(u_short what, u_short state)
   *
   *     Transition the entire system into a new APM power state.
   */
- 
+
  static int set_system_power_state(u_short state)
  {
         return set_power_state(APM_DEVICE_ALL, state);
@@ -766,13 +766,13 @@ static int set_system_power_state(u_short state)
   *     to handle the idle request. On a success the function returns 1
   *     if the BIOS did clock slowing or 0 otherwise.
   */
- 
+
  static int apm_do_idle(void)
  {
-       u32     eax;
-       u8      ret = 0;
-       int     idled = 0;
-       int     polling;
+       u32 eax;
+       u8 ret = 0;
+       int idled = 0;
+       int polling;
  
         polling = !!(current_thread_info()->status & TS_POLLING);
         if (polling) {
@@ -799,10 +799,9 @@ static int apm_do_idle(void)
                 /* This always fails on some SMP boards running UP kernels.
                  * Only report the failure the first 5 times.
                  */
-               if (++t < 5)
-               {
+               if (++t < 5) {
                         printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
-                                       (eax >> 8) & 0xff);
+                              (eax >> 8) & 0xff);
                         t = jiffies;
                 }
                 return -1;
@@ -814,15 +813,15 @@ static int apm_do_idle(void)
  /**
   *     apm_do_busy     -       inform the BIOS the CPU is busy
   *
- *     Request that the BIOS brings the CPU back to full performance. 
+ *     Request that the BIOS brings the CPU back to full performance.
   */
- 
+
  static void apm_do_busy(void)
  {
-       u32     dummy;
+       u32 dummy;
  
         if (clock_slowed || ALWAYS_CALL_BUSY) {
-               (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
+               (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
                 clock_slowed = 0;
         }
  }
@@ -833,15 +832,15 @@ static void apm_do_busy(void)
   * power management - we probably want
   * to conserve power.
   */
-#define IDLE_CALC_LIMIT   (HZ * 100)
-#define IDLE_LEAKY_MAX    16
+#define IDLE_CALC_LIMIT        (HZ * 100)
+#define IDLE_LEAKY_MAX 16
  
  static void (*original_pm_idle)(void) __read_mostly;
  
  /**
   * apm_cpu_idle                -       cpu idling for APM capable Linux
   *
- * This is the idling function the kernel executes when APM is available. It 
+ * This is the idling function the kernel executes when APM is available. It
   * tries to do BIOS powermanagement based on the average system idle time.
   * Furthermore it calls the system default idle routine.
   */
@@ -882,7 +881,8 @@ recalc:
  
                         t = jiffies;
                         switch (apm_do_idle()) {
-                       case 0: apm_idle_done = 1;
+                       case 0:
+                               apm_idle_done = 1;
                                 if (t != jiffies) {
                                         if (bucket) {
                                                 bucket = IDLE_LEAKY_MAX;
@@ -893,7 +893,8 @@ recalc:
                                         continue;
                                 }
                                 break;
-                       case 1: apm_idle_done = 1;
+                       case 1:
+                               apm_idle_done = 1;
                                 break;
                         default: /* BIOS refused */
                                 break;
@@ -921,10 +922,10 @@ recalc:
   *     the SMP call on CPU0 as some systems will only honour this call
   *     on their first cpu.
   */
- 
+
  static void apm_power_off(void)
  {
-       unsigned char   po_bios_call[] = {
+       unsigned char po_bios_call[] = {
                 0xb8, 0x00, 0x10,       /* movw  $0x1000,ax  */
                 0x8e, 0xd0,             /* movw  ax,ss       */
                 0xbc, 0x00, 0xf0,       /* movw  $0xf000,sp  */
@@ -935,13 +936,12 @@ static void apm_power_off(void)
         };
  
         /* Some bioses don't like being called from CPU != 0 */
-       if (apm_info.realmode_power_off)
-       {
+       if (apm_info.realmode_power_off) {
                 (void)apm_save_cpus();
                 machine_real_restart(po_bios_call, sizeof(po_bios_call));
+       } else {
+               (void)set_system_power_state(APM_STATE_OFF);
         }
-       else
-               (void) set_system_power_state(APM_STATE_OFF);
  }
  
  #ifdef CONFIG_APM_DO_ENABLE
@@ -950,17 +950,17 @@ static void apm_power_off(void)
   *     apm_enable_power_management - enable BIOS APM power management
   *     @enable: enable yes/no
   *
- *     Enable or disable the APM BIOS power services. 
+ *     Enable or disable the APM BIOS power services.
   */
- 
+
  static int apm_enable_power_management(int enable)
  {
-       u32     eax;
+       u32 eax;
  
         if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
                 return APM_NOT_ENGAGED;
         if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
-                       enable, &eax))
+                                enable, &eax))
                 return (eax >> 8) & 0xff;
         if (enable)
                 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
@@ -983,19 +983,19 @@ static int apm_enable_power_management(int enable)
   *     if reported is a lifetime in secodnds/minutes at current powwer
   *     consumption.
   */
- 
+
  static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
  {
-       u32     eax;
-       u32     ebx;
-       u32     ecx;
-       u32     edx;
-       u32     dummy;
+       u32 eax;
+       u32 ebx;
+       u32 ecx;
+       u32 edx;
+       u32 dummy;
  
         if (apm_info.get_power_status_broken)
                 return APM_32_UNSUPPORTED;
         if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
-                       &eax, &ebx, &ecx, &edx, &dummy))
+                         &eax, &ebx, &ecx, &edx, &dummy))
                 return (eax >> 8) & 0xff;
         *status = ebx;
         *bat = ecx;
@@ -1011,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
  static int apm_get_battery_status(u_short which, u_short *status,
                                   u_short *bat, u_short *life, u_short *nbat)
  {
-       u32     eax;
-       u32     ebx;
-       u32     ecx;
-       u32     edx;
-       u32     esi;
+       u32 eax;
+       u32 ebx;
+       u32 ecx;
+       u32 edx;
+       u32 esi;
  
         if (apm_info.connection_version < 0x0102) {
                 /* pretend we only have one battery. */
@@ -1026,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
         }
  
         if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
-                       &ebx, &ecx, &edx, &esi))
+                         &ebx, &ecx, &edx, &esi))
                 return (eax >> 8) & 0xff;
         *status = ebx;
         *bat = ecx;
@@ -1044,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status,
   *     Activate or deactive power management on either a specific device
   *     or the entire system (%APM_DEVICE_ALL).
   */
- 
+
  static int apm_engage_power_management(u_short device, int enable)
  {
-       u32     eax;
+       u32 eax;
  
         if ((enable == 0) && (device == APM_DEVICE_ALL)
             && (apm_info.bios.flags & APM_BIOS_DISABLED))
@@ -1074,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable)
   *     all video devices. Typically the BIOS will do laptop backlight and
   *     monitor powerdown for us.
   */
- 
+
  static int apm_console_blank(int blank)
  {
         int error = APM_NOT_ENGAGED; /* silence gcc */
@@ -1126,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as)
  
  static void queue_event(apm_event_t event, struct apm_user *sender)
  {
-       struct apm_user *       as;
+       struct apm_user *as;
  
         spin_lock(&user_list_lock);
         if (user_list == NULL)
@@ -1174,11 +1174,11 @@ static void reinit_timer(void)
  
         spin_lock_irqsave(&i8253_lock, flags);
         /* set the clock to HZ */
-       outb_p(0x34, PIT_MODE);         /* binary, mode 2, LSB/MSB, ch 0 */
+       outb_pit(0x34, PIT_MODE);               /* binary, mode 2, LSB/MSB, ch 0 */
         udelay(10);
-       outb_p(LATCH & 0xff, PIT_CH0);  /* LSB */
+       outb_pit(LATCH & 0xff, PIT_CH0);        /* LSB */
         udelay(10);
-       outb(LATCH >> 8, PIT_CH0);      /* MSB */
+       outb_pit(LATCH >> 8, PIT_CH0);  /* MSB */
         udelay(10);
         spin_unlock_irqrestore(&i8253_lock, flags);
  #endif
@@ -1186,7 +1186,7 @@ static void reinit_timer(void)
  
  static int suspend(int vetoable)
  {
-       int             err;
+       int err;
         struct apm_user *as;
  
         if (pm_send_all(PM_SUSPEND, (void *)3)) {
@@ -1239,7 +1239,7 @@ static int suspend(int vetoable)
  
  static void standby(void)
  {
-       int     err;
+       int err;
  
         local_irq_disable();
         device_power_down(PMSG_SUSPEND);
@@ -1256,8 +1256,8 @@ static void standby(void)
  
  static apm_event_t get_event(void)
  {
-       int             error;
-       apm_event_t     event = APM_NO_EVENTS; /* silence gcc */
+       int error;
+       apm_event_t event = APM_NO_EVENTS; /* silence gcc */
         apm_eventinfo_t info;
  
         static int notified;
@@ -1275,9 +1275,9 @@ static apm_event_t get_event(void)
  
  static void check_events(void)
  {
-       apm_event_t             event;
-       static unsigned long    last_resume;
-       static int              ignore_bounce;
+       apm_event_t event;
+       static unsigned long last_resume;
+       static int ignore_bounce;
  
         while ((event = get_event()) != 0) {
                 if (debug) {
@@ -1289,7 +1289,7 @@ static void check_events(void)
                                        "event 0x%02x\n", event);
                 }
                 if (ignore_bounce
-                   && ((jiffies - last_resume) > bounce_interval))
+                   && (time_after(jiffies, last_resume + bounce_interval)))
                         ignore_bounce = 0;
  
                 switch (event) {
@@ -1357,7 +1357,7 @@ static void check_events(void)
                         /*
                          * We are not allowed to reject a critical suspend.
                          */
-                       (void) suspend(0);
+                       (void)suspend(0);
                         break;
                 }
         }
@@ -1365,12 +1365,12 @@ static void check_events(void)
  
  static void apm_event_handler(void)
  {
-       static int      pending_count = 4;
-       int             err;
+       static int pending_count = 4;
+       int err;
  
         if ((standbys_pending > 0) || (suspends_pending > 0)) {
                 if ((apm_info.connection_version > 0x100) &&
-                               (pending_count-- <= 0)) {
+                   (pending_count-- <= 0)) {
                         pending_count = 4;
                         if (debug)
                                 printk(KERN_DEBUG "apm: setting state busy\n");
@@ -1418,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func)
  
  static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
  {
-       struct apm_user *       as;
-       int                     i;
-       apm_event_t             event;
+       struct apm_user *as;
+       int i;
+       apm_event_t event;
  
         as = fp->private_data;
         if (check_apm_user(as, "read"))
@@ -1459,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
         return 0;
  }
  
-static unsigned int do_poll(struct file *fp, poll_table * wait)
+static unsigned int do_poll(struct file *fp, poll_table *wait)
  {
-       struct apm_user * as;
+       struct apm_user *as;
  
         as = fp->private_data;
         if (check_apm_user(as, "poll"))
@@ -1472,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait)
         return 0;
  }
  
-static int do_ioctl(struct inode * inode, struct file *filp,
+static int do_ioctl(struct inode *inode, struct file *filp,
                     u_int cmd, u_long arg)
  {
-       struct apm_user *       as;
+       struct apm_user *as;
  
         as = filp->private_data;
         if (check_apm_user(as, "ioctl"))
@@ -1515,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp,
         return 0;
  }
  
-static int do_release(struct inode * inode, struct file * filp)
+static int do_release(struct inode *inode, struct file *filp)
  {
-       struct apm_user *       as;
+       struct apm_user *as;
  
         as = filp->private_data;
         if (check_apm_user(as, "release"))
@@ -1533,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp)
                 if (suspends_pending <= 0)
                         (void) suspend(1);
         }
-       spin_lock(&user_list_lock);
+       spin_lock(&user_list_lock);
         if (user_list == as)
                 user_list = as->next;
         else {
-               struct apm_user *       as1;
+               struct apm_user *as1;
  
                 for (as1 = user_list;
                      (as1 != NULL) && (as1->next != as);
@@ -1553,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp)
         return 0;
  }
  
-static int do_open(struct inode * inode, struct file * filp)
+static int do_open(struct inode *inode, struct file *filp)
  {
-       struct apm_user *       as;
+       struct apm_user *as;
  
         as = kmalloc(sizeof(*as), GFP_KERNEL);
         if (as == NULL) {
@@ -1569,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp)
         as->suspends_read = as->standbys_read = 0;
         /*
          * XXX - this is a tiny bit broken, when we consider BSD
-         * process accounting. If the device is opened by root, we
+        * process accounting. If the device is opened by root, we
          * instantly flag that we used superuser privs. Who knows,
          * we might close the device immediately without doing a
          * privileged operation -- cevans
@@ -1652,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v)
            8) min = minutes; sec = seconds */
  
         seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
-                    driver_version,
-                    (apm_info.bios.version >> 8) & 0xff,
-                    apm_info.bios.version & 0xff,
-                    apm_info.bios.flags,
-                    ac_line_status,
-                    battery_status,
-                    battery_flag,
-                    percentage,
-                    time_units,
-                    units);
+                  driver_version,
+                  (apm_info.bios.version >> 8) & 0xff,
+                  apm_info.bios.version & 0xff,
+                  apm_info.bios.flags,
+                  ac_line_status,
+                  battery_status,
+                  battery_flag,
+                  percentage,
+                  time_units,
+                  units);
         return 0;
  }
  
@@ -1684,8 +1684,8 @@ static int apm(void *unused)
         unsigned short  cx;
         unsigned short  dx;
         int             error;
-       char *          power_stat;
-       char *          bat_stat;
+       char            *power_stat;
+       char            *bat_stat;
  
  #ifdef CONFIG_SMP
         /* 2002/08/01 - WT
@@ -1744,23 +1744,41 @@ static int apm(void *unused)
                 }
         }
  
-       if (debug && (num_online_cpus() == 1 || smp )) {
+       if (debug && (num_online_cpus() == 1 || smp)) {
                 error = apm_get_power_status(&bx, &cx, &dx);
                 if (error)
                         printk(KERN_INFO "apm: power status not available\n");
                 else {
                         switch ((bx >> 8) & 0xff) {
-                       case 0: power_stat = "off line"; break;
-                       case 1: power_stat = "on line"; break;
-                       case 2: power_stat = "on backup power"; break;
-                       default: power_stat = "unknown"; break;
+                       case 0:
+                               power_stat = "off line";
+                               break;
+                       case 1:
+                               power_stat = "on line";
+                               break;
+                       case 2:
+                               power_stat = "on backup power";
+                               break;
+                       default:
+                               power_stat = "unknown";
+                               break;
                         }
                         switch (bx & 0xff) {
-                       case 0: bat_stat = "high"; break;
-                       case 1: bat_stat = "low"; break;
-                       case 2: bat_stat = "critical"; break;
-                       case 3: bat_stat = "charging"; break;
-                       default: bat_stat = "unknown"; break;
+                       case 0:
+                               bat_stat = "high";
+                               break;
+                       case 1:
+                               bat_stat = "low";
+                               break;
+                       case 2:
+                               bat_stat = "critical";
+                               break;
+                       case 3:
+                               bat_stat = "charging";
+                               break;
+                       default:
+                               bat_stat = "unknown";
+                               break;
                         }
                         printk(KERN_INFO
                                "apm: AC %s, battery status %s, battery life ",
@@ -1777,8 +1795,8 @@ static int apm(void *unused)
                                         printk("unknown\n");
                                 else
                                         printk("%d %s\n", dx & 0x7fff,
-                                               (dx & 0x8000) ?
-                                               "minutes" : "seconds");
+                                              (dx & 0x8000) ?
+                                              "minutes" : "seconds");
                         }
                 }
         }
@@ -1803,7 +1821,7 @@ static int apm(void *unused)
  #ifndef MODULE
  static int __init apm_setup(char *str)
  {
-       int     invert;
+       int invert;
  
         while ((str != NULL) && (*str != '\0')) {
                 if (strncmp(str, "off", 3) == 0)
@@ -1828,14 +1846,13 @@ static int __init apm_setup(char *str)
                 if ((strncmp(str, "power-off", 9) == 0) ||
                     (strncmp(str, "power_off", 9) == 0))
                         power_off = !invert;
-               if (strncmp(str, "smp", 3) == 0)
-               {
+               if (strncmp(str, "smp", 3) == 0) {
                         smp = !invert;
                         idle_threshold = 100;
                 }
                 if ((strncmp(str, "allow-ints", 10) == 0) ||
                     (strncmp(str, "allow_ints", 10) == 0))
-                       apm_info.allow_ints = !invert;
+                       apm_info.allow_ints = !invert;
                 if ((strncmp(str, "broken-psr", 10) == 0) ||
                     (strncmp(str, "broken_psr", 10) == 0))
                         apm_info.get_power_status_broken = !invert;
@@ -1881,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d)
   */
  static int __init broken_ps2_resume(const struct dmi_system_id *d)
  {
-       printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
+       printk(KERN_INFO "%s machine detected. Mousepad Resume Bug "
+              "workaround hopefully not needed.\n", d->ident);
         return 0;
  }
  
@@ -1890,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d)
  {
         if (apm_info.realmode_power_off == 0) {
                 apm_info.realmode_power_off = 1;
-               printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
+               printk(KERN_INFO "%s bios detected. "
+                      "Using realmode poweroff only.\n", d->ident);
         }
         return 0;
  }
@@ -1900,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d)
  {
         if (apm_info.allow_ints == 0) {
                 apm_info.allow_ints = 1;
-               printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
+               printk(KERN_INFO "%s machine detected. "
+                      "Enabling interrupts during APM calls.\n", d->ident);
         }
         return 0;
  }
@@ -1910,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d)
  {
         if (apm_info.disabled == 0) {
                 apm_info.disabled = 1;
-               printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+               printk(KERN_INFO "%s machine detected. "
+                      "Disabling APM.\n", d->ident);
         }
         return 0;
  }
@@ -1919,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
  {
         if (apm_info.disabled == 0) {
                 apm_info.disabled = 1;
-               printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+               printk(KERN_INFO "%s machine detected. "
+                      "Disabling APM.\n", d->ident);
                 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
                 printk(KERN_INFO "download from support.intel.com \n");
         }
@@ -1931,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
  {
         if (apm_info.forbid_idle == 0) {
                 apm_info.forbid_idle = 1;
-               printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
+               printk(KERN_INFO "%s machine detected. "
+                      "Disabling APM idle calls.\n", d->ident);
         }
         return 0;
  }
@@ -1954,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
  static int __init broken_apm_power(const struct dmi_system_id *d)
  {
         apm_info.get_power_status_broken = 1;
-       printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
+       printk(KERN_WARNING "BIOS strings suggest APM bugs, "
+              "disabling power status reporting.\n");
         return 0;
  }
  
@@ -1965,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d)
  static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
  {
         apm_info.get_power_status_swabinminutes = 1;
-       printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
+       printk(KERN_WARNING "BIOS strings suggest APM reports battery life "
+              "in minutes and wrong byte order.\n");
         return 0;
  }
  
@@ -1990,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
                 apm_is_horked, "Dell Inspiron 2500",
                 {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
                         DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+                       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
         },
         {       /* Allow interrupts during suspend on Dell Inspiron laptops*/
                 set_apm_ints, "Dell Inspiron", {
@@ -2014,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
                 apm_is_horked, "Dell Dimension 4100",
                 {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
                         DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
-                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+                       DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+                       DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
         },
         {       /* Allow interrupts during suspend on Compaq Laptops*/
                 set_apm_ints, "Compaq 12XL125",
                 {       DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
                         DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
                         DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
+                       DMI_MATCH(DMI_BIOS_VERSION, "4.06"), },
         },
         {       /* Allow interrupts during APM or the clock goes slow */
                 set_apm_ints, "ASUSTeK",
@@ -2064,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
                 apm_is_horked, "Sharp PC-PJ/AX",
                 {       DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
                         DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
+                       DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), },
         },
         {       /* APM crashes */
                 apm_is_horked, "Dell Inspiron 2500",
                 {       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
                         DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
-                       DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-                       DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+                       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
         },
         {       /* APM idle hangs */
                 apm_likes_to_melt, "Jabil AMD",
@@ -2203,11 +2228,11 @@ static int __init apm_init(void)
                 return -ENODEV;
         }
         printk(KERN_INFO
-               "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
-               ((apm_info.bios.version >> 8) & 0xff),
-               (apm_info.bios.version & 0xff),
-               apm_info.bios.flags,
-               driver_version);
+              "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
+              ((apm_info.bios.version >> 8) & 0xff),
+              (apm_info.bios.version & 0xff),
+              apm_info.bios.flags,
+              driver_version);
         if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
                 printk(KERN_INFO "apm: no 32 bit BIOS support\n");
                 return -ENODEV;
@@ -2312,9 +2337,9 @@ static int __init apm_init(void)
         }
         wake_up_process(kapmd_task);
  
-       if (num_online_cpus() > 1 && !smp ) {
+       if (num_online_cpus() > 1 && !smp) {
                 printk(KERN_NOTICE
-                  "apm: disabled - APM is not SMP safe (power off active).\n");
+                      "apm: disabled - APM is not SMP safe (power off active).\n");
                 return 0;
         }
  
@@ -2339,7 +2364,7 @@ static int __init apm_init(void)
  
  static void __exit apm_exit(void)
  {
-       int     error;
+       int error;
  
         if (set_pm_idle) {
                 pm_idle = original_pm_idle;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c

index 0e45981b2dd7f15406962ec8f85d334e21451ff1..afd84463b7121f135dd1acee1683d2743d9ff8aa 100644 (file)
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -38,15 +38,15 @@ void foo(void);
  
  void foo(void)
  {
-       OFFSET(SIGCONTEXT_eax, sigcontext, eax);
-       OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
-       OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
-       OFFSET(SIGCONTEXT_edx, sigcontext, edx);
-       OFFSET(SIGCONTEXT_esi, sigcontext, esi);
-       OFFSET(SIGCONTEXT_edi, sigcontext, edi);
-       OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
-       OFFSET(SIGCONTEXT_esp, sigcontext, esp);
-       OFFSET(SIGCONTEXT_eip, sigcontext, eip);
+       OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
+       OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
+       OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
+       OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
+       OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
+       OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
+       OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
+       OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
+       OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
         BLANK();
  
         OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
@@ -70,39 +70,38 @@ void foo(void)
         OFFSET(TI_cpu, thread_info, cpu);
         BLANK();
  
-       OFFSET(GDS_size, Xgt_desc_struct, size);
-       OFFSET(GDS_address, Xgt_desc_struct, address);
-       OFFSET(GDS_pad, Xgt_desc_struct, pad);
+       OFFSET(GDS_size, desc_ptr, size);
+       OFFSET(GDS_address, desc_ptr, address);
         BLANK();
  
-       OFFSET(PT_EBX, pt_regs, ebx);
-       OFFSET(PT_ECX, pt_regs, ecx);
-       OFFSET(PT_EDX, pt_regs, edx);
-       OFFSET(PT_ESI, pt_regs, esi);
-       OFFSET(PT_EDI, pt_regs, edi);
-       OFFSET(PT_EBP, pt_regs, ebp);
-       OFFSET(PT_EAX, pt_regs, eax);
-       OFFSET(PT_DS,  pt_regs, xds);
-       OFFSET(PT_ES,  pt_regs, xes);
-       OFFSET(PT_FS,  pt_regs, xfs);
-       OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
-       OFFSET(PT_EIP, pt_regs, eip);
-       OFFSET(PT_CS,  pt_regs, xcs);
-       OFFSET(PT_EFLAGS, pt_regs, eflags);
-       OFFSET(PT_OLDESP, pt_regs, esp);
-       OFFSET(PT_OLDSS,  pt_regs, xss);
+       OFFSET(PT_EBX, pt_regs, bx);
+       OFFSET(PT_ECX, pt_regs, cx);
+       OFFSET(PT_EDX, pt_regs, dx);
+       OFFSET(PT_ESI, pt_regs, si);
+       OFFSET(PT_EDI, pt_regs, di);
+       OFFSET(PT_EBP, pt_regs, bp);
+       OFFSET(PT_EAX, pt_regs, ax);
+       OFFSET(PT_DS,  pt_regs, ds);
+       OFFSET(PT_ES,  pt_regs, es);
+       OFFSET(PT_FS,  pt_regs, fs);
+       OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
+       OFFSET(PT_EIP, pt_regs, ip);
+       OFFSET(PT_CS,  pt_regs, cs);
+       OFFSET(PT_EFLAGS, pt_regs, flags);
+       OFFSET(PT_OLDESP, pt_regs, sp);
+       OFFSET(PT_OLDSS,  pt_regs, ss);
         BLANK();
  
         OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
-       OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
+       OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
         BLANK();
  
         OFFSET(pbe_address, pbe, address);
         OFFSET(pbe_orig_address, pbe, orig_address);
         OFFSET(pbe_next, pbe, next);
  
-       /* Offset from the sysenter stack to tss.esp0 */
-       DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
+       /* Offset from the sysenter stack to tss.sp0 */
+       DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
                  sizeof(struct tss_struct));
  
         DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
@@ -111,8 +110,6 @@ void foo(void)
         DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
         DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
  
-       DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
-
         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
  
  #ifdef CONFIG_PARAVIRT
@@ -123,7 +120,7 @@ void foo(void)
         OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
         OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
         OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-       OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+       OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
         OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
  #endif
  
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c

index d1b6ed98774e3f635aaaef7ec16d84ffde1fefa7..494e1e096ee6df0c986aeb14dbc3f8b7a407bb50 100644 (file)
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -38,7 +38,6 @@ int main(void)
  #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
         ENTRY(state);
         ENTRY(flags); 
-       ENTRY(thread); 
         ENTRY(pid);
         BLANK();
  #undef ENTRY
@@ -47,6 +46,9 @@ int main(void)
         ENTRY(addr_limit);
         ENTRY(preempt_count);
         ENTRY(status);
+#ifdef CONFIG_IA32_EMULATION
+       ENTRY(sysenter_return);
+#endif
         BLANK();
  #undef ENTRY
  #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
@@ -59,17 +61,31 @@ int main(void)
         ENTRY(data_offset);
         BLANK();
  #undef ENTRY
+#ifdef CONFIG_PARAVIRT
+       BLANK();
+       OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
+       OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+       OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+       OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+       OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+       OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+       OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
+       OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+       OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+#endif
+
+
  #ifdef CONFIG_IA32_EMULATION
  #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
-       ENTRY(eax);
-       ENTRY(ebx);
-       ENTRY(ecx);
-       ENTRY(edx);
-       ENTRY(esi);
-       ENTRY(edi);
-       ENTRY(ebp);
-       ENTRY(esp);
-       ENTRY(eip);
+       ENTRY(ax);
+       ENTRY(bx);
+       ENTRY(cx);
+       ENTRY(dx);
+       ENTRY(si);
+       ENTRY(di);
+       ENTRY(bp);
+       ENTRY(sp);
+       ENTRY(ip);
         BLANK();
  #undef ENTRY
         DEFINE(IA32_RT_SIGFRAME_sigcontext,
@@ -81,14 +97,14 @@ int main(void)
         DEFINE(pbe_next, offsetof(struct pbe, next));
         BLANK();
  #define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
-       ENTRY(rbx);
-       ENTRY(rbx);
-       ENTRY(rcx);
-       ENTRY(rdx);
-       ENTRY(rsp);
-       ENTRY(rbp);
-       ENTRY(rsi);
-       ENTRY(rdi);
+       ENTRY(bx);
+       ENTRY(bx);
+       ENTRY(cx);
+       ENTRY(dx);
+       ENTRY(sp);
+       ENTRY(bp);
+       ENTRY(si);
+       ENTRY(di);
         ENTRY(r8);
         ENTRY(r9);
         ENTRY(r10);
@@ -97,7 +113,7 @@ int main(void)
         ENTRY(r13);
         ENTRY(r14);
         ENTRY(r15);
-       ENTRY(eflags);
+       ENTRY(flags);
         BLANK();
  #undef ENTRY
  #define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
@@ -108,7 +124,7 @@ int main(void)
         ENTRY(cr8);
         BLANK();
  #undef ENTRY
-       DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
+       DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
         BLANK();
         DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
         BLANK();
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c

index 0b9860530a6ba760385510569b2388f3409c9d12..30f25a75fe2876ef93605181f81c386c02953bfb 100644 (file)
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -1,8 +1,6 @@
  /*
   *     Implement 'Simple Boot Flag Specification 2.0'
   */
-
-
  #include <linux/types.h>
  #include <linux/kernel.h>
  #include <linux/init.h>
@@ -14,40 +12,38 @@
  
  #include <linux/mc146818rtc.h>
  
-
  #define SBF_RESERVED (0x78)
  #define SBF_PNPOS    (1<<0)
  #define SBF_BOOTING  (1<<1)
  #define SBF_DIAG     (1<<2)
  #define SBF_PARITY   (1<<7)
  
-
  int sbf_port __initdata = -1;  /* set via acpi_boot_init() */
  
-
  static int __init parity(u8 v)
  {
         int x = 0;
         int i;
-       
-       for(i=0;i<8;i++)
-       {
-               x^=(v&1);
-               v>>=1;
+
+       for (i = 0; i < 8; i++) {
+               x ^= (v & 1);
+               v >>= 1;
         }
+
         return x;
  }
  
  static void __init sbf_write(u8 v)
  {
         unsigned long flags;
-       if(sbf_port != -1)
-       {
+
+       if (sbf_port != -1) {
                 v &= ~SBF_PARITY;
-               if(!parity(v))
-                       v|=SBF_PARITY;
+               if (!parity(v))
+                       v |= SBF_PARITY;
  
-               printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
+               printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
+                       sbf_port, v);
  
                 spin_lock_irqsave(&rtc_lock, flags);
                 CMOS_WRITE(v, sbf_port);
@@ -57,33 +53,41 @@ static void __init sbf_write(u8 v)
  
  static u8 __init sbf_read(void)
  {
-       u8 v;
         unsigned long flags;
-       if(sbf_port == -1)
+       u8 v;
+
+       if (sbf_port == -1)
                 return 0;
+
         spin_lock_irqsave(&rtc_lock, flags);
         v = CMOS_READ(sbf_port);
         spin_unlock_irqrestore(&rtc_lock, flags);
+
         return v;
  }
  
  static int __init sbf_value_valid(u8 v)
  {
-       if(v&SBF_RESERVED)              /* Reserved bits */
+       if (v & SBF_RESERVED)           /* Reserved bits */
                 return 0;
-       if(!parity(v))
+       if (!parity(v))
                 return 0;
+
         return 1;
  }
  
  static int __init sbf_init(void)
  {
         u8 v;
-       if(sbf_port == -1)
+
+       if (sbf_port == -1)
                 return 0;
+
         v = sbf_read();
-       if(!sbf_value_valid(v))
-               printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
+       if (!sbf_value_valid(v)) {
+               printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
+                       "CMOS RAM was invalid\n", v);
+       }
  
         v &= ~SBF_RESERVED;
         v &= ~SBF_BOOTING;
@@ -92,7 +96,7 @@ static int __init sbf_init(void)
         v |= SBF_PNPOS;
  #endif
         sbf_write(v);
+
         return 0;
  }
-
  module_init(sbf_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c

index 9a189cef64043a8968a465f1c34c0424354956f2..8f520f93ffd44ca91a887d0dc4ea029a4aae7f10 100644 (file)
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/bugs_64.c
@@ -13,7 +13,6 @@
  void __init check_bugs(void)
  {
         identify_cpu(&boot_cpu_data);
-       mtrr_bp_init();
  #if !defined(CONFIG_SMP)
         printk("CPU: ");
         print_cpu_info(&boot_cpu_data);
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c

index 3e91d3ee26ec68da1601385f375ac70b09a8d27a..238468ae19931be3116e1e9de9ae0f0c1db1d2e2 100644 (file)
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
                         &regs[CR_ECX], &regs[CR_EDX]);
  
                 if (regs[cb->reg] & (1 << cb->bit))
-                       set_bit(cb->feature, c->x86_capability);
+                       set_cpu_cap(c, cb->feature);
         }
  }
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index 1ff88c7f45cff837b581c02cc6a8beda9bb70947..06fa159232fd74b9520cb41a706f1d239e653572 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void)
  
  int force_mwait __cpuinitdata;
  
+void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+       if (cpuid_eax(0x80000000) >= 0x80000007) {
+               c->x86_power = cpuid_edx(0x80000007);
+               if (c->x86_power & (1<<8))
+                       set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+       }
+}
+
  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
  {
         u32 l, h;
@@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
         }
  #endif
  
+       early_init_amd(c);
+
         /*
          *      FIXME: We should handle the K5 here. Set up the write
          *      range and also turn on MSR 83 bits 4 and 31 (write alloc,
@@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
         }
  
-       if (cpuid_eax(0x80000000) >= 0x80000007) {
-               c->x86_power = cpuid_edx(0x80000007);
-               if (c->x86_power & (1<<8))
-                       set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-       }
-
  #ifdef CONFIG_X86_HT
         /*
          * On a AMD multi core setup the lower bits of the APIC id
@@ -295,12 +300,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                 local_apic_timer_disabled = 1;
  #endif
  
-       if (c->x86 == 0x10 && !force_mwait)
-               clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
-
         /* K6s reports MCEs but don't actually have all the MSRs */
         if (c->x86 < 6)
                 clear_bit(X86_FEATURE_MCE, c->x86_capability);
+
+       if (cpu_has_xmm)
+               set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
  }
  
  static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

index 205fd5ba57f7a588d594800916c3e13cec220db1..9b95edcfc6ae23696fc7566087adc601afa6f388 100644 (file)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -11,6 +11,7 @@
  #include <linux/utsname.h>
  #include <asm/bugs.h>
  #include <asm/processor.h>
+#include <asm/processor-flags.h>
  #include <asm/i387.h>
  #include <asm/msr.h>
  #include <asm/paravirt.h>
@@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium);
  static int __init no_387(char *s)
  {
         boot_cpu_data.hard_math = 0;
-       write_cr0(0xE | read_cr0());
+       write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
         return 1;
  }
  
@@ -153,7 +154,7 @@ static void __init check_config(void)
   * If we configured ourselves for a TSC, we'd better have one!
   */
  #ifdef CONFIG_X86_TSC
-       if (!cpu_has_tsc && !tsc_disable)
+       if (!cpu_has_tsc)
                 panic("Kernel compiled for Pentium+, requires TSC feature!");
  #endif
  
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index e2fcf2051bdb26c87161b4c6dc90b5d0a893ce7a..db28aa9e2f694b7f14669d3c66df8cc712cfab48 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,43 +22,48 @@
  #include "cpu.h"
  
  DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
-       [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
-       [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
-       [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
-       [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
         /*
          * Segments used for calling PnP BIOS have byte granularity.
          * They code segments and data segments have fixed 64k limits,
          * the transfer segment sizes are set at run time.
          */
-       [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
-       [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
-       [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+       /* 32-bit code */
+       [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+       /* 16-bit code */
+       [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+       /* 16-bit data */
+       [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+       /* 16-bit data */
+       [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+       /* 16-bit data */
+       [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
         /*
          * The APM segments have byte granularity and their bases
          * are set at run time.  All have 64k limits.
          */
-       [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+       /* 32-bit code */
+       [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
         /* 16-bit code */
-       [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
-       [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
+       [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+       /* data */
+       [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
  
-       [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
-       [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
+       [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+       [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
  } };
  EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
  
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
  static int cachesize_override __cpuinitdata = -1;
-static int disable_x86_fxsr __cpuinitdata;
  static int disable_x86_serial_nr __cpuinitdata = 1;
-static int disable_x86_sep __cpuinitdata;
  
  struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
  
-extern int disable_pse;
-
  static void __cpuinit default_init(struct cpuinfo_x86 * c)
  {
         /* Not much we can do here... */
@@ -207,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
  
  static int __init x86_fxsr_setup(char * s)
  {
-       /* Tell all the other CPUs to not use it... */
-       disable_x86_fxsr = 1;
-
-       /*
-        * ... and clear the bits early in the boot_cpu_data
-        * so that the bootup process doesn't try to do this
-        * either.
-        */
-       clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
-       clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
+       setup_clear_cpu_cap(X86_FEATURE_FXSR);
+       setup_clear_cpu_cap(X86_FEATURE_XMM);
         return 1;
  }
  __setup("nofxsr", x86_fxsr_setup);
@@ -224,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup);
  
  static int __init x86_sep_setup(char * s)
  {
-       disable_x86_sep = 1;
+       setup_clear_cpu_cap(X86_FEATURE_SEP);
         return 1;
  }
  __setup("nosep", x86_sep_setup);
@@ -281,6 +278,33 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
                         c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
         }
  }
+static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
+{
+       u32 tfms, xlvl;
+       int ebx;
+
+       memset(&c->x86_capability, 0, sizeof c->x86_capability);
+       if (have_cpuid_p()) {
+               /* Intel-defined flags: level 0x00000001 */
+               if (c->cpuid_level >= 0x00000001) {
+                       u32 capability, excap;
+                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+                       c->x86_capability[0] = capability;
+                       c->x86_capability[4] = excap;
+               }
+
+               /* AMD-defined flags: level 0x80000001 */
+               xlvl = cpuid_eax(0x80000000);
+               if ((xlvl & 0xffff0000) == 0x80000000) {
+                       if (xlvl >= 0x80000001) {
+                               c->x86_capability[1] = cpuid_edx(0x80000001);
+                               c->x86_capability[6] = cpuid_ecx(0x80000001);
+                       }
+               }
+
+       }
+
+}
  
  /* Do minimum CPU detection early.
     Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
@@ -300,6 +324,17 @@ static void __init early_cpu_detect(void)
         cpu_detect(c);
  
         get_cpu_vendor(c, 1);
+
+       switch (c->x86_vendor) {
+       case X86_VENDOR_AMD:
+               early_init_amd(c);
+               break;
+       case X86_VENDOR_INTEL:
+               early_init_intel(c);
+               break;
+       }
+
+       early_get_cap(c);
  }
  
  static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
@@ -357,8 +392,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                 init_scattered_cpuid_features(c);
         }
  
-       early_intel_workaround(c);
-
  #ifdef CONFIG_X86_HT
         c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
  #endif
@@ -392,7 +425,7 @@ __setup("serialnumber", x86_serial_nr_setup);
  /*
   * This does the hard work of actually picking apart the CPU stuff...
   */
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
  {
         int i;
  
@@ -418,20 +451,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
  
         generic_identify(c);
  
-       printk(KERN_DEBUG "CPU: After generic identify, caps:");
-       for (i = 0; i < NCAPINTS; i++)
-               printk(" %08lx", c->x86_capability[i]);
-       printk("\n");
-
-       if (this_cpu->c_identify) {
+       if (this_cpu->c_identify)
                 this_cpu->c_identify(c);
  
-               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
-               for (i = 0; i < NCAPINTS; i++)
-                       printk(" %08lx", c->x86_capability[i]);
-               printk("\n");
-       }
-
         /*
          * Vendor-specific initialization.  In this section we
          * canonicalize the feature flags, meaning if there are
@@ -453,23 +475,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
          * we do "generic changes."
          */
  
-       /* TSC disabled? */
-       if ( tsc_disable )
-               clear_bit(X86_FEATURE_TSC, c->x86_capability);
-
-       /* FXSR disabled? */
-       if (disable_x86_fxsr) {
-               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
-               clear_bit(X86_FEATURE_XMM, c->x86_capability);
-       }
-
-       /* SEP disabled? */
-       if (disable_x86_sep)
-               clear_bit(X86_FEATURE_SEP, c->x86_capability);
-
-       if (disable_pse)
-               clear_bit(X86_FEATURE_PSE, c->x86_capability);
-
         /* If the model name is still unset, do table lookup. */
         if ( !c->x86_model_id[0] ) {
                 char *p;
@@ -482,13 +487,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                                 c->x86, c->x86_model);
         }
  
-       /* Now the feature flags better reflect actual CPU features! */
-
-       printk(KERN_DEBUG "CPU: After all inits, caps:");
-       for (i = 0; i < NCAPINTS; i++)
-               printk(" %08lx", c->x86_capability[i]);
-       printk("\n");
-
         /*
          * On SMP, boot_cpu_data holds the common feature set between
          * all CPUs; so make sure that we indicate which features are
@@ -501,8 +499,14 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                         boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
         }
  
+       /* Clear all flags overriden by options */
+       for (i = 0; i < NCAPINTS; i++)
+               c->x86_capability[i] ^= cleared_cpu_caps[i];
+
         /* Init Machine Check Exception if available. */
         mcheck_init(c);
+
+       select_idle_routine(c);
  }
  
  void __init identify_boot_cpu(void)
@@ -510,7 +514,6 @@ void __init identify_boot_cpu(void)
         identify_cpu(&boot_cpu_data);
         sysenter_setup();
         enable_sep_cpu();
-       mtrr_bp_init();
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -567,6 +570,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
  }
  #endif
  
+static __init int setup_noclflush(char *arg)
+{
+       setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+       return 1;
+}
+__setup("noclflush", setup_noclflush);
+
  void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
  {
         char *vendor = NULL;
@@ -590,6 +600,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
                 printk("\n");
  }
  
+static __init int setup_disablecpuid(char *arg)
+{
+       int bit;
+       if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+               setup_clear_cpu_cap(bit);
+       else
+               return 0;
+       return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
  cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
  
  /* This is hacky. :)
@@ -620,21 +641,13 @@ void __init early_cpu_init(void)
         nexgen_init_cpu();
         umc_init_cpu();
         early_cpu_detect();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       /* pse is not compatible with on-the-fly unmapping,
-        * disable it even if the cpus claim to support it.
-        */
-       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-       disable_pse = 1;
-#endif
  }
  
  /* Make sure %fs is initialized properly in idle threads */
  struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
  {
         memset(regs, 0, sizeof(struct pt_regs));
-       regs->xfs = __KERNEL_PERCPU;
+       regs->fs = __KERNEL_PERCPU;
         return regs;
  }
  
@@ -642,7 +655,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
   * it's on the real one. */
  void switch_to_new_gdt(void)
  {
-       struct Xgt_desc_struct gdt_descr;
+       struct desc_ptr gdt_descr;
  
         gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
         gdt_descr.size = GDT_SIZE - 1;
@@ -672,12 +685,6 @@ void __cpuinit cpu_init(void)
  
         if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
                 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-       if (tsc_disable && cpu_has_tsc) {
-               printk(KERN_NOTICE "Disabling TSC...\n");
-               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
-               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
-               set_in_cr4(X86_CR4_TSD);
-       }
  
         load_idt(&idt_descr);
         switch_to_new_gdt();
@@ -691,7 +698,7 @@ void __cpuinit cpu_init(void)
                 BUG();
         enter_lazy_tlb(&init_mm, curr);
  
-       load_esp0(t, thread);
+       load_sp0(t, thread);
         set_tss_desc(cpu,t);
         load_TR_desc();
         load_LDT(&init_mm.context);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h

index 2f6432cef6ffb68c4bd9391f34313d20ca66cdb1..ad6527a5beb12ba086ab643553722aa31fdbc08a 100644 (file)
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -24,5 +24,6 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
  extern int get_model_name(struct cpuinfo_x86 *c);
  extern void display_cacheinfo(struct cpuinfo_x86 *c);
  
-extern void early_intel_workaround(struct cpuinfo_x86 *c);
+extern void early_init_intel(struct cpuinfo_x86 *c);
+extern void early_init_amd(struct cpuinfo_x86 *c);
  
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c

index fea0af0476b96371f4c71fabcb463ab65d74fc89..a962dcb9c408518add7154ac9757ad2ed2c08f7c 100644 (file)
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -67,7 +67,8 @@ struct acpi_cpufreq_data {
         unsigned int cpu_feature;
  };
  
-static struct acpi_cpufreq_data *drv_data[NR_CPUS];
+static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
+
  /* acpi_perf_data is a pointer to percpu data. */
  static struct acpi_processor_performance *acpi_perf_data;
  
@@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask)
         if (unlikely(cpus_empty(mask)))
                 return 0;
  
-       switch (drv_data[first_cpu(mask)]->cpu_feature) {
+       switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) {
         case SYSTEM_INTEL_MSR_CAPABLE:
                 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
                 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
                 break;
         case SYSTEM_IO_CAPABLE:
                 cmd.type = SYSTEM_IO_CAPABLE;
-               perf = drv_data[first_cpu(mask)]->acpi_data;
+               perf = per_cpu(drv_data, first_cpu(mask))->acpi_data;
                 cmd.addr.io.port = perf->control_register.address;
                 cmd.addr.io.bit_width = perf->control_register.bit_width;
                 break;
@@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
  
  #endif
  
-       retval = drv_data[cpu]->max_freq * perf_percent / 100;
+       retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
  
         put_cpu();
         set_cpus_allowed(current, saved_mask);
@@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
  
  static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
  {
-       struct acpi_cpufreq_data *data = drv_data[cpu];
+       struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
         unsigned int freq;
  
         dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
  static int acpi_cpufreq_target(struct cpufreq_policy *policy,
                                unsigned int target_freq, unsigned int relation)
  {
-       struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+       struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
         struct acpi_processor_performance *perf;
         struct cpufreq_freqs freqs;
         cpumask_t online_policy_cpus;
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
  
  static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
  {
-       struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+       struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
  
         dprintk("acpi_cpufreq_verify\n");
  
@@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
                 return -ENOMEM;
  
         data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
-       drv_data[cpu] = data;
+       per_cpu(drv_data, cpu) = data;
  
         if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
                 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +715,20 @@ err_unreg:
         acpi_processor_unregister_performance(perf, cpu);
  err_free:
         kfree(data);
-       drv_data[cpu] = NULL;
+       per_cpu(drv_data, cpu) = NULL;
  
         return result;
  }
  
  static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
  {
-       struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+       struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
  
         dprintk("acpi_cpufreq_cpu_exit\n");
  
         if (data) {
                 cpufreq_frequency_table_put_attr(policy->cpu);
-               drv_data[policy->cpu] = NULL;
+               per_cpu(drv_data, policy->cpu) = NULL;
                 acpi_processor_unregister_performance(data->acpi_data,
                                                       policy->cpu);
                 kfree(data);
@@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
  
  static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
  {
-       struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+       struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
  
         dprintk("acpi_cpufreq_resume\n");
  
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c

index 749d00cb2ebdde52ae67719ecc4514191b6c20fe..06fcce516d51d44a97349f9140507f82b6d20ef2 100644 (file)
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
         if ( acpi_bus_get_device(obj_handle, &d) ) {
                 return 0;
         }
-       *return_value = (void *)acpi_driver_data(d);
+       *return_value = acpi_driver_data(d);
         return 1;
  }
  
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c

index 99e1ef9939bec24bc210791eecc6f23f328c24ae..a0522735dd9d9298122b5069baabe2bc12c0cda4 100644 (file)
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -52,7 +52,7 @@
  /* serialize freq changes  */
  static DEFINE_MUTEX(fidvid_mutex);
  
-static struct powernow_k8_data *powernow_data[NR_CPUS];
+static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
  
  static int cpu_family = CPU_OPTERON;
  
@@ -1018,7 +1018,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
  static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
  {
         cpumask_t oldmask = CPU_MASK_ALL;
-       struct powernow_k8_data *data = powernow_data[pol->cpu];
+       struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
         u32 checkfid;
         u32 checkvid;
         unsigned int newstate;
@@ -1094,7 +1094,7 @@ err_out:
  /* Driver entry point to verify the policy and range of frequencies */
  static int powernowk8_verify(struct cpufreq_policy *pol)
  {
-       struct powernow_k8_data *data = powernow_data[pol->cpu];
+       struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
  
         if (!data)
                 return -EINVAL;
@@ -1202,7 +1202,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
                 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
                         data->currfid, data->currvid);
  
-       powernow_data[pol->cpu] = data;
+       per_cpu(powernow_data, pol->cpu) = data;
  
         return 0;
  
@@ -1216,7 +1216,7 @@ err_out:
  
  static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
  {
-       struct powernow_k8_data *data = powernow_data[pol->cpu];
+       struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
  
         if (!data)
                 return -EINVAL;
@@ -1237,7 +1237,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
         cpumask_t oldmask = current->cpus_allowed;
         unsigned int khz = 0;
  
-       data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))];
+       data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu)));
  
         if (!data)
                 return -EINVAL;
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c

index 88d66fb8411d183cc5c96091b4722c165a2058bb..404a6a2d4016c790c6457b966703f8ebdc669dc2 100644 (file)
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -5,6 +5,7 @@
  #include <asm/dma.h>
  #include <asm/io.h>
  #include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
  #include <asm/timer.h>
  #include <asm/pci-direct.h>
  #include <asm/tsc.h>
@@ -126,15 +127,12 @@ static void __cpuinit set_cx86_reorder(void)
  
  static void __cpuinit set_cx86_memwb(void)
  {
-       u32 cr0;
-
         printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
  
         /* CCR2 bit 2: unlock NW bit */
         setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
         /* set 'Not Write-through' */
-       cr0 = 0x20000000;
-       write_cr0(read_cr0() | cr0);
+       write_cr0(read_cr0() | X86_CR0_NW);
         /* CCR2 bit 2: lock NW bit and set WT1 */
         setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
  }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index cc8c501b9f396d2fb0d63804e7a867875141a2be..d1c372b018dbe299fde8ae3c5bcfc21961929e84 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,6 +11,8 @@
  #include <asm/pgtable.h>
  #include <asm/msr.h>
  #include <asm/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/ds.h>
  
  #include "cpu.h"
  
@@ -27,13 +29,14 @@
  struct movsl_mask movsl_mask __read_mostly;
  #endif
  
-void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c)
+void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
  {
-       if (c->x86_vendor != X86_VENDOR_INTEL)
-               return;
         /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
         if (c->x86 == 15 && c->x86_cache_alignment == 64)
                 c->x86_cache_alignment = 128;
+       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+               (c->x86 == 0x6 && c->x86_model >= 0x0e))
+               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
  }
  
  /*
@@ -113,6 +116,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
         unsigned int l2 = 0;
         char *p = NULL;
  
+       early_init_intel(c);
+
  #ifdef CONFIG_X86_F00F_BUG
         /*
          * All current models of Pentium and Pentium with MMX technology CPUs
@@ -132,7 +137,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
         }
  #endif
  
-       select_idle_routine(c);
         l2 = init_intel_cacheinfo(c);
         if (c->cpuid_level > 9 ) {
                 unsigned eax = cpuid_eax(10);
@@ -201,16 +205,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
         }
  #endif
  
+       if (cpu_has_xmm2)
+               set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability);
         if (c->x86 == 15) {
                 set_bit(X86_FEATURE_P4, c->x86_capability);
-               set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
         }
         if (c->x86 == 6) 
                 set_bit(X86_FEATURE_P3, c->x86_capability);
-       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-               (c->x86 == 0x6 && c->x86_model >= 0x0e))
-               set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-
         if (cpu_has_ds) {
                 unsigned int l1;
                 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
@@ -219,6 +220,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                 if (!(l1 & (1<<12)))
                         set_bit(X86_FEATURE_PEBS, c->x86_capability);
         }
+
+       if (cpu_has_bts)
+               ds_init_intel(c);
  }
  
  static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -342,5 +346,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
  EXPORT_SYMBOL(cmpxchg_386_u32);
  #endif
  
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+       u64 prev;
+       unsigned long flags;
+
+       /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+       local_irq_save(flags);
+       prev = *(u64 *)ptr;
+       if (prev == old)
+               *(u64 *)ptr = new;
+       local_irq_restore(flags);
+       return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
  // arch_initcall(intel_cpu_init);
  
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c

index eef63e3630c2a65330b50c9a335d629340584f5d..e633c9c2b764c3ec7829420b06b6ece9c459d455 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -16,7 +16,7 @@
  #include "mce.h"
  
  /* Machine Check Handler For AMD Athlon/Duron */
-static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
+static void k7_machine_check(struct pt_regs * regs, long error_code)
  {
         int recover=1;
         u32 alow, ahigh, high, low;
@@ -27,29 +27,32 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
         if (mcgstl & (1<<0))    /* Recoverable ? */
                 recover=0;
  
-       printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+       printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
                 smp_processor_id(), mcgsth, mcgstl);
  
-       for (i=1; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+       for (i = 1; i < nr_mce_banks; i++) {
+               rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
                 if (high&(1<<31)) {
+                       char misc[20];
+                       char addr[24];
+                       misc[0] = addr[0] = '\0';
                         if (high & (1<<29))
                                 recover |= 1;
                         if (high & (1<<25))
                                 recover |= 2;
-                       printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
                         high &= ~(1<<31);
                         if (high & (1<<27)) {
-                               rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                               printk ("[%08x%08x]", ahigh, alow);
+                               rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+                               snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
                         }
                         if (high & (1<<26)) {
-                               rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                               printk (" at %08x%08x", ahigh, alow);
+                               rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+                               snprintf(addr, 24, " at %08x%08x", ahigh, alow);
                         }
-                       printk ("\n");
+                       printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+                               smp_processor_id(), i, high, low, misc, addr);
                         /* Clear it */
-                       wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+                       wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
                         /* Serialize */
                         wmb();
                         add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h

index 81fb6e2d35f37e2109d02a0e5b98fb1335261e4f..ae9f628838f126863443451d58352f3b158bf464 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
  void winchip_mcheck_init(struct cpuinfo_x86 *c);
  
  /* Call the installed machine check handler for this CPU setup. */
-extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
+extern void (*machine_check_vector)(struct pt_regs *, long error_code);
  
  extern int nr_mce_banks;
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c

index 34c781eddee4cdc6fce3419366b63329965655a5..a5182dcd94ae625865f02ef84937ebcbdc0ca1a9 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -22,13 +22,13 @@ int nr_mce_banks;
  EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
  
  /* Handle unconfigured int18 (should never happen) */
-static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code)
+static void unexpected_machine_check(struct pt_regs * regs, long error_code)
  {      
         printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
  }
  
  /* Call the installed machine check handler for this CPU setup. */
-void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
+void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
  
  /* This has to be run for each processor */
  void mcheck_init(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c

index 242e8668dbeb2a6904b9b15feee00f827e099d5f..9a699ed035982ddeac48c4800478d5c8d4a0a5c4 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
   * separate MCEs from kernel messages to avoid bogus bug reports.
   */
  
-struct mce_log mcelog = {
+static struct mce_log mcelog = {
         MCE_LOG_SIGNATURE,
         MCE_LOG_LEN,
  };
@@ -80,7 +80,7 @@ void mce_log(struct mce *mce)
                         /* When the buffer fills up discard new entries. Assume
                            that the earlier errors are the more interesting. */
                         if (entry >= MCE_LOG_LEN) {
-                               set_bit(MCE_OVERFLOW, &mcelog.flags);
+                               set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
                                 return;
                         }
                         /* Old left over entry. Skip. */
@@ -110,12 +110,12 @@ static void print_mce(struct mce *m)
                KERN_EMERG
                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
                m->cpu, m->mcgstatus, m->bank, m->status);
-       if (m->rip) {
+       if (m->ip) {
                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-                      m->cs, m->rip);
+                      m->cs, m->ip);
                 if (m->cs == __KERNEL_CS)
-                       print_symbol("{%s}", m->rip);
+                       print_symbol("{%s}", m->ip);
                 printk("\n");
         }
         printk(KERN_EMERG "TSC %Lx ", m->tsc);
@@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c)
  static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
  {
         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
-               m->rip = regs->rip;
+               m->ip = regs->ip;
                 m->cs = regs->cs;
         } else {
-               m->rip = 0;
+               m->ip = 0;
                 m->cs = 0;
         }
         if (rip_msr) {
                 /* Assume the RIP in the MSR is exact. Is this true? */
                 m->mcgstatus |= MCG_STATUS_EIPV;
-               rdmsrl(rip_msr, m->rip);
+               rdmsrl(rip_msr, m->ip);
                 m->cs = 0;
         }
  }
@@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  
         atomic_inc(&mce_entry);
  
-       if (regs)
-               notify_die(DIE_NMI, "machine check", regs, error_code, 18,
-                          SIGKILL);
-       if (!banks)
+       if ((regs
+            && notify_die(DIE_NMI, "machine check", regs, error_code,
+                          18, SIGKILL) == NOTIFY_STOP)
+           || !banks)
                 goto out2;
  
         memset(&m, 0, sizeof(struct mce));
@@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                  * instruction which caused the MCE.
                  */
                 if (m.mcgstatus & MCG_STATUS_EIPV)
-                       user_space = panicm.rip && (panicm.cs & 3);
+                       user_space = panicm.ip && (panicm.cs & 3);
  
                 /*
                  * If we know that the error was in user space, send a
@@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
                         loff_t *off)
  {
         unsigned long *cpu_tsc;
-       static DECLARE_MUTEX(mce_read_sem);
+       static DEFINE_MUTEX(mce_read_mutex);
         unsigned next;
         char __user *buf = ubuf;
         int i, err;
@@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
         if (!cpu_tsc)
                 return -ENOMEM;
  
-       down(&mce_read_sem);
+       mutex_lock(&mce_read_mutex);
         next = rcu_dereference(mcelog.next);
  
         /* Only supports full reads right now */
         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
-               up(&mce_read_sem);
+               mutex_unlock(&mce_read_mutex);
                 kfree(cpu_tsc);
                 return -EINVAL;
         }
@@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
                 }
         }
-       up(&mce_read_sem);
+       mutex_unlock(&mce_read_mutex);
         kfree(cpu_tsc);
         return err ? -EFAULT : buf - ubuf;
  }
@@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
         return 0;
  }
  
-static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd,
-                    unsigned long arg)
+static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
  {
         int __user *p = (int __user *)arg;
  
@@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = {
         .release = mce_release,
         .read = mce_read,
         .poll = mce_poll,
-       .ioctl = mce_ioctl,
+       .unlocked_ioctl = mce_ioctl,
  };
  
  static struct miscdevice mce_log_device = {
@@ -855,8 +854,8 @@ static void mce_remove_device(unsigned int cpu)
  }
  
  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
+                                     unsigned long action, void *hcpu)
  {
         unsigned int cpu = (unsigned long)hcpu;
  
@@ -873,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-static struct notifier_block mce_cpu_notifier = {
+static struct notifier_block mce_cpu_notifier __cpuinitdata = {
         .notifier_call = mce_cpu_callback,
  };
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c

index 753588755feeb6d76d079cefe64d39236438a86e..32671da8184e5e78c1e090d5d795606057f3cd37 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
  {
         unsigned int bank, block;
         unsigned int cpu = smp_processor_id();
+       u8 lvt_off;
         u32 low = 0, high = 0, address = 0;
  
         for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
                         if (shared_bank[bank] && c->cpu_core_id)
                                 break;
  #endif
+                       lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
+                                                      APIC_EILVT_MSG_FIX, 0);
+
                         high &= ~MASK_LVTOFF_HI;
-                       high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
+                       high |= lvt_off << 20;
                         wrmsr(address, low, high);
  
-                       setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
-                                               THRESHOLD_APIC_VECTOR,
-                                               K8_APIC_EXT_INT_MSG_FIX, 0);
-
                         threshold_defaults.address = address;
                         threshold_restart_bank(&threshold_defaults, 0, 0);
                 }
@@ -450,7 +450,8 @@ recurse:
         if (err)
                 goto out_free;
  
-       kobject_uevent(&b->kobj, KOBJ_ADD);
+       if (b)
+               kobject_uevent(&b->kobj, KOBJ_ADD);
  
         return err;
  
@@ -554,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
         int err = 0;
  
         for (bank = 0; bank < NR_BANKS; ++bank) {
-               if (!(per_cpu(bank_map, cpu) & 1 << bank))
+               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
                         continue;
                 err = threshold_create_bank(cpu, bank);
                 if (err)
@@ -637,14 +638,14 @@ static void threshold_remove_device(unsigned int cpu)
         unsigned int bank;
  
         for (bank = 0; bank < NR_BANKS; ++bank) {
-               if (!(per_cpu(bank_map, cpu) & 1 << bank))
+               if (!(per_cpu(bank_map, cpu) & (1 << bank)))
                         continue;
                 threshold_remove_bank(cpu, bank);
         }
  }
  
  /* get notified when a cpu comes on/off */
-static int threshold_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
                                             unsigned long action, void *hcpu)
  {
         /* cpu was unsigned int to begin with */
@@ -669,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
         return NOTIFY_OK;
  }
  
-static struct notifier_block threshold_cpu_notifier = {
+static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
         .notifier_call = threshold_cpu_callback,
  };
  
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c

index be4dabfee1f5c9af40b1b61362f6e80ec1cc6ff1..cb03345554a5cfe0153ea8ad6ef0d739fc8b1583 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
  /* Thermal interrupt handler for this CPU setup */
  static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
  
-fastcall void smp_thermal_interrupt(struct pt_regs *regs)
+void smp_thermal_interrupt(struct pt_regs *regs)
  {
         irq_enter();
         vendor_thermal_interrupt(regs);
@@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
         rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
  }
  
-static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs * regs, long error_code)
  {
         int recover=1;
         u32 alow, ahigh, high, low;
@@ -152,38 +152,41 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
         if (mcgstl & (1<<0))    /* Recoverable ? */
                 recover=0;
  
-       printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+       printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
                 smp_processor_id(), mcgsth, mcgstl);
  
         if (mce_num_extended_msrs > 0) {
                 struct intel_mce_extended_msrs dbg;
                 intel_get_extended_msrs(&dbg);
-               printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
-                       smp_processor_id(), dbg.eip, dbg.eflags);
-               printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
-                       dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
-               printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
+               printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
+                       "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
+                       "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
+                       smp_processor_id(), dbg.eip, dbg.eflags,
+                       dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
                         dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
         }
  
-       for (i=0; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+       for (i = 0; i < nr_mce_banks; i++) {
+               rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
                 if (high & (1<<31)) {
+                       char misc[20];
+                       char addr[24];
+                       misc[0] = addr[0] = '\0';
                         if (high & (1<<29))
                                 recover |= 1;
                         if (high & (1<<25))
                                 recover |= 2;
-                       printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
                         high &= ~(1<<31);
                         if (high & (1<<27)) {
-                               rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                               printk ("[%08x%08x]", ahigh, alow);
+                               rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+                               snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
                         }
                         if (high & (1<<26)) {
-                               rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                               printk (" at %08x%08x", ahigh, alow);
+                               rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+                               snprintf(addr, 24, " at %08x%08x", ahigh, alow);
                         }
-                       printk ("\n");
+                       printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+                               smp_processor_id(), i, high, low, misc, addr);
                 }
         }
  
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c

index 94bc43d950cf1eb2c1e3db71940ab60e231cac2b..a18310aaae0cf4962cd7a2512e20ba8361667027 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
  #include "mce.h"
  
  /* Machine check handler for Pentium class Intel */
-static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code)
+static void pentium_machine_check(struct pt_regs * regs, long error_code)
  {
         u32 loaddr, hi, lotype;
         rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c

index deeae42ce199b80584e752609217978a7eb19b27..74342604d30e1d3d6eb98567103d63b046f6a8f2 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -16,7 +16,7 @@
  #include "mce.h"
  
  /* Machine Check Handler For PII/PIII */
-static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs * regs, long error_code)
  {
         int recover=1;
         u32 alow, ahigh, high, low;
@@ -27,27 +27,30 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
         if (mcgstl & (1<<0))    /* Recoverable ? */
                 recover=0;
  
-       printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+       printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
                 smp_processor_id(), mcgsth, mcgstl);
  
-       for (i=0; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+       for (i = 0; i < nr_mce_banks; i++) {
+               rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
                 if (high & (1<<31)) {
+                       char misc[20];
+                       char addr[24];
+                       misc[0] = addr[0] = '\0';
                         if (high & (1<<29))
                                 recover |= 1;
                         if (high & (1<<25))
                                 recover |= 2;
-                       printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
                         high &= ~(1<<31);
                         if (high & (1<<27)) {
-                               rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                               printk ("[%08x%08x]", ahigh, alow);
+                               rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+                               snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
                         }
                         if (high & (1<<26)) {
-                               rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                               printk (" at %08x%08x", ahigh, alow);
+                               rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+                               snprintf(addr, 24, " at %08x%08x", ahigh, alow);
                         }
-                       printk ("\n");
+                       printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+                               smp_processor_id(), i, high, low, misc, addr);
                 }
         }
  
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c

index 9e424b6c293d3fea2727a814d02cc79b729cacfc..3d428d5afc528104e8be5573cdf32c18365e7588 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,7 +15,7 @@
  #include "mce.h"
  
  /* Machine check handler for WinChip C6 */
-static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code)
+static void winchip_machine_check(struct pt_regs * regs, long error_code)
  {
         printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
         add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c

index 0949cdbf848afcceff5e997f12ac882075c45876..ee2331b0e58fe1b19c6fd22ca6e01cc290b7eb23 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base,
      <base> The base address of the region.
      <size> The size of the region. If this is 0 the region is disabled.
      <type> The type of the region.
-    <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
-    be done externally.
      [RETURNS] Nothing.
  */
  {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c

index 9964be3de2b7ceb2a80b26561738ab23f5b4ffed..8e139c70f888262a62535bc7c2007acd2ad5b7f6 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -4,6 +4,7 @@
  #include <asm/msr.h>
  #include <asm/io.h>
  #include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
  #include "mtrr.h"
  
  int arr3_protected;
@@ -142,7 +143,7 @@ static void prepare_set(void)
  
         /*  Disable and flush caches. Note that wbinvd flushes the TLBs as
             a side-effect  */
-       cr0 = read_cr0() | 0x40000000;
+       cr0 = read_cr0() | X86_CR0_CD;
         wbinvd();
         write_cr0(cr0);
         wbinvd();
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c

index 992f08dfbb6c39d2df5f786d4310fd7d0704cb75..103d61a59b196ac5bebd57a9c3369409cba56bfa 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,11 +9,12 @@
  #include <asm/msr.h>
  #include <asm/system.h>
  #include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
  #include <asm/tlbflush.h>
  #include "mtrr.h"
  
  struct mtrr_state {
-       struct mtrr_var_range *var_ranges;
+       struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
         mtrr_type fixed_ranges[NUM_FIXED_RANGES];
         unsigned char enabled;
         unsigned char have_fixed;
@@ -85,12 +86,6 @@ void __init get_mtrr_state(void)
         struct mtrr_var_range *vrs;
         unsigned lo, dummy;
  
-       if (!mtrr_state.var_ranges) {
-               mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), 
-                                               GFP_KERNEL);
-               if (!mtrr_state.var_ranges)
-                       return;
-       } 
         vrs = mtrr_state.var_ranges;
  
         rdmsr(MTRRcap_MSR, lo, dummy);
@@ -188,7 +183,7 @@ static inline void k8_enable_fixed_iorrs(void)
   * \param changed pointer which indicates whether the MTRR needed to be changed
   * \param msrwords pointer to the MSR values which the MSR should have
   */
-static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
+static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
  {
         unsigned lo, hi;
  
@@ -200,7 +195,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
                     ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
                         k8_enable_fixed_iorrs();
                 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
-               *changed = TRUE;
+               *changed = true;
         }
  }
  
@@ -260,7 +255,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
  static int set_fixed_ranges(mtrr_type * frs)
  {
         unsigned long long *saved = (unsigned long long *) frs;
-       int changed = FALSE;
+       bool changed = false;
         int block=-1, range;
  
         while (fixed_range_blocks[++block].ranges)
@@ -273,17 +268,17 @@ static int set_fixed_ranges(mtrr_type * frs)
  
  /*  Set the MSR pair relating to a var range. Returns TRUE if
      changes are made  */
-static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
+static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
  {
         unsigned int lo, hi;
-       int changed = FALSE;
+       bool changed = false;
  
         rdmsr(MTRRphysBase_MSR(index), lo, hi);
         if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
             || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
                 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
                 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
-               changed = TRUE;
+               changed = true;
         }
  
         rdmsr(MTRRphysMask_MSR(index), lo, hi);
@@ -292,7 +287,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
             || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
                 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
                 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
-               changed = TRUE;
+               changed = true;
         }
         return changed;
  }
@@ -350,7 +345,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
         spin_lock(&set_atomicity_lock);
  
         /*  Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
-       cr0 = read_cr0() | 0x40000000;  /* set CD flag */
+       cr0 = read_cr0() | X86_CR0_CD;
         write_cr0(cr0);
         wbinvd();
  
@@ -417,8 +412,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
      <base> The base address of the region.
      <size> The size of the region. If this is 0 the region is disabled.
      <type> The type of the region.
-    <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
-    be done externally.
      [RETURNS] Nothing.
  */
  {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c

index c7d8f175674525543baa11f712b1241eeb364d67..91e150acb46c157b96b4910542def2f9855742af 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -11,10 +11,6 @@
  #include <asm/mtrr.h>
  #include "mtrr.h"
  
-/* RED-PEN: this is accessed without any locking */
-extern unsigned int *usage_table;
-
-
  #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
  
  static const char *const mtrr_strings[MTRR_NUM_TYPES] =
@@ -37,7 +33,7 @@ const char *mtrr_attrib_to_str(int x)
  
  static int
  mtrr_file_add(unsigned long base, unsigned long size,
-             unsigned int type, char increment, struct file *file, int page)
+             unsigned int type, bool increment, struct file *file, int page)
  {
         int reg, max;
         unsigned int *fcount = FILE_FCOUNT(file); 
@@ -55,7 +51,7 @@ mtrr_file_add(unsigned long base, unsigned long size,
                 base >>= PAGE_SHIFT;
                 size >>= PAGE_SHIFT;
         }
-       reg = mtrr_add_page(base, size, type, 1);
+       reg = mtrr_add_page(base, size, type, true);
         if (reg >= 0)
                 ++fcount[reg];
         return reg;
@@ -141,7 +137,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
                 size >>= PAGE_SHIFT;
                 err =
                     mtrr_add_page((unsigned long) base, (unsigned long) size, i,
-                                 1);
+                                 true);
                 if (err < 0)
                         return err;
                 return len;
@@ -217,7 +213,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
                 if (!capable(CAP_SYS_ADMIN))
                         return -EPERM;
                 err =
-                   mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
+                   mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
                                   file, 0);
                 break;
         case MTRRIOC_SET_ENTRY:
@@ -226,7 +222,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
  #endif
                 if (!capable(CAP_SYS_ADMIN))
                         return -EPERM;
-               err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
+               err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
                 break;
         case MTRRIOC_DEL_ENTRY:
  #ifdef CONFIG_COMPAT
@@ -270,7 +266,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
                 if (!capable(CAP_SYS_ADMIN))
                         return -EPERM;
                 err =
-                   mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
+                   mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
                                   file, 1);
                 break;
         case MTRRIOC_SET_PAGE_ENTRY:
@@ -279,7 +275,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
  #endif
                 if (!capable(CAP_SYS_ADMIN))
                         return -EPERM;
-               err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
+               err =
+                   mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
                 break;
         case MTRRIOC_DEL_PAGE_ENTRY:
  #ifdef CONFIG_COMPAT
@@ -396,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
         for (i = 0; i < max; i++) {
                 mtrr_if->get(i, &base, &size, &type);
                 if (size == 0)
-                       usage_table[i] = 0;
+                       mtrr_usage_table[i] = 0;
                 else {
                         if (size < (0x100000 >> PAGE_SHIFT)) {
                                 /* less than 1MB */
@@ -410,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
                         len += seq_printf(seq, 
                                    "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
                              i, base, base >> (20 - PAGE_SHIFT), size, factor,
-                            mtrr_attrib_to_str(type), usage_table[i]);
+                            mtrr_attrib_to_str(type), mtrr_usage_table[i]);
                 }
         }
         return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c

index beb45c9c08357911a78af58f24640556b0a19240..715919582657564a32f2941947687d87c448f722 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -38,8 +38,8 @@
  #include <linux/cpu.h>
  #include <linux/mutex.h>
  
+#include <asm/e820.h>
  #include <asm/mtrr.h>
-
  #include <asm/uaccess.h>
  #include <asm/processor.h>
  #include <asm/msr.h>
@@ -47,7 +47,7 @@
  
  u32 num_var_ranges = 0;
  
-unsigned int *usage_table;
+unsigned int mtrr_usage_table[MAX_VAR_RANGES];
  static DEFINE_MUTEX(mtrr_mutex);
  
  u64 size_or_mask, size_and_mask;
@@ -121,13 +121,8 @@ static void __init init_table(void)
         int i, max;
  
         max = num_var_ranges;
-       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
-           == NULL) {
-               printk(KERN_ERR "mtrr: could not allocate\n");
-               return;
-       }
         for (i = 0; i < max; i++)
-               usage_table[i] = 1;
+               mtrr_usage_table[i] = 1;
  }
  
  struct set_mtrr_data {
@@ -311,7 +306,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
   */
  
  int mtrr_add_page(unsigned long base, unsigned long size, 
-                 unsigned int type, char increment)
+                 unsigned int type, bool increment)
  {
         int i, replace, error;
         mtrr_type ltype;
@@ -383,7 +378,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
                         goto out;
                 }
                 if (increment)
-                       ++usage_table[i];
+                       ++mtrr_usage_table[i];
                 error = i;
                 goto out;
         }
@@ -391,13 +386,15 @@ int mtrr_add_page(unsigned long base, unsigned long size,
         i = mtrr_if->get_free_region(base, size, replace);
         if (i >= 0) {
                 set_mtrr(i, base, size, type);
-               if (likely(replace < 0))
-                       usage_table[i] = 1;
-               else {
-                       usage_table[i] = usage_table[replace] + !!increment;
+               if (likely(replace < 0)) {
+                       mtrr_usage_table[i] = 1;
+               } else {
+                       mtrr_usage_table[i] = mtrr_usage_table[replace];
+                       if (increment)
+                               mtrr_usage_table[i]++;
                         if (unlikely(replace != i)) {
                                 set_mtrr(replace, 0, 0, 0);
-                               usage_table[replace] = 0;
+                               mtrr_usage_table[replace] = 0;
                         }
                 }
         } else
@@ -460,7 +457,7 @@ static int mtrr_check(unsigned long base, unsigned long size)
  
  int
  mtrr_add(unsigned long base, unsigned long size, unsigned int type,
-        char increment)
+        bool increment)
  {
         if (mtrr_check(base, size))
                 return -EINVAL;
@@ -527,11 +524,11 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
                 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
                 goto out;
         }
-       if (usage_table[reg] < 1) {
+       if (mtrr_usage_table[reg] < 1) {
                 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
                 goto out;
         }
-       if (--usage_table[reg] < 1)
+       if (--mtrr_usage_table[reg] < 1)
                 set_mtrr(reg, 0, 0, 0);
         error = reg;
   out:
@@ -591,16 +588,11 @@ struct mtrr_value {
         unsigned long   lsize;
  };
  
-static struct mtrr_value * mtrr_state;
+static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
  
  static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
  {
         int i;
-       int size = num_var_ranges * sizeof(struct mtrr_value);
-
-       mtrr_state = kzalloc(size,GFP_ATOMIC);
-       if (!mtrr_state)
-               return -ENOMEM;
  
         for (i = 0; i < num_var_ranges; i++) {
                 mtrr_if->get(i,
@@ -622,7 +614,6 @@ static int mtrr_restore(struct sys_device * sysdev)
                                  mtrr_state[i].lsize,
                                  mtrr_state[i].ltype);
         }
-       kfree(mtrr_state);
         return 0;
  }
  
@@ -633,6 +624,112 @@ static struct sysdev_driver mtrr_sysdev_driver = {
         .resume         = mtrr_restore,
  };
  
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+       disable_mtrr_trim = 1;
+       return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+static __init int amd_special_default_mtrr(void)
+{
+       u32 l, h;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+               return 0;
+       if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+               return 0;
+       /* In case some hypervisor doesn't pass SYSCFG through */
+       if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+               return 0;
+       /*
+        * Memory between 4GB and top of mem is forced WB by this magic bit.
+        * Reserved before K8RevF, but should be zero there.
+        */
+       if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+                (Tom2Enabled | Tom2ForceMemTypeWB))
+               return 1;
+       return 0;
+}
+
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations.  This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+       unsigned long i, base, size, highest_addr = 0, def, dummy;
+       mtrr_type type;
+       u64 trim_start, trim_size;
+
+       /*
+        * Make sure we only trim uncachable memory on machines that
+        * support the Intel MTRR architecture:
+        */
+       if (!is_cpu(INTEL) || disable_mtrr_trim)
+               return 0;
+       rdmsr(MTRRdefType_MSR, def, dummy);
+       def &= 0xff;
+       if (def != MTRR_TYPE_UNCACHABLE)
+               return 0;
+
+       if (amd_special_default_mtrr())
+               return 0;
+
+       /* Find highest cached pfn */
+       for (i = 0; i < num_var_ranges; i++) {
+               mtrr_if->get(i, &base, &size, &type);
+               if (type != MTRR_TYPE_WRBACK)
+                       continue;
+               base <<= PAGE_SHIFT;
+               size <<= PAGE_SHIFT;
+               if (highest_addr < base + size)
+                       highest_addr = base + size;
+       }
+
+       /* kvm/qemu doesn't have mtrr set right, don't trim them all */
+       if (!highest_addr) {
+               printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
+               WARN_ON(1);
+               return 0;
+       }
+
+       if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
+               printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+                       " all of memory, losing %LdMB of RAM.\n",
+                       (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20);
+
+               WARN_ON(1);
+
+               printk(KERN_INFO "update e820 for mtrr\n");
+               trim_start = highest_addr;
+               trim_size = end_pfn;
+               trim_size <<= PAGE_SHIFT;
+               trim_size -= trim_start;
+               add_memory_region(trim_start, trim_size, E820_RESERVED);
+               update_e820();
+               return 1;
+       }
+
+       return 0;
+}
  
  /**
   * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h

index 289dfe6030e3a4b89d8615bfd89ff2abd52fc7fa..fb74a2c2081487bfd7207f6d4b620064cd67c4c8 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -2,10 +2,8 @@
   * local mtrr defines.
   */
  
-#ifndef TRUE
-#define TRUE  1
-#define FALSE 0
-#endif
+#include <linux/types.h>
+#include <linux/stddef.h>
  
  #define MTRRcap_MSR     0x0fe
  #define MTRRdefType_MSR 0x2ff
@@ -14,6 +12,7 @@
  #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
  
  #define NUM_FIXED_RANGES 88
+#define MAX_VAR_RANGES 256
  #define MTRRfix64K_00000_MSR 0x250
  #define MTRRfix16K_80000_MSR 0x258
  #define MTRRfix16K_A0000_MSR 0x259
@@ -34,6 +33,8 @@
     an 8 bit field: */
  typedef u8 mtrr_type;
  
+extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+
  struct mtrr_ops {
         u32     vendor;
         u32     use_intel_if;
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c

index 49e20c2afcdf7b4f4e24dc523315f7cb0a1970b0..9f8ba923d1c973c138f0fad5f5b9a5d31063e0a6 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -4,6 +4,7 @@
  #include <asm/mtrr.h>
  #include <asm/msr.h>
  #include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
  #include "mtrr.h"
  
  
@@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
  
                 /*  Disable and flush caches. Note that wbinvd flushes the TLBs as
                     a side-effect  */
-               cr0 = read_cr0() | 0x40000000;
+               cr0 = read_cr0() | X86_CR0_CD;
                 wbinvd();
                 write_cr0(cr0);
                 wbinvd();
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c

index c02541e6e653f72ae1856df090b0719041cd2664..9b838324b818f73e6aabea5dbca893c790827c7d 100644 (file)
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr)
         clear_bit(counter, evntsel_nmi_owner);
  }
  
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
  EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
  EXPORT_SYMBOL(reserve_perfctr_nmi);
  EXPORT_SYMBOL(release_perfctr_nmi);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c

index 3900e46d66db4c91ea8b898504af963dc3ff4df4..028213260148d62b75f6d1804aa0977cfec6395f 100644 (file)
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -188,7 +188,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
  static void c_stop(struct seq_file *m, void *v)
  {
  }
-struct seq_operations cpuinfo_op = {
+const struct seq_operations cpuinfo_op = {
         .start  = c_start,
         .next   = c_next,
         .stop   = c_stop,
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c

index d387c770c518e179413f4d2f891cbb86403ce45b..dec66e4528100c72ee384f7c5aa9400339eb26ab 100644 (file)
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -50,7 +50,7 @@ struct cpuid_command {
  
  static void cpuid_smp_cpuid(void *cmd_block)
  {
-       struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
+       struct cpuid_command *cmd = cmd_block;
  
         cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
                       &cmd->data[3]);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c

index 40978af630e7f2a61d0170add5fbe0506052b6da..a47798b59f07e82205cc6f24d2e4ccc5f0cb6db7 100644 (file)
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
  
  static void doublefault_fn(void)
  {
-       struct Xgt_desc_struct gdt_desc = {0, 0};
+       struct desc_ptr gdt_desc = {0, 0};
         unsigned long gdt, tss;
  
         store_gdt(&gdt_desc);
@@ -33,14 +33,15 @@ static void doublefault_fn(void)
                 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
  
                 if (ptr_ok(tss)) {
-                       struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
+                       struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
  
-                       printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
+                       printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
+                              t->ip, t->sp);
  
                         printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
-                               t->eax, t->ebx, t->ecx, t->edx);
+                               t->ax, t->bx, t->cx, t->dx);
                         printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-                               t->esi, t->edi);
+                               t->si, t->di);
                 }
         }
  
@@ -50,15 +51,15 @@ static void doublefault_fn(void)
  
  struct tss_struct doublefault_tss __cacheline_aligned = {
         .x86_tss = {
-               .esp0           = STACK_START,
+               .sp0            = STACK_START,
                 .ss0            = __KERNEL_DS,
                 .ldt            = 0,
                 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
  
-               .eip            = (unsigned long) doublefault_fn,
+               .ip             = (unsigned long) doublefault_fn,
                 /* 0x2 bit is always set */
-               .eflags         = X86_EFLAGS_SF | 0x2,
-               .esp            = STACK_START,
+               .flags          = X86_EFLAGS_SF | 0x2,
+               .sp             = STACK_START,
                 .es             = __USER_DS,
                 .cs             = __KERNEL_CS,
                 .ss             = __KERNEL_DS,
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c

new file mode 100644 (file)

index 0000000..1c5ca4d
--- /dev/null
+++ b/arch/x86/kernel/ds.c
@@ -0,0 +1,464 @@
+/*
+ * Debug Store support
+ *
+ * This provides a low-level interface to the hardware's Debug Store
+ * feature that is used for last branch recording (LBR) and
+ * precise-event based sampling (PEBS).
+ *
+ * Different architectures use a different DS layout/pointer size.
+ * The below functions therefore work on a void*.
+ *
+ *
+ * Since there is no user for PEBS, yet, only LBR (or branch
+ * trace store, BTS) is supported.
+ *
+ *
+ * Copyright (C) 2007 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ */
+
+#include <asm/ds.h>
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+
+/*
+ * Debug Store (DS) save area configuration (see Intel64 and IA32
+ * Architectures Software Developer's Manual, section 18.5)
+ *
+ * The DS configuration consists of the following fields; different
+ * architetures vary in the size of those fields.
+ * - double-word aligned base linear address of the BTS buffer
+ * - write pointer into the BTS buffer
+ * - end linear address of the BTS buffer (one byte beyond the end of
+ *   the buffer)
+ * - interrupt pointer into BTS buffer
+ *   (interrupt occurs when write pointer passes interrupt pointer)
+ * - double-word aligned base linear address of the PEBS buffer
+ * - write pointer into the PEBS buffer
+ * - end linear address of the PEBS buffer (one byte beyond the end of
+ *   the buffer)
+ * - interrupt pointer into PEBS buffer
+ *   (interrupt occurs when write pointer passes interrupt pointer)
+ * - value to which counter is reset following counter overflow
+ *
+ * On later architectures, the last branch recording hardware uses
+ * 64bit pointers even in 32bit mode.
+ *
+ *
+ * Branch Trace Store (BTS) records store information about control
+ * flow changes. They at least provide the following information:
+ * - source linear address
+ * - destination linear address
+ *
+ * Netburst supported a predicated bit that had been dropped in later
+ * architectures. We do not suppor it.
+ *
+ *
+ * In order to abstract from the actual DS and BTS layout, we describe
+ * the access to the relevant fields.
+ * Thanks to Andi Kleen for proposing this design.
+ *
+ * The implementation, however, is not as general as it might seem. In
+ * order to stay somewhat simple and efficient, we assume an
+ * underlying unsigned type (mostly a pointer type) and we expect the
+ * field to be at least as big as that type.
+ */
+
+/*
+ * A special from_ip address to indicate that the BTS record is an
+ * info record that needs to be interpreted or skipped.
+ */
+#define BTS_ESCAPE_ADDRESS (-1)
+
+/*
+ * A field access descriptor
+ */
+struct access_desc {
+       unsigned char offset;
+       unsigned char size;
+};
+
+/*
+ * The configuration for a particular DS/BTS hardware implementation.
+ */
+struct ds_configuration {
+       /* the DS configuration */
+       unsigned char  sizeof_ds;
+       struct access_desc bts_buffer_base;
+       struct access_desc bts_index;
+       struct access_desc bts_absolute_maximum;
+       struct access_desc bts_interrupt_threshold;
+       /* the BTS configuration */
+       unsigned char  sizeof_bts;
+       struct access_desc from_ip;
+       struct access_desc to_ip;
+       /* BTS variants used to store additional information like
+          timestamps */
+       struct access_desc info_type;
+       struct access_desc info_data;
+       unsigned long debugctl_mask;
+};
+
+/*
+ * The global configuration used by the below accessor functions
+ */
+static struct ds_configuration ds_cfg;
+
+/*
+ * Accessor functions for some DS and BTS fields using the above
+ * global ptrace_bts_cfg.
+ */
+static inline unsigned long get_bts_buffer_base(char *base)
+{
+       return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
+}
+static inline void set_bts_buffer_base(char *base, unsigned long value)
+{
+       (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
+}
+static inline unsigned long get_bts_index(char *base)
+{
+       return *(unsigned long *)(base + ds_cfg.bts_index.offset);
+}
+static inline void set_bts_index(char *base, unsigned long value)
+{
+       (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
+}
+static inline unsigned long get_bts_absolute_maximum(char *base)
+{
+       return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
+}
+static inline void set_bts_absolute_maximum(char *base, unsigned long value)
+{
+       (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+}
+static inline unsigned long get_bts_interrupt_threshold(char *base)
+{
+       return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
+}
+static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
+{
+       (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+}
+static inline unsigned long get_from_ip(char *base)
+{
+       return *(unsigned long *)(base + ds_cfg.from_ip.offset);
+}
+static inline void set_from_ip(char *base, unsigned long value)
+{
+       (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
+}
+static inline unsigned long get_to_ip(char *base)
+{
+       return *(unsigned long *)(base + ds_cfg.to_ip.offset);
+}
+static inline void set_to_ip(char *base, unsigned long value)
+{
+       (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
+}
+static inline unsigned char get_info_type(char *base)
+{
+       return *(unsigned char *)(base + ds_cfg.info_type.offset);
+}
+static inline void set_info_type(char *base, unsigned char value)
+{
+       (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
+}
+static inline unsigned long get_info_data(char *base)
+{
+       return *(unsigned long *)(base + ds_cfg.info_data.offset);
+}
+static inline void set_info_data(char *base, unsigned long value)
+{
+       (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
+}
+
+
+int ds_allocate(void **dsp, size_t bts_size_in_bytes)
+{
+       size_t bts_size_in_records;
+       unsigned long bts;
+       void *ds;
+
+       if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+               return -EOPNOTSUPP;
+
+       if (bts_size_in_bytes < 0)
+               return -EINVAL;
+
+       bts_size_in_records =
+               bts_size_in_bytes / ds_cfg.sizeof_bts;
+       bts_size_in_bytes =
+               bts_size_in_records * ds_cfg.sizeof_bts;
+
+       if (bts_size_in_bytes <= 0)
+               return -EINVAL;
+
+       bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
+
+       if (!bts)
+               return -ENOMEM;
+
+       ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+
+       if (!ds) {
+               kfree((void *)bts);
+               return -ENOMEM;
+       }
+
+       set_bts_buffer_base(ds, bts);
+       set_bts_index(ds, bts);
+       set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
+       set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
+
+       *dsp = ds;
+       return 0;
+}
+
+int ds_free(void **dsp)
+{
+       if (*dsp)
+               kfree((void *)get_bts_buffer_base(*dsp));
+       kfree(*dsp);
+       *dsp = 0;
+
+       return 0;
+}
+
+int ds_get_bts_size(void *ds)
+{
+       int size_in_bytes;
+
+       if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+               return -EOPNOTSUPP;
+
+       if (!ds)
+               return 0;
+
+       size_in_bytes =
+               get_bts_absolute_maximum(ds) -
+               get_bts_buffer_base(ds);
+       return size_in_bytes;
+}
+
+int ds_get_bts_end(void *ds)
+{
+       int size_in_bytes = ds_get_bts_size(ds);
+
+       if (size_in_bytes <= 0)
+               return size_in_bytes;
+
+       return size_in_bytes / ds_cfg.sizeof_bts;
+}
+
+int ds_get_bts_index(void *ds)
+{
+       int index_offset_in_bytes;
+
+       if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+               return -EOPNOTSUPP;
+
+       index_offset_in_bytes =
+               get_bts_index(ds) -
+               get_bts_buffer_base(ds);
+
+       return index_offset_in_bytes / ds_cfg.sizeof_bts;
+}
+
+int ds_set_overflow(void *ds, int method)
+{
+       switch (method) {
+       case DS_O_SIGNAL:
+               return -EOPNOTSUPP;
+       case DS_O_WRAP:
+               return 0;
+       default:
+               return -EINVAL;
+       }
+}
+
+int ds_get_overflow(void *ds)
+{
+       return DS_O_WRAP;
+}
+
+int ds_clear(void *ds)
+{
+       int bts_size = ds_get_bts_size(ds);
+       unsigned long bts_base;
+
+       if (bts_size <= 0)
+               return bts_size;
+
+       bts_base = get_bts_buffer_base(ds);
+       memset((void *)bts_base, 0, bts_size);
+
+       set_bts_index(ds, bts_base);
+       return 0;
+}
+
+int ds_read_bts(void *ds, int index, struct bts_struct *out)
+{
+       void *bts;
+
+       if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+               return -EOPNOTSUPP;
+
+       if (index < 0)
+               return -EINVAL;
+
+       if (index >= ds_get_bts_size(ds))
+               return -EINVAL;
+
+       bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
+
+       memset(out, 0, sizeof(*out));
+       if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
+               out->qualifier       = get_info_type(bts);
+               out->variant.jiffies = get_info_data(bts);
+       } else {
+               out->qualifier = BTS_BRANCH;
+               out->variant.lbr.from_ip = get_from_ip(bts);
+               out->variant.lbr.to_ip   = get_to_ip(bts);
+       }
+
+       return sizeof(*out);;
+}
+
+int ds_write_bts(void *ds, const struct bts_struct *in)
+{
+       unsigned long bts;
+
+       if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+               return -EOPNOTSUPP;
+
+       if (ds_get_bts_size(ds) <= 0)
+               return -ENXIO;
+
+       bts = get_bts_index(ds);
+
+       memset((void *)bts, 0, ds_cfg.sizeof_bts);
+       switch (in->qualifier) {
+       case BTS_INVALID:
+               break;
+
+       case BTS_BRANCH:
+               set_from_ip((void *)bts, in->variant.lbr.from_ip);
+               set_to_ip((void *)bts, in->variant.lbr.to_ip);
+               break;
+
+       case BTS_TASK_ARRIVES:
+       case BTS_TASK_DEPARTS:
+               set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
+               set_info_type((void *)bts, in->qualifier);
+               set_info_data((void *)bts, in->variant.jiffies);
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       bts = bts + ds_cfg.sizeof_bts;
+       if (bts >= get_bts_absolute_maximum(ds))
+               bts = get_bts_buffer_base(ds);
+       set_bts_index(ds, bts);
+
+       return ds_cfg.sizeof_bts;
+}
+
+unsigned long ds_debugctl_mask(void)
+{
+       return ds_cfg.debugctl_mask;
+}
+
+#ifdef __i386__
+static const struct ds_configuration ds_cfg_netburst = {
+       .sizeof_ds = 9 * 4,
+       .bts_buffer_base = { 0, 4 },
+       .bts_index = { 4, 4 },
+       .bts_absolute_maximum = { 8, 4 },
+       .bts_interrupt_threshold = { 12, 4 },
+       .sizeof_bts = 3 * 4,
+       .from_ip = { 0, 4 },
+       .to_ip = { 4, 4 },
+       .info_type = { 4, 1 },
+       .info_data = { 8, 4 },
+       .debugctl_mask = (1<<2)|(1<<3)
+};
+
+static const struct ds_configuration ds_cfg_pentium_m = {
+       .sizeof_ds = 9 * 4,
+       .bts_buffer_base = { 0, 4 },
+       .bts_index = { 4, 4 },
+       .bts_absolute_maximum = { 8, 4 },
+       .bts_interrupt_threshold = { 12, 4 },
+       .sizeof_bts = 3 * 4,
+       .from_ip = { 0, 4 },
+       .to_ip = { 4, 4 },
+       .info_type = { 4, 1 },
+       .info_data = { 8, 4 },
+       .debugctl_mask = (1<<6)|(1<<7)
+};
+#endif /* _i386_ */
+
+static const struct ds_configuration ds_cfg_core2 = {
+       .sizeof_ds = 9 * 8,
+       .bts_buffer_base = { 0, 8 },
+       .bts_index = { 8, 8 },
+       .bts_absolute_maximum = { 16, 8 },
+       .bts_interrupt_threshold = { 24, 8 },
+       .sizeof_bts = 3 * 8,
+       .from_ip = { 0, 8 },
+       .to_ip = { 8, 8 },
+       .info_type = { 8, 1 },
+       .info_data = { 16, 8 },
+       .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+};
+
+static inline void
+ds_configure(const struct ds_configuration *cfg)
+{
+       ds_cfg = *cfg;
+}
+
+void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
+{
+       switch (c->x86) {
+       case 0x6:
+               switch (c->x86_model) {
+#ifdef __i386__
+               case 0xD:
+               case 0xE: /* Pentium M */
+                       ds_configure(&ds_cfg_pentium_m);
+                       break;
+#endif /* _i386_ */
+               case 0xF: /* Core2 */
+                       ds_configure(&ds_cfg_core2);
+                       break;
+               default:
+                       /* sorry, don't know about them */
+                       break;
+               }
+               break;
+       case 0xF:
+               switch (c->x86_model) {
+#ifdef __i386__
+               case 0x0:
+               case 0x1:
+               case 0x2: /* Netburst */
+                       ds_configure(&ds_cfg_netburst);
+                       break;
+#endif /* _i386_ */
+               default:
+                       /* sorry, don't know about them */
+                       break;
+               }
+               break;
+       default:
+               /* sorry, don't know about them */
+               break;
+       }
+}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c

index 18f500d185a2ae182b38852d51729a32995faee4..4e16ef4a2659f38f18d613289527f415d9ad92f1 100644 (file)
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -7,7 +7,6 @@
  #include <linux/kexec.h>
  #include <linux/module.h>
  #include <linux/mm.h>
-#include <linux/efi.h>
  #include <linux/pfn.h>
  #include <linux/uaccess.h>
  #include <linux/suspend.h>
@@ -17,11 +16,6 @@
  #include <asm/e820.h>
  #include <asm/setup.h>
  
-#ifdef CONFIG_EFI
-int efi_enabled = 0;
-EXPORT_SYMBOL(efi_enabled);
-#endif
-
  struct e820map e820;
  struct change_member {
         struct e820entry *pbios; /* pointer to original bios entry */
@@ -37,26 +31,6 @@ unsigned long pci_mem_start = 0x10000000;
  EXPORT_SYMBOL(pci_mem_start);
  #endif
  extern int user_defined_memmap;
-struct resource data_resource = {
-       .name   = "Kernel data",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource code_resource = {
-       .name   = "Kernel code",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource bss_resource = {
-       .name   = "Kernel bss",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
-};
  
  static struct resource system_rom_resource = {
         .name   = "System ROM",
@@ -111,60 +85,6 @@ static struct resource video_rom_resource = {
         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
  };
  
-static struct resource video_ram_resource = {
-       .name   = "Video RAM area",
-       .start  = 0xa0000,
-       .end    = 0xbffff,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
-       .name   = "dma1",
-       .start  = 0x0000,
-       .end    = 0x001f,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "pic1",
-       .start  = 0x0020,
-       .end    = 0x0021,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "timer0",
-       .start  = 0x0040,
-       .end    = 0x0043,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "timer1",
-       .start  = 0x0050,
-       .end    = 0x0053,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "keyboard",
-       .start  = 0x0060,
-       .end    = 0x006f,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "dma page reg",
-       .start  = 0x0080,
-       .end    = 0x008f,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "pic2",
-       .start  = 0x00a0,
-       .end    = 0x00a1,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "dma2",
-       .start  = 0x00c0,
-       .end    = 0x00df,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-       .name   = "fpu",
-       .start  = 0x00f0,
-       .end    = 0x00ff,
-       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
  #define ROMSIGNATURE 0xaa55
  
  static int __init romsignature(const unsigned char *rom)
@@ -260,10 +180,9 @@ static void __init probe_roms(void)
   * Request address space for all standard RAM and ROM resources
   * and also for regions reported as reserved by the e820.
   */
-static void __init
-legacy_init_iomem_resources(struct resource *code_resource,
-                           struct resource *data_resource,
-                           struct resource *bss_resource)
+void __init init_iomem_resources(struct resource *code_resource,
+               struct resource *data_resource,
+               struct resource *bss_resource)
  {
         int i;
  
@@ -305,35 +224,6 @@ legacy_init_iomem_resources(struct resource *code_resource,
         }
  }
  
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
-       int i;
-
-       printk("Setting up standard PCI resources\n");
-       if (efi_enabled)
-               efi_initialize_iomem_resources(&code_resource,
-                               &data_resource, &bss_resource);
-       else
-               legacy_init_iomem_resources(&code_resource,
-                               &data_resource, &bss_resource);
-
-       /* EFI systems may still have VGA */
-       request_resource(&iomem_resource, &video_ram_resource);
-
-       /* request I/O space for devices used on all i[345]86 PCs */
-       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-               request_resource(&ioport_resource, &standard_io_resources[i]);
-       return 0;
-}
-
-subsys_initcall(request_standard_resources);
-
  #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
  /**
   * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
@@ -370,19 +260,17 @@ void __init add_memory_region(unsigned long long start,
  {
         int x;
  
-       if (!efi_enabled) {
-                       x = e820.nr_map;
-
-               if (x == E820MAX) {
-                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-                   return;
-               }
+       x = e820.nr_map;
  
-               e820.map[x].addr = start;
-               e820.map[x].size = size;
-               e820.map[x].type = type;
-               e820.nr_map++;
+       if (x == E820MAX) {
+               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+               return;
         }
+
+       e820.map[x].addr = start;
+       e820.map[x].size = size;
+       e820.map[x].type = type;
+       e820.nr_map++;
  } /* add_memory_region */
  
  /*
@@ -597,29 +485,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
         return 0;
  }
  
-/*
- * Callback for efi_memory_walk.
- */
-static int __init
-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
-{
-       unsigned long *max_pfn = arg, pfn;
-
-       if (start < end) {
-               pfn = PFN_UP(end -1);
-               if (pfn > *max_pfn)
-                       *max_pfn = pfn;
-       }
-       return 0;
-}
-
-static int __init
-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
-{
-       memory_present(0, PFN_UP(start), PFN_DOWN(end));
-       return 0;
-}
-
  /*
   * Find the highest page frame number we have available
   */
@@ -628,11 +493,6 @@ void __init find_max_pfn(void)
         int i;
  
         max_pfn = 0;
-       if (efi_enabled) {
-               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
-               efi_memmap_walk(efi_memory_present_wrapper, NULL);
-               return;
-       }
  
         for (i = 0; i < e820.nr_map; i++) {
                 unsigned long start, end;
@@ -649,24 +509,6 @@ void __init find_max_pfn(void)
         }
  }
  
-/*
- * Free all available memory for boot time allocation.  Used
- * as a callback function by efi_memory_walk()
- */
-
-static int __init
-free_available_memory(unsigned long start, unsigned long end, void *arg)
-{
-       /* check max_low_pfn */
-       if (start >= (max_low_pfn << PAGE_SHIFT))
-               return 0;
-       if (end >= (max_low_pfn << PAGE_SHIFT))
-               end = max_low_pfn << PAGE_SHIFT;
-       if (start < end)
-               free_bootmem(start, end - start);
-
-       return 0;
-}
  /*
   * Register fully available low RAM pages with the bootmem allocator.
   */
@@ -674,10 +516,6 @@ void __init register_bootmem_low_pages(unsigned long max_low_pfn)
  {
         int i;
  
-       if (efi_enabled) {
-               efi_memmap_walk(free_available_memory, NULL);
-               return;
-       }
         for (i = 0; i < e820.nr_map; i++) {
                 unsigned long curr_pfn, last_pfn, size;
                 /*
@@ -785,56 +623,12 @@ void __init print_memory_map(char *who)
         }
  }
  
-static __init __always_inline void efi_limit_regions(unsigned long long size)
-{
-       unsigned long long current_addr = 0;
-       efi_memory_desc_t *md, *next_md;
-       void *p, *p1;
-       int i, j;
-
-       j = 0;
-       p1 = memmap.map;
-       for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
-               md = p;
-               next_md = p1;
-               current_addr = md->phys_addr +
-                       PFN_PHYS(md->num_pages);
-               if (is_available_memory(md)) {
-                       if (md->phys_addr >= size) continue;
-                       memcpy(next_md, md, memmap.desc_size);
-                       if (current_addr >= size) {
-                               next_md->num_pages -=
-                                       PFN_UP(current_addr-size);
-                       }
-                       p1 += memmap.desc_size;
-                       next_md = p1;
-                       j++;
-               } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
-                          EFI_MEMORY_RUNTIME) {
-                       /* In order to make runtime services
-                        * available we have to include runtime
-                        * memory regions in memory map */
-                       memcpy(next_md, md, memmap.desc_size);
-                       p1 += memmap.desc_size;
-                       next_md = p1;
-                       j++;
-               }
-       }
-       memmap.nr_map = j;
-       memmap.map_end = memmap.map +
-               (memmap.nr_map * memmap.desc_size);
-}
-
  void __init limit_regions(unsigned long long size)
  {
         unsigned long long current_addr;
         int i;
  
         print_memory_map("limit_regions start");
-       if (efi_enabled) {
-               efi_limit_regions(size);
-               return;
-       }
         for (i = 0; i < e820.nr_map; i++) {
                 current_addr = e820.map[i].addr + e820.map[i].size;
                 if (current_addr < size)
@@ -955,3 +749,14 @@ static int __init parse_memmap(char *arg)
         return 0;
  }
  early_param("memmap", parse_memmap);
+void __init update_e820(void)
+{
+       u8 nr_map;
+
+       nr_map = e820.nr_map;
+       if (sanitize_e820_map(e820.map, &nr_map))
+               return;
+       e820.nr_map = nr_map;
+       printk(KERN_INFO "modified physical RAM map:\n");
+       print_memory_map("modified");
+}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c

index 04698e0b056c2ccc5ad9562d2290e6520d765c82..c617174e896382fe91b77a4c70b8ef2996511b92 100644 (file)
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -1,4 +1,4 @@
-/* 
+/*
   * Handle the memory map.
   * The functions here do the job until bootmem takes over.
   *
@@ -26,80 +26,87 @@
  #include <asm/proto.h>
  #include <asm/setup.h>
  #include <asm/sections.h>
+#include <asm/kdebug.h>
  
  struct e820map e820;
  
-/* 
+/*
   * PFN of last memory page.
   */
-unsigned long end_pfn; 
-EXPORT_SYMBOL(end_pfn);
+unsigned long end_pfn;
  
-/* 
+/*
   * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
   * The direct mapping extends to end_pfn_map, so that we can directly access
   * apertures, ACPI and other tables without having to play with fixmaps.
- */ 
-unsigned long end_pfn_map; 
+ */
+unsigned long end_pfn_map;
  
-/* 
+/*
   * Last pfn which the user wants to use.
   */
  static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
  
-extern struct resource code_resource, data_resource, bss_resource;
-
-/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
-static inline int bad_addr(unsigned long *addrp, unsigned long size)
-{ 
-       unsigned long addr = *addrp, last = addr + size; 
-
-       /* various gunk below that needed for SMP startup */
-       if (addr < 0x8000) { 
-               *addrp = PAGE_ALIGN(0x8000);
-               return 1; 
-       }
-
-       /* direct mapping tables of the kernel */
-       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
-               *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
-               return 1;
-       } 
-
-       /* initrd */ 
-#ifdef CONFIG_BLK_DEV_INITRD
-       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-               unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-               unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-               unsigned long ramdisk_end   = ramdisk_image+ramdisk_size;
-
-               if (last >= ramdisk_image && addr < ramdisk_end) {
-                       *addrp = PAGE_ALIGN(ramdisk_end);
-                       return 1;
-               }
-       } 
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+       unsigned long start, end;
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+       { 0, PAGE_SIZE },                       /* BIOS data page */
+#ifdef CONFIG_SMP
+       { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE },
  #endif
-       /* kernel code */
-       if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
-               *addrp = PAGE_ALIGN(__pa_symbol(&_end));
-               return 1;
+       {}
+};
+
+void __init reserve_early(unsigned long start, unsigned long end)
+{
+       int i;
+       struct early_res *r;
+       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+               r = &early_res[i];
+               if (end > r->start && start < r->end)
+                       panic("Overlapping early reservations %lx-%lx to %lx-%lx\n",
+                             start, end, r->start, r->end);
         }
+       if (i >= MAX_EARLY_RES)
+               panic("Too many early reservations");
+       r = &early_res[i];
+       r->start = start;
+       r->end = end;
+}
  
-       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
-               *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
-               return 1;
+void __init early_res_to_bootmem(void)
+{
+       int i;
+       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+               struct early_res *r = &early_res[i];
+               reserve_bootmem_generic(r->start, r->end - r->start);
         }
+}
  
-#ifdef CONFIG_NUMA
-       /* NUMA memory to node map */
-       if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
-               *addrp = nodemap_addr + nodemap_size;
-               return 1;
+/* Check for already reserved areas */
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{
+       int i;
+       unsigned long addr = *addrp, last;
+       int changed = 0;
+again:
+       last = addr + size;
+       for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+               struct early_res *r = &early_res[i];
+               if (last >= r->start && addr < r->end) {
+                       *addrp = addr = r->end;
+                       changed = 1;
+                       goto again;
+               }
         }
-#endif
-       /* XXX ramdisk image here? */ 
-       return 0;
-} 
+       return changed;
+}
  
  /*
   * This function checks if any part of the range <start,end> is mapped
@@ -107,16 +114,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
   */
  int
  e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{ 
+{
         int i;
-       for (i = 0; i < e820.nr_map; i++) { 
-               struct e820entry *ei = &e820.map[i]; 
-               if (type && ei->type != type) 
+
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+
+               if (type && ei->type != type)
                         continue;
                 if (ei->addr >= end || ei->addr + ei->size <= start)
-                       continue; 
-               return 1; 
-       } 
+                       continue;
+               return 1;
+       }
         return 0;
  }
  EXPORT_SYMBOL_GPL(e820_any_mapped);
@@ -127,11 +136,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
   * Note: this function only works correct if the e820 table is sorted and
   * not-overlapping, which is the case
   */
-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+int __init e820_all_mapped(unsigned long start, unsigned long end,
+                          unsigned type)
  {
         int i;
+
         for (i = 0; i < e820.nr_map; i++) {
                 struct e820entry *ei = &e820.map[i];
+
                 if (type && ei->type != type)
                         continue;
                 /* is the region (part) in overlap with the current region ?*/
@@ -143,65 +155,73 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type
                  */
                 if (ei->addr <= start)
                         start = ei->addr + ei->size;
-               /* if start is now at or beyond end, we're done, full coverage */
+               /*
+                * if start is now at or beyond end, we're done, full
+                * coverage
+                */
                 if (start >= end)
-                       return 1; /* we're done */
+                       return 1;
         }
         return 0;
  }
  
-/* 
- * Find a free area in a specific range. 
- */ 
-unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
-{ 
-       int i; 
-       for (i = 0; i < e820.nr_map; i++) { 
-               struct e820entry *ei = &e820.map[i]; 
-               unsigned long addr = ei->addr, last; 
-               if (ei->type != E820_RAM) 
-                       continue; 
-               if (addr < start) 
+/*
+ * Find a free area in a specific range.
+ */
+unsigned long __init find_e820_area(unsigned long start, unsigned long end,
+                                   unsigned size)
+{
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+               unsigned long addr = ei->addr, last;
+
+               if (ei->type != E820_RAM)
+                       continue;
+               if (addr < start)
                         addr = start;
-               if (addr > ei->addr + ei->size) 
-                       continue; 
+               if (addr > ei->addr + ei->size)
+                       continue;
                 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
                         ;
                 last = PAGE_ALIGN(addr) + size;
                 if (last > ei->addr + ei->size)
                         continue;
-               if (last > end) 
+               if (last > end)
                         continue;
-               return addr; 
-       } 
-       return -1UL;            
-} 
+               return addr;
+       }
+       return -1UL;
+}
  
  /*
   * Find the highest page frame number we have available
   */
  unsigned long __init e820_end_of_ram(void)
  {
-       unsigned long end_pfn = 0;
+       unsigned long end_pfn;
+
         end_pfn = find_max_pfn_with_active_regions();
-       
-       if (end_pfn > end_pfn_map) 
+
+       if (end_pfn > end_pfn_map)
                 end_pfn_map = end_pfn;
         if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
                 end_pfn_map = MAXMEM>>PAGE_SHIFT;
         if (end_pfn > end_user_pfn)
                 end_pfn = end_user_pfn;
-       if (end_pfn > end_pfn_map) 
-               end_pfn = end_pfn_map; 
+       if (end_pfn > end_pfn_map)
+               end_pfn = end_pfn_map;
  
-       printk("end_pfn_map = %lu\n", end_pfn_map);
-       return end_pfn; 
+       printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
+       return end_pfn;
  }
  
  /*
   * Mark e820 reserved areas as busy for the resource manager.
   */
-void __init e820_reserve_resources(void)
+void __init e820_reserve_resources(struct resource *code_resource,
+               struct resource *data_resource, struct resource *bss_resource)
  {
         int i;
         for (i = 0; i < e820.nr_map; i++) {
@@ -219,13 +239,13 @@ void __init e820_reserve_resources(void)
                 request_resource(&iomem_resource, res);
                 if (e820.map[i].type == E820_RAM) {
                         /*
-                        *  We don't know which RAM region contains kernel data,
-                        *  so we try it repeatedly and let the resource manager
-                        *  test it.
+                        * We don't know which RAM region contains kernel data,
+                        * so we try it repeatedly and let the resource manager
+                        * test it.
                          */
-                       request_resource(res, &code_resource);
-                       request_resource(res, &data_resource);
-                       request_resource(res, &bss_resource);
+                       request_resource(res, code_resource);
+                       request_resource(res, data_resource);
+                       request_resource(res, bss_resource);
  #ifdef CONFIG_KEXEC
                         if (crashk_res.start != crashk_res.end)
                                 request_resource(res, &crashk_res);
@@ -322,9 +342,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
                         add_active_range(nid, ei_startpfn, ei_endpfn);
  }
  
-/* 
+/*
   * Add a memory region to the kernel e820 map.
- */ 
+ */
  void __init add_memory_region(unsigned long start, unsigned long size, int type)
  {
         int x = e820.nr_map;
@@ -349,9 +369,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
  {
         unsigned long start_pfn = start >> PAGE_SHIFT;
         unsigned long end_pfn = end >> PAGE_SHIFT;
-       unsigned long ei_startpfn;
-       unsigned long ei_endpfn;
-       unsigned long ram = 0;
+       unsigned long ei_startpfn, ei_endpfn, ram = 0;
         int i;
  
         for (i = 0; i < e820.nr_map; i++) {
@@ -363,28 +381,31 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
         return end - start - (ram << PAGE_SHIFT);
  }
  
-void __init e820_print_map(char *who)
+static void __init e820_print_map(char *who)
  {
         int i;
  
         for (i = 0; i < e820.nr_map; i++) {
                 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
-                       (unsigned long long) e820.map[i].addr,
-                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
+                      (unsigned long long) e820.map[i].addr,
+                      (unsigned long long)
+                      (e820.map[i].addr + e820.map[i].size));
                 switch (e820.map[i].type) {
-               case E820_RAM:  printk("(usable)\n");
-                               break;
+               case E820_RAM:
+                       printk(KERN_CONT "(usable)\n");
+                       break;
                 case E820_RESERVED:
-                               printk("(reserved)\n");
-                               break;
+                       printk(KERN_CONT "(reserved)\n");
+                       break;
                 case E820_ACPI:
-                               printk("(ACPI data)\n");
-                               break;
+                       printk(KERN_CONT "(ACPI data)\n");
+                       break;
                 case E820_NVS:
-                               printk("(ACPI NVS)\n");
-                               break;
-               default:        printk("type %u\n", e820.map[i].type);
-                               break;
+                       printk(KERN_CONT "(ACPI NVS)\n");
+                       break;
+               default:
+                       printk(KERN_CONT "type %u\n", e820.map[i].type);
+                       break;
                 }
         }
  }
@@ -392,11 +413,11 @@ void __init e820_print_map(char *who)
  /*
   * Sanitize the BIOS e820 map.
   *
- * Some e820 responses include overlapping entries.  The following 
+ * Some e820 responses include overlapping entries. The following
   * replaces the original e820 map with a new one, removing overlaps.
   *
   */
-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
  {
         struct change_member {
                 struct e820entry *pbios; /* pointer to original bios entry */
@@ -416,7 +437,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
         int i;
  
         /*
-               Visually we're performing the following (1,2,3,4 = memory types)...
+               Visually we're performing the following
+               (1,2,3,4 = memory types)...
  
                 Sample memory map (w/overlaps):
                    ____22__________________
@@ -458,22 +480,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
         old_nr = *pnr_map;
  
         /* bail out if we find any unreasonable addresses in bios map */
-       for (i=0; i<old_nr; i++)
+       for (i = 0; i < old_nr; i++)
                 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
                         return -1;
  
         /* create pointers for initial change-point information (for sorting) */
-       for (i=0; i < 2*old_nr; i++)
+       for (i = 0; i < 2 * old_nr; i++)
                 change_point[i] = &change_point_list[i];
  
         /* record all known change-points (starting and ending addresses),
            omitting those that are for empty memory regions */
         chgidx = 0;
-       for (i=0; i < old_nr; i++)      {
+       for (i = 0; i < old_nr; i++)    {
                 if (biosmap[i].size != 0) {
                         change_point[chgidx]->addr = biosmap[i].addr;
                         change_point[chgidx++]->pbios = &biosmap[i];
-                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+                       change_point[chgidx]->addr = biosmap[i].addr +
+                               biosmap[i].size;
                         change_point[chgidx++]->pbios = &biosmap[i];
                 }
         }
@@ -483,75 +506,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
         still_changing = 1;
         while (still_changing)  {
                 still_changing = 0;
-               for (i=1; i < chg_nr; i++)  {
-                       /* if <current_addr> > <last_addr>, swap */
-                       /* or, if current=<start_addr> & last=<end_addr>, swap */
-                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
-                               ((change_point[i]->addr == change_point[i-1]->addr) &&
-                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
-                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-                          )
-                       {
+               for (i = 1; i < chg_nr; i++)  {
+                       unsigned long long curaddr, lastaddr;
+                       unsigned long long curpbaddr, lastpbaddr;
+
+                       curaddr = change_point[i]->addr;
+                       lastaddr = change_point[i - 1]->addr;
+                       curpbaddr = change_point[i]->pbios->addr;
+                       lastpbaddr = change_point[i - 1]->pbios->addr;
+
+                       /*
+                        * swap entries, when:
+                        *
+                        * curaddr > lastaddr or
+                        * curaddr == lastaddr and curaddr == curpbaddr and
+                        * lastaddr != lastpbaddr
+                        */
+                       if (curaddr < lastaddr ||
+                           (curaddr == lastaddr && curaddr == curpbaddr &&
+                            lastaddr != lastpbaddr)) {
                                 change_tmp = change_point[i];
                                 change_point[i] = change_point[i-1];
                                 change_point[i-1] = change_tmp;
-                               still_changing=1;
+                               still_changing = 1;
                         }
                 }
         }
  
         /* create a new bios memory map, removing overlaps */
-       overlap_entries=0;       /* number of entries in the overlap table */
-       new_bios_entry=0;        /* index for creating new bios map entries */
+       overlap_entries = 0;     /* number of entries in the overlap table */
+       new_bios_entry = 0;      /* index for creating new bios map entries */
         last_type = 0;           /* start with undefined memory type */
         last_addr = 0;           /* start with 0 as last starting address */
+
         /* loop through change-points, determining affect on the new bios map */
-       for (chgidx=0; chgidx < chg_nr; chgidx++)
-       {
+       for (chgidx = 0; chgidx < chg_nr; chgidx++) {
                 /* keep track of all overlapping bios entries */
-               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-               {
-                       /* add map entry to overlap list (> 1 entry implies an overlap) */
-                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-               }
-               else
-               {
-                       /* remove entry from list (order independent, so swap with last) */
-                       for (i=0; i<overlap_entries; i++)
-                       {
-                               if (overlap_list[i] == change_point[chgidx]->pbios)
-                                       overlap_list[i] = overlap_list[overlap_entries-1];
+               if (change_point[chgidx]->addr ==
+                   change_point[chgidx]->pbios->addr) {
+                       /*
+                        * add map entry to overlap list (> 1 entry
+                        * implies an overlap)
+                        */
+                       overlap_list[overlap_entries++] =
+                               change_point[chgidx]->pbios;
+               } else {
+                       /*
+                        * remove entry from list (order independent,
+                        * so swap with last)
+                        */
+                       for (i = 0; i < overlap_entries; i++) {
+                               if (overlap_list[i] ==
+                                   change_point[chgidx]->pbios)
+                                       overlap_list[i] =
+                                               overlap_list[overlap_entries-1];
                         }
                         overlap_entries--;
                 }
-               /* if there are overlapping entries, decide which "type" to use */
-               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+               /*
+                * if there are overlapping entries, decide which
+                * "type" to use (larger value takes precedence --
+                * 1=usable, 2,3,4,4+=unusable)
+                */
                 current_type = 0;
-               for (i=0; i<overlap_entries; i++)
+               for (i = 0; i < overlap_entries; i++)
                         if (overlap_list[i]->type > current_type)
                                 current_type = overlap_list[i]->type;
-               /* continue building up new bios map based on this information */
+               /*
+                * continue building up new bios map based on this
+                * information
+                */
                 if (current_type != last_type)  {
                         if (last_type != 0)      {
                                 new_bios[new_bios_entry].size =
                                         change_point[chgidx]->addr - last_addr;
-                               /* move forward only if the new size was non-zero */
+                               /*
+                                * move forward only if the new size
+                                * was non-zero
+                                */
                                 if (new_bios[new_bios_entry].size != 0)
+                                       /*
+                                        * no more space left for new
+                                        * bios entries ?
+                                        */
                                         if (++new_bios_entry >= E820MAX)
-                                               break;  /* no more space left for new bios entries */
+                                               break;
                         }
                         if (current_type != 0)  {
-                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+                               new_bios[new_bios_entry].addr =
+                                       change_point[chgidx]->addr;
                                 new_bios[new_bios_entry].type = current_type;
-                               last_addr=change_point[chgidx]->addr;
+                               last_addr = change_point[chgidx]->addr;
                         }
                         last_type = current_type;
                 }
         }
-       new_nr = new_bios_entry;   /* retain count for new bios entries */
+       /* retain count for new bios entries */
+       new_nr = new_bios_entry;
  
         /* copy new bios mapping into original location */
-       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+       memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
         *pnr_map = new_nr;
  
         return 0;
@@ -566,7 +620,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
   * will have given us a memory map that we can use to properly
   * set up memory.  If we aren't, we'll fake a memory map.
   */
-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
  {
         /* Only one memory region (or negative)? Ignore it */
         if (nr_map < 2)
@@ -583,18 +637,20 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
                         return -1;
  
                 add_memory_region(start, size, type);
-       } while (biosmap++,--nr_map);
+       } while (biosmap++, --nr_map);
         return 0;
  }
  
-void early_panic(char *msg)
+static void early_panic(char *msg)
  {
         early_printk(msg);
         panic(msg);
  }
  
-void __init setup_memory_region(void)
+/* We're not void only for x86 32-bit compat */
+char * __init machine_specific_memory_setup(void)
  {
+       char *who = "BIOS-e820";
         /*
          * Try to copy the BIOS-supplied E820-map.
          *
@@ -605,7 +661,10 @@ void __init setup_memory_region(void)
         if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
                 early_panic("Cannot find a valid memory map");
         printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-       e820_print_map("BIOS-e820");
+       e820_print_map(who);
+
+       /* In case someone cares... */
+       return who;
  }
  
  static int __init parse_memopt(char *p)
@@ -613,9 +672,9 @@ static int __init parse_memopt(char *p)
         if (!p)
                 return -EINVAL;
         end_user_pfn = memparse(p, &p);
-       end_user_pfn >>= PAGE_SHIFT;    
+       end_user_pfn >>= PAGE_SHIFT;
         return 0;
-} 
+}
  early_param("mem", parse_memopt);
  
  static int userdef __initdata;
@@ -627,9 +686,9 @@ static int __init parse_memmap_opt(char *p)
  
         if (!strcmp(p, "exactmap")) {
  #ifdef CONFIG_CRASH_DUMP
-               /* If we are doing a crash dump, we
-                * still need to know the real mem
-                * size before original memory map is
+               /*
+                * If we are doing a crash dump, we still need to know
+                * the real mem size before original memory map is
                  * reset.
                  */
                 e820_register_active_regions(0, 0, -1UL);
@@ -646,6 +705,8 @@ static int __init parse_memmap_opt(char *p)
         mem_size = memparse(p, &p);
         if (p == oldp)
                 return -EINVAL;
+
+       userdef = 1;
         if (*p == '@') {
                 start_at = memparse(p+1, &p);
                 add_memory_region(start_at, mem_size, E820_RAM);
@@ -665,11 +726,29 @@ early_param("memmap", parse_memmap_opt);
  void __init finish_e820_parsing(void)
  {
         if (userdef) {
+               char nr = e820.nr_map;
+
+               if (sanitize_e820_map(e820.map, &nr) < 0)
+                       early_panic("Invalid user supplied memory map");
+               e820.nr_map = nr;
+
                 printk(KERN_INFO "user-defined physical RAM map:\n");
                 e820_print_map("user");
         }
  }
  
+void __init update_e820(void)
+{
+       u8 nr_map;
+
+       nr_map = e820.nr_map;
+       if (sanitize_e820_map(e820.map, &nr_map))
+               return;
+       e820.nr_map = nr_map;
+       printk(KERN_INFO "modified physical RAM map:\n");
+       e820_print_map("modified");
+}
+
  unsigned long pci_mem_start = 0xaeedbabe;
  EXPORT_SYMBOL(pci_mem_start);
  
@@ -713,8 +792,10 @@ __init void e820_setup_gap(void)
  
         if (!found) {
                 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
-               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
-                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
+               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+                      "address range\n"
+                      KERN_ERR "PCI: Unassigned devices with 32bit resource "
+                      "registers may break!\n");
         }
  
         /*
@@ -727,8 +808,9 @@ __init void e820_setup_gap(void)
         /* Fun with two's complement */
         pci_mem_start = (gapstart + round) & -round;
  
-       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-               pci_mem_start, gapstart, gapsize);
+       printk(KERN_INFO
+              "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+              pci_mem_start, gapstart, gapsize);
  }
  
  int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c

index 88bb83ec895f0d2899f15f4f98f75c5aedac479b..9f51e1ea9e8225e919ccb69ad6b4bc315c6f38cc 100644 (file)
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -21,7 +21,33 @@
  #include <asm/gart.h>
  #endif
  
-static void __init via_bugs(void)
+static void __init fix_hypertransport_config(int num, int slot, int func)
+{
+       u32 htcfg;
+       /*
+        * we found a hypertransport bus
+        * make sure that we are broadcasting
+        * interrupts to all cpus on the ht bus
+        * if we're using extended apic ids
+        */
+       htcfg = read_pci_config(num, slot, func, 0x68);
+       if (htcfg & (1 << 18)) {
+               printk(KERN_INFO "Detected use of extended apic ids "
+                                "on hypertransport bus\n");
+               if ((htcfg & (1 << 17)) == 0) {
+                       printk(KERN_INFO "Enabling hypertransport extended "
+                                        "apic interrupt broadcast\n");
+                       printk(KERN_INFO "Note this is a bios bug, "
+                                        "please contact your hw vendor\n");
+                       htcfg |= (1 << 17);
+                       write_pci_config(num, slot, func, 0x68, htcfg);
+               }
+       }
+
+
+}
+
+static void __init via_bugs(int  num, int slot, int func)
  {
  #ifdef CONFIG_GART_IOMMU
         if ((end_pfn > MAX_DMA32_PFN ||  force_iommu) &&
@@ -44,7 +70,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header)
  #endif /* CONFIG_X86_IO_APIC */
  #endif /* CONFIG_ACPI */
  
-static void __init nvidia_bugs(void)
+static void __init nvidia_bugs(int num, int slot, int func)
  {
  #ifdef CONFIG_ACPI
  #ifdef CONFIG_X86_IO_APIC
@@ -72,7 +98,7 @@ static void __init nvidia_bugs(void)
  
  }
  
-static void __init ati_bugs(void)
+static void __init ati_bugs(int num, int slot, int func)
  {
  #ifdef CONFIG_X86_IO_APIC
         if (timer_over_8254 == 1) {
@@ -83,18 +109,67 @@ static void __init ati_bugs(void)
  #endif
  }
  
+#define QFLAG_APPLY_ONCE       0x1
+#define QFLAG_APPLIED          0x2
+#define QFLAG_DONE             (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
  struct chipset {
-       u16 vendor;
-       void (*f)(void);
+       u32 vendor;
+       u32 device;
+       u32 class;
+       u32 class_mask;
+       u32 flags;
+       void (*f)(int num, int slot, int func);
  };
  
  static struct chipset early_qrk[] __initdata = {
-       { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
-       { PCI_VENDOR_ID_VIA, via_bugs },
-       { PCI_VENDOR_ID_ATI, ati_bugs },
+       { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+         PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
+       { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+         PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
+       { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
+         PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
+       { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+         PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
         {}
  };
  
+static void __init check_dev_quirk(int num, int slot, int func)
+{
+       u16 class;
+       u16 vendor;
+       u16 device;
+       u8 type;
+       int i;
+
+       class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
+
+       if (class == 0xffff)
+               return;
+
+       vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
+
+       device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+       for (i = 0; early_qrk[i].f != NULL; i++) {
+               if (((early_qrk[i].vendor == PCI_ANY_ID) ||
+                       (early_qrk[i].vendor == vendor)) &&
+                       ((early_qrk[i].device == PCI_ANY_ID) ||
+                       (early_qrk[i].device == device)) &&
+                       (!((early_qrk[i].class ^ class) &
+                           early_qrk[i].class_mask))) {
+                               if ((early_qrk[i].flags &
+                                    QFLAG_DONE) != QFLAG_DONE)
+                                       early_qrk[i].f(num, slot, func);
+                               early_qrk[i].flags |= QFLAG_APPLIED;
+                       }
+       }
+
+       type = read_pci_config_byte(num, slot, func,
+                                   PCI_HEADER_TYPE);
+       if (!(type & 0x80))
+               return;
+}
+
  void __init early_quirks(void)
  {
         int num, slot, func;
@@ -103,36 +178,8 @@ void __init early_quirks(void)
                 return;
  
         /* Poor man's PCI discovery */
-       for (num = 0; num < 32; num++) {
-               for (slot = 0; slot < 32; slot++) {
-                       for (func = 0; func < 8; func++) {
-                               u32 class;
-                               u32 vendor;
-                               u8 type;
-                               int i;
-                               class = read_pci_config(num,slot,func,
-                                                       PCI_CLASS_REVISION);
-                               if (class == 0xffffffff)
-                                       break;
-
-                               if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
-                                       continue;
-
-                               vendor = read_pci_config(num, slot, func,
-                                                        PCI_VENDOR_ID);
-                               vendor &= 0xffff;
-
-                               for (i = 0; early_qrk[i].f; i++)
-                                       if (early_qrk[i].vendor == vendor) {
-                                               early_qrk[i].f();
-                                               return;
-                                       }
-
-                               type = read_pci_config_byte(num, slot, func,
-                                                           PCI_HEADER_TYPE);
-                               if (!(type & 0x80))
-                                       break;
-                       }
-               }
-       }
+       for (num = 0; num < 32; num++)
+               for (slot = 0; slot < 32; slot++)
+                       for (func = 0; func < 8; func++)
+                               check_dev_quirk(num, slot, func);
  }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c

new file mode 100644 (file)

index 0000000..1411324
--- /dev/null
+++ b/arch/x86/kernel/efi.c
@@ -0,0 +1,512 @@
+/*
+ * Common EFI (Extensible Firmware Interface) support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *     David Mosberger-Tang <davidm@hpl.hp.com>
+ *     Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2005-2008 Intel Co.
+ *     Fenghua Yu <fenghua.yu@intel.com>
+ *     Bibo Mao <bibo.mao@intel.com>
+ *     Chandramouli Narayanan <mouli@linux.intel.com>
+ *     Huang Ying <ying.huang@intel.com>
+ *
+ * Copied from efi_32.c to eliminate the duplicated code between EFI
+ * 32/64 support code. --ying 2007-10-26
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *     Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/efi.h>
+#include <linux/bootmem.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+#include <linux/bcd.h>
+
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/time.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define EFI_DEBUG      1
+#define PFX            "EFI: "
+
+int efi_enabled;
+EXPORT_SYMBOL(efi_enabled);
+
+struct efi efi;
+EXPORT_SYMBOL(efi);
+
+struct efi_memory_map memmap;
+
+struct efi efi_phys __initdata;
+static efi_system_table_t efi_systab __initdata;
+
+static int __init setup_noefi(char *arg)
+{
+       efi_enabled = 0;
+       return 0;
+}
+early_param("noefi", setup_noefi);
+
+static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+       return efi_call_virt2(get_time, tm, tc);
+}
+
+static efi_status_t virt_efi_set_time(efi_time_t *tm)
+{
+       return efi_call_virt1(set_time, tm);
+}
+
+static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
+                                            efi_bool_t *pending,
+                                            efi_time_t *tm)
+{
+       return efi_call_virt3(get_wakeup_time,
+                             enabled, pending, tm);
+}
+
+static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+       return efi_call_virt2(set_wakeup_time,
+                             enabled, tm);
+}
+
+static efi_status_t virt_efi_get_variable(efi_char16_t *name,
+                                         efi_guid_t *vendor,
+                                         u32 *attr,
+                                         unsigned long *data_size,
+                                         void *data)
+{
+       return efi_call_virt5(get_variable,
+                             name, vendor, attr,
+                             data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
+                                              efi_char16_t *name,
+                                              efi_guid_t *vendor)
+{
+       return efi_call_virt3(get_next_variable,
+                             name_size, name, vendor);
+}
+
+static efi_status_t virt_efi_set_variable(efi_char16_t *name,
+                                         efi_guid_t *vendor,
+                                         unsigned long attr,
+                                         unsigned long data_size,
+                                         void *data)
+{
+       return efi_call_virt5(set_variable,
+                             name, vendor, attr,
+                             data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
+{
+       return efi_call_virt1(get_next_high_mono_count, count);
+}
+
+static void virt_efi_reset_system(int reset_type,
+                                 efi_status_t status,
+                                 unsigned long data_size,
+                                 efi_char16_t *data)
+{
+       efi_call_virt4(reset_system, reset_type, status,
+                      data_size, data);
+}
+
+static efi_status_t virt_efi_set_virtual_address_map(
+       unsigned long memory_map_size,
+       unsigned long descriptor_size,
+       u32 descriptor_version,
+       efi_memory_desc_t *virtual_map)
+{
+       return efi_call_virt4(set_virtual_address_map,
+                             memory_map_size, descriptor_size,
+                             descriptor_version, virtual_map);
+}
+
+static efi_status_t __init phys_efi_set_virtual_address_map(
+       unsigned long memory_map_size,
+       unsigned long descriptor_size,
+       u32 descriptor_version,
+       efi_memory_desc_t *virtual_map)
+{
+       efi_status_t status;
+
+       efi_call_phys_prelog();
+       status = efi_call_phys4(efi_phys.set_virtual_address_map,
+                               memory_map_size, descriptor_size,
+                               descriptor_version, virtual_map);
+       efi_call_phys_epilog();
+       return status;
+}
+
+static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
+                                            efi_time_cap_t *tc)
+{
+       efi_status_t status;
+
+       efi_call_phys_prelog();
+       status = efi_call_phys2(efi_phys.get_time, tm, tc);
+       efi_call_phys_epilog();
+       return status;
+}
+
+int efi_set_rtc_mmss(unsigned long nowtime)
+{
+       int real_seconds, real_minutes;
+       efi_status_t    status;
+       efi_time_t      eft;
+       efi_time_cap_t  cap;
+
+       status = efi.get_time(&eft, &cap);
+       if (status != EFI_SUCCESS) {
+               printk(KERN_ERR "Oops: efitime: can't read time!\n");
+               return -1;
+       }
+
+       real_seconds = nowtime % 60;
+       real_minutes = nowtime / 60;
+       if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+               real_minutes += 30;
+       real_minutes %= 60;
+       eft.minute = real_minutes;
+       eft.second = real_seconds;
+
+       status = efi.set_time(&eft);
+       if (status != EFI_SUCCESS) {
+               printk(KERN_ERR "Oops: efitime: can't write time!\n");
+               return -1;
+       }
+       return 0;
+}
+
+unsigned long efi_get_time(void)
+{
+       efi_status_t status;
+       efi_time_t eft;
+       efi_time_cap_t cap;
+
+       status = efi.get_time(&eft, &cap);
+       if (status != EFI_SUCCESS)
+               printk(KERN_ERR "Oops: efitime: can't read time!\n");
+
+       return mktime(eft.year, eft.month, eft.day, eft.hour,
+                     eft.minute, eft.second);
+}
+
+#if EFI_DEBUG
+static void __init print_efi_memmap(void)
+{
+       efi_memory_desc_t *md;
+       void *p;
+       int i;
+
+       for (p = memmap.map, i = 0;
+            p < memmap.map_end;
+            p += memmap.desc_size, i++) {
+               md = p;
+               printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
+                       "range=[0x%016llx-0x%016llx) (%lluMB)\n",
+                       i, md->type, md->attribute, md->phys_addr,
+                       md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+                       (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
+       }
+}
+#endif  /*  EFI_DEBUG  */
+
+void __init efi_init(void)
+{
+       efi_config_table_t *config_tables;
+       efi_runtime_services_t *runtime;
+       efi_char16_t *c16;
+       char vendor[100] = "unknown";
+       int i = 0;
+       void *tmp;
+
+#ifdef CONFIG_X86_32
+       efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
+       memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
+#else
+       efi_phys.systab = (efi_system_table_t *)
+               (boot_params.efi_info.efi_systab |
+                ((__u64)boot_params.efi_info.efi_systab_hi<<32));
+       memmap.phys_map = (void *)
+               (boot_params.efi_info.efi_memmap |
+                ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#endif
+       memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+               boot_params.efi_info.efi_memdesc_size;
+       memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+       memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+
+       efi.systab = early_ioremap((unsigned long)efi_phys.systab,
+                                  sizeof(efi_system_table_t));
+       if (efi.systab == NULL)
+               printk(KERN_ERR "Couldn't map the EFI system table!\n");
+       memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
+       early_iounmap(efi.systab, sizeof(efi_system_table_t));
+       efi.systab = &efi_systab;
+
+       /*
+        * Verify the EFI Table
+        */
+       if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+               printk(KERN_ERR "EFI system table signature incorrect!\n");
+       if ((efi.systab->hdr.revision >> 16) == 0)
+               printk(KERN_ERR "Warning: EFI system table version "
+                      "%d.%02d, expected 1.00 or greater!\n",
+                      efi.systab->hdr.revision >> 16,
+                      efi.systab->hdr.revision & 0xffff);
+
+       /*
+        * Show what we know for posterity
+        */
+       c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
+       if (c16) {
+               for (i = 0; i < sizeof(vendor) && *c16; ++i)
+                       vendor[i] = *c16++;
+               vendor[i] = '\0';
+       } else
+               printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+       early_iounmap(tmp, 2);
+
+       printk(KERN_INFO "EFI v%u.%.02u by %s \n",
+              efi.systab->hdr.revision >> 16,
+              efi.systab->hdr.revision & 0xffff, vendor);
+
+       /*
+        * Let's see what config tables the firmware passed to us.
+        */
+       config_tables = early_ioremap(
+               efi.systab->tables,
+               efi.systab->nr_tables * sizeof(efi_config_table_t));
+       if (config_tables == NULL)
+               printk(KERN_ERR "Could not map EFI Configuration Table!\n");
+
+       printk(KERN_INFO);
+       for (i = 0; i < efi.systab->nr_tables; i++) {
+               if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
+                       efi.mps = config_tables[i].table;
+                       printk(" MPS=0x%lx ", config_tables[i].table);
+               } else if (!efi_guidcmp(config_tables[i].guid,
+                                       ACPI_20_TABLE_GUID)) {
+                       efi.acpi20 = config_tables[i].table;
+                       printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
+               } else if (!efi_guidcmp(config_tables[i].guid,
+                                       ACPI_TABLE_GUID)) {
+                       efi.acpi = config_tables[i].table;
+                       printk(" ACPI=0x%lx ", config_tables[i].table);
+               } else if (!efi_guidcmp(config_tables[i].guid,
+                                       SMBIOS_TABLE_GUID)) {
+                       efi.smbios = config_tables[i].table;
+                       printk(" SMBIOS=0x%lx ", config_tables[i].table);
+               } else if (!efi_guidcmp(config_tables[i].guid,
+                                       HCDP_TABLE_GUID)) {
+                       efi.hcdp = config_tables[i].table;
+                       printk(" HCDP=0x%lx ", config_tables[i].table);
+               } else if (!efi_guidcmp(config_tables[i].guid,
+                                       UGA_IO_PROTOCOL_GUID)) {
+                       efi.uga = config_tables[i].table;
+                       printk(" UGA=0x%lx ", config_tables[i].table);
+               }
+       }
+       printk("\n");
+       early_iounmap(config_tables,
+                         efi.systab->nr_tables * sizeof(efi_config_table_t));
+
+       /*
+        * Check out the runtime services table. We need to map
+        * the runtime services table so that we can grab the physical
+        * address of several of the EFI runtime functions, needed to
+        * set the firmware into virtual mode.
+        */
+       runtime = early_ioremap((unsigned long)efi.systab->runtime,
+                               sizeof(efi_runtime_services_t));
+       if (runtime != NULL) {
+               /*
+                * We will only need *early* access to the following
+                * two EFI runtime services before set_virtual_address_map
+                * is invoked.
+                */
+               efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
+               efi_phys.set_virtual_address_map =
+                       (efi_set_virtual_address_map_t *)
+                       runtime->set_virtual_address_map;
+               /*
+                * Make efi_get_time can be called before entering
+                * virtual mode.
+                */
+               efi.get_time = phys_efi_get_time;
+       } else
+               printk(KERN_ERR "Could not map the EFI runtime service "
+                      "table!\n");
+       early_iounmap(runtime, sizeof(efi_runtime_services_t));
+
+       /* Map the EFI memory map */
+       memmap.map = early_ioremap((unsigned long)memmap.phys_map,
+                                  memmap.nr_map * memmap.desc_size);
+       if (memmap.map == NULL)
+               printk(KERN_ERR "Could not map the EFI memory map!\n");
+       memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+       if (memmap.desc_size != sizeof(efi_memory_desc_t))
+               printk(KERN_WARNING "Kernel-defined memdesc"
+                      "doesn't match the one from EFI!\n");
+
+       /* Setup for EFI runtime service */
+       reboot_type = BOOT_EFI;
+
+#if EFI_DEBUG
+       print_efi_memmap();
+#endif
+}
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static void __init runtime_code_page_mkexec(void)
+{
+       efi_memory_desc_t *md;
+       unsigned long end;
+       void *p;
+
+       if (!(__supported_pte_mask & _PAGE_NX))
+               return;
+
+       /* Make EFI runtime service code area executable */
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+               end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+               if (md->type == EFI_RUNTIME_SERVICES_CODE &&
+                   (end >> PAGE_SHIFT) <= max_pfn_mapped) {
+                       set_memory_x(md->virt_addr, md->num_pages);
+                       set_memory_uc(md->virt_addr, md->num_pages);
+               }
+       }
+       __flush_tlb_all();
+}
+#else
+static inline void __init runtime_code_page_mkexec(void) { }
+#endif
+
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor and update
+ * that memory descriptor with the virtual address obtained from ioremap().
+ * This enables the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ */
+void __init efi_enter_virtual_mode(void)
+{
+       efi_memory_desc_t *md;
+       efi_status_t status;
+       unsigned long end;
+       void *p;
+
+       efi.systab = NULL;
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+               if (!(md->attribute & EFI_MEMORY_RUNTIME))
+                       continue;
+               end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+               if ((md->attribute & EFI_MEMORY_WB) &&
+                   ((end >> PAGE_SHIFT) <= max_pfn_mapped))
+                       md->virt_addr = (unsigned long)__va(md->phys_addr);
+               else
+                       md->virt_addr = (unsigned long)
+                               efi_ioremap(md->phys_addr,
+                                           md->num_pages << EFI_PAGE_SHIFT);
+               if (!md->virt_addr)
+                       printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
+                              (unsigned long long)md->phys_addr);
+               if ((md->phys_addr <= (unsigned long)efi_phys.systab) &&
+                   ((unsigned long)efi_phys.systab < end))
+                       efi.systab = (efi_system_table_t *)(unsigned long)
+                               (md->virt_addr - md->phys_addr +
+                                (unsigned long)efi_phys.systab);
+       }
+
+       BUG_ON(!efi.systab);
+
+       status = phys_efi_set_virtual_address_map(
+               memmap.desc_size * memmap.nr_map,
+               memmap.desc_size,
+               memmap.desc_version,
+               memmap.phys_map);
+
+       if (status != EFI_SUCCESS) {
+               printk(KERN_ALERT "Unable to switch EFI into virtual mode "
+                      "(status=%lx)!\n", status);
+               panic("EFI call to SetVirtualAddressMap() failed!");
+       }
+
+       /*
+        * Now that EFI is in virtual mode, update the function
+        * pointers in the runtime service table to the new virtual addresses.
+        *
+        * Call EFI services through wrapper functions.
+        */
+       efi.get_time = virt_efi_get_time;
+       efi.set_time = virt_efi_set_time;
+       efi.get_wakeup_time = virt_efi_get_wakeup_time;
+       efi.set_wakeup_time = virt_efi_set_wakeup_time;
+       efi.get_variable = virt_efi_get_variable;
+       efi.get_next_variable = virt_efi_get_next_variable;
+       efi.set_variable = virt_efi_set_variable;
+       efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+       efi.reset_system = virt_efi_reset_system;
+       efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
+       runtime_code_page_mkexec();
+       early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
+       memmap.map = NULL;
+}
+
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+u32 efi_mem_type(unsigned long phys_addr)
+{
+       efi_memory_desc_t *md;
+       void *p;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+               if ((md->phys_addr <= phys_addr) &&
+                   (phys_addr < (md->phys_addr +
+                                 (md->num_pages << EFI_PAGE_SHIFT))))
+                       return md->type;
+       }
+       return 0;
+}
+
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+       efi_memory_desc_t *md;
+       void *p;
+
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+               if ((md->phys_addr <= phys_addr) &&
+                   (phys_addr < (md->phys_addr +
+                                 (md->num_pages << EFI_PAGE_SHIFT))))
+                       return md->attribute;
+       }
+       return 0;
+}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c

index e2be78f49399b7161a3c51783fda97d4c87467bb..cb91f985b4a1f8520e021d995135977729e57481 100644 (file)
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -20,40 +20,15 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
  #include <linux/types.h>
-#include <linux/time.h>
-#include <linux/spinlock.h>
-#include <linux/bootmem.h>
  #include <linux/ioport.h>
-#include <linux/module.h>
  #include <linux/efi.h>
-#include <linux/kexec.h>
  
-#include <asm/setup.h>
  #include <asm/io.h>
  #include <asm/page.h>
  #include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
  #include <asm/tlbflush.h>
  
-#define EFI_DEBUG      0
-#define PFX            "EFI: "
-
-extern efi_status_t asmlinkage efi_call_phys(void *, ...);
-
-struct efi efi;
-EXPORT_SYMBOL(efi);
-static struct efi efi_phys;
-struct efi_memory_map memmap;
-
-/*
- * We require an early boot_ioremap mapping mechanism initially
- */
-extern void * boot_ioremap(unsigned long, unsigned long);
-
  /*
   * To make EFI call EFI runtime service in physical addressing mode we need
   * prelog/epilog before/after the invocation to disable interrupt, to
@@ -62,16 +37,14 @@ extern void * boot_ioremap(unsigned long, unsigned long);
   */
  
  static unsigned long efi_rt_eflags;
-static DEFINE_SPINLOCK(efi_rt_lock);
  static pgd_t efi_bak_pg_dir_pointer[2];
  
-static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
+void efi_call_phys_prelog(void)
  {
         unsigned long cr4;
         unsigned long temp;
-       struct Xgt_desc_struct gdt_descr;
+       struct desc_ptr gdt_descr;
  
-       spin_lock(&efi_rt_lock);
         local_irq_save(efi_rt_eflags);
  
         /*
@@ -101,17 +74,17 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
         /*
          * After the lock is released, the original page table is restored.
          */
-       local_flush_tlb();
+       __flush_tlb_all();
  
         gdt_descr.address = __pa(get_cpu_gdt_table(0));
         gdt_descr.size = GDT_SIZE - 1;
         load_gdt(&gdt_descr);
  }
  
-static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
+void efi_call_phys_epilog(void)
  {
         unsigned long cr4;
-       struct Xgt_desc_struct gdt_descr;
+       struct desc_ptr gdt_descr;
  
         gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
         gdt_descr.size = GDT_SIZE - 1;
@@ -132,586 +105,7 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
         /*
          * After the lock is released, the original page table is restored.
          */
-       local_flush_tlb();
+       __flush_tlb_all();
  
         local_irq_restore(efi_rt_eflags);
-       spin_unlock(&efi_rt_lock);
-}
-
-static efi_status_t
-phys_efi_set_virtual_address_map(unsigned long memory_map_size,
-                                unsigned long descriptor_size,
-                                u32 descriptor_version,
-                                efi_memory_desc_t *virtual_map)
-{
-       efi_status_t status;
-
-       efi_call_phys_prelog();
-       status = efi_call_phys(efi_phys.set_virtual_address_map,
-                                    memory_map_size, descriptor_size,
-                                    descriptor_version, virtual_map);
-       efi_call_phys_epilog();
-       return status;
-}
-
-static efi_status_t
-phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-       efi_status_t status;
-
-       efi_call_phys_prelog();
-       status = efi_call_phys(efi_phys.get_time, tm, tc);
-       efi_call_phys_epilog();
-       return status;
-}
-
-inline int efi_set_rtc_mmss(unsigned long nowtime)
-{
-       int real_seconds, real_minutes;
-       efi_status_t    status;
-       efi_time_t      eft;
-       efi_time_cap_t  cap;
-
-       spin_lock(&efi_rt_lock);
-       status = efi.get_time(&eft, &cap);
-       spin_unlock(&efi_rt_lock);
-       if (status != EFI_SUCCESS)
-               panic("Ooops, efitime: can't read time!\n");
-       real_seconds = nowtime % 60;
-       real_minutes = nowtime / 60;
-
-       if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
-               real_minutes += 30;
-       real_minutes %= 60;
-
-       eft.minute = real_minutes;
-       eft.second = real_seconds;
-
-       if (status != EFI_SUCCESS) {
-               printk("Ooops: efitime: can't read time!\n");
-               return -1;
-       }
-       return 0;
-}
-/*
- * This is used during kernel init before runtime
- * services have been remapped and also during suspend, therefore,
- * we'll need to call both in physical and virtual modes.
- */
-inline unsigned long efi_get_time(void)
-{
-       efi_status_t status;
-       efi_time_t eft;
-       efi_time_cap_t cap;
-
-       if (efi.get_time) {
-               /* if we are in virtual mode use remapped function */
-               status = efi.get_time(&eft, &cap);
-       } else {
-               /* we are in physical mode */
-               status = phys_efi_get_time(&eft, &cap);
-       }
-
-       if (status != EFI_SUCCESS)
-               printk("Oops: efitime: can't read time status: 0x%lx\n",status);
-
-       return mktime(eft.year, eft.month, eft.day, eft.hour,
-                       eft.minute, eft.second);
-}
-
-int is_available_memory(efi_memory_desc_t * md)
-{
-       if (!(md->attribute & EFI_MEMORY_WB))
-               return 0;
-
-       switch (md->type) {
-               case EFI_LOADER_CODE:
-               case EFI_LOADER_DATA:
-               case EFI_BOOT_SERVICES_CODE:
-               case EFI_BOOT_SERVICES_DATA:
-               case EFI_CONVENTIONAL_MEMORY:
-                       return 1;
-       }
-       return 0;
-}
-
-/*
- * We need to map the EFI memory map again after paging_init().
- */
-void __init efi_map_memmap(void)
-{
-       memmap.map = NULL;
-
-       memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
-                       (memmap.nr_map * memmap.desc_size));
-       if (memmap.map == NULL)
-               printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
-
-       memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-}
-
-#if EFI_DEBUG
-static void __init print_efi_memmap(void)
-{
-       efi_memory_desc_t *md;
-       void *p;
-       int i;
-
-       for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
-               md = p;
-               printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
-                       "range=[0x%016llx-0x%016llx) (%lluMB)\n",
-                       i, md->type, md->attribute, md->phys_addr,
-                       md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
-                       (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
-       }
-}
-#endif  /*  EFI_DEBUG  */
-
-/*
- * Walks the EFI memory map and calls CALLBACK once for each EFI
- * memory descriptor that has memory that is available for kernel use.
- */
-void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
-{
-       int prev_valid = 0;
-       struct range {
-               unsigned long start;
-               unsigned long end;
-       } uninitialized_var(prev), curr;
-       efi_memory_desc_t *md;
-       unsigned long start, end;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-
-               if ((md->num_pages == 0) || (!is_available_memory(md)))
-                       continue;
-
-               curr.start = md->phys_addr;
-               curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
-
-               if (!prev_valid) {
-                       prev = curr;
-                       prev_valid = 1;
-               } else {
-                       if (curr.start < prev.start)
-                               printk(KERN_INFO PFX "Unordered memory map\n");
-                       if (prev.end == curr.start)
-                               prev.end = curr.end;
-                       else {
-                               start =
-                                   (unsigned long) (PAGE_ALIGN(prev.start));
-                               end = (unsigned long) (prev.end & PAGE_MASK);
-                               if ((end > start)
-                                   && (*callback) (start, end, arg) < 0)
-                                       return;
-                               prev = curr;
-                       }
-               }
-       }
-       if (prev_valid) {
-               start = (unsigned long) PAGE_ALIGN(prev.start);
-               end = (unsigned long) (prev.end & PAGE_MASK);
-               if (end > start)
-                       (*callback) (start, end, arg);
-       }
-}
-
-void __init efi_init(void)
-{
-       efi_config_table_t *config_tables;
-       efi_runtime_services_t *runtime;
-       efi_char16_t *c16;
-       char vendor[100] = "unknown";
-       unsigned long num_config_tables;
-       int i = 0;
-
-       memset(&efi, 0, sizeof(efi) );
-       memset(&efi_phys, 0, sizeof(efi_phys));
-
-       efi_phys.systab =
-               (efi_system_table_t *)boot_params.efi_info.efi_systab;
-       memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
-       memmap.nr_map = boot_params.efi_info.efi_memmap_size/
-               boot_params.efi_info.efi_memdesc_size;
-       memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
-       memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
-
-       efi.systab = (efi_system_table_t *)
-               boot_ioremap((unsigned long) efi_phys.systab,
-                       sizeof(efi_system_table_t));
-       /*
-        * Verify the EFI Table
-        */
-       if (efi.systab == NULL)
-               printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
-       if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-               printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
-       if ((efi.systab->hdr.revision >> 16) == 0)
-               printk(KERN_ERR PFX "Warning: EFI system table version "
-                      "%d.%02d, expected 1.00 or greater\n",
-                      efi.systab->hdr.revision >> 16,
-                      efi.systab->hdr.revision & 0xffff);
-
-       /*
-        * Grab some details from the system table
-        */
-       num_config_tables = efi.systab->nr_tables;
-       config_tables = (efi_config_table_t *)efi.systab->tables;
-       runtime = efi.systab->runtime;
-
-       /*
-        * Show what we know for posterity
-        */
-       c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
-       if (c16) {
-               for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
-                       vendor[i] = *c16++;
-               vendor[i] = '\0';
-       } else
-               printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
-
-       printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
-              efi.systab->hdr.revision >> 16,
-              efi.systab->hdr.revision & 0xffff, vendor);
-
-       /*
-        * Let's see what config tables the firmware passed to us.
-        */
-       config_tables = (efi_config_table_t *)
-                               boot_ioremap((unsigned long) config_tables,
-                               num_config_tables * sizeof(efi_config_table_t));
-
-       if (config_tables == NULL)
-               printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
-
-       efi.mps        = EFI_INVALID_TABLE_ADDR;
-       efi.acpi       = EFI_INVALID_TABLE_ADDR;
-       efi.acpi20     = EFI_INVALID_TABLE_ADDR;
-       efi.smbios     = EFI_INVALID_TABLE_ADDR;
-       efi.sal_systab = EFI_INVALID_TABLE_ADDR;
-       efi.boot_info  = EFI_INVALID_TABLE_ADDR;
-       efi.hcdp       = EFI_INVALID_TABLE_ADDR;
-       efi.uga        = EFI_INVALID_TABLE_ADDR;
-
-       for (i = 0; i < num_config_tables; i++) {
-               if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
-                       efi.mps = config_tables[i].table;
-                       printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
-                       efi.acpi20 = config_tables[i].table;
-                       printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
-                       efi.acpi = config_tables[i].table;
-                       printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
-                       efi.smbios = config_tables[i].table;
-                       printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
-                       efi.hcdp = config_tables[i].table;
-                       printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
-               } else
-                   if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
-                       efi.uga = config_tables[i].table;
-                       printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
-               }
-       }
-       printk("\n");
-
-       /*
-        * Check out the runtime services table. We need to map
-        * the runtime services table so that we can grab the physical
-        * address of several of the EFI runtime functions, needed to
-        * set the firmware into virtual mode.
-        */
-
-       runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
-                                               runtime,
-                                               sizeof(efi_runtime_services_t));
-       if (runtime != NULL) {
-               /*
-                * We will only need *early* access to the following
-                * two EFI runtime services before set_virtual_address_map
-                * is invoked.
-                */
-               efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
-               efi_phys.set_virtual_address_map =
-                       (efi_set_virtual_address_map_t *)
-                               runtime->set_virtual_address_map;
-       } else
-               printk(KERN_ERR PFX "Could not map the runtime service table!\n");
-
-       /* Map the EFI memory map for use until paging_init() */
-       memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap,
-                                 boot_params.efi_info.efi_memmap_size);
-       if (memmap.map == NULL)
-               printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
-
-       memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-
-#if EFI_DEBUG
-       print_efi_memmap();
-#endif
-}
-
-static inline void __init check_range_for_systab(efi_memory_desc_t *md)
-{
-       if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
-               ((unsigned long)efi_phys.systab < md->phys_addr +
-               ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
-               unsigned long addr;
-
-               addr = md->virt_addr - md->phys_addr +
-                       (unsigned long)efi_phys.systab;
-               efi.systab = (efi_system_table_t *)addr;
-       }
-}
-
-/*
- * Wrap all the virtual calls in a way that forces the parameters on the stack.
- */
-
-#define efi_call_virt(f, args...) \
-     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-       return efi_call_virt(get_time, tm, tc);
-}
-
-static efi_status_t virt_efi_set_time (efi_time_t *tm)
-{
-       return efi_call_virt(set_time, tm);
-}
-
-static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
-                                             efi_bool_t *pending,
-                                             efi_time_t *tm)
-{
-       return efi_call_virt(get_wakeup_time, enabled, pending, tm);
-}
-
-static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
-                                             efi_time_t *tm)
-{
-       return efi_call_virt(set_wakeup_time, enabled, tm);
-}
-
-static efi_status_t virt_efi_get_variable (efi_char16_t *name,
-                                          efi_guid_t *vendor, u32 *attr,
-                                          unsigned long *data_size, void *data)
-{
-       return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
-                                               efi_char16_t *name,
-                                               efi_guid_t *vendor)
-{
-       return efi_call_virt(get_next_variable, name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable (efi_char16_t *name,
-                                          efi_guid_t *vendor,
-                                          unsigned long attr,
-                                          unsigned long data_size, void *data)
-{
-       return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
-{
-       return efi_call_virt(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system (int reset_type, efi_status_t status,
-                                  unsigned long data_size,
-                                  efi_char16_t *data)
-{
-       efi_call_virt(reset_system, reset_type, status, data_size, data);
-}
-
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- */
-
-void __init efi_enter_virtual_mode(void)
-{
-       efi_memory_desc_t *md;
-       efi_status_t status;
-       void *p;
-
-       efi.systab = NULL;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-
-               if (!(md->attribute & EFI_MEMORY_RUNTIME))
-                       continue;
-
-               md->virt_addr = (unsigned long)ioremap(md->phys_addr,
-                       md->num_pages << EFI_PAGE_SHIFT);
-               if (!(unsigned long)md->virt_addr) {
-                       printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
-                               (unsigned long)md->phys_addr);
-               }
-               /* update the virtual address of the EFI system table */
-               check_range_for_systab(md);
-       }
-
-       BUG_ON(!efi.systab);
-
-       status = phys_efi_set_virtual_address_map(
-                       memmap.desc_size * memmap.nr_map,
-                       memmap.desc_size,
-                       memmap.desc_version,
-                       memmap.phys_map);
-
-       if (status != EFI_SUCCESS) {
-               printk (KERN_ALERT "You are screwed! "
-                       "Unable to switch EFI into virtual mode "
-                       "(status=%lx)\n", status);
-               panic("EFI call to SetVirtualAddressMap() failed!");
-       }
-
-       /*
-        * Now that EFI is in virtual mode, update the function
-        * pointers in the runtime service table to the new virtual addresses.
-        */
-
-       efi.get_time = virt_efi_get_time;
-       efi.set_time = virt_efi_set_time;
-       efi.get_wakeup_time = virt_efi_get_wakeup_time;
-       efi.set_wakeup_time = virt_efi_set_wakeup_time;
-       efi.get_variable = virt_efi_get_variable;
-       efi.get_next_variable = virt_efi_get_next_variable;
-       efi.set_variable = virt_efi_set_variable;
-       efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
-       efi.reset_system = virt_efi_reset_system;
-}
-
-void __init
-efi_initialize_iomem_resources(struct resource *code_resource,
-                              struct resource *data_resource,
-                              struct resource *bss_resource)
-{
-       struct resource *res;
-       efi_memory_desc_t *md;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-
-               if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
-                   0x100000000ULL)
-                       continue;
-               res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-               switch (md->type) {
-               case EFI_RESERVED_TYPE:
-                       res->name = "Reserved Memory";
-                       break;
-               case EFI_LOADER_CODE:
-                       res->name = "Loader Code";
-                       break;
-               case EFI_LOADER_DATA:
-                       res->name = "Loader Data";
-                       break;
-               case EFI_BOOT_SERVICES_DATA:
-                       res->name = "BootServices Data";
-                       break;
-               case EFI_BOOT_SERVICES_CODE:
-                       res->name = "BootServices Code";
-                       break;
-               case EFI_RUNTIME_SERVICES_CODE:
-                       res->name = "Runtime Service Code";
-                       break;
-               case EFI_RUNTIME_SERVICES_DATA:
-                       res->name = "Runtime Service Data";
-                       break;
-               case EFI_CONVENTIONAL_MEMORY:
-                       res->name = "Conventional Memory";
-                       break;
-               case EFI_UNUSABLE_MEMORY:
-                       res->name = "Unusable Memory";
-                       break;
-               case EFI_ACPI_RECLAIM_MEMORY:
-                       res->name = "ACPI Reclaim";
-                       break;
-               case EFI_ACPI_MEMORY_NVS:
-                       res->name = "ACPI NVS";
-                       break;
-               case EFI_MEMORY_MAPPED_IO:
-                       res->name = "Memory Mapped IO";
-                       break;
-               case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
-                       res->name = "Memory Mapped IO Port Space";
-                       break;
-               default:
-                       res->name = "Reserved";
-                       break;
-               }
-               res->start = md->phys_addr;
-               res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-               if (request_resource(&iomem_resource, res) < 0)
-                       printk(KERN_ERR PFX "Failed to allocate res %s : "
-                               "0x%llx-0x%llx\n", res->name,
-                               (unsigned long long)res->start,
-                               (unsigned long long)res->end);
-               /*
-                * We don't know which region contains kernel data so we try
-                * it repeatedly and let the resource manager test it.
-                */
-               if (md->type == EFI_CONVENTIONAL_MEMORY) {
-                       request_resource(res, code_resource);
-                       request_resource(res, data_resource);
-                       request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
-                       request_resource(res, &crashk_res);
-#endif
-               }
-       }
-}
-
-/*
- * Convenience functions to obtain memory types and attributes
- */
-
-u32 efi_mem_type(unsigned long phys_addr)
-{
-       efi_memory_desc_t *md;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-               if ((md->phys_addr <= phys_addr) && (phys_addr <
-                       (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
-                       return md->type;
-       }
-       return 0;
-}
-
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
-       efi_memory_desc_t *md;
-       void *p;
-
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               md = p;
-               if ((md->phys_addr <= phys_addr) && (phys_addr <
-                       (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
-                       return md->attribute;
-       }
-       return 0;
  }
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c

new file mode 100644 (file)

index 0000000..4b73992
--- /dev/null
+++ b/arch/x86/kernel/efi_64.c
@@ -0,0 +1,134 @@
+/*
+ * x86_64 specific EFI support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 2005-2008 Intel Co.
+ *     Fenghua Yu <fenghua.yu@intel.com>
+ *     Bibo Mao <bibo.mao@intel.com>
+ *     Chandramouli Narayanan <mouli@linux.intel.com>
+ *     Huang Ying <ying.huang@intel.com>
+ *
+ * Code to convert EFI to E820 map has been implemented in elilo bootloader
+ * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
+ * is setup appropriately for EFI runtime code.
+ * - mouli 06/14/2007.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+
+#include <asm/setup.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm/efi.h>
+
+static pgd_t save_pgd __initdata;
+static unsigned long efi_flags __initdata;
+
+static void __init early_mapping_set_exec(unsigned long start,
+                                         unsigned long end,
+                                         int executable)
+{
+       pte_t *kpte;
+       int level;
+
+       while (start < end) {
+               kpte = lookup_address((unsigned long)__va(start), &level);
+               BUG_ON(!kpte);
+               if (executable)
+                       set_pte(kpte, pte_mkexec(*kpte));
+               else
+                       set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
+                                           __supported_pte_mask));
+               if (level == 4)
+                       start = (start + PMD_SIZE) & PMD_MASK;
+               else
+                       start = (start + PAGE_SIZE) & PAGE_MASK;
+       }
+}
+
+static void __init early_runtime_code_mapping_set_exec(int executable)
+{
+       efi_memory_desc_t *md;
+       void *p;
+
+       if (!(__supported_pte_mask & _PAGE_NX))
+               return;
+
+       /* Make EFI runtime service code area executable */
+       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+               md = p;
+               if (md->type == EFI_RUNTIME_SERVICES_CODE) {
+                       unsigned long end;
+                       end = md->phys_addr + (md->num_pages << PAGE_SHIFT);
+                       early_mapping_set_exec(md->phys_addr, end, executable);
+               }
+       }
+}
+
+void __init efi_call_phys_prelog(void)
+{
+       unsigned long vaddress;
+
+       local_irq_save(efi_flags);
+       early_runtime_code_mapping_set_exec(1);
+       vaddress = (unsigned long)__va(0x0UL);
+       save_pgd = *pgd_offset_k(0x0UL);
+       set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
+       __flush_tlb_all();
+}
+
+void __init efi_call_phys_epilog(void)
+{
+       /*
+        * After the lock is released, the original page table is restored.
+        */
+       set_pgd(pgd_offset_k(0x0UL), save_pgd);
+       early_runtime_code_mapping_set_exec(0);
+       __flush_tlb_all();
+       local_irq_restore(efi_flags);
+}
+
+void __init efi_reserve_bootmem(void)
+{
+       reserve_bootmem_generic((unsigned long)memmap.phys_map,
+                               memmap.nr_map * memmap.desc_size);
+}
+
+void __iomem * __init efi_ioremap(unsigned long offset,
+                                 unsigned long size)
+{
+       static unsigned pages_mapped;
+       unsigned long last_addr;
+       unsigned i, pages;
+
+       last_addr = offset + size - 1;
+       offset &= PAGE_MASK;
+       pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT;
+       if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+               return NULL;
+
+       for (i = 0; i < pages; i++) {
+               __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
+                            offset, PAGE_KERNEL_EXEC_NOCACHE);
+               offset += PAGE_SIZE;
+               pages_mapped++;
+       }
+
+       return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
+                                            (pages_mapped - pages));
+}
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S

new file mode 100644 (file)

index 0000000..99b47d4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -0,0 +1,109 @@
+/*
+ * Function calling ABI conversion from Linux to EFI for x86_64
+ *
+ * Copyright (C) 2007 Intel Corp
+ *     Bibo Mao <bibo.mao@intel.com>
+ *     Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/linkage.h>
+
+#define SAVE_XMM                       \
+       mov %rsp, %rax;                 \
+       subq $0x70, %rsp;               \
+       and $~0xf, %rsp;                \
+       mov %rax, (%rsp);               \
+       mov %cr0, %rax;                 \
+       clts;                           \
+       mov %rax, 0x8(%rsp);            \
+       movaps %xmm0, 0x60(%rsp);       \
+       movaps %xmm1, 0x50(%rsp);       \
+       movaps %xmm2, 0x40(%rsp);       \
+       movaps %xmm3, 0x30(%rsp);       \
+       movaps %xmm4, 0x20(%rsp);       \
+       movaps %xmm5, 0x10(%rsp)
+
+#define RESTORE_XMM                    \
+       movaps 0x60(%rsp), %xmm0;       \
+       movaps 0x50(%rsp), %xmm1;       \
+       movaps 0x40(%rsp), %xmm2;       \
+       movaps 0x30(%rsp), %xmm3;       \
+       movaps 0x20(%rsp), %xmm4;       \
+       movaps 0x10(%rsp), %xmm5;       \
+       mov 0x8(%rsp), %rsi;            \
+       mov %rsi, %cr0;                 \
+       mov (%rsp), %rsp
+
+ENTRY(efi_call0)
+       SAVE_XMM
+       subq $32, %rsp
+       call *%rdi
+       addq $32, %rsp
+       RESTORE_XMM
+       ret
+
+ENTRY(efi_call1)
+       SAVE_XMM
+       subq $32, %rsp
+       mov  %rsi, %rcx
+       call *%rdi
+       addq $32, %rsp
+       RESTORE_XMM
+       ret
+
+ENTRY(efi_call2)
+       SAVE_XMM
+       subq $32, %rsp
+       mov  %rsi, %rcx
+       call *%rdi
+       addq $32, %rsp
+       RESTORE_XMM
+       ret
+
+ENTRY(efi_call3)
+       SAVE_XMM
+       subq $32, %rsp
+       mov  %rcx, %r8
+       mov  %rsi, %rcx
+       call *%rdi
+       addq $32, %rsp
+       RESTORE_XMM
+       ret
+
+ENTRY(efi_call4)
+       SAVE_XMM
+       subq $32, %rsp
+       mov %r8, %r9
+       mov %rcx, %r8
+       mov %rsi, %rcx
+       call *%rdi
+       addq $32, %rsp
+       RESTORE_XMM
+       ret
+
+ENTRY(efi_call5)
+       SAVE_XMM
+       subq $48, %rsp
+       mov %r9, 32(%rsp)
+       mov %r8, %r9
+       mov %rcx, %r8
+       mov %rsi, %rcx
+       call *%rdi
+       addq $48, %rsp
+       RESTORE_XMM
+       ret
+
+ENTRY(efi_call6)
+       SAVE_XMM
+       mov (%rsp), %rax
+       mov 8(%rax), %rax
+       subq $48, %rsp
+       mov %r9, 32(%rsp)
+       mov %rax, 40(%rsp)
+       mov %r8, %r9
+       mov %rcx, %r8
+       mov %rsi, %rcx
+       call *%rdi
+       addq $48, %rsp
+       RESTORE_XMM
+       ret
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S

index dc7f938e501501a74a3a97d943b5014795bcba49..be5c31d048847e0ba2ae2f76eb5a136e9ee96cb7 100644 (file)
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -58,7 +58,7 @@
   * for paravirtualization.  The following will never clobber any registers:
   *   INTERRUPT_RETURN (aka. "iret")
   *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
   *
   * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
   * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -283,12 +283,12 @@ END(resume_kernel)
     the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
  
         # sysenter call handler stub
-ENTRY(sysenter_entry)
+ENTRY(ia32_sysenter_target)
         CFI_STARTPROC simple
         CFI_SIGNAL_FRAME
         CFI_DEF_CFA esp, 0
         CFI_REGISTER esp, ebp
-       movl TSS_sysenter_esp0(%esp),%esp
+       movl TSS_sysenter_sp0(%esp),%esp
  sysenter_past_esp:
         /*
          * No need to follow this irqs on/off section: the syscall
@@ -351,7 +351,7 @@ sysenter_past_esp:
         xorl %ebp,%ebp
         TRACE_IRQS_ON
  1:     mov  PT_FS(%esp), %fs
-       ENABLE_INTERRUPTS_SYSEXIT
+       ENABLE_INTERRUPTS_SYSCALL_RET
         CFI_ENDPROC
  .pushsection .fixup,"ax"
  2:     movl $0,PT_FS(%esp)
@@ -360,7 +360,7 @@ sysenter_past_esp:
         .align 4
         .long 1b,2b
  .popsection
-ENDPROC(sysenter_entry)
+ENDPROC(ia32_sysenter_target)
  
         # system call handler stub
  ENTRY(system_call)
@@ -583,7 +583,7 @@ END(syscall_badsys)
   * Build the entry stubs and pointer table with
   * some assembler magic.
   */
-.data
+.section .rodata,"a"
  ENTRY(interrupt)
  .text
  
@@ -743,7 +743,7 @@ END(device_not_available)
   * that sets up the real kernel stack. Check here, since we can't
   * allow the wrong stack to be used.
   *
- * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
   * already pushed 3 words if it hits on the sysenter instruction:
   * eflags, cs and eip.
   *
@@ -755,7 +755,7 @@ END(device_not_available)
         cmpw $__KERNEL_CS,4(%esp);              \
         jne ok;                                 \
  label:                                         \
-       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
+       movl TSS_sysenter_sp0+offset(%esp),%esp;        \
         CFI_DEF_CFA esp, 0;                     \
         CFI_UNDEFINED eip;                      \
         pushfl;                                 \
@@ -768,7 +768,7 @@ label:                                              \
  
  KPROBE_ENTRY(debug)
         RING0_INT_FRAME
-       cmpl $sysenter_entry,(%esp)
+       cmpl $ia32_sysenter_target,(%esp)
         jne debug_stack_correct
         FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
  debug_stack_correct:
@@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi)
         popl %eax
         CFI_ADJUST_CFA_OFFSET -4
         je nmi_espfix_stack
-       cmpl $sysenter_entry,(%esp)
+       cmpl $ia32_sysenter_target,(%esp)
         je nmi_stack_fixup
         pushl %eax
         CFI_ADJUST_CFA_OFFSET 4
@@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi)
         popl %eax
         CFI_ADJUST_CFA_OFFSET -4
         jae nmi_stack_correct
-       cmpl $sysenter_entry,12(%esp)
+       cmpl $ia32_sysenter_target,12(%esp)
         je nmi_debug_stack_check
  nmi_stack_correct:
         /* We have a RING0_INT_FRAME here */
@@ -882,10 +882,10 @@ ENTRY(native_iret)
  .previous
  END(native_iret)
  
-ENTRY(native_irq_enable_sysexit)
+ENTRY(native_irq_enable_syscall_ret)
         sti
         sysexit
-END(native_irq_enable_sysexit)
+END(native_irq_enable_syscall_ret)
  #endif
  
  KPROBE_ENTRY(int3)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index e70f3881d7e486f1ec55c14bcea41065506aaff2..bea8474744ffb633ad2d367bc478ec8cae3fb962 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -50,6 +50,7 @@
  #include <asm/hw_irq.h>
  #include <asm/page.h>
  #include <asm/irqflags.h>
+#include <asm/paravirt.h>
  
         .code64
  
@@ -57,6 +58,13 @@
  #define retint_kernel retint_restore_args
  #endif 
  
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_irq_enable_syscall_ret)
+       movq    %gs:pda_oldrsp,%rsp
+       swapgs
+       sysretq
+#endif /* CONFIG_PARAVIRT */
+
  
  .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
  #ifdef CONFIG_TRACE_IRQFLAGS
@@ -216,14 +224,21 @@ ENTRY(system_call)
         CFI_DEF_CFA     rsp,PDA_STACKOFFSET
         CFI_REGISTER    rip,rcx
         /*CFI_REGISTER  rflags,r11*/
-       swapgs
+       SWAPGS_UNSAFE_STACK
+       /*
+        * A hypervisor implementation might want to use a label
+        * after the swapgs, so that it can do the swapgs
+        * for the guest and jump here on syscall.
+        */
+ENTRY(system_call_after_swapgs)
+
         movq    %rsp,%gs:pda_oldrsp 
         movq    %gs:pda_kernelstack,%rsp
         /*
          * No need to follow this irqs off/on section - it's straight
          * and short:
          */
-       sti                                     
+       ENABLE_INTERRUPTS(CLBR_NONE)
         SAVE_ARGS 8,1
         movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
         movq  %rcx,RIP-ARGOFFSET(%rsp)
@@ -246,7 +261,7 @@ ret_from_sys_call:
  sysret_check:          
         LOCKDEP_SYS_EXIT
         GET_THREAD_INFO(%rcx)
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         movl threadinfo_flags(%rcx),%edx
         andl %edi,%edx
@@ -260,9 +275,7 @@ sysret_check:
         CFI_REGISTER    rip,rcx
         RESTORE_ARGS 0,-ARG_SKIP,1
         /*CFI_REGISTER  rflags,r11*/
-       movq    %gs:pda_oldrsp,%rsp
-       swapgs
-       sysretq
+       ENABLE_INTERRUPTS_SYSCALL_RET
  
         CFI_RESTORE_STATE
         /* Handle reschedules */
@@ -271,7 +284,7 @@ sysret_careful:
         bt $TIF_NEED_RESCHED,%edx
         jnc sysret_signal
         TRACE_IRQS_ON
-       sti
+       ENABLE_INTERRUPTS(CLBR_NONE)
         pushq %rdi
         CFI_ADJUST_CFA_OFFSET 8
         call schedule
@@ -282,7 +295,7 @@ sysret_careful:
         /* Handle a signal */ 
  sysret_signal:
         TRACE_IRQS_ON
-       sti
+       ENABLE_INTERRUPTS(CLBR_NONE)
         testl $_TIF_DO_NOTIFY_MASK,%edx
         jz    1f
  
@@ -295,7 +308,7 @@ sysret_signal:
  1:     movl $_TIF_NEED_RESCHED,%edi
         /* Use IRET because user could have changed frame. This
            works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         jmp int_with_check
         
@@ -327,7 +340,7 @@ tracesys:
   */
         .globl int_ret_from_sys_call
  int_ret_from_sys_call:
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         testl $3,CS-ARGOFFSET(%rsp)
         je retint_restore_args
@@ -349,20 +362,20 @@ int_careful:
         bt $TIF_NEED_RESCHED,%edx
         jnc  int_very_careful
         TRACE_IRQS_ON
-       sti
+       ENABLE_INTERRUPTS(CLBR_NONE)
         pushq %rdi
         CFI_ADJUST_CFA_OFFSET 8
         call schedule
         popq %rdi
         CFI_ADJUST_CFA_OFFSET -8
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         jmp int_with_check
  
         /* handle signals and tracing -- both require a full stack frame */
  int_very_careful:
         TRACE_IRQS_ON
-       sti
+       ENABLE_INTERRUPTS(CLBR_NONE)
         SAVE_REST
         /* Check for syscall exit trace */      
         testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -385,7 +398,7 @@ int_signal:
  1:     movl $_TIF_NEED_RESCHED,%edi    
  int_restore_rest:
         RESTORE_REST
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         jmp int_with_check
         CFI_ENDPROC
@@ -506,7 +519,7 @@ END(stub_rt_sigreturn)
         CFI_DEF_CFA_REGISTER    rbp
         testl $3,CS(%rdi)
         je 1f
-       swapgs  
+       SWAPGS
         /* irqcount is used to check if a CPU is already on an interrupt
            stack or not. While this is essentially redundant with preempt_count
            it is a little cheaper to use a separate counter in the PDA
@@ -527,7 +540,7 @@ ENTRY(common_interrupt)
         interrupt do_IRQ
         /* 0(%rsp): oldrsp-ARGOFFSET */
  ret_from_intr:
-       cli     
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         decl %gs:pda_irqcount
         leaveq
@@ -556,13 +569,13 @@ retint_swapgs:            /* return to user-space */
         /*
          * The iretq could re-enable interrupts:
          */
-       cli
+       DISABLE_INTERRUPTS(CLBR_ANY)
         TRACE_IRQS_IRETQ
-       swapgs 
+       SWAPGS
         jmp restore_args
  
  retint_restore_args:   /* return to kernel space */
-       cli
+       DISABLE_INTERRUPTS(CLBR_ANY)
         /*
          * The iretq could re-enable interrupts:
          */
@@ -570,10 +583,14 @@ retint_restore_args:      /* return to kernel space */
  restore_args:
         RESTORE_ARGS 0,8,0                                              
  iret_label:    
+#ifdef CONFIG_PARAVIRT
+       INTERRUPT_RETURN
+#endif
+ENTRY(native_iret)
         iretq
  
         .section __ex_table,"a"
-       .quad iret_label,bad_iret       
+       .quad native_iret, bad_iret
         .previous
         .section .fixup,"ax"
         /* force a signal here? this matches i386 behaviour */
@@ -581,24 +598,24 @@ iret_label:
  bad_iret:
         movq $11,%rdi   /* SIGSEGV */
         TRACE_IRQS_ON
-       sti
-       jmp do_exit                     
-       .previous       
-       
+       ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+       jmp do_exit
+       .previous
+
         /* edi: workmask, edx: work */
  retint_careful:
         CFI_RESTORE_STATE
         bt    $TIF_NEED_RESCHED,%edx
         jnc   retint_signal
         TRACE_IRQS_ON
-       sti
+       ENABLE_INTERRUPTS(CLBR_NONE)
         pushq %rdi
         CFI_ADJUST_CFA_OFFSET   8
         call  schedule
         popq %rdi               
         CFI_ADJUST_CFA_OFFSET   -8
         GET_THREAD_INFO(%rcx)
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         jmp retint_check
         
@@ -606,14 +623,14 @@ retint_signal:
         testl $_TIF_DO_NOTIFY_MASK,%edx
         jz    retint_swapgs
         TRACE_IRQS_ON
-       sti
+       ENABLE_INTERRUPTS(CLBR_NONE)
         SAVE_REST
         movq $-1,ORIG_RAX(%rsp)                         
         xorl %esi,%esi          # oldset
         movq %rsp,%rdi          # &pt_regs
         call do_notify_resume
         RESTORE_REST
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         movl $_TIF_NEED_RESCHED,%edi
         GET_THREAD_INFO(%rcx)
@@ -731,7 +748,7 @@ END(spurious_interrupt)
         rdmsr
         testl %edx,%edx
         js    1f
-       swapgs
+       SWAPGS
         xorl  %ebx,%ebx
  1:
         .if \ist
@@ -747,7 +764,7 @@ END(spurious_interrupt)
         .if \ist
         addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
         .endif
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         .if \irqtrace
         TRACE_IRQS_OFF
         .endif
@@ -776,10 +793,10 @@ paranoid_swapgs\trace:
         .if \trace
         TRACE_IRQS_IRETQ 0
         .endif
-       swapgs
+       SWAPGS_UNSAFE_STACK
  paranoid_restore\trace:
         RESTORE_ALL 8
-       iretq
+       INTERRUPT_RETURN
  paranoid_userspace\trace:
         GET_THREAD_INFO(%rcx)
         movl threadinfo_flags(%rcx),%ebx
@@ -794,11 +811,11 @@ paranoid_userspace\trace:
         .if \trace
         TRACE_IRQS_ON
         .endif
-       sti
+       ENABLE_INTERRUPTS(CLBR_NONE)
         xorl %esi,%esi                  /* arg2: oldset */
         movq %rsp,%rdi                  /* arg1: &pt_regs */
         call do_notify_resume
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         .if \trace
         TRACE_IRQS_OFF
         .endif
@@ -807,9 +824,9 @@ paranoid_schedule\trace:
         .if \trace
         TRACE_IRQS_ON
         .endif
-       sti
+       ENABLE_INTERRUPTS(CLBR_ANY)
         call schedule
-       cli
+       DISABLE_INTERRUPTS(CLBR_ANY)
         .if \trace
         TRACE_IRQS_OFF
         .endif
@@ -862,7 +879,7 @@ KPROBE_ENTRY(error_entry)
         testl $3,CS(%rsp)
         je  error_kernelspace
  error_swapgs:  
-       swapgs
+       SWAPGS
  error_sti:     
         movq %rdi,RDI(%rsp)     
         CFI_REL_OFFSET  rdi,RDI
@@ -874,7 +891,7 @@ error_sti:
  error_exit:
         movl %ebx,%eax
         RESTORE_REST
-       cli
+       DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         GET_THREAD_INFO(%rcx)   
         testl %eax,%eax
@@ -911,12 +928,12 @@ ENTRY(load_gs_index)
         CFI_STARTPROC
         pushf
         CFI_ADJUST_CFA_OFFSET 8
-       cli
-        swapgs
+       DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+        SWAPGS
  gs_change:     
          movl %edi,%gs   
  2:     mfence          /* workaround */
-       swapgs
+       SWAPGS
          popf
         CFI_ADJUST_CFA_OFFSET -8
          ret
@@ -930,7 +947,7 @@ ENDPROC(load_gs_index)
          .section .fixup,"ax"
         /* running with kernelgs */
  bad_gs: 
-       swapgs                  /* switch back to user gs */
+       SWAPGS                  /* switch back to user gs */
         xorl %eax,%eax
          movl %eax,%gs
          jmp  2b
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c

index ce703e21c91212485ccf04fdae476e806e755ac4..4ae7b64402602d98eae00f1f85b2fda1ba847cd1 100644 (file)
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -24,18 +24,11 @@
  #include <acpi/acpi_bus.h>
  #endif
  
-/*
- * which logical CPU number maps to which CPU (physical APIC ID)
- *
- * The following static array is used during kernel startup
- * and the x86_cpu_to_apicid_ptr contains the address of the
- * array during this time.  Is it zeroed when the per_cpu
- * data area is removed.
- */
-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
                                         = { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_ptr;
-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
+void *x86_cpu_to_apicid_early_ptr;
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
  EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
  
  struct genapic __read_mostly *genapic = &apic_flat;
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c

index f12d8c5d98093b2abbe5231cb7304455622d027d..9c7f7d3959689af664bd7060b8f5c918a495dd14 100644 (file)
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -1,6 +1,7 @@
  /*
   * AMD Geode southbridge support code
   * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of version 2 of the GNU General Public License
@@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base);
  
  /* === GPIO API === */
  
-void geode_gpio_set(unsigned int gpio, unsigned int reg)
+void geode_gpio_set(u32 gpio, unsigned int reg)
  {
         u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
  
         if (!base)
                 return;
  
-       if (gpio < 16)
-               outl(1 << gpio, base + reg);
-       else
-               outl(1 << (gpio - 16), base + 0x80 + reg);
+       /* low bank register */
+       if (gpio & 0xFFFF)
+               outl(gpio & 0xFFFF, base + reg);
+       /* high bank register */
+       gpio >>= 16;
+       if (gpio)
+               outl(gpio, base + 0x80 + reg);
  }
  EXPORT_SYMBOL_GPL(geode_gpio_set);
  
-void geode_gpio_clear(unsigned int gpio, unsigned int reg)
+void geode_gpio_clear(u32 gpio, unsigned int reg)
  {
         u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
  
         if (!base)
                 return;
  
-       if (gpio < 16)
-               outl(1 << (gpio + 16), base + reg);
-       else
-               outl(1 << gpio, base + 0x80 + reg);
+       /* low bank register */
+       if (gpio & 0xFFFF)
+               outl((gpio & 0xFFFF) << 16, base + reg);
+       /* high bank register */
+       gpio &= (0xFFFF << 16);
+       if (gpio)
+               outl(gpio, base + 0x80 + reg);
  }
  EXPORT_SYMBOL_GPL(geode_gpio_clear);
  
-int geode_gpio_isset(unsigned int gpio, unsigned int reg)
+int geode_gpio_isset(u32 gpio, unsigned int reg)
  {
         u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+       u32 val;
  
         if (!base)
                 return 0;
  
-       if (gpio < 16)
-               return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
-       else
-               return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
+       /* low bank register */
+       if (gpio & 0xFFFF) {
+               val = inl(base + reg) & (gpio & 0xFFFF);
+               if ((gpio & 0xFFFF) == val)
+                       return 1;
+       }
+       /* high bank register */
+       gpio >>= 16;
+       if (gpio) {
+               val = inl(base + 0x80 + reg) & gpio;
+               if (gpio == val)
+                       return 1;
+       }
+       return 0;
  }
  EXPORT_SYMBOL_GPL(geode_gpio_isset);
  
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c

index 6b3469311e42736a630d9d95d907c8230277ee51..a317336cdeaaff2c52aff6f008e6b62447e6db19 100644 (file)
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -10,6 +10,7 @@
  #include <linux/kernel.h>
  #include <linux/string.h>
  #include <linux/percpu.h>
+#include <linux/start_kernel.h>
  
  #include <asm/processor.h>
  #include <asm/proto.h>
@@ -19,12 +20,14 @@
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  #include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/e820.h>
  
  static void __init zap_identity_mappings(void)
  {
         pgd_t *pgd = pgd_offset_k(0UL);
         pgd_clear(pgd);
-       __flush_tlb();
+       __flush_tlb_all();
  }
  
  /* Don't add a printk in there. printk relies on the PDA which is not initialized 
@@ -46,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data)
         }
  }
  
+#define EBDA_ADDR_POINTER 0x40E
+
+static __init void reserve_ebda(void)
+{
+       unsigned ebda_addr, ebda_size;
+
+       /*
+        * there is a real-mode segmented pointer pointing to the
+        * 4K EBDA area at 0x40E
+        */
+       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+       ebda_addr <<= 4;
+
+       if (!ebda_addr)
+               return;
+
+       ebda_size = *(unsigned short *)__va(ebda_addr);
+
+       /* Round EBDA up to pages */
+       if (ebda_size == 0)
+               ebda_size = 1;
+       ebda_size <<= 10;
+       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+       if (ebda_size > 64*1024)
+               ebda_size = 64*1024;
+
+       reserve_early(ebda_addr, ebda_addr + ebda_size);
+}
+
  void __init x86_64_start_kernel(char * real_mode_data)
  {
         int i;
@@ -56,8 +88,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
         /* Make NULL pointers segfault */
         zap_identity_mappings();
  
-       for (i = 0; i < IDT_ENTRIES; i++)
+       for (i = 0; i < IDT_ENTRIES; i++) {
+#ifdef CONFIG_EARLY_PRINTK
+               set_intr_gate(i, &early_idt_handlers[i]);
+#else
                 set_intr_gate(i, early_idt_handler);
+#endif
+       }
         load_idt((const struct desc_ptr *)&idt_descr);
  
         early_printk("Kernel alive\n");
@@ -67,8 +104,24 @@ void __init x86_64_start_kernel(char * real_mode_data)
  
         pda_init(0);
         copy_bootdata(__va(real_mode_data));
-#ifdef CONFIG_SMP
-       cpu_set(0, cpu_online_map);
-#endif
+
+       reserve_early(__pa_symbol(&_text), __pa_symbol(&_end));
+
+       /* Reserve INITRD */
+       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+               unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+               unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
+               unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
+               reserve_early(ramdisk_image, ramdisk_end);
+       }
+
+       reserve_ebda();
+
+       /*
+        * At this point everything still needed from the boot loader
+        * or BIOS or kernel text should be early reserved or marked not
+        * RAM in e820. All other memory is free game.
+        */
+
         start_kernel();
  }
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S

index fbad51fce672b25f4f30cf24ca81da8bc628a574..5d8c5730686b1e12eab12798d0ed07ae0f4aafa0 100644 (file)
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -9,6 +9,7 @@
  
  .text
  #include <linux/threads.h>
+#include <linux/init.h>
  #include <linux/linkage.h>
  #include <asm/segment.h>
  #include <asm/page.h>
@@ -151,7 +152,9 @@ WEAK(xen_entry)
         /* Unknown implementation; there's really
            nothing we can do at this point. */
         ud2a
-.data
+
+       __INITDATA
+
  subarch_entries:
         .long default_entry             /* normal x86/PC */
         .long lguest_entry              /* lguest hypervisor */
@@ -199,7 +202,6 @@ default_entry:
         addl $0x67, %eax                        /* 0x67 == _PAGE_TABLE */
         movl %eax, 4092(%edx)
  
-       xorl %ebx,%ebx                          /* This is the boot CPU (BSP) */
         jmp 3f
  /*
   * Non-boot CPU entry point; entered from trampoline.S
@@ -222,6 +224,8 @@ ENTRY(startup_32_smp)
         movl %eax,%es
         movl %eax,%fs
         movl %eax,%gs
+#endif /* CONFIG_SMP */
+3:
  
  /*
   *     New page tables may be in 4Mbyte page mode and may
@@ -268,12 +272,6 @@ ENTRY(startup_32_smp)
         wrmsr
  
  6:
-       /* This is a secondary processor (AP) */
-       xorl %ebx,%ebx
-       incl %ebx
-
-#endif /* CONFIG_SMP */
-3:
  
  /*
   * Enable paging
@@ -297,7 +295,7 @@ ENTRY(startup_32_smp)
         popfl
  
  #ifdef CONFIG_SMP
-       andl %ebx,%ebx
+       cmpb $0, ready
         jz  1f                          /* Initial CPU cleans BSS */
         jmp checkCPUtype
  1:
@@ -502,6 +500,7 @@ early_fault:
         call printk
  #endif
  #endif
+       call dump_stack
  hlt_loop:
         hlt
         jmp hlt_loop
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index b6167fe3330e22aec72abc9f534de07d19fcb614..1d5a7a361200b69e28e61294ce7e45b75e6ba659 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,13 @@
  #include <asm/msr.h>
  #include <asm/cache.h>
  
+#ifdef CONFIG_PARAVIRT
+#include <asm/asm-offsets.h>
+#include <asm/paravirt.h>
+#else
+#define GET_CR2_INTO_RCX movq %cr2, %rcx
+#endif
+
  /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
   * because we need identity-mapped pages.
   *
@@ -260,14 +267,43 @@ init_rsp:
  bad_address:
         jmp bad_address
  
+#ifdef CONFIG_EARLY_PRINTK
+.macro early_idt_tramp first, last
+       .ifgt \last-\first
+       early_idt_tramp \first, \last-1
+       .endif
+       movl $\last,%esi
+       jmp early_idt_handler
+.endm
+
+       .globl early_idt_handlers
+early_idt_handlers:
+       early_idt_tramp 0, 63
+       early_idt_tramp 64, 127
+       early_idt_tramp 128, 191
+       early_idt_tramp 192, 255
+#endif
+
  ENTRY(early_idt_handler)
+#ifdef CONFIG_EARLY_PRINTK
         cmpl $2,early_recursion_flag(%rip)
         jz  1f
         incl early_recursion_flag(%rip)
+       GET_CR2_INTO_RCX
+       movq %rcx,%r9
+       xorl %r8d,%r8d          # zero for error code
+       movl %esi,%ecx          # get vector number
+       # Test %ecx against mask of vectors that push error code.
+       cmpl $31,%ecx
+       ja 0f
+       movl $1,%eax
+       salq %cl,%rax
+       testl $0x27d00,%eax
+       je 0f
+       popq %r8                # get error code
+0:     movq 0(%rsp),%rcx       # get ip
+       movq 8(%rsp),%rdx       # get cs
         xorl %eax,%eax
-       movq 8(%rsp),%rsi       # get rip
-       movq (%rsp),%rdx
-       movq %cr2,%rcx
         leaq early_idt_msg(%rip),%rdi
         call early_printk
         cmpl $2,early_recursion_flag(%rip)
@@ -278,15 +314,19 @@ ENTRY(early_idt_handler)
         movq 8(%rsp),%rsi       # get rip again
         call __print_symbol
  #endif
+#endif /* EARLY_PRINTK */
  1:     hlt
         jmp 1b
+
+#ifdef CONFIG_EARLY_PRINTK
  early_recursion_flag:
         .long 0
  
  early_idt_msg:
-       .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
+       .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
  early_idt_ripmsg:
         .asciz "RIP %s\n"
+#endif /* CONFIG_EARLY_PRINTK */
  
  .balign PAGE_SIZE
  
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c

index 2f99ee206b9527c3cad6250caa95e2a3ceac9268..429d084e014d4b11a829bbf01f53fdc860e276e2 100644 (file)
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -6,7 +6,6 @@
  #include <linux/init.h>
  #include <linux/sysdev.h>
  #include <linux/pm.h>
-#include <linux/delay.h>
  
  #include <asm/fixmap.h>
  #include <asm/hpet.h>
@@ -16,7 +15,8 @@
  #define HPET_MASK      CLOCKSOURCE_MASK(32)
  #define HPET_SHIFT     22
  
-/* FSEC = 10^-15 NSEC = 10^-9 */
+/* FSEC = 10^-15
+   NSEC = 10^-9 */
  #define FSEC_PER_NSEC  1000000
  
  /*
@@ -107,6 +107,7 @@ int is_hpet_enabled(void)
  {
         return is_hpet_capable() && hpet_legacy_int_enabled;
  }
+EXPORT_SYMBOL_GPL(is_hpet_enabled);
  
  /*
   * When the hpet driver (/dev/hpet) is enabled, we need to reserve
@@ -132,16 +133,13 @@ static void hpet_reserve_platform_timers(unsigned long id)
  #ifdef CONFIG_HPET_EMULATE_RTC
         hpet_reserve_timer(&hd, 1);
  #endif
-
         hd.hd_irq[0] = HPET_LEGACY_8254;
         hd.hd_irq[1] = HPET_LEGACY_RTC;
  
-       for (i = 2; i < nrtimers; timer++, i++)
-               hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
-                       Tn_INT_ROUTE_CNF_SHIFT;
-
+       for (i = 2; i < nrtimers; timer++, i++)
+              hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+                      Tn_INT_ROUTE_CNF_SHIFT;
         hpet_alloc(&hd);
-
  }
  #else
  static void hpet_reserve_platform_timers(unsigned long id) { }
@@ -478,6 +476,7 @@ void hpet_disable(void)
   */
  #include <linux/mc146818rtc.h>
  #include <linux/rtc.h>
+#include <asm/rtc.h>
  
  #define DEFAULT_RTC_INT_FREQ   64
  #define DEFAULT_RTC_SHIFT      6
@@ -492,6 +491,38 @@ static unsigned long hpet_default_delta;
  static unsigned long hpet_pie_delta;
  static unsigned long hpet_pie_limit;
  
+static rtc_irq_handler irq_handler;
+
+/*
+ * Registers a IRQ handler.
+ */
+int hpet_register_irq_handler(rtc_irq_handler handler)
+{
+       if (!is_hpet_enabled())
+               return -ENODEV;
+       if (irq_handler)
+               return -EBUSY;
+
+       irq_handler = handler;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(hpet_register_irq_handler);
+
+/*
+ * Deregisters the IRQ handler registered with hpet_register_irq_handler()
+ * and does cleanup.
+ */
+void hpet_unregister_irq_handler(rtc_irq_handler handler)
+{
+       if (!is_hpet_enabled())
+               return;
+
+       irq_handler = NULL;
+       hpet_rtc_flags = 0;
+}
+EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
+
  /*
   * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
   * is not supported by all HPET implementations for timer 1.
@@ -533,6 +564,7 @@ int hpet_rtc_timer_init(void)
  
         return 1;
  }
+EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
  
  /*
   * The functions below are called from rtc driver.
@@ -547,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
         hpet_rtc_flags &= ~bit_mask;
         return 1;
  }
+EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
  
  int hpet_set_rtc_irq_bit(unsigned long bit_mask)
  {
@@ -562,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
  
         return 1;
  }
+EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
  
  int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
                         unsigned char sec)
@@ -575,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
  
         return 1;
  }
+EXPORT_SYMBOL_GPL(hpet_set_alarm_time);
  
  int hpet_set_periodic_freq(unsigned long freq)
  {
@@ -593,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq)
         }
         return 1;
  }
+EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
  
  int hpet_rtc_dropped_irq(void)
  {
         return is_hpet_enabled();
  }
+EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
  
  static void hpet_rtc_timer_reinit(void)
  {
@@ -641,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
         unsigned long rtc_int_flag = 0;
  
         hpet_rtc_timer_reinit();
+       memset(&curr_time, 0, sizeof(struct rtc_time));
  
         if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
-               rtc_get_rtc_time(&curr_time);
+               get_rtc_time(&curr_time);
  
         if (hpet_rtc_flags & RTC_UIE &&
             curr_time.tm_sec != hpet_prev_update_sec) {
@@ -665,8 +703,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
  
         if (rtc_int_flag) {
                 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
-               rtc_interrupt(rtc_int_flag, dev_id);
+               if (irq_handler)
+                       irq_handler(rtc_int_flag, dev_id);
         }
         return IRQ_HANDLED;
  }
+EXPORT_SYMBOL_GPL(hpet_rtc_interrupt);
  #endif
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c

index 02112fcc0de7ea3ff7a752e84d2b071d7a6ff74c..061627806a2d98320032f27e151ab08b3d8559fe 100644 (file)
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8);
  
  EXPORT_SYMBOL(strstr);
  
-#ifdef CONFIG_SMP
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
-EXPORT_SYMBOL(__write_lock_failed);
-EXPORT_SYMBOL(__read_lock_failed);
-#endif
-
  EXPORT_SYMBOL(csum_partial);
  EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c

new file mode 100644 (file)

index 0000000..26719bd
--- /dev/null
+++ b/arch/x86/kernel/i387.c
@@ -0,0 +1,479 @@
+/*
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/regset.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/math_emu.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_X86_64
+
+#include <asm/sigcontext32.h>
+#include <asm/user32.h>
+
+#else
+
+#define        save_i387_ia32          save_i387
+#define        restore_i387_ia32       restore_i387
+
+#define _fpstate_ia32          _fpstate
+#define user_i387_ia32_struct  user_i387_struct
+#define user32_fxsr_struct     user_fxsr_struct
+
+#endif
+
+#ifdef CONFIG_MATH_EMULATION
+#define HAVE_HWFP (boot_cpu_data.hard_math)
+#else
+#define HAVE_HWFP 1
+#endif
+
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+
+void mxcsr_feature_mask_init(void)
+{
+       unsigned long mask = 0;
+       clts();
+       if (cpu_has_fxsr) {
+               memset(&current->thread.i387.fxsave, 0,
+                      sizeof(struct i387_fxsave_struct));
+               asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+               mask = current->thread.i387.fxsave.mxcsr_mask;
+               if (mask == 0)
+                       mask = 0x0000ffbf;
+       }
+       mxcsr_feature_mask &= mask;
+       stts();
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Called at bootup to set up the initial FPU state that is later cloned
+ * into all processes.
+ */
+void __cpuinit fpu_init(void)
+{
+       unsigned long oldcr0 = read_cr0();
+       extern void __bad_fxsave_alignment(void);
+
+       if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+               __bad_fxsave_alignment();
+       set_in_cr4(X86_CR4_OSFXSR);
+       set_in_cr4(X86_CR4_OSXMMEXCPT);
+
+       write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+
+       mxcsr_feature_mask_init();
+       /* clean state in init */
+       current_thread_info()->status = 0;
+       clear_used_math();
+}
+#endif /* CONFIG_X86_64 */
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+void init_fpu(struct task_struct *tsk)
+{
+       if (tsk_used_math(tsk)) {
+               if (tsk == current)
+                       unlazy_fpu(tsk);
+               return;
+       }
+
+       if (cpu_has_fxsr) {
+               memset(&tsk->thread.i387.fxsave, 0,
+                      sizeof(struct i387_fxsave_struct));
+               tsk->thread.i387.fxsave.cwd = 0x37f;
+               if (cpu_has_xmm)
+                       tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
+       } else {
+               memset(&tsk->thread.i387.fsave, 0,
+                      sizeof(struct i387_fsave_struct));
+               tsk->thread.i387.fsave.cwd = 0xffff037fu;
+               tsk->thread.i387.fsave.swd = 0xffff0000u;
+               tsk->thread.i387.fsave.twd = 0xffffffffu;
+               tsk->thread.i387.fsave.fos = 0xffff0000u;
+       }
+       /*
+        * Only the device not available exception or ptrace can call init_fpu.
+        */
+       set_stopped_child_used_math(tsk);
+}
+
+int fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+       return tsk_used_math(target) ? regset->n : 0;
+}
+
+int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+       return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+               unsigned int pos, unsigned int count,
+               void *kbuf, void __user *ubuf)
+{
+       if (!cpu_has_fxsr)
+               return -ENODEV;
+
+       unlazy_fpu(target);
+
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                  &target->thread.i387.fxsave, 0, -1);
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+               unsigned int pos, unsigned int count,
+               const void *kbuf, const void __user *ubuf)
+{
+       int ret;
+
+       if (!cpu_has_fxsr)
+               return -ENODEV;
+
+       unlazy_fpu(target);
+       set_stopped_child_used_math(target);
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &target->thread.i387.fxsave, 0, -1);
+
+       /*
+        * mxcsr reserved bits must be masked to zero for security reasons.
+        */
+       target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+
+       return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+       unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+       /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+       tmp = ~twd;
+       tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+       /* and move the valid bits to the lower byte. */
+       tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+       tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+       tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+       return tmp;
+}
+
+#define FPREG_ADDR(f, n)       ((void *)&(f)->st_space + (n) * 16);
+#define FP_EXP_TAG_VALID       0
+#define FP_EXP_TAG_ZERO                1
+#define FP_EXP_TAG_SPECIAL     2
+#define FP_EXP_TAG_EMPTY       3
+
+static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
+{
+       struct _fpxreg *st;
+       u32 tos = (fxsave->swd >> 11) & 7;
+       u32 twd = (unsigned long) fxsave->twd;
+       u32 tag;
+       u32 ret = 0xffff0000u;
+       int i;
+
+       for (i = 0; i < 8; i++, twd >>= 1) {
+               if (twd & 0x1) {
+                       st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+                       switch (st->exponent & 0x7fff) {
+                       case 0x7fff:
+                               tag = FP_EXP_TAG_SPECIAL;
+                               break;
+                       case 0x0000:
+                               if (!st->significand[0] &&
+                                   !st->significand[1] &&
+                                   !st->significand[2] &&
+                                   !st->significand[3])
+                                       tag = FP_EXP_TAG_ZERO;
+                               else
+                                       tag = FP_EXP_TAG_SPECIAL;
+                               break;
+                       default:
+                               if (st->significand[3] & 0x8000)
+                                       tag = FP_EXP_TAG_VALID;
+                               else
+                                       tag = FP_EXP_TAG_SPECIAL;
+                               break;
+                       }
+               } else {
+                       tag = FP_EXP_TAG_EMPTY;
+               }
+               ret |= tag << (2 * i);
+       }
+       return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+static void convert_from_fxsr(struct user_i387_ia32_struct *env,
+                             struct task_struct *tsk)
+{
+       struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+       struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+       struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+       int i;
+
+       env->cwd = fxsave->cwd | 0xffff0000u;
+       env->swd = fxsave->swd | 0xffff0000u;
+       env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+       env->fip = fxsave->rip;
+       env->foo = fxsave->rdp;
+       if (tsk == current) {
+               /*
+                * should be actually ds/cs at fpu exception time, but
+                * that information is not available in 64bit mode.
+                */
+               asm("mov %%ds,%0" : "=r" (env->fos));
+               asm("mov %%cs,%0" : "=r" (env->fcs));
+       } else {
+               struct pt_regs *regs = task_pt_regs(tsk);
+               env->fos = 0xffff0000 | tsk->thread.ds;
+               env->fcs = regs->cs;
+       }
+#else
+       env->fip = fxsave->fip;
+       env->fcs = fxsave->fcs;
+       env->foo = fxsave->foo;
+       env->fos = fxsave->fos;
+#endif
+
+       for (i = 0; i < 8; ++i)
+               memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+static void convert_to_fxsr(struct task_struct *tsk,
+                           const struct user_i387_ia32_struct *env)
+
+{
+       struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+       struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+       struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+       int i;
+
+       fxsave->cwd = env->cwd;
+       fxsave->swd = env->swd;
+       fxsave->twd = twd_i387_to_fxsr(env->twd);
+       fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+       fxsave->rip = env->fip;
+       fxsave->rdp = env->foo;
+       /* cs and ds ignored */
+#else
+       fxsave->fip = env->fip;
+       fxsave->fcs = (env->fcs & 0xffff);
+       fxsave->foo = env->foo;
+       fxsave->fos = env->fos;
+#endif
+
+       for (i = 0; i < 8; ++i)
+               memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+              unsigned int pos, unsigned int count,
+              void *kbuf, void __user *ubuf)
+{
+       struct user_i387_ia32_struct env;
+
+       if (!HAVE_HWFP)
+               return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
+       unlazy_fpu(target);
+
+       if (!cpu_has_fxsr)
+               return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                          &target->thread.i387.fsave, 0, -1);
+
+       if (kbuf && pos == 0 && count == sizeof(env)) {
+               convert_from_fxsr(kbuf, target);
+               return 0;
+       }
+
+       convert_from_fxsr(&env, target);
+       return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+              unsigned int pos, unsigned int count,
+              const void *kbuf, const void __user *ubuf)
+{
+       struct user_i387_ia32_struct env;
+       int ret;
+
+       if (!HAVE_HWFP)
+               return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+       unlazy_fpu(target);
+       set_stopped_child_used_math(target);
+
+       if (!cpu_has_fxsr)
+               return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                         &target->thread.i387.fsave, 0, -1);
+
+       if (pos > 0 || count < sizeof(env))
+               convert_from_fxsr(&env, target);
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+       if (!ret)
+               convert_to_fxsr(target, &env);
+
+       return ret;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
+{
+       struct task_struct *tsk = current;
+
+       unlazy_fpu(tsk);
+       tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
+       if (__copy_to_user(buf, &tsk->thread.i387.fsave,
+                          sizeof(struct i387_fsave_struct)))
+               return -1;
+       return 1;
+}
+
+static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
+{
+       struct task_struct *tsk = current;
+       struct user_i387_ia32_struct env;
+       int err = 0;
+
+       unlazy_fpu(tsk);
+
+       convert_from_fxsr(&env, tsk);
+       if (__copy_to_user(buf, &env, sizeof(env)))
+               return -1;
+
+       err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+       err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
+       if (err)
+               return -1;
+
+       if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+                          sizeof(struct i387_fxsave_struct)))
+               return -1;
+       return 1;
+}
+
+int save_i387_ia32(struct _fpstate_ia32 __user *buf)
+{
+       if (!used_math())
+               return 0;
+
+       /* This will cause a "finit" to be triggered by the next
+        * attempted FPU operation by the 'current' process.
+        */
+       clear_used_math();
+
+       if (HAVE_HWFP) {
+               if (cpu_has_fxsr) {
+                       return save_i387_fxsave(buf);
+               } else {
+                       return save_i387_fsave(buf);
+               }
+       } else {
+               return fpregs_soft_get(current, NULL,
+                                      0, sizeof(struct user_i387_ia32_struct),
+                                      NULL, buf) ? -1 : 1;
+       }
+}
+
+static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
+{
+       struct task_struct *tsk = current;
+       clear_fpu(tsk);
+       return __copy_from_user(&tsk->thread.i387.fsave, buf,
+                               sizeof(struct i387_fsave_struct));
+}
+
+static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
+{
+       int err;
+       struct task_struct *tsk = current;
+       struct user_i387_ia32_struct env;
+       clear_fpu(tsk);
+       err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+                              sizeof(struct i387_fxsave_struct));
+       /* mxcsr reserved bits must be masked to zero for security reasons */
+       tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+       if (err || __copy_from_user(&env, buf, sizeof(env)))
+               return 1;
+       convert_to_fxsr(tsk, &env);
+       return 0;
+}
+
+int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
+{
+       int err;
+
+       if (HAVE_HWFP) {
+               if (cpu_has_fxsr) {
+                       err = restore_i387_fxsave(buf);
+               } else {
+                       err = restore_i387_fsave(buf);
+               }
+       } else {
+               err = fpregs_soft_set(current, NULL,
+                                     0, sizeof(struct user_i387_ia32_struct),
+                                     NULL, buf) != 0;
+       }
+       set_used_math();
+       return err;
+}
+
+/*
+ * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
+ */
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
+{
+       int fpvalid;
+       struct task_struct *tsk = current;
+
+       fpvalid = !!used_math();
+       if (fpvalid)
+               fpvalid = !fpregs_get(tsk, NULL,
+                                     0, sizeof(struct user_i387_ia32_struct),
+                                     fpu, NULL);
+
+       return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c

deleted file mode 100644 (file)

index 7d2e12f..0000000
--- a/arch/x86/kernel/i387_32.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/math_emu.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-
-#ifdef CONFIG_MATH_EMULATION
-#define HAVE_HWFP (boot_cpu_data.hard_math)
-#else
-#define HAVE_HWFP 1
-#endif
-
-static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
-
-void mxcsr_feature_mask_init(void)
-{
-       unsigned long mask = 0;
-       clts();
-       if (cpu_has_fxsr) {
-               memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-               asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); 
-               mask = current->thread.i387.fxsave.mxcsr_mask;
-               if (mask == 0) mask = 0x0000ffbf;
-       } 
-       mxcsr_feature_mask &= mask;
-       stts();
-}
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-void init_fpu(struct task_struct *tsk)
-{
-       if (cpu_has_fxsr) {
-               memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-               tsk->thread.i387.fxsave.cwd = 0x37f;
-               if (cpu_has_xmm)
-                       tsk->thread.i387.fxsave.mxcsr = 0x1f80;
-       } else {
-               memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
-               tsk->thread.i387.fsave.cwd = 0xffff037fu;
-               tsk->thread.i387.fsave.swd = 0xffff0000u;
-               tsk->thread.i387.fsave.twd = 0xffffffffu;
-               tsk->thread.i387.fsave.fos = 0xffff0000u;
-       }
-       /* only the device not available exception or ptrace can call init_fpu */
-       set_stopped_child_used_math(tsk);
-}
-
-/*
- * FPU lazy state save handling.
- */
-
-void kernel_fpu_begin(void)
-{
-       struct thread_info *thread = current_thread_info();
-
-       preempt_disable();
-       if (thread->status & TS_USEDFPU) {
-               __save_init_fpu(thread->task);
-               return;
-       }
-       clts();
-}
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
-{
-       unsigned int tmp; /* to avoid 16 bit prefixes in the code */
- 
-       /* Transform each pair of bits into 01 (valid) or 00 (empty) */
-        tmp = ~twd;
-        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-        /* and move the valid bits to the lower byte. */
-        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-        return tmp;
-}
-
-static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
-{
-       struct _fpxreg *st = NULL;
-       unsigned long tos = (fxsave->swd >> 11) & 7;
-       unsigned long twd = (unsigned long) fxsave->twd;
-       unsigned long tag;
-       unsigned long ret = 0xffff0000u;
-       int i;
-
-#define FPREG_ADDR(f, n)       ((void *)&(f)->st_space + (n) * 16);
-
-       for ( i = 0 ; i < 8 ; i++ ) {
-               if ( twd & 0x1 ) {
-                       st = FPREG_ADDR( fxsave, (i - tos) & 7 );
-
-                       switch ( st->exponent & 0x7fff ) {
-                       case 0x7fff:
-                               tag = 2;                /* Special */
-                               break;
-                       case 0x0000:
-                               if ( !st->significand[0] &&
-                                    !st->significand[1] &&
-                                    !st->significand[2] &&
-                                    !st->significand[3] ) {
-                                       tag = 1;        /* Zero */
-                               } else {
-                                       tag = 2;        /* Special */
-                               }
-                               break;
-                       default:
-                               if ( st->significand[3] & 0x8000 ) {
-                                       tag = 0;        /* Valid */
-                               } else {
-                                       tag = 2;        /* Special */
-                               }
-                               break;
-                       }
-               } else {
-                       tag = 3;                        /* Empty */
-               }
-               ret |= (tag << (2 * i));
-               twd = twd >> 1;
-       }
-       return ret;
-}
-
-/*
- * FPU state interaction.
- */
-
-unsigned short get_fpu_cwd( struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               return tsk->thread.i387.fxsave.cwd;
-       } else {
-               return (unsigned short)tsk->thread.i387.fsave.cwd;
-       }
-}
-
-unsigned short get_fpu_swd( struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               return tsk->thread.i387.fxsave.swd;
-       } else {
-               return (unsigned short)tsk->thread.i387.fsave.swd;
-       }
-}
-
-#if 0
-unsigned short get_fpu_twd( struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               return tsk->thread.i387.fxsave.twd;
-       } else {
-               return (unsigned short)tsk->thread.i387.fsave.twd;
-       }
-}
-#endif  /*  0  */
-
-unsigned short get_fpu_mxcsr( struct task_struct *tsk )
-{
-       if ( cpu_has_xmm ) {
-               return tsk->thread.i387.fxsave.mxcsr;
-       } else {
-               return 0x1f80;
-       }
-}
-
-#if 0
-
-void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
-{
-       if ( cpu_has_fxsr ) {
-               tsk->thread.i387.fxsave.cwd = cwd;
-       } else {
-               tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
-       }
-}
-
-void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
-{
-       if ( cpu_has_fxsr ) {
-               tsk->thread.i387.fxsave.swd = swd;
-       } else {
-               tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
-       }
-}
-
-void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
-{
-       if ( cpu_has_fxsr ) {
-               tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
-       } else {
-               tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
-       }
-}
-
-#endif  /*  0  */
-
-/*
- * FXSR floating point environment conversions.
- */
-
-static int convert_fxsr_to_user( struct _fpstate __user *buf,
-                                       struct i387_fxsave_struct *fxsave )
-{
-       unsigned long env[7];
-       struct _fpreg __user *to;
-       struct _fpxreg *from;
-       int i;
-
-       env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
-       env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
-       env[2] = twd_fxsr_to_i387(fxsave);
-       env[3] = fxsave->fip;
-       env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
-       env[5] = fxsave->foo;
-       env[6] = fxsave->fos;
-
-       if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
-               return 1;
-
-       to = &buf->_st[0];
-       from = (struct _fpxreg *) &fxsave->st_space[0];
-       for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
-               unsigned long __user *t = (unsigned long __user *)to;
-               unsigned long *f = (unsigned long *)from;
-
-               if (__put_user(*f, t) ||
-                               __put_user(*(f + 1), t + 1) ||
-                               __put_user(from->exponent, &to->exponent))
-                       return 1;
-       }
-       return 0;
-}
-
-static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
-                                         struct _fpstate __user *buf )
-{
-       unsigned long env[7];
-       struct _fpxreg *to;
-       struct _fpreg __user *from;
-       int i;
-
-       if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
-               return 1;
-
-       fxsave->cwd = (unsigned short)(env[0] & 0xffff);
-       fxsave->swd = (unsigned short)(env[1] & 0xffff);
-       fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
-       fxsave->fip = env[3];
-       fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
-       fxsave->fcs = (env[4] & 0xffff);
-       fxsave->foo = env[5];
-       fxsave->fos = env[6];
-
-       to = (struct _fpxreg *) &fxsave->st_space[0];
-       from = &buf->_st[0];
-       for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
-               unsigned long *t = (unsigned long *)to;
-               unsigned long __user *f = (unsigned long __user *)from;
-
-               if (__get_user(*t, f) ||
-                               __get_user(*(t + 1), f + 1) ||
-                               __get_user(to->exponent, &from->exponent))
-                       return 1;
-       }
-       return 0;
-}
-
-/*
- * Signal frame handlers.
- */
-
-static inline int save_i387_fsave( struct _fpstate __user *buf )
-{
-       struct task_struct *tsk = current;
-
-       unlazy_fpu( tsk );
-       tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
-       if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
-                            sizeof(struct i387_fsave_struct) ) )
-               return -1;
-       return 1;
-}
-
-static int save_i387_fxsave( struct _fpstate __user *buf )
-{
-       struct task_struct *tsk = current;
-       int err = 0;
-
-       unlazy_fpu( tsk );
-
-       if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
-               return -1;
-
-       err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
-       err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
-       if ( err )
-               return -1;
-
-       if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
-                            sizeof(struct i387_fxsave_struct) ) )
-               return -1;
-       return 1;
-}
-
-int save_i387( struct _fpstate __user *buf )
-{
-       if ( !used_math() )
-               return 0;
-
-       /* This will cause a "finit" to be triggered by the next
-        * attempted FPU operation by the 'current' process.
-        */
-       clear_used_math();
-
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       return save_i387_fxsave( buf );
-               } else {
-                       return save_i387_fsave( buf );
-               }
-       } else {
-               return save_i387_soft( &current->thread.i387.soft, buf );
-       }
-}
-
-static inline int restore_i387_fsave( struct _fpstate __user *buf )
-{
-       struct task_struct *tsk = current;
-       clear_fpu( tsk );
-       return __copy_from_user( &tsk->thread.i387.fsave, buf,
-                                sizeof(struct i387_fsave_struct) );
-}
-
-static int restore_i387_fxsave( struct _fpstate __user *buf )
-{
-       int err;
-       struct task_struct *tsk = current;
-       clear_fpu( tsk );
-       err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
-                               sizeof(struct i387_fxsave_struct) );
-       /* mxcsr reserved bits must be masked to zero for security reasons */
-       tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-       return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
-}
-
-int restore_i387( struct _fpstate __user *buf )
-{
-       int err;
-
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       err = restore_i387_fxsave( buf );
-               } else {
-                       err = restore_i387_fsave( buf );
-               }
-       } else {
-               err = restore_i387_soft( &current->thread.i387.soft, buf );
-       }
-       set_used_math();
-       return err;
-}
-
-/*
- * ptrace request handlers.
- */
-
-static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
-                                   struct task_struct *tsk )
-{
-       return __copy_to_user( buf, &tsk->thread.i387.fsave,
-                              sizeof(struct user_i387_struct) );
-}
-
-static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
-                                    struct task_struct *tsk )
-{
-       return convert_fxsr_to_user( (struct _fpstate __user *)buf,
-                                    &tsk->thread.i387.fxsave );
-}
-
-int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
-{
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       return get_fpregs_fxsave( buf, tsk );
-               } else {
-                       return get_fpregs_fsave( buf, tsk );
-               }
-       } else {
-               return save_i387_soft( &tsk->thread.i387.soft,
-                                      (struct _fpstate __user *)buf );
-       }
-}
-
-static inline int set_fpregs_fsave( struct task_struct *tsk,
-                                   struct user_i387_struct __user *buf )
-{
-       return __copy_from_user( &tsk->thread.i387.fsave, buf,
-                                sizeof(struct user_i387_struct) );
-}
-
-static inline int set_fpregs_fxsave( struct task_struct *tsk,
-                                    struct user_i387_struct __user *buf )
-{
-       return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
-                                      (struct _fpstate __user *)buf );
-}
-
-int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
-{
-       if ( HAVE_HWFP ) {
-               if ( cpu_has_fxsr ) {
-                       return set_fpregs_fxsave( tsk, buf );
-               } else {
-                       return set_fpregs_fsave( tsk, buf );
-               }
-       } else {
-               return restore_i387_soft( &tsk->thread.i387.soft,
-                                         (struct _fpstate __user *)buf );
-       }
-}
-
-int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
-{
-       if ( cpu_has_fxsr ) {
-               if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
-                                   sizeof(struct user_fxsr_struct) ))
-                       return -EFAULT;
-               return 0;
-       } else {
-               return -EIO;
-       }
-}
-
-int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
-{
-       int ret = 0;
-
-       if ( cpu_has_fxsr ) {
-               if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
-                                 sizeof(struct user_fxsr_struct) ))
-                       ret = -EFAULT;
-               /* mxcsr reserved bits must be masked to zero for security reasons */
-               tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-       } else {
-               ret = -EIO;
-       }
-       return ret;
-}
-
-/*
- * FPU state for core dumps.
- */
-
-static inline void copy_fpu_fsave( struct task_struct *tsk,
-                                  struct user_i387_struct *fpu )
-{
-       memcpy( fpu, &tsk->thread.i387.fsave,
-               sizeof(struct user_i387_struct) );
-}
-
-static inline void copy_fpu_fxsave( struct task_struct *tsk,
-                                  struct user_i387_struct *fpu )
-{
-       unsigned short *to;
-       unsigned short *from;
-       int i;
-
-       memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
-
-       to = (unsigned short *)&fpu->st_space[0];
-       from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
-       for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
-               memcpy( to, from, 5 * sizeof(unsigned short) );
-       }
-}
-
-int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
-{
-       int fpvalid;
-       struct task_struct *tsk = current;
-
-       fpvalid = !!used_math();
-       if ( fpvalid ) {
-               unlazy_fpu( tsk );
-               if ( cpu_has_fxsr ) {
-                       copy_fpu_fxsave( tsk, fpu );
-               } else {
-                       copy_fpu_fsave( tsk, fpu );
-               }
-       }
-
-       return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
-{
-       int fpvalid = !!tsk_used_math(tsk);
-
-       if (fpvalid) {
-               if (tsk == current)
-                       unlazy_fpu(tsk);
-               if (cpu_has_fxsr)
-                       copy_fpu_fxsave(tsk, fpu);
-               else
-                       copy_fpu_fsave(tsk, fpu);
-       }
-       return fpvalid;
-}
-
-int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
-{
-       int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
-
-       if (fpvalid) {
-               if (tsk == current)
-                      unlazy_fpu(tsk);
-               memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
-       }
-       return fpvalid;
-}
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c

deleted file mode 100644 (file)

index bfaff28..0000000
--- a/arch/x86/kernel/i387_64.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *  Copyright (C) 2002 Andi Kleen, SuSE Labs
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * 
- *  x86-64 rework 2002 Andi Kleen. 
- *  Does direct fxsave in and out of user space now for signal handlers.
- *  All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
- *  the 64bit user space sees a FXSAVE frame directly. 
- */
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
-
-void mxcsr_feature_mask_init(void)
-{
-       unsigned int mask;
-       clts();
-       memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-       asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
-       mask = current->thread.i387.fxsave.mxcsr_mask;
-       if (mask == 0) mask = 0x0000ffbf;
-       mxcsr_feature_mask &= mask;
-       stts();
-}
-
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-void __cpuinit fpu_init(void)
-{
-       unsigned long oldcr0 = read_cr0();
-       extern void __bad_fxsave_alignment(void);
-               
-       if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
-               __bad_fxsave_alignment();
-       set_in_cr4(X86_CR4_OSFXSR);
-       set_in_cr4(X86_CR4_OSXMMEXCPT);
-
-       write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
-
-       mxcsr_feature_mask_init();
-       /* clean state in init */
-       current_thread_info()->status = 0;
-       clear_used_math();
-}
-
-void init_fpu(struct task_struct *child)
-{
-       if (tsk_used_math(child)) {
-               if (child == current)
-                       unlazy_fpu(child);
-               return;
-       }       
-       memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-       child->thread.i387.fxsave.cwd = 0x37f;
-       child->thread.i387.fxsave.mxcsr = 0x1f80;
-       /* only the device not available exception or ptrace can call init_fpu */
-       set_stopped_child_used_math(child);
-}
-
-/*
- * Signal frame handlers.
- */
-
-int save_i387(struct _fpstate __user *buf)
-{
-       struct task_struct *tsk = current;
-       int err = 0;
-
-       BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
-                       sizeof(tsk->thread.i387.fxsave));
-
-       if ((unsigned long)buf % 16) 
-               printk("save_i387: bad fpstate %p\n",buf); 
-
-       if (!used_math())
-               return 0;
-       clear_used_math(); /* trigger finit */
-       if (task_thread_info(tsk)->status & TS_USEDFPU) {
-               err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
-               if (err) return err;
-               task_thread_info(tsk)->status &= ~TS_USEDFPU;
-               stts();
-       } else {
-               if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
-                                  sizeof(struct i387_fxsave_struct)))
-                       return -1;
-       }
-       return 1;
-}
-
-/*
- * ptrace request handlers.
- */
-
-int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
-{
-       init_fpu(tsk);
-       return __copy_to_user(buf, &tsk->thread.i387.fxsave,
-                              sizeof(struct user_i387_struct)) ? -EFAULT : 0;
-}
-
-int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
-{
-       if (__copy_from_user(&tsk->thread.i387.fxsave, buf, 
-                            sizeof(struct user_i387_struct)))
-               return -EFAULT;
-               return 0;
-}
-
-/*
- * FPU state for core dumps.
- */
-
-int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
-{
-       struct task_struct *tsk = current;
-
-       if (!used_math())
-               return 0;
-
-       unlazy_fpu(tsk);
-       memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 
-       return 1; 
-}
-
-int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
-{
-       int fpvalid = !!tsk_used_math(tsk);
-
-       if (fpvalid) {
-               if (tsk == current)
-                       unlazy_fpu(tsk);
-               memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));         
-}
-       return fpvalid;
-}
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c

index a42c807453253f83fad63373567ed050a6ae1ac1..ef62b07b2b488531255adf0a36824d4172028249 100644 (file)
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -13,10 +13,17 @@
  #include <asm/delay.h>
  #include <asm/i8253.h>
  #include <asm/io.h>
+#include <asm/hpet.h>
  
  DEFINE_SPINLOCK(i8253_lock);
  EXPORT_SYMBOL(i8253_lock);
  
+#ifdef CONFIG_X86_32
+static void pit_disable_clocksource(void);
+#else
+static inline void pit_disable_clocksource(void) { }
+#endif
+
  /*
   * HPET replaces the PIT, when enabled. So we need to know, which of
   * the two timers is used
@@ -31,38 +38,38 @@ struct clock_event_device *global_clock_event;
  static void init_pit_timer(enum clock_event_mode mode,
                            struct clock_event_device *evt)
  {
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
+       spin_lock(&i8253_lock);
  
         switch(mode) {
         case CLOCK_EVT_MODE_PERIODIC:
                 /* binary, mode 2, LSB/MSB, ch 0 */
-               outb_p(0x34, PIT_MODE);
-               outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
-               outb(LATCH >> 8 , PIT_CH0);     /* MSB */
+               outb_pit(0x34, PIT_MODE);
+               outb_pit(LATCH & 0xff , PIT_CH0);       /* LSB */
+               outb_pit(LATCH >> 8 , PIT_CH0);         /* MSB */
                 break;
  
         case CLOCK_EVT_MODE_SHUTDOWN:
         case CLOCK_EVT_MODE_UNUSED:
                 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
                     evt->mode == CLOCK_EVT_MODE_ONESHOT) {
-                       outb_p(0x30, PIT_MODE);
-                       outb_p(0, PIT_CH0);
-                       outb_p(0, PIT_CH0);
+                       outb_pit(0x30, PIT_MODE);
+                       outb_pit(0, PIT_CH0);
+                       outb_pit(0, PIT_CH0);
                 }
+               pit_disable_clocksource();
                 break;
  
         case CLOCK_EVT_MODE_ONESHOT:
                 /* One shot setup */
-               outb_p(0x38, PIT_MODE);
+               pit_disable_clocksource();
+               outb_pit(0x38, PIT_MODE);
                 break;
  
         case CLOCK_EVT_MODE_RESUME:
                 /* Nothing to do here */
                 break;
         }
-       spin_unlock_irqrestore(&i8253_lock, flags);
+       spin_unlock(&i8253_lock);
  }
  
  /*
@@ -72,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode,
   */
  static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
  {
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-       outb_p(delta & 0xff , PIT_CH0); /* LSB */
-       outb(delta >> 8 , PIT_CH0);     /* MSB */
-       spin_unlock_irqrestore(&i8253_lock, flags);
+       spin_lock(&i8253_lock);
+       outb_pit(delta & 0xff , PIT_CH0);       /* LSB */
+       outb_pit(delta >> 8 , PIT_CH0);         /* MSB */
+       spin_unlock(&i8253_lock);
  
         return 0;
  }
@@ -148,15 +153,15 @@ static cycle_t pit_read(void)
          * count), it cannot be newer.
          */
         jifs = jiffies;
-       outb_p(0x00, PIT_MODE); /* latch the count ASAP */
-       count = inb_p(PIT_CH0); /* read the latched count */
-       count |= inb_p(PIT_CH0) << 8;
+       outb_pit(0x00, PIT_MODE);       /* latch the count ASAP */
+       count = inb_pit(PIT_CH0);       /* read the latched count */
+       count |= inb_pit(PIT_CH0) << 8;
  
         /* VIA686a test code... reset the latch if count > max + 1 */
         if (count > LATCH) {
-               outb_p(0x34, PIT_MODE);
-               outb_p(LATCH & 0xff, PIT_CH0);
-               outb(LATCH >> 8, PIT_CH0);
+               outb_pit(0x34, PIT_MODE);
+               outb_pit(LATCH & 0xff, PIT_CH0);
+               outb_pit(LATCH >> 8, PIT_CH0);
                 count = LATCH - 1;
         }
  
@@ -195,9 +200,28 @@ static struct clocksource clocksource_pit = {
         .shift  = 20,
  };
  
+static void pit_disable_clocksource(void)
+{
+       /*
+        * Use mult to check whether it is registered or not
+        */
+       if (clocksource_pit.mult) {
+               clocksource_unregister(&clocksource_pit);
+               clocksource_pit.mult = 0;
+       }
+}
+
  static int __init init_pit_clocksource(void)
  {
-       if (num_possible_cpus() > 1) /* PIT does not scale! */
+        /*
+         * Several reasons not to register PIT as a clocksource:
+         *
+         * - On SMP PIT does not scale due to i8253_lock
+         * - when HPET is enabled
+         * - when local APIC timer is active (PIT is switched off)
+         */
+       if (num_possible_cpus() > 1 || is_hpet_enabled() ||
+           pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
                 return 0;
  
         clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c

index 5f3496d0198403023f652a2e4d0a8a21816887ea..2d25b77102fe19e964291713c94e1dd62a0b1466 100644 (file)
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -21,8 +21,6 @@
  #include <asm/arch_hooks.h>
  #include <asm/i8259.h>
  
-#include <io_ports.h>
-
  /*
   * This is the 'legacy' 8259A Programmable Interrupt Controller,
   * present in the majority of PC/AT boxes.
@@ -291,20 +289,20 @@ void init_8259A(int auto_eoi)
         outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-2 */
  
         /*
-        * outb_p - this has to work on a wide range of PC hardware.
+        * outb_pic - this has to work on a wide range of PC hardware.
          */
-       outb_p(0x11, PIC_MASTER_CMD);   /* ICW1: select 8259A-1 init */
-       outb_p(0x20 + 0, PIC_MASTER_IMR);       /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
-       outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);   /* 8259A-1 (the master) has a slave on IR2 */
+       outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
+       outb_pic(0x20 + 0, PIC_MASTER_IMR);     /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+       outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
         if (auto_eoi)   /* master does Auto EOI */
-               outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+               outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
         else            /* master expects normal EOI */
-               outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+               outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
  
-       outb_p(0x11, PIC_SLAVE_CMD);    /* ICW1: select 8259A-2 init */
-       outb_p(0x20 + 8, PIC_SLAVE_IMR);        /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
-       outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);  /* 8259A-2 is a slave on master's IR2 */
-       outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+       outb_pic(0x11, PIC_SLAVE_CMD);  /* ICW1: select 8259A-2 init */
+       outb_pic(0x20 + 8, PIC_SLAVE_IMR);      /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+       outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);        /* 8259A-2 is a slave on master's IR2 */
+       outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
         if (auto_eoi)
                 /*
                  * In AEOI mode we just have to mask the interrupt
@@ -341,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
         outb(0,0xF0);
         if (ignore_fpu_irq || !boot_cpu_data.hard_math)
                 return IRQ_NONE;
-       math_error((void __user *)get_irq_regs()->eip);
+       math_error((void __user *)get_irq_regs()->ip);
         return IRQ_HANDLED;
  }
  
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c

index ba6d57286f5648ff299d10e9a2dcb88d0c563ceb..fa57a15685082294e7b2d33c5d0bb450b2348d56 100644 (file)
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -21,6 +21,7 @@
  #include <asm/delay.h>
  #include <asm/desc.h>
  #include <asm/apic.h>
+#include <asm/i8259.h>
  
  /*
   * Common place to define all x86 IRQ vectors
@@ -48,7 +49,7 @@
   */
  
  /*
- * The IO-APIC gives us many more interrupt sources. Most of these 
+ * The IO-APIC gives us many more interrupt sources. Most of these
   * are unused but an SMP system is supposed to have enough memory ...
   * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
   * across the spectrum, so we really want to be prepared to get all
@@ -76,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
         IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
  
  /* for the irq vectors */
-static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
                                           IRQLIST_16(0x2), IRQLIST_16(0x3),
         IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
         IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = {
  /*
   * This contains the irq mask for both 8259A irq controllers,
   */
-static unsigned int cached_irq_mask = 0xffff;
-
-#define __byte(x,y)    (((unsigned char *)&(y))[x])
-#define cached_21      (__byte(0,cached_irq_mask))
-#define cached_A1      (__byte(1,cached_irq_mask))
+unsigned int cached_irq_mask = 0xffff;
  
  /*
   * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
@@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq)
         spin_lock_irqsave(&i8259A_lock, flags);
         cached_irq_mask |= mask;
         if (irq & 8)
-               outb(cached_A1,0xA1);
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
         else
-               outb(cached_21,0x21);
+               outb(cached_master_mask, PIC_MASTER_IMR);
         spin_unlock_irqrestore(&i8259A_lock, flags);
  }
  
@@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq)
         spin_lock_irqsave(&i8259A_lock, flags);
         cached_irq_mask &= mask;
         if (irq & 8)
-               outb(cached_A1,0xA1);
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
         else
-               outb(cached_21,0x21);
+               outb(cached_master_mask, PIC_MASTER_IMR);
         spin_unlock_irqrestore(&i8259A_lock, flags);
  }
  
@@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq)
  
         spin_lock_irqsave(&i8259A_lock, flags);
         if (irq < 8)
-               ret = inb(0x20) & mask;
+               ret = inb(PIC_MASTER_CMD) & mask;
         else
-               ret = inb(0xA0) & (mask >> 8);
+               ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
         spin_unlock_irqrestore(&i8259A_lock, flags);
  
         return ret;
@@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq)
         int irqmask = 1<<irq;
  
         if (irq < 8) {
-               outb(0x0B,0x20);                /* ISR register */
-               value = inb(0x20) & irqmask;
-               outb(0x0A,0x20);                /* back to the IRR register */
+               outb(0x0B,PIC_MASTER_CMD);      /* ISR register */
+               value = inb(PIC_MASTER_CMD) & irqmask;
+               outb(0x0A,PIC_MASTER_CMD);      /* back to the IRR register */
                 return value;
         }
-       outb(0x0B,0xA0);                /* ISR register */
-       value = inb(0xA0) & (irqmask >> 8);
-       outb(0x0A,0xA0);                /* back to the IRR register */
+       outb(0x0B,PIC_SLAVE_CMD);       /* ISR register */
+       value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+       outb(0x0A,PIC_SLAVE_CMD);       /* back to the IRR register */
         return value;
  }
  
@@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq)
  
  handle_real_irq:
         if (irq & 8) {
-               inb(0xA1);              /* DUMMY - (do we need this?) */
-               outb(cached_A1,0xA1);
-               outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
-               outb(0x62,0x20);        /* 'Specific EOI' to master-IRQ2 */
+               inb(PIC_SLAVE_IMR);     /* DUMMY - (do we need this?) */
+               outb(cached_slave_mask, PIC_SLAVE_IMR);
+               /* 'Specific EOI' to slave */
+               outb(0x60+(irq&7),PIC_SLAVE_CMD);
+                /* 'Specific EOI' to master-IRQ2 */
+               outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
         } else {
-               inb(0x21);              /* DUMMY - (do we need this?) */
-               outb(cached_21,0x21);
-               outb(0x60+irq,0x20);    /* 'Specific EOI' to master */
+               inb(PIC_MASTER_IMR);    /* DUMMY - (do we need this?) */
+               outb(cached_master_mask, PIC_MASTER_IMR);
+               /* 'Specific EOI' to master */
+               outb(0x60+irq,PIC_MASTER_CMD);
         }
         spin_unlock_irqrestore(&i8259A_lock, flags);
         return;
@@ -270,7 +270,8 @@ spurious_8259A_irq:
                  * lets ACK and report it. [once per IRQ]
                  */
                 if (!(spurious_irq_mask & irqmask)) {
-                       printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+                       printk(KERN_DEBUG
+                              "spurious 8259A interrupt: IRQ%d.\n", irq);
                         spurious_irq_mask |= irqmask;
                 }
                 atomic_inc(&irq_err_count);
@@ -283,51 +284,6 @@ spurious_8259A_irq:
         }
  }
  
-void init_8259A(int auto_eoi)
-{
-       unsigned long flags;
-
-       i8259A_auto_eoi = auto_eoi;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-
-       outb(0xff, 0x21);       /* mask all of 8259A-1 */
-       outb(0xff, 0xA1);       /* mask all of 8259A-2 */
-
-       /*
-        * outb_p - this has to work on a wide range of PC hardware.
-        */
-       outb_p(0x11, 0x20);     /* ICW1: select 8259A-1 init */
-       outb_p(IRQ0_VECTOR, 0x21);      /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-       outb_p(0x04, 0x21);     /* 8259A-1 (the master) has a slave on IR2 */
-       if (auto_eoi)
-               outb_p(0x03, 0x21);     /* master does Auto EOI */
-       else
-               outb_p(0x01, 0x21);     /* master expects normal EOI */
-
-       outb_p(0x11, 0xA0);     /* ICW1: select 8259A-2 init */
-       outb_p(IRQ8_VECTOR, 0xA1);      /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
-       outb_p(0x02, 0xA1);     /* 8259A-2 is a slave on master's IR2 */
-       outb_p(0x01, 0xA1);     /* (slave's support for AEOI in flat mode
-                                   is to be investigated) */
-
-       if (auto_eoi)
-               /*
-                * in AEOI mode we just have to mask the interrupt
-                * when acking.
-                */
-               i8259A_chip.mask_ack = disable_8259A_irq;
-       else
-               i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-       udelay(100);            /* wait for 8259A to initialize */
-
-       outb(cached_21, 0x21);  /* restore master IRQ mask */
-       outb(cached_A1, 0xA1);  /* restore slave IRQ mask */
-
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
  static char irq_trigger[2];
  /**
   * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -364,8 +320,8 @@ static int i8259A_shutdown(struct sys_device *dev)
          * the kernel initialization code can get it
          * out of.
          */
-       outb(0xff, 0x21);       /* mask all of 8259A-1 */
-       outb(0xff, 0xA1);       /* mask all of 8259A-1 */
+       outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
+       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-1 */
         return 0;
  }
  
@@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void)
  
  device_initcall(i8259A_init_sysfs);
  
+void init_8259A(int auto_eoi)
+{
+       unsigned long flags;
+
+       i8259A_auto_eoi = auto_eoi;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+
+       outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */
+       outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-2 */
+
+       /*
+        * outb_pic - this has to work on a wide range of PC hardware.
+        */
+       outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
+       /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
+       outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+       /* 8259A-1 (the master) has a slave on IR2 */
+       outb_pic(0x04, PIC_MASTER_IMR);
+       if (auto_eoi)   /* master does Auto EOI */
+               outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+       else            /* master expects normal EOI */
+               outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+       outb_pic(0x11, PIC_SLAVE_CMD);  /* ICW1: select 8259A-2 init */
+       /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
+       outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+       /* 8259A-2 is a slave on master's IR2 */
+       outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+       /* (slave's support for AEOI in flat mode is to be investigated) */
+       outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
+       if (auto_eoi)
+               /*
+                * In AEOI mode we just have to mask the interrupt
+                * when acking.
+                */
+               i8259A_chip.mask_ack = disable_8259A_irq;
+       else
+               i8259A_chip.mask_ack = mask_and_ack_8259A;
+
+       udelay(100);            /* wait for 8259A to initialize */
+
+       outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+       outb(cached_slave_mask, PIC_SLAVE_IMR);   /* restore slave IRQ mask */
+
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+
+
+
  /*
   * IRQ2 is cascade interrupt to second interrupt controller
   */
@@ -448,7 +456,9 @@ void __init init_ISA_irqs (void)
         }
  }
  
-void __init init_IRQ(void)
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
  {
         int i;
  
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c

index 468c9c43784261823bbd018550d113c4a49df777..5b3ce7934363af3a5a3db53c471b889bcd4b834e 100644 (file)
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES;
  static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
  static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
  
  /*
   * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c

index ab77f1905469592b724ce9c6bdd7c3ea70f0e408..4ca548632c8dfa61e5a8a95be00a8cb9d1f97fca 100644 (file)
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -35,6 +35,7 @@
  #include <linux/htirq.h>
  #include <linux/freezer.h>
  #include <linux/kthread.h>
+#include <linux/jiffies.h>     /* time_after() */
  
  #include <asm/io.h>
  #include <asm/smp.h>
@@ -48,8 +49,6 @@
  #include <mach_apic.h>
  #include <mach_apicdef.h>
  
-#include "io_ports.h"
-
  int (*ioapic_renumber_irq)(int ioapic, int irq);
  atomic_t irq_mis_count;
  
@@ -351,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
  # include <asm/processor.h>    /* kernel_thread() */
  # include <linux/kernel_stat.h>        /* kstat */
  # include <linux/slab.h>               /* kmalloc() */
-# include <linux/timer.h>      /* time_after() */
+# include <linux/timer.h>
   
  #define IRQBALANCE_CHECK_ARCH -999
  #define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
@@ -727,7 +726,7 @@ late_initcall(balanced_irq_init);
  #endif /* CONFIG_SMP */
  
  #ifndef CONFIG_SMP
-void fastcall send_IPI_self(int vector)
+void send_IPI_self(int vector)
  {
         unsigned int cfg;
  
@@ -1900,7 +1899,7 @@ static int __init timer_irq_works(void)
          * might have cached one ExtINT interrupt.  Finally, at
          * least one tick may be lost due to delays.
          */
-       if (jiffies - t1 > 4)
+       if (time_after(jiffies, t1 + 4))
                 return 1;
  
         return 0;
@@ -2080,7 +2079,7 @@ static struct irq_chip lapic_chip __read_mostly = {
         .eoi            = ack_apic,
  };
  
-static void setup_nmi (void)
+static void __init setup_nmi(void)
  {
         /*
          * Dirty trick to enable the NMI watchdog ...
@@ -2093,7 +2092,7 @@ static void setup_nmi (void)
          */ 
         apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
  
-       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
+       enable_NMI_through_LVT0();
  
         apic_printk(APIC_VERBOSE, " done.\n");
  }
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c

index 23a3ac06a23ea1dc68d6d79f6cf7eb3f9f215f96..1627c0d53e0b4f63b927184d40e56145d0012e38 100644 (file)
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -32,9 +32,11 @@
  #include <linux/msi.h>
  #include <linux/htirq.h>
  #include <linux/dmar.h>
+#include <linux/jiffies.h>
  #ifdef CONFIG_ACPI
  #include <acpi/acpi_bus.h>
  #endif
+#include <linux/bootmem.h>
  
  #include <asm/idle.h>
  #include <asm/io.h>
@@ -1069,7 +1071,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
         v = apic_read(APIC_LVR);
         printk(KERN_INFO "... APIC VERSION: %08x\n", v);
         ver = GET_APIC_VERSION(v);
-       maxlvt = get_maxlvt();
+       maxlvt = lapic_get_maxlvt();
  
         v = apic_read(APIC_TASKPRI);
         printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1171,7 +1173,7 @@ void __apicdebuginit print_PIC(void)
  
  #endif  /*  0  */
  
-static void __init enable_IO_APIC(void)
+void __init enable_IO_APIC(void)
  {
         union IO_APIC_reg_01 reg_01;
         int i8259_apic, i8259_pin;
@@ -1298,7 +1300,7 @@ static int __init timer_irq_works(void)
          */
  
         /* jiffies wrap? */
-       if (jiffies - t1 > 4)
+       if (time_after(jiffies, t1 + 4))
                 return 1;
         return 0;
  }
@@ -1411,7 +1413,7 @@ static void irq_complete_move(unsigned int irq)
         if (likely(!cfg->move_in_progress))
                 return;
  
-       vector = ~get_irq_regs()->orig_rax;
+       vector = ~get_irq_regs()->orig_ax;
         me = smp_processor_id();
         if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
                 cpumask_t cleanup_mask;
@@ -1438,7 +1440,7 @@ static void ack_apic_level(unsigned int irq)
         int do_unmask_irq = 0;
  
         irq_complete_move(irq);
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
         /* If we are moving the irq we need to mask it */
         if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
                 do_unmask_irq = 1;
@@ -1565,7 +1567,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = {
         .end = end_lapic_irq,
  };
  
-static void setup_nmi (void)
+static void __init setup_nmi(void)
  {
         /*
          * Dirty trick to enable the NMI watchdog ...
@@ -1578,7 +1580,7 @@ static void setup_nmi (void)
          */ 
         printk(KERN_INFO "activating NMI Watchdog ...");
  
-       enable_NMI_through_LVT0(NULL);
+       enable_NMI_through_LVT0();
  
         printk(" done.\n");
  }
@@ -1654,7 +1656,7 @@ static inline void unlock_ExtINT_logic(void)
   *
   * FIXME: really need to revamp this for modern platforms only.
   */
-static inline void check_timer(void)
+static inline void __init check_timer(void)
  {
         struct irq_cfg *cfg = irq_cfg + 0;
         int apic1, pin1, apic2, pin2;
@@ -1788,7 +1790,10 @@ __setup("no_timer_check", notimercheck);
  
  void __init setup_IO_APIC(void)
  {
-       enable_IO_APIC();
+
+       /*
+        * calling enable_IO_APIC() is moved to setup_local_APIC for BP
+        */
  
         if (acpi_ioapic)
                 io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
@@ -2288,3 +2293,92 @@ void __init setup_ioapic_dest(void)
  }
  #endif
  
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(void)
+{
+       unsigned long n;
+       struct resource *res;
+       char *mem;
+       int i;
+
+       if (nr_ioapics <= 0)
+               return NULL;
+
+       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+       n *= nr_ioapics;
+
+       mem = alloc_bootmem(n);
+       res = (void *)mem;
+
+       if (mem != NULL) {
+               memset(mem, 0, n);
+               mem += sizeof(struct resource) * nr_ioapics;
+
+               for (i = 0; i < nr_ioapics; i++) {
+                       res[i].name = mem;
+                       res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                       sprintf(mem,  "IOAPIC %u", i);
+                       mem += IOAPIC_RESOURCE_NAME_SIZE;
+               }
+       }
+
+       ioapic_resources = res;
+
+       return res;
+}
+
+void __init ioapic_init_mappings(void)
+{
+       unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+       struct resource *ioapic_res;
+       int i;
+
+       ioapic_res = ioapic_setup_resources();
+       for (i = 0; i < nr_ioapics; i++) {
+               if (smp_found_config) {
+                       ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+               } else {
+                       ioapic_phys = (unsigned long)
+                               alloc_bootmem_pages(PAGE_SIZE);
+                       ioapic_phys = __pa(ioapic_phys);
+               }
+               set_fixmap_nocache(idx, ioapic_phys);
+               apic_printk(APIC_VERBOSE,
+                           "mapped IOAPIC to %016lx (%016lx)\n",
+                           __fix_to_virt(idx), ioapic_phys);
+               idx++;
+
+               if (ioapic_res != NULL) {
+                       ioapic_res->start = ioapic_phys;
+                       ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+                       ioapic_res++;
+               }
+       }
+}
+
+static int __init ioapic_insert_resources(void)
+{
+       int i;
+       struct resource *r = ioapic_resources;
+
+       if (!r) {
+               printk(KERN_ERR
+                      "IO APIC resources could be not be allocated.\n");
+               return -1;
+       }
+
+       for (i = 0; i < nr_ioapics; i++) {
+               insert_resource(&iomem_resource, r);
+               r++;
+       }
+
+       return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c

new file mode 100644 (file)

index 0000000..bd49321
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,114 @@
+/*
+ * I/O delay strategies for inb_p/outb_p
+ *
+ * Allow for a DMI based override of port 0x80, needed for certain HP laptops
+ * and possibly other systems. Also allow for the gradual elimination of
+ * outb_p/inb_p API uses.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <asm/io.h>
+
+int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
+EXPORT_SYMBOL_GPL(io_delay_type);
+
+static int __initdata io_delay_override;
+
+/*
+ * Paravirt wants native_io_delay to be a constant.
+ */
+void native_io_delay(void)
+{
+       switch (io_delay_type) {
+       default:
+       case CONFIG_IO_DELAY_TYPE_0X80:
+               asm volatile ("outb %al, $0x80");
+               break;
+       case CONFIG_IO_DELAY_TYPE_0XED:
+               asm volatile ("outb %al, $0xed");
+               break;
+       case CONFIG_IO_DELAY_TYPE_UDELAY:
+               /*
+                * 2 usecs is an upper-bound for the outb delay but
+                * note that udelay doesn't have the bus-level
+                * side-effects that outb does, nor does udelay() have
+                * precise timings during very early bootup (the delays
+                * are shorter until calibrated):
+                */
+               udelay(2);
+       case CONFIG_IO_DELAY_TYPE_NONE:
+               break;
+       }
+}
+EXPORT_SYMBOL(native_io_delay);
+
+static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
+{
+       if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
+               printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
+                       id->ident);
+               io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+       }
+
+       return 0;
+}
+
+/*
+ * Quirk table for systems that misbehave (lock up, etc.) if port
+ * 0x80 is used:
+ */
+static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
+       {
+               .callback       = dmi_io_delay_0xed_port,
+               .ident          = "Compaq Presario V6000",
+               .matches        = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME, "30B7")
+               }
+       },
+       {
+               .callback       = dmi_io_delay_0xed_port,
+               .ident          = "HP Pavilion dv9000z",
+               .matches        = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME, "30B9")
+               }
+       },
+       {
+               .callback       = dmi_io_delay_0xed_port,
+               .ident          = "HP Pavilion tx1000",
+               .matches        = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME, "30BF")
+               }
+       },
+       { }
+};
+
+void __init io_delay_init(void)
+{
+       if (!io_delay_override)
+               dmi_check_system(io_delay_0xed_port_dmi_table);
+}
+
+static int __init io_delay_param(char *s)
+{
+       if (!strcmp(s, "0x80"))
+               io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
+       else if (!strcmp(s, "0xed"))
+               io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+       else if (!strcmp(s, "udelay"))
+               io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
+       else if (!strcmp(s, "none"))
+               io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
+       else
+               return -EINVAL;
+
+       io_delay_override = 1;
+       return 0;
+}
+
+early_param("io_delay", io_delay_param);
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport.c

similarity index 70%

rename from arch/x86/kernel/ioport_32.c

rename to arch/x86/kernel/ioport.c

index 4ed48dc8df1e76fa925f15e11febe5ebc79481fa..50e5e4a31c8500e94b277378b2d3860d673fc225 100644 (file)
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport.c
@@ -1,6 +1,6 @@
  /*
   * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
+ * by Linus. 32/64 bits code unification by Miguel Botón.
   */
  
  #include <linux/sched.h>
@@ -16,49 +16,27 @@
  #include <linux/syscalls.h>
  
  /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+static void set_bitmap(unsigned long *bitmap, unsigned int base,
+                      unsigned int extent, int new_value)
  {
-       unsigned long mask;
-       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
-       unsigned int low_index = base & (BITS_PER_LONG-1);
-       int length = low_index + extent;
-
-       if (low_index != 0) {
-               mask = (~0UL << low_index);
-               if (length < BITS_PER_LONG)
-                       mask &= ~(~0UL << length);
-               if (new_value)
-                       *bitmap_base++ |= mask;
-               else
-                       *bitmap_base++ &= ~mask;
-               length -= BITS_PER_LONG;
-       }
-
-       mask = (new_value ? ~0UL : 0UL);
-       while (length >= BITS_PER_LONG) {
-               *bitmap_base++ = mask;
-               length -= BITS_PER_LONG;
-       }
+       unsigned int i;
  
-       if (length > 0) {
-               mask = ~(~0UL << length);
+       for (i = base; i < base + extent; i++) {
                 if (new_value)
-                       *bitmap_base++ |= mask;
+                       __set_bit(i, bitmap);
                 else
-                       *bitmap_base++ &= ~mask;
+                       __clear_bit(i, bitmap);
         }
  }
  
-
  /*
   * this changes the io permissions bitmap in the current task.
   */
  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
  {
-       unsigned long i, max_long, bytes, bytes_updated;
         struct thread_struct * t = &current->thread;
         struct tss_struct * tss;
-       unsigned long *bitmap;
+       unsigned int i, max_long, bytes, bytes_updated;
  
         if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
                 return -EINVAL;
@@ -71,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
          * this is why we delay this operation until now:
          */
         if (!t->io_bitmap_ptr) {
-               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+               unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
                 if (!bitmap)
                         return -ENOMEM;
  
@@ -100,11 +79,12 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
                 if (t->io_bitmap_ptr[i] != ~0UL)
                         max_long = i;
  
-       bytes = (max_long + 1) * sizeof(long);
+       bytes = (max_long + 1) * sizeof(unsigned long);
         bytes_updated = max(bytes, t->io_bitmap_max);
  
         t->io_bitmap_max = bytes;
  
+#ifdef CONFIG_X86_32
         /*
          * Sets the lazy trigger so that the next I/O operation will
          * reload the correct bitmap.
@@ -113,6 +93,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
          */
         tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
         tss->io_bitmap_owner = NULL;
+#else
+       /* Update the TSS: */
+       memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+#endif
  
         put_cpu();
  
@@ -124,18 +108,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
   * beyond the 0x3ff range: to get the full 65536 ports bitmapped
   * you'd need 8kB of bitmaps/process, which is a bit excessive.
   *
- * Here we just change the eflags value on the stack: we allow
+ * Here we just change the flags value on the stack: we allow
   * only the super-user to do it. This depends on the stack-layout
   * on system-call entry - see also fork() and the signal handling
   * code.
   */
-
-asmlinkage long sys_iopl(unsigned long unused)
+static int do_iopl(unsigned int level, struct pt_regs *regs)
  {
-       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
-       unsigned int level = regs->ebx;
-       unsigned int old = (regs->eflags >> 12) & 3;
-       struct thread_struct *t = &current->thread;
+       unsigned int old = (regs->flags >> 12) & 3;
  
         if (level > 3)
                 return -EINVAL;
@@ -144,8 +124,31 @@ asmlinkage long sys_iopl(unsigned long unused)
                 if (!capable(CAP_SYS_RAWIO))
                         return -EPERM;
         }
+       regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
+
+       return 0;
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage long sys_iopl(unsigned long regsp)
+{
+       struct pt_regs *regs = (struct pt_regs *)&regsp;
+       unsigned int level = regs->bx;
+       struct thread_struct *t = &current->thread;
+       int rc;
+
+       rc = do_iopl(level, regs);
+       if (rc < 0)
+               goto out;
+
         t->iopl = level << 12;
-       regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
         set_iopl_mask(t->iopl);
-       return 0;
+out:
+       return rc;
+}
+#else
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+       return do_iopl(level, regs);
  }
+#endif
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c

deleted file mode 100644 (file)

index 5f62fad..0000000
--- a/arch/x86/kernel/ioport_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
-
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
-{
-       int i;
-               if (new_value)
-               for (i = base; i < base + extent; i++) 
-                       __set_bit(i, bitmap); 
-               else
-               for (i = base; i < base + extent; i++) 
-                       clear_bit(i, bitmap); 
-}
-
-/*
- * this changes the io permissions bitmap in the current task.
- */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
-{
-       unsigned int i, max_long, bytes, bytes_updated;
-       struct thread_struct * t = &current->thread;
-       struct tss_struct * tss;
-       unsigned long *bitmap;
-
-       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
-               return -EINVAL;
-       if (turn_on && !capable(CAP_SYS_RAWIO))
-               return -EPERM;
-
-       /*
-        * If it's the first ioperm() call in this thread's lifetime, set the
-        * IO bitmap up. ioperm() is much less timing critical than clone(),
-        * this is why we delay this operation until now:
-        */
-       if (!t->io_bitmap_ptr) {
-               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-               if (!bitmap)
-                       return -ENOMEM;
-
-               memset(bitmap, 0xff, IO_BITMAP_BYTES);
-               t->io_bitmap_ptr = bitmap;
-               set_thread_flag(TIF_IO_BITMAP);
-       }
-
-       /*
-        * do it in the per-thread copy and in the TSS ...
-        *
-        * Disable preemption via get_cpu() - we must not switch away
-        * because the ->io_bitmap_max value must match the bitmap
-        * contents:
-        */
-       tss = &per_cpu(init_tss, get_cpu());
-
-       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
-
-       /*
-        * Search for a (possibly new) maximum. This is simple and stupid,
-        * to keep it obviously correct:
-        */
-       max_long = 0;
-       for (i = 0; i < IO_BITMAP_LONGS; i++)
-               if (t->io_bitmap_ptr[i] != ~0UL)
-                       max_long = i;
-
-       bytes = (max_long + 1) * sizeof(long);
-       bytes_updated = max(bytes, t->io_bitmap_max);
-
-       t->io_bitmap_max = bytes;
-
-       /* Update the TSS: */
-       memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
-
-       put_cpu();
-
-       return 0;
-}
-
-/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
- *
- * Here we just change the eflags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
- */
-
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
-{
-       unsigned int old = (regs->eflags >> 12) & 3;
-
-       if (level > 3)
-               return -EINVAL;
-       /* Trying to gain more privileges? */
-       if (level > old) {
-               if (!capable(CAP_SYS_RAWIO))
-                       return -EPERM;
-       }
-       regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
-       return 0;
-}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c

index d3fde94f734557809925d1e73cc2e021cebd6342..cef054b09d27345c5f520ecb2a30eb87c3095b1c 100644 (file)
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
   * SMP cross-CPU interrupts have their own specific
   * handlers).
   */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+unsigned int do_IRQ(struct pt_regs *regs)
  {      
         struct pt_regs *old_regs;
         /* high bit used in ret_from_ code */
-       int irq = ~regs->orig_eax;
+       int irq = ~regs->orig_ax;
         struct irq_desc *desc = irq_desc + irq;
  #ifdef CONFIG_4KSTACKS
         union irq_ctx *curctx, *irqctx;
@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
  #ifdef CONFIG_DEBUG_STACKOVERFLOW
         /* Debugging check for stack overflow: is there less than 1KB free? */
         {
-               long esp;
+               long sp;
  
                 __asm__ __volatile__("andl %%esp,%0" :
-                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
-               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+                                       "=r" (sp) : "0" (THREAD_SIZE - 1));
+               if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
                         printk("do_IRQ: stack overflow: %ld\n",
-                               esp - sizeof(struct thread_info));
+                               sp - sizeof(struct thread_info));
                         dump_stack();
                 }
         }
@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
          * current stack (which is the irq stack already after all)
          */
         if (curctx != irqctx) {
-               int arg1, arg2, ebx;
+               int arg1, arg2, bx;
  
                 /* build the stack frame on the IRQ stack */
                 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
                         (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
  
                 asm volatile(
-                       "       xchgl  %%ebx,%%esp      \n"
-                       "       call   *%%edi           \n"
-                       "       movl   %%ebx,%%esp      \n"
-                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
+                       "       xchgl  %%ebx,%%esp    \n"
+                       "       call   *%%edi         \n"
+                       "       movl   %%ebx,%%esp    \n"
+                       : "=a" (arg1), "=d" (arg2), "=b" (bx)
                         :  "0" (irq),   "1" (desc),  "2" (isp),
                            "D" (desc->handle_irq)
                         : "memory", "cc"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c

index 6b5c730d67b9d36a184f8340e8c4fa2b5b55ede0..3aac15466a91f951df85fdcaccdc7e0dd2196d8b 100644 (file)
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,26 @@
  
  atomic_t irq_err_count;
  
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+       printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
+       /*
+        * Currently unexpected vectors happen only on SMP and APIC.
+        * We _must_ ack these because every local APIC has only N
+        * irq slots per priority level, and a 'hanging, unacked' IRQ
+        * holds up an irq slot - in excessive cases (when multiple
+        * unexpected vectors occur) that might lock up the APIC
+        * completely.
+        * But don't ack when the APIC is disabled. -AK
+        */
+       if (!disable_apic)
+               ack_APIC_irq();
+}
+
  #ifdef CONFIG_DEBUG_STACKOVERFLOW
  /*
   * Probabilistic stack overflow check:
@@ -33,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs)
         u64 curbase = (u64)task_stack_page(current);
         static unsigned long warned = -60*HZ;
  
-       if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
-           regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
+       if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
+           regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
             time_after(jiffies, warned + 60*HZ)) {
-               printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
-                      current->comm, curbase, regs->rsp);
+               printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+                      current->comm, curbase, regs->sp);
                 show_stack(NULL,NULL);
                 warned = jiffies;
         }
@@ -142,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
         struct pt_regs *old_regs = set_irq_regs(regs);
  
         /* high bit used in ret_from_ code  */
-       unsigned vector = ~regs->orig_rax;
+       unsigned vector = ~regs->orig_ax;
         unsigned irq;
  
         exit_idle();
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c

new file mode 100644 (file)

index 0000000..7335430
--- /dev/null
+++ b/arch/x86/kernel/kdebugfs.c
@@ -0,0 +1,65 @@
+/*
+ * Architecture specific debugfs files
+ *
+ * Copyright (C) 2007, Intel Corp.
+ *     Huang Ying <ying.huang@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+static struct debugfs_blob_wrapper boot_params_blob = {
+       .data = &boot_params,
+       .size = sizeof(boot_params),
+};
+
+static int __init boot_params_kdebugfs_init(void)
+{
+       int error;
+       struct dentry *dbp, *version, *data;
+
+       dbp = debugfs_create_dir("boot_params", NULL);
+       if (!dbp) {
+               error = -ENOMEM;
+               goto err_return;
+       }
+       version = debugfs_create_x16("version", S_IRUGO, dbp,
+                                    &boot_params.hdr.version);
+       if (!version) {
+               error = -ENOMEM;
+               goto err_dir;
+       }
+       data = debugfs_create_blob("data", S_IRUGO, dbp,
+                                  &boot_params_blob);
+       if (!data) {
+               error = -ENOMEM;
+               goto err_version;
+       }
+       return 0;
+err_version:
+       debugfs_remove(version);
+err_dir:
+       debugfs_remove(dbp);
+err_return:
+       return error;
+}
+#endif
+
+static int __init arch_kdebugfs_init(void)
+{
+       int error = 0;
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+       error = boot_params_kdebugfs_init();
+#endif
+
+       return error;
+}
+
+arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c

new file mode 100644 (file)

index 0000000..a99e764
--- /dev/null
+++ b/arch/x86/kernel/kprobes.c
@@ -0,0 +1,1066 @@
+/*
+ *  Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *             Probes initial implementation ( includes contributions from
+ *             Rusty Russell).
+ * 2004-July   Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *             interface to access function arguments.
+ * 2004-Oct    Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *             <prasanna@in.ibm.com> adapted for x86_64 from i386.
+ * 2005-Mar    Roland McGrath <roland@redhat.com>
+ *             Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May    Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *             <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *             <prasanna@in.ibm.com> added function-return probes.
+ * 2005-May    Rusty Lynch <rusty.lynch@intel.com>
+ *             Added function return probes functionality
+ * 2006-Feb    Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
+ *             kprobe-booster and kretprobe-booster for i386.
+ * 2007-Dec    Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
+ *             and kretprobe-booster for x86-64
+ * 2007-Dec    Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
+ *             <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ *             unified x86 kprobes code.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+
+void jprobe_return_end(void);
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+#ifdef CONFIG_X86_64
+#define stack_addr(regs) ((unsigned long *)regs->sp)
+#else
+/*
+ * "&regs->sp" looks wrong, but it's correct for x86_32.  x86_32 CPUs
+ * don't save the ss and esp registers if the CPU is already in kernel
+ * mode when it traps.  So for kprobes, regs->sp and regs->ss are not
+ * the [nonexistent] saved stack pointer and ss register, but rather
+ * the top 8 bytes of the pre-int3 stack.  So &regs->sp happens to
+ * point to the top of the pre-int3 stack.
+ */
+#define stack_addr(regs) ((unsigned long *)&regs->sp)
+#endif
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+       (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+         (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+         (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+         (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+        << (row % 32))
+       /*
+        * Undefined/reserved opcodes, conditional jump, Opcode Extension
+        * Groups, and some special opcodes can not boost.
+        */
+static const u32 twobyte_is_boostable[256 / 32] = {
+       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+       /*      ----------------------------------------------          */
+       W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+       W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
+       W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+       W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+       W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+       W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+       W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+       W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+       W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+       W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+       W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+       W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+       W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+       W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+       W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+       W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
+       /*      -----------------------------------------------         */
+       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 onebyte_has_modrm[256 / 32] = {
+       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+       /*      -----------------------------------------------         */
+       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
+       W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
+       W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
+       W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
+       W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+       W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+       W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
+       W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
+       W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+       W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
+       W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
+       W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
+       W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
+       W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+       W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
+       W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
+       /*      -----------------------------------------------         */
+       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 twobyte_has_modrm[256 / 32] = {
+       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+       /*      -----------------------------------------------         */
+       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
+       W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
+       W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
+       W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
+       W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
+       W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
+       W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
+       W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
+       W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
+       W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
+       W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
+       W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
+       W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
+       W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
+       W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
+       W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
+       /*      -----------------------------------------------         */
+       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+#undef W
+
+struct kretprobe_blackpoint kretprobe_blacklist[] = {
+       {"__switch_to", }, /* This function switches only current task, but
+                             doesn't switch kernel stack.*/
+       {NULL, NULL}    /* Terminator */
+};
+const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static void __kprobes set_jmp_op(void *from, void *to)
+{
+       struct __arch_jmp_op {
+               char op;
+               s32 raddr;
+       } __attribute__((packed)) * jop;
+       jop = (struct __arch_jmp_op *)from;
+       jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
+       jop->op = RELATIVEJUMP_INSTRUCTION;
+}
+
+/*
+ * Check for the REX prefix which can only exist on X86_64
+ * X86_32 always returns 0
+ */
+static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+{
+#ifdef CONFIG_X86_64
+       if ((*insn & 0xf0) == 0x40)
+               return 1;
+#endif
+       return 0;
+}
+
+/*
+ * Returns non-zero if opcode is boostable.
+ * RIP relative instructions are adjusted at copying time in 64 bits mode
+ */
+static int __kprobes can_boost(kprobe_opcode_t *opcodes)
+{
+       kprobe_opcode_t opcode;
+       kprobe_opcode_t *orig_opcodes = opcodes;
+
+retry:
+       if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+               return 0;
+       opcode = *(opcodes++);
+
+       /* 2nd-byte opcode */
+       if (opcode == 0x0f) {
+               if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+                       return 0;
+               return test_bit(*opcodes,
+                               (unsigned long *)twobyte_is_boostable);
+       }
+
+       switch (opcode & 0xf0) {
+#ifdef CONFIG_X86_64
+       case 0x40:
+               goto retry; /* REX prefix is boostable */
+#endif
+       case 0x60:
+               if (0x63 < opcode && opcode < 0x67)
+                       goto retry; /* prefixes */
+               /* can't boost Address-size override and bound */
+               return (opcode != 0x62 && opcode != 0x67);
+       case 0x70:
+               return 0; /* can't boost conditional jump */
+       case 0xc0:
+               /* can't boost software-interruptions */
+               return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+       case 0xd0:
+               /* can boost AA* and XLAT */
+               return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+       case 0xe0:
+               /* can boost in/out and absolute jmps */
+               return ((opcode & 0x04) || opcode == 0xea);
+       case 0xf0:
+               if ((opcode & 0x0c) == 0 && opcode != 0xf1)
+                       goto retry; /* lock/rep(ne) prefix */
+               /* clear and set flags are boostable */
+               return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+       default:
+               /* segment override prefixes are boostable */
+               if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
+                       goto retry; /* prefixes */
+               /* CS override prefix and call are not boostable */
+               return (opcode != 0x2e && opcode != 0x9a);
+       }
+}
+
+/*
+ * Returns non-zero if opcode modifies the interrupt flag.
+ */
+static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
+{
+       switch (*insn) {
+       case 0xfa:              /* cli */
+       case 0xfb:              /* sti */
+       case 0xcf:              /* iret/iretd */
+       case 0x9d:              /* popf/popfd */
+               return 1;
+       }
+
+       /*
+        * on X86_64, 0x40-0x4f are REX prefixes so we need to look
+        * at the next byte instead.. but of course not recurse infinitely
+        */
+       if (is_REX_prefix(insn))
+               return is_IF_modifier(++insn);
+
+       return 0;
+}
+
+/*
+ * Adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode.
+ * If it does, Return the address of the 32-bit displacement word.
+ * If not, return null.
+ * Only applicable to 64-bit x86.
+ */
+static void __kprobes fix_riprel(struct kprobe *p)
+{
+#ifdef CONFIG_X86_64
+       u8 *insn = p->ainsn.insn;
+       s64 disp;
+       int need_modrm;
+
+       /* Skip legacy instruction prefixes.  */
+       while (1) {
+               switch (*insn) {
+               case 0x66:
+               case 0x67:
+               case 0x2e:
+               case 0x3e:
+               case 0x26:
+               case 0x64:
+               case 0x65:
+               case 0x36:
+               case 0xf0:
+               case 0xf3:
+               case 0xf2:
+                       ++insn;
+                       continue;
+               }
+               break;
+       }
+
+       /* Skip REX instruction prefix.  */
+       if (is_REX_prefix(insn))
+               ++insn;
+
+       if (*insn == 0x0f) {
+               /* Two-byte opcode.  */
+               ++insn;
+               need_modrm = test_bit(*insn,
+                                     (unsigned long *)twobyte_has_modrm);
+       } else
+               /* One-byte opcode.  */
+               need_modrm = test_bit(*insn,
+                                     (unsigned long *)onebyte_has_modrm);
+
+       if (need_modrm) {
+               u8 modrm = *++insn;
+               if ((modrm & 0xc7) == 0x05) {
+                       /* %rip+disp32 addressing mode */
+                       /* Displacement follows ModRM byte.  */
+                       ++insn;
+                       /*
+                        * The copied instruction uses the %rip-relative
+                        * addressing mode.  Adjust the displacement for the
+                        * difference between the original location of this
+                        * instruction and the location of the copy that will
+                        * actually be run.  The tricky bit here is making sure
+                        * that the sign extension happens correctly in this
+                        * calculation, since we need a signed 32-bit result to
+                        * be sign-extended to 64 bits when it's added to the
+                        * %rip value and yield the same 64-bit result that the
+                        * sign-extension of the original signed 32-bit
+                        * displacement would have given.
+                        */
+                       disp = (u8 *) p->addr + *((s32 *) insn) -
+                              (u8 *) p->ainsn.insn;
+                       BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
+                       *(s32 *)insn = (s32) disp;
+               }
+       }
+#endif
+}
+
+static void __kprobes arch_copy_kprobe(struct kprobe *p)
+{
+       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+
+       fix_riprel(p);
+
+       if (can_boost(p->addr))
+               p->ainsn.boostable = 0;
+       else
+               p->ainsn.boostable = -1;
+
+       p->opcode = *p->addr;
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+       /* insn: must be on special executable page on x86. */
+       p->ainsn.insn = get_insn_slot();
+       if (!p->ainsn.insn)
+               return -ENOMEM;
+       arch_copy_kprobe(p);
+       return 0;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+       text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+       text_poke(p->addr, &p->opcode, 1);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+       mutex_lock(&kprobe_mutex);
+       free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+       mutex_unlock(&kprobe_mutex);
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       kcb->prev_kprobe.kp = kprobe_running();
+       kcb->prev_kprobe.status = kcb->kprobe_status;
+       kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+       kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+       kcb->kprobe_status = kcb->prev_kprobe.status;
+       kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+       kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+                               struct kprobe_ctlblk *kcb)
+{
+       __get_cpu_var(current_kprobe) = p;
+       kcb->kprobe_saved_flags = kcb->kprobe_old_flags
+               = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+       if (is_IF_modifier(p->ainsn.insn))
+               kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
+}
+
+static void __kprobes clear_btf(void)
+{
+       if (test_thread_flag(TIF_DEBUGCTLMSR))
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+}
+
+static void __kprobes restore_btf(void)
+{
+       if (test_thread_flag(TIF_DEBUGCTLMSR))
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+       clear_btf();
+       regs->flags |= X86_EFLAGS_TF;
+       regs->flags &= ~X86_EFLAGS_IF;
+       /* single step inline if the instruction is an int3 */
+       if (p->opcode == BREAKPOINT_INSTRUCTION)
+               regs->ip = (unsigned long)p->addr;
+       else
+               regs->ip = (unsigned long)p->ainsn.insn;
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+                                     struct pt_regs *regs)
+{
+       unsigned long *sara = stack_addr(regs);
+
+       ri->ret_addr = (kprobe_opcode_t *) *sara;
+
+       /* Replace the return addr with trampoline addr */
+       *sara = (unsigned long) &kretprobe_trampoline;
+}
+
+static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+                                      struct kprobe_ctlblk *kcb)
+{
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+       if (p->ainsn.boostable == 1 && !p->post_handler) {
+               /* Boost up -- we can execute copied instructions directly */
+               reset_current_kprobe();
+               regs->ip = (unsigned long)p->ainsn.insn;
+               preempt_enable_no_resched();
+               return;
+       }
+#endif
+       prepare_singlestep(p, regs);
+       kcb->kprobe_status = KPROBE_HIT_SS;
+}
+
+/*
+ * We have reentered the kprobe_handler(), since another probe was hit while
+ * within the handler. We save the original kprobes variables and just single
+ * step on the instruction of the new probe without calling any user handlers.
+ */
+static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+                                   struct kprobe_ctlblk *kcb)
+{
+       switch (kcb->kprobe_status) {
+       case KPROBE_HIT_SSDONE:
+#ifdef CONFIG_X86_64
+               /* TODO: Provide re-entrancy from post_kprobes_handler() and
+                * avoid exception stack corruption while single-stepping on
+                * the instruction of the new probe.
+                */
+               arch_disarm_kprobe(p);
+               regs->ip = (unsigned long)p->addr;
+               reset_current_kprobe();
+               preempt_enable_no_resched();
+               break;
+#endif
+       case KPROBE_HIT_ACTIVE:
+               save_previous_kprobe(kcb);
+               set_current_kprobe(p, regs, kcb);
+               kprobes_inc_nmissed_count(p);
+               prepare_singlestep(p, regs);
+               kcb->kprobe_status = KPROBE_REENTER;
+               break;
+       case KPROBE_HIT_SS:
+               if (p == kprobe_running()) {
+                       regs->flags &= ~TF_MASK;
+                       regs->flags |= kcb->kprobe_saved_flags;
+                       return 0;
+               } else {
+                       /* A probe has been hit in the codepath leading up
+                        * to, or just after, single-stepping of a probed
+                        * instruction. This entire codepath should strictly
+                        * reside in .kprobes.text section. Raise a warning
+                        * to highlight this peculiar case.
+                        */
+               }
+       default:
+               /* impossible cases */
+               WARN_ON(1);
+               return 0;
+       }
+
+       return 1;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+       kprobe_opcode_t *addr;
+       struct kprobe *p;
+       struct kprobe_ctlblk *kcb;
+
+       addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+       if (*addr != BREAKPOINT_INSTRUCTION) {
+               /*
+                * The breakpoint instruction was removed right
+                * after we hit it.  Another cpu has removed
+                * either a probepoint or a debugger breakpoint
+                * at this address.  In either case, no further
+                * handling of this interrupt is appropriate.
+                * Back up over the (now missing) int3 and run
+                * the original instruction.
+                */
+               regs->ip = (unsigned long)addr;
+               return 1;
+       }
+
+       /*
+        * We don't want to be preempted for the entire
+        * duration of kprobe processing. We conditionally
+        * re-enable preemption at the end of this function,
+        * and also in reenter_kprobe() and setup_singlestep().
+        */
+       preempt_disable();
+
+       kcb = get_kprobe_ctlblk();
+       p = get_kprobe(addr);
+
+       if (p) {
+               if (kprobe_running()) {
+                       if (reenter_kprobe(p, regs, kcb))
+                               return 1;
+               } else {
+                       set_current_kprobe(p, regs, kcb);
+                       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+                       /*
+                        * If we have no pre-handler or it returned 0, we
+                        * continue with normal processing.  If we have a
+                        * pre-handler and it returned non-zero, it prepped
+                        * for calling the break_handler below on re-entry
+                        * for jprobe processing, so get out doing nothing
+                        * more here.
+                        */
+                       if (!p->pre_handler || !p->pre_handler(p, regs))
+                               setup_singlestep(p, regs, kcb);
+                       return 1;
+               }
+       } else if (kprobe_running()) {
+               p = __get_cpu_var(current_kprobe);
+               if (p->break_handler && p->break_handler(p, regs)) {
+                       setup_singlestep(p, regs, kcb);
+                       return 1;
+               }
+       } /* else: not a kprobe fault; let the kernel handle it */
+
+       preempt_enable_no_resched();
+       return 0;
+}
+
+/*
+ * When a retprobed function returns, this code saves registers and
+ * calls trampoline_handler() runs, which calls the kretprobe's handler.
+ */
+void __kprobes kretprobe_trampoline_holder(void)
+{
+       asm volatile (
+                       ".global kretprobe_trampoline\n"
+                       "kretprobe_trampoline: \n"
+#ifdef CONFIG_X86_64
+                       /* We don't bother saving the ss register */
+                       "       pushq %rsp\n"
+                       "       pushfq\n"
+                       /*
+                        * Skip cs, ip, orig_ax.
+                        * trampoline_handler() will plug in these values
+                        */
+                       "       subq $24, %rsp\n"
+                       "       pushq %rdi\n"
+                       "       pushq %rsi\n"
+                       "       pushq %rdx\n"
+                       "       pushq %rcx\n"
+                       "       pushq %rax\n"
+                       "       pushq %r8\n"
+                       "       pushq %r9\n"
+                       "       pushq %r10\n"
+                       "       pushq %r11\n"
+                       "       pushq %rbx\n"
+                       "       pushq %rbp\n"
+                       "       pushq %r12\n"
+                       "       pushq %r13\n"
+                       "       pushq %r14\n"
+                       "       pushq %r15\n"
+                       "       movq %rsp, %rdi\n"
+                       "       call trampoline_handler\n"
+                       /* Replace saved sp with true return address. */
+                       "       movq %rax, 152(%rsp)\n"
+                       "       popq %r15\n"
+                       "       popq %r14\n"
+                       "       popq %r13\n"
+                       "       popq %r12\n"
+                       "       popq %rbp\n"
+                       "       popq %rbx\n"
+                       "       popq %r11\n"
+                       "       popq %r10\n"
+                       "       popq %r9\n"
+                       "       popq %r8\n"
+                       "       popq %rax\n"
+                       "       popq %rcx\n"
+                       "       popq %rdx\n"
+                       "       popq %rsi\n"
+                       "       popq %rdi\n"
+                       /* Skip orig_ax, ip, cs */
+                       "       addq $24, %rsp\n"
+                       "       popfq\n"
+#else
+                       "       pushf\n"
+                       /*
+                        * Skip cs, ip, orig_ax.
+                        * trampoline_handler() will plug in these values
+                        */
+                       "       subl $12, %esp\n"
+                       "       pushl %fs\n"
+                       "       pushl %ds\n"
+                       "       pushl %es\n"
+                       "       pushl %eax\n"
+                       "       pushl %ebp\n"
+                       "       pushl %edi\n"
+                       "       pushl %esi\n"
+                       "       pushl %edx\n"
+                       "       pushl %ecx\n"
+                       "       pushl %ebx\n"
+                       "       movl %esp, %eax\n"
+                       "       call trampoline_handler\n"
+                       /* Move flags to cs */
+                       "       movl 52(%esp), %edx\n"
+                       "       movl %edx, 48(%esp)\n"
+                       /* Replace saved flags with true return address. */
+                       "       movl %eax, 52(%esp)\n"
+                       "       popl %ebx\n"
+                       "       popl %ecx\n"
+                       "       popl %edx\n"
+                       "       popl %esi\n"
+                       "       popl %edi\n"
+                       "       popl %ebp\n"
+                       "       popl %eax\n"
+                       /* Skip ip, orig_ax, es, ds, fs */
+                       "       addl $20, %esp\n"
+                       "       popf\n"
+#endif
+                       "       ret\n");
+}
+
+/*
+ * Called from kretprobe_trampoline
+ */
+void * __kprobes trampoline_handler(struct pt_regs *regs)
+{
+       struct kretprobe_instance *ri = NULL;
+       struct hlist_head *head, empty_rp;
+       struct hlist_node *node, *tmp;
+       unsigned long flags, orig_ret_address = 0;
+       unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+
+       INIT_HLIST_HEAD(&empty_rp);
+       spin_lock_irqsave(&kretprobe_lock, flags);
+       head = kretprobe_inst_table_head(current);
+       /* fixup registers */
+#ifdef CONFIG_X86_64
+       regs->cs = __KERNEL_CS;
+#else
+       regs->cs = __KERNEL_CS | get_kernel_rpl();
+#endif
+       regs->ip = trampoline_address;
+       regs->orig_ax = ~0UL;
+
+       /*
+        * It is possible to have multiple instances associated with a given
+        * task either because multiple functions in the call path have
+        * return probes installed on them, and/or more then one
+        * return probe was registered for a target function.
+        *
+        * We can handle this because:
+        *     - instances are always pushed into the head of the list
+        *     - when multiple return probes are registered for the same
+        *       function, the (chronologically) first instance's ret_addr
+        *       will be the real return address, and all the rest will
+        *       point to kretprobe_trampoline.
+        */
+       hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+               if (ri->task != current)
+                       /* another task is sharing our hash bucket */
+                       continue;
+
+               if (ri->rp && ri->rp->handler) {
+                       __get_cpu_var(current_kprobe) = &ri->rp->kp;
+                       get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+                       ri->rp->handler(ri, regs);
+                       __get_cpu_var(current_kprobe) = NULL;
+               }
+
+               orig_ret_address = (unsigned long)ri->ret_addr;
+               recycle_rp_inst(ri, &empty_rp);
+
+               if (orig_ret_address != trampoline_address)
+                       /*
+                        * This is the real return address. Any other
+                        * instances associated with this task are for
+                        * other calls deeper on the call stack
+                        */
+                       break;
+       }
+
+       kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+       spin_unlock_irqrestore(&kretprobe_lock, flags);
+
+       hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+               hlist_del(&ri->hlist);
+               kfree(ri);
+       }
+       return (void *)orig_ret_address;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new ip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed flags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ *
+ * If this is the first time we've single-stepped the instruction at
+ * this probepoint, and the instruction is boostable, boost it: add a
+ * jump instruction after the copied instruction, that jumps to the next
+ * instruction after the probepoint.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+               struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+{
+       unsigned long *tos = stack_addr(regs);
+       unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+       unsigned long orig_ip = (unsigned long)p->addr;
+       kprobe_opcode_t *insn = p->ainsn.insn;
+
+       /*skip the REX prefix*/
+       if (is_REX_prefix(insn))
+               insn++;
+
+       regs->flags &= ~X86_EFLAGS_TF;
+       switch (*insn) {
+       case 0x9c:      /* pushfl */
+               *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
+               *tos |= kcb->kprobe_old_flags;
+               break;
+       case 0xc2:      /* iret/ret/lret */
+       case 0xc3:
+       case 0xca:
+       case 0xcb:
+       case 0xcf:
+       case 0xea:      /* jmp absolute -- ip is correct */
+               /* ip is already adjusted, no more changes required */
+               p->ainsn.boostable = 1;
+               goto no_change;
+       case 0xe8:      /* call relative - Fix return addr */
+               *tos = orig_ip + (*tos - copy_ip);
+               break;
+#ifdef CONFIG_X86_32
+       case 0x9a:      /* call absolute -- same as call absolute, indirect */
+               *tos = orig_ip + (*tos - copy_ip);
+               goto no_change;
+#endif
+       case 0xff:
+               if ((insn[1] & 0x30) == 0x10) {
+                       /*
+                        * call absolute, indirect
+                        * Fix return addr; ip is correct.
+                        * But this is not boostable
+                        */
+                       *tos = orig_ip + (*tos - copy_ip);
+                       goto no_change;
+               } else if (((insn[1] & 0x31) == 0x20) ||
+                          ((insn[1] & 0x31) == 0x21)) {
+                       /*
+                        * jmp near and far, absolute indirect
+                        * ip is correct. And this is boostable
+                        */
+                       p->ainsn.boostable = 1;
+                       goto no_change;
+               }
+       default:
+               break;
+       }
+
+       if (p->ainsn.boostable == 0) {
+               if ((regs->ip > copy_ip) &&
+                   (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
+                       /*
+                        * These instructions can be executed directly if it
+                        * jumps back to correct address.
+                        */
+                       set_jmp_op((void *)regs->ip,
+                                  (void *)orig_ip + (regs->ip - copy_ip));
+                       p->ainsn.boostable = 1;
+               } else {
+                       p->ainsn.boostable = -1;
+               }
+       }
+
+       regs->ip += orig_ip - copy_ip;
+
+no_change:
+       restore_btf();
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.
+ */
+static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+       struct kprobe *cur = kprobe_running();
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       if (!cur)
+               return 0;
+
+       if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+               kcb->kprobe_status = KPROBE_HIT_SSDONE;
+               cur->post_handler(cur, regs, 0);
+       }
+
+       resume_execution(cur, regs, kcb);
+       regs->flags |= kcb->kprobe_saved_flags;
+       trace_hardirqs_fixup_flags(regs->flags);
+
+       /* Restore back the original saved kprobes variables and continue. */
+       if (kcb->kprobe_status == KPROBE_REENTER) {
+               restore_previous_kprobe(kcb);
+               goto out;
+       }
+       reset_current_kprobe();
+out:
+       preempt_enable_no_resched();
+
+       /*
+        * if somebody else is singlestepping across a probe point, flags
+        * will have TF set, in which case, continue the remaining processing
+        * of do_debug, as if this is not a probe hit.
+        */
+       if (regs->flags & X86_EFLAGS_TF)
+               return 0;
+
+       return 1;
+}
+
+int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+       struct kprobe *cur = kprobe_running();
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       switch (kcb->kprobe_status) {
+       case KPROBE_HIT_SS:
+       case KPROBE_REENTER:
+               /*
+                * We are here because the instruction being single
+                * stepped caused a page fault. We reset the current
+                * kprobe and the ip points back to the probe address
+                * and allow the page fault handler to continue as a
+                * normal page fault.
+                */
+               regs->ip = (unsigned long)cur->addr;
+               regs->flags |= kcb->kprobe_old_flags;
+               if (kcb->kprobe_status == KPROBE_REENTER)
+                       restore_previous_kprobe(kcb);
+               else
+                       reset_current_kprobe();
+               preempt_enable_no_resched();
+               break;
+       case KPROBE_HIT_ACTIVE:
+       case KPROBE_HIT_SSDONE:
+               /*
+                * We increment the nmissed count for accounting,
+                * we can also use npre/npostfault count for accounting
+                * these specific fault cases.
+                */
+               kprobes_inc_nmissed_count(cur);
+
+               /*
+                * We come here because instructions in the pre/post
+                * handler caused the page_fault, this could happen
+                * if handler tries to access user space by
+                * copy_from_user(), get_user() etc. Let the
+                * user-specified handler try to fix it first.
+                */
+               if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+                       return 1;
+
+               /*
+                * In case the user-specified fault handler returned
+                * zero, try to fix up.
+                */
+               if (fixup_exception(regs))
+                       return 1;
+
+               /*
+                * fixup routine could not handle it,
+                * Let do_page_fault() fix it.
+                */
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+                                      unsigned long val, void *data)
+{
+       struct die_args *args = data;
+       int ret = NOTIFY_DONE;
+
+       if (args->regs && user_mode_vm(args->regs))
+               return ret;
+
+       switch (val) {
+       case DIE_INT3:
+               if (kprobe_handler(args->regs))
+                       ret = NOTIFY_STOP;
+               break;
+       case DIE_DEBUG:
+               if (post_kprobe_handler(args->regs))
+                       ret = NOTIFY_STOP;
+               break;
+       case DIE_GPF:
+               /*
+                * To be potentially processing a kprobe fault and to
+                * trust the result from kprobe_running(), we have
+                * be non-preemptible.
+                */
+               if (!preemptible() && kprobe_running() &&
+                   kprobe_fault_handler(args->regs, args->trapnr))
+                       ret = NOTIFY_STOP;
+               break;
+       default:
+               break;
+       }
+       return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+       unsigned long addr;
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       kcb->jprobe_saved_regs = *regs;
+       kcb->jprobe_saved_sp = stack_addr(regs);
+       addr = (unsigned long)(kcb->jprobe_saved_sp);
+
+       /*
+        * As Linus pointed out, gcc assumes that the callee
+        * owns the argument space and could overwrite it, e.g.
+        * tailcall optimization. So, to be absolutely safe
+        * we also save and restore enough stack bytes to cover
+        * the argument area.
+        */
+       memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+              MIN_STACK_SIZE(addr));
+       regs->flags &= ~X86_EFLAGS_IF;
+       trace_hardirqs_off();
+       regs->ip = (unsigned long)(jp->entry);
+       return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       asm volatile (
+#ifdef CONFIG_X86_64
+                       "       xchg   %%rbx,%%rsp      \n"
+#else
+                       "       xchgl   %%ebx,%%esp     \n"
+#endif
+                       "       int3                    \n"
+                       "       .globl jprobe_return_end\n"
+                       "       jprobe_return_end:      \n"
+                       "       nop                     \n"::"b"
+                       (kcb->jprobe_saved_sp):"memory");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+       u8 *addr = (u8 *) (regs->ip - 1);
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+       if ((addr > (u8 *) jprobe_return) &&
+           (addr < (u8 *) jprobe_return_end)) {
+               if (stack_addr(regs) != kcb->jprobe_saved_sp) {
+                       struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
+                       printk(KERN_ERR
+                              "current sp %p does not match saved sp %p\n",
+                              stack_addr(regs), kcb->jprobe_saved_sp);
+                       printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
+                       show_registers(saved_regs);
+                       printk(KERN_ERR "Current registers\n");
+                       show_registers(regs);
+                       BUG();
+               }
+               *regs = kcb->jprobe_saved_regs;
+               memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
+                      kcb->jprobes_stack,
+                      MIN_STACK_SIZE(kcb->jprobe_saved_sp));
+               preempt_enable_no_resched();
+               return 1;
+       }
+       return 0;
+}
+
+int __init arch_init_kprobes(void)
+{
+       return 0;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+       return 0;
+}
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c

deleted file mode 100644 (file)

index 3a020f7..0000000
--- a/arch/x86/kernel/kprobes_32.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/*
- *  Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *             Probes initial implementation ( includes contributions from
- *             Rusty Russell).
- * 2004-July   Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- *             interface to access function arguments.
- * 2005-May    Hien Nguyen <hien@us.ibm.com>, Jim Keniston
- *             <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
- *             <prasanna@in.ibm.com> added function-return probes.
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/preempt.h>
-#include <linux/kdebug.h>
-#include <asm/cacheflush.h>
-#include <asm/desc.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-
-void jprobe_return_end(void);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {
-       {"__switch_to", }, /* This function switches only current task, but
-                            doesn't switch kernel stack.*/
-       {NULL, NULL}    /* Terminator */
-};
-const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-
-/* insert a jmp code */
-static __always_inline void set_jmp_op(void *from, void *to)
-{
-       struct __arch_jmp_op {
-               char op;
-               long raddr;
-       } __attribute__((packed)) *jop;
-       jop = (struct __arch_jmp_op *)from;
-       jop->raddr = (long)(to) - ((long)(from) + 5);
-       jop->op = RELATIVEJUMP_INSTRUCTION;
-}
-
-/*
- * returns non-zero if opcodes can be boosted.
- */
-static __always_inline int can_boost(kprobe_opcode_t *opcodes)
-{
-#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)               \
-       (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-         (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-         (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-         (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-        << (row % 32))
-       /*
-        * Undefined/reserved opcodes, conditional jump, Opcode Extension
-        * Groups, and some special opcodes can not be boost.
-        */
-       static const unsigned long twobyte_is_boostable[256 / 32] = {
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-               /*      -------------------------------         */
-               W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
-               W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
-               W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
-               W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
-               W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
-               W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
-               W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
-               W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
-               W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
-               W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
-               W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
-               W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
-               W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
-               W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
-               W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
-               W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0)  /* f0 */
-               /*      -------------------------------         */
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-       };
-#undef W
-       kprobe_opcode_t opcode;
-       kprobe_opcode_t *orig_opcodes = opcodes;
-retry:
-       if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-               return 0;
-       opcode = *(opcodes++);
-
-       /* 2nd-byte opcode */
-       if (opcode == 0x0f) {
-               if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-                       return 0;
-               return test_bit(*opcodes, twobyte_is_boostable);
-       }
-
-       switch (opcode & 0xf0) {
-       case 0x60:
-               if (0x63 < opcode && opcode < 0x67)
-                       goto retry; /* prefixes */
-               /* can't boost Address-size override and bound */
-               return (opcode != 0x62 && opcode != 0x67);
-       case 0x70:
-               return 0; /* can't boost conditional jump */
-       case 0xc0:
-               /* can't boost software-interruptions */
-               return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
-       case 0xd0:
-               /* can boost AA* and XLAT */
-               return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
-       case 0xe0:
-               /* can boost in/out and absolute jmps */
-               return ((opcode & 0x04) || opcode == 0xea);
-       case 0xf0:
-               if ((opcode & 0x0c) == 0 && opcode != 0xf1)
-                       goto retry; /* lock/rep(ne) prefix */
-               /* clear and set flags can be boost */
-               return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
-       default:
-               if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
-                       goto retry; /* prefixes */
-               /* can't boost CS override and call */
-               return (opcode != 0x2e && opcode != 0x9a);
-       }
-}
-
-/*
- * returns non-zero if opcode modifies the interrupt flag.
- */
-static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
-{
-       switch (opcode) {
-       case 0xfa:              /* cli */
-       case 0xfb:              /* sti */
-       case 0xcf:              /* iret/iretd */
-       case 0x9d:              /* popf/popfd */
-               return 1;
-       }
-       return 0;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-       /* insn: must be on special executable page on i386. */
-       p->ainsn.insn = get_insn_slot();
-       if (!p->ainsn.insn)
-               return -ENOMEM;
-
-       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-       p->opcode = *p->addr;
-       if (can_boost(p->addr)) {
-               p->ainsn.boostable = 0;
-       } else {
-               p->ainsn.boostable = -1;
-       }
-       return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-       mutex_lock(&kprobe_mutex);
-       free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
-       mutex_unlock(&kprobe_mutex);
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       kcb->prev_kprobe.kp = kprobe_running();
-       kcb->prev_kprobe.status = kcb->kprobe_status;
-       kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
-       kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
-       kcb->kprobe_status = kcb->prev_kprobe.status;
-       kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
-       kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
-                               struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = p;
-       kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
-               = (regs->eflags & (TF_MASK | IF_MASK));
-       if (is_IF_modifier(p->opcode))
-               kcb->kprobe_saved_eflags &= ~IF_MASK;
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-       regs->eflags |= TF_MASK;
-       regs->eflags &= ~IF_MASK;
-       /*single step inline if the instruction is an int3*/
-       if (p->opcode == BREAKPOINT_INSTRUCTION)
-               regs->eip = (unsigned long)p->addr;
-       else
-               regs->eip = (unsigned long)p->ainsn.insn;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-                                     struct pt_regs *regs)
-{
-       unsigned long *sara = (unsigned long *)&regs->esp;
-
-       ri->ret_addr = (kprobe_opcode_t *) *sara;
-
-       /* Replace the return addr with trampoline addr */
-       *sara = (unsigned long) &kretprobe_trampoline;
-}
-
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
-static int __kprobes kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *p;
-       int ret = 0;
-       kprobe_opcode_t *addr;
-       struct kprobe_ctlblk *kcb;
-
-       addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
-
-       /*
-        * We don't want to be preempted for the entire
-        * duration of kprobe processing
-        */
-       preempt_disable();
-       kcb = get_kprobe_ctlblk();
-
-       /* Check we're not actually recursing */
-       if (kprobe_running()) {
-               p = get_kprobe(addr);
-               if (p) {
-                       if (kcb->kprobe_status == KPROBE_HIT_SS &&
-                               *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-                               regs->eflags &= ~TF_MASK;
-                               regs->eflags |= kcb->kprobe_saved_eflags;
-                               goto no_kprobe;
-                       }
-                       /* We have reentered the kprobe_handler(), since
-                        * another probe was hit while within the handler.
-                        * We here save the original kprobes variables and
-                        * just single step on the instruction of the new probe
-                        * without calling any user handlers.
-                        */
-                       save_previous_kprobe(kcb);
-                       set_current_kprobe(p, regs, kcb);
-                       kprobes_inc_nmissed_count(p);
-                       prepare_singlestep(p, regs);
-                       kcb->kprobe_status = KPROBE_REENTER;
-                       return 1;
-               } else {
-                       if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /* The breakpoint instruction was removed by
-                        * another cpu right after we hit, no further
-                        * handling of this interrupt is appropriate
-                        */
-                               regs->eip -= sizeof(kprobe_opcode_t);
-                               ret = 1;
-                               goto no_kprobe;
-                       }
-                       p = __get_cpu_var(current_kprobe);
-                       if (p->break_handler && p->break_handler(p, regs)) {
-                               goto ss_probe;
-                       }
-               }
-               goto no_kprobe;
-       }
-
-       p = get_kprobe(addr);
-       if (!p) {
-               if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /*
-                        * The breakpoint instruction was removed right
-                        * after we hit it.  Another cpu has removed
-                        * either a probepoint or a debugger breakpoint
-                        * at this address.  In either case, no further
-                        * handling of this interrupt is appropriate.
-                        * Back up over the (now missing) int3 and run
-                        * the original instruction.
-                        */
-                       regs->eip -= sizeof(kprobe_opcode_t);
-                       ret = 1;
-               }
-               /* Not one of ours: let kernel handle it */
-               goto no_kprobe;
-       }
-
-       set_current_kprobe(p, regs, kcb);
-       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-       if (p->pre_handler && p->pre_handler(p, regs))
-               /* handler has already set things up, so skip ss setup */
-               return 1;
-
-ss_probe:
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
-       if (p->ainsn.boostable == 1 && !p->post_handler){
-               /* Boost up -- we can execute copied instructions directly */
-               reset_current_kprobe();
-               regs->eip = (unsigned long)p->ainsn.insn;
-               preempt_enable_no_resched();
-               return 1;
-       }
-#endif
-       prepare_singlestep(p, regs);
-       kcb->kprobe_status = KPROBE_HIT_SS;
-       return 1;
-
-no_kprobe:
-       preempt_enable_no_resched();
-       return ret;
-}
-
-/*
- * For function-return probes, init_kprobes() establishes a probepoint
- * here. When a retprobed function returns, this probe is hit and
- * trampoline_probe_handler() runs, calling the kretprobe's handler.
- */
- void __kprobes kretprobe_trampoline_holder(void)
- {
-       asm volatile ( ".global kretprobe_trampoline\n"
-                       "kretprobe_trampoline: \n"
-                       "       pushf\n"
-                       /* skip cs, eip, orig_eax */
-                       "       subl $12, %esp\n"
-                       "       pushl %fs\n"
-                       "       pushl %ds\n"
-                       "       pushl %es\n"
-                       "       pushl %eax\n"
-                       "       pushl %ebp\n"
-                       "       pushl %edi\n"
-                       "       pushl %esi\n"
-                       "       pushl %edx\n"
-                       "       pushl %ecx\n"
-                       "       pushl %ebx\n"
-                       "       movl %esp, %eax\n"
-                       "       call trampoline_handler\n"
-                       /* move eflags to cs */
-                       "       movl 52(%esp), %edx\n"
-                       "       movl %edx, 48(%esp)\n"
-                       /* save true return address on eflags */
-                       "       movl %eax, 52(%esp)\n"
-                       "       popl %ebx\n"
-                       "       popl %ecx\n"
-                       "       popl %edx\n"
-                       "       popl %esi\n"
-                       "       popl %edi\n"
-                       "       popl %ebp\n"
-                       "       popl %eax\n"
-                       /* skip eip, orig_eax, es, ds, fs */
-                       "       addl $20, %esp\n"
-                       "       popf\n"
-                       "       ret\n");
-}
-
-/*
- * Called from kretprobe_trampoline
- */
-fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
-{
-       struct kretprobe_instance *ri = NULL;
-       struct hlist_head *head, empty_rp;
-       struct hlist_node *node, *tmp;
-       unsigned long flags, orig_ret_address = 0;
-       unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
-
-       INIT_HLIST_HEAD(&empty_rp);
-       spin_lock_irqsave(&kretprobe_lock, flags);
-       head = kretprobe_inst_table_head(current);
-       /* fixup registers */
-       regs->xcs = __KERNEL_CS | get_kernel_rpl();
-       regs->eip = trampoline_address;
-       regs->orig_eax = 0xffffffff;
-
-       /*
-        * It is possible to have multiple instances associated with a given
-        * task either because an multiple functions in the call path
-        * have a return probe installed on them, and/or more then one return
-        * return probe was registered for a target function.
-        *
-        * We can handle this because:
-        *     - instances are always inserted at the head of the list
-        *     - when multiple return probes are registered for the same
-        *       function, the first instance's ret_addr will point to the
-        *       real return address, and all the rest will point to
-        *       kretprobe_trampoline
-        */
-       hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-               if (ri->task != current)
-                       /* another task is sharing our hash bucket */
-                       continue;
-
-               if (ri->rp && ri->rp->handler){
-                       __get_cpu_var(current_kprobe) = &ri->rp->kp;
-                       get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
-                       ri->rp->handler(ri, regs);
-                       __get_cpu_var(current_kprobe) = NULL;
-               }
-
-               orig_ret_address = (unsigned long)ri->ret_addr;
-               recycle_rp_inst(ri, &empty_rp);
-
-               if (orig_ret_address != trampoline_address)
-                       /*
-                        * This is the real return address. Any other
-                        * instances associated with this task are for
-                        * other calls deeper on the call stack
-                        */
-                       break;
-       }
-
-       kretprobe_assert(ri, orig_ret_address, trampoline_address);
-       spin_unlock_irqrestore(&kretprobe_lock, flags);
-
-       hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
-               hlist_del(&ri->hlist);
-               kfree(ri);
-       }
-       return (void*)orig_ret_address;
-}
-
-/*
- * Called after single-stepping.  p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction.  To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction.  The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt.  We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new eip is relative to the copied instruction.  We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- *
- * This function also checks instruction size for preparing direct execution.
- */
-static void __kprobes resume_execution(struct kprobe *p,
-               struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
-       unsigned long *tos = (unsigned long *)&regs->esp;
-       unsigned long copy_eip = (unsigned long)p->ainsn.insn;
-       unsigned long orig_eip = (unsigned long)p->addr;
-
-       regs->eflags &= ~TF_MASK;
-       switch (p->ainsn.insn[0]) {
-       case 0x9c:              /* pushfl */
-               *tos &= ~(TF_MASK | IF_MASK);
-               *tos |= kcb->kprobe_old_eflags;
-               break;
-       case 0xc2:              /* iret/ret/lret */
-       case 0xc3:
-       case 0xca:
-       case 0xcb:
-       case 0xcf:
-       case 0xea:              /* jmp absolute -- eip is correct */
-               /* eip is already adjusted, no more changes required */
-               p->ainsn.boostable = 1;
-               goto no_change;
-       case 0xe8:              /* call relative - Fix return addr */
-               *tos = orig_eip + (*tos - copy_eip);
-               break;
-       case 0x9a:              /* call absolute -- same as call absolute, indirect */
-               *tos = orig_eip + (*tos - copy_eip);
-               goto no_change;
-       case 0xff:
-               if ((p->ainsn.insn[1] & 0x30) == 0x10) {
-                       /*
-                        * call absolute, indirect
-                        * Fix return addr; eip is correct.
-                        * But this is not boostable
-                        */
-                       *tos = orig_eip + (*tos - copy_eip);
-                       goto no_change;
-               } else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||       /* jmp near, absolute indirect */
-                          ((p->ainsn.insn[1] & 0x31) == 0x21)) {       /* jmp far, absolute indirect */
-                       /* eip is correct. And this is boostable */
-                       p->ainsn.boostable = 1;
-                       goto no_change;
-               }
-       default:
-               break;
-       }
-
-       if (p->ainsn.boostable == 0) {
-               if ((regs->eip > copy_eip) &&
-                   (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
-                       /*
-                        * These instructions can be executed directly if it
-                        * jumps back to correct address.
-                        */
-                       set_jmp_op((void *)regs->eip,
-                                  (void *)orig_eip + (regs->eip - copy_eip));
-                       p->ainsn.boostable = 1;
-               } else {
-                       p->ainsn.boostable = -1;
-               }
-       }
-
-       regs->eip = orig_eip + (regs->eip - copy_eip);
-
-no_change:
-       return;
-}
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
- */
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       if (!cur)
-               return 0;
-
-       if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-               kcb->kprobe_status = KPROBE_HIT_SSDONE;
-               cur->post_handler(cur, regs, 0);
-       }
-
-       resume_execution(cur, regs, kcb);
-       regs->eflags |= kcb->kprobe_saved_eflags;
-       trace_hardirqs_fixup_flags(regs->eflags);
-
-       /*Restore back the original saved kprobes variables and continue. */
-       if (kcb->kprobe_status == KPROBE_REENTER) {
-               restore_previous_kprobe(kcb);
-               goto out;
-       }
-       reset_current_kprobe();
-out:
-       preempt_enable_no_resched();
-
-       /*
-        * if somebody else is singlestepping across a probe point, eflags
-        * will have TF set, in which case, continue the remaining processing
-        * of do_debug, as if this is not a probe hit.
-        */
-       if (regs->eflags & TF_MASK)
-               return 0;
-
-       return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       switch(kcb->kprobe_status) {
-       case KPROBE_HIT_SS:
-       case KPROBE_REENTER:
-               /*
-                * We are here because the instruction being single
-                * stepped caused a page fault. We reset the current
-                * kprobe and the eip points back to the probe address
-                * and allow the page fault handler to continue as a
-                * normal page fault.
-                */
-               regs->eip = (unsigned long)cur->addr;
-               regs->eflags |= kcb->kprobe_old_eflags;
-               if (kcb->kprobe_status == KPROBE_REENTER)
-                       restore_previous_kprobe(kcb);
-               else
-                       reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-       case KPROBE_HIT_ACTIVE:
-       case KPROBE_HIT_SSDONE:
-               /*
-                * We increment the nmissed count for accounting,
-                * we can also use npre/npostfault count for accouting
-                * these specific fault cases.
-                */
-               kprobes_inc_nmissed_count(cur);
-
-               /*
-                * We come here because instructions in the pre/post
-                * handler caused the page_fault, this could happen
-                * if handler tries to access user space by
-                * copy_from_user(), get_user() etc. Let the
-                * user-specified handler try to fix it first.
-                */
-               if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-                       return 1;
-
-               /*
-                * In case the user-specified fault handler returned
-                * zero, try to fix up.
-                */
-               if (fixup_exception(regs))
-                       return 1;
-
-               /*
-                * fixup_exception() could not handle it,
-                * Let do_page_fault() fix it.
-                */
-               break;
-       default:
-               break;
-       }
-       return 0;
-}
-
-/*
- * Wrapper routine to for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-                                      unsigned long val, void *data)
-{
-       struct die_args *args = (struct die_args *)data;
-       int ret = NOTIFY_DONE;
-
-       if (args->regs && user_mode_vm(args->regs))
-               return ret;
-
-       switch (val) {
-       case DIE_INT3:
-               if (kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_GPF:
-               /* kprobe_running() needs smp_processor_id() */
-               preempt_disable();
-               if (kprobe_running() &&
-                   kprobe_fault_handler(args->regs, args->trapnr))
-                       ret = NOTIFY_STOP;
-               preempt_enable();
-               break;
-       default:
-               break;
-       }
-       return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-       unsigned long addr;
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       kcb->jprobe_saved_regs = *regs;
-       kcb->jprobe_saved_esp = &regs->esp;
-       addr = (unsigned long)(kcb->jprobe_saved_esp);
-
-       /*
-        * TBD: As Linus pointed out, gcc assumes that the callee
-        * owns the argument space and could overwrite it, e.g.
-        * tailcall optimization. So, to be absolutely safe
-        * we also save and restore enough stack bytes to cover
-        * the argument area.
-        */
-       memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
-                       MIN_STACK_SIZE(addr));
-       regs->eflags &= ~IF_MASK;
-       trace_hardirqs_off();
-       regs->eip = (unsigned long)(jp->entry);
-       return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       asm volatile ("       xchgl   %%ebx,%%esp     \n"
-                     "       int3                      \n"
-                     "       .globl jprobe_return_end  \n"
-                     "       jprobe_return_end:        \n"
-                     "       nop                       \n"::"b"
-                     (kcb->jprobe_saved_esp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-       u8 *addr = (u8 *) (regs->eip - 1);
-       unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-
-       if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-               if (&regs->esp != kcb->jprobe_saved_esp) {
-                       struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-                       printk("current esp %p does not match saved esp %p\n",
-                              &regs->esp, kcb->jprobe_saved_esp);
-                       printk("Saved registers for jprobe %p\n", jp);
-                       show_registers(saved_regs);
-                       printk("Current registers\n");
-                       show_registers(regs);
-                       BUG();
-               }
-               *regs = kcb->jprobe_saved_regs;
-               memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
-                      MIN_STACK_SIZE(stack_addr));
-               preempt_enable_no_resched();
-               return 1;
-       }
-       return 0;
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-       return 0;
-}
-
-int __init arch_init_kprobes(void)
-{
-       return 0;
-}
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c

deleted file mode 100644 (file)

index 5df19a9..0000000
--- a/arch/x86/kernel/kprobes_64.c
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- *  Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *             Probes initial implementation ( includes contributions from
- *             Rusty Russell).
- * 2004-July   Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- *             interface to access function arguments.
- * 2004-Oct    Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
- *             <prasanna@in.ibm.com> adapted for x86_64
- * 2005-Mar    Roland McGrath <roland@redhat.com>
- *             Fixed to handle %rip-relative addressing mode correctly.
- * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
- *              Added function return probes functionality
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-
-void jprobe_return_end(void);
-static void __kprobes arch_copy_kprobe(struct kprobe *p);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {
-       {"__switch_to", }, /* This function switches only current task, but
-                             doesn't switch kernel stack.*/
-       {NULL, NULL}    /* Terminator */
-};
-const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-
-/*
- * returns non-zero if opcode modifies the interrupt flag.
- */
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
-{
-       switch (*insn) {
-       case 0xfa:              /* cli */
-       case 0xfb:              /* sti */
-       case 0xcf:              /* iret/iretd */
-       case 0x9d:              /* popf/popfd */
-               return 1;
-       }
-
-       if (*insn  >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
-               return 1;
-       return 0;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-       /* insn: must be on special executable page on x86_64. */
-       p->ainsn.insn = get_insn_slot();
-       if (!p->ainsn.insn) {
-               return -ENOMEM;
-       }
-       arch_copy_kprobe(p);
-       return 0;
-}
-
-/*
- * Determine if the instruction uses the %rip-relative addressing mode.
- * If it does, return the address of the 32-bit displacement word.
- * If not, return null.
- */
-static s32 __kprobes *is_riprel(u8 *insn)
-{
-#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)               \
-       (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-         (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-         (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-         (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-        << (row % 64))
-       static const u64 onebyte_has_modrm[256 / 64] = {
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-               /*      -------------------------------         */
-               W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
-               W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
-               W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
-               W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
-               W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
-               W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
-               W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
-               W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
-               W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
-               W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
-               W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
-               W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
-               W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
-               W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
-               W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
-               W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
-               /*      -------------------------------         */
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-       };
-       static const u64 twobyte_has_modrm[256 / 64] = {
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-               /*      -------------------------------         */
-               W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
-               W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
-               W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
-               W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
-               W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
-               W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
-               W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
-               W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
-               W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
-               W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
-               W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
-               W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
-               W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
-               W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
-               W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
-               W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
-               /*      -------------------------------         */
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-       };
-#undef W
-       int need_modrm;
-
-       /* Skip legacy instruction prefixes.  */
-       while (1) {
-               switch (*insn) {
-               case 0x66:
-               case 0x67:
-               case 0x2e:
-               case 0x3e:
-               case 0x26:
-               case 0x64:
-               case 0x65:
-               case 0x36:
-               case 0xf0:
-               case 0xf3:
-               case 0xf2:
-                       ++insn;
-                       continue;
-               }
-               break;
-       }
-
-       /* Skip REX instruction prefix.  */
-       if ((*insn & 0xf0) == 0x40)
-               ++insn;
-
-       if (*insn == 0x0f) {    /* Two-byte opcode.  */
-               ++insn;
-               need_modrm = test_bit(*insn, twobyte_has_modrm);
-       } else {                /* One-byte opcode.  */
-               need_modrm = test_bit(*insn, onebyte_has_modrm);
-       }
-
-       if (need_modrm) {
-               u8 modrm = *++insn;
-               if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
-                       /* Displacement follows ModRM byte.  */
-                       return (s32 *) ++insn;
-               }
-       }
-
-       /* No %rip-relative addressing mode here.  */
-       return NULL;
-}
-
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
-       s32 *ripdisp;
-       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
-       ripdisp = is_riprel(p->ainsn.insn);
-       if (ripdisp) {
-               /*
-                * The copied instruction uses the %rip-relative
-                * addressing mode.  Adjust the displacement for the
-                * difference between the original location of this
-                * instruction and the location of the copy that will
-                * actually be run.  The tricky bit here is making sure
-                * that the sign extension happens correctly in this
-                * calculation, since we need a signed 32-bit result to
-                * be sign-extended to 64 bits when it's added to the
-                * %rip value and yield the same 64-bit result that the
-                * sign-extension of the original signed 32-bit
-                * displacement would have given.
-                */
-               s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
-               BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-               *ripdisp = disp;
-       }
-       p->opcode = *p->addr;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-       mutex_lock(&kprobe_mutex);
-       free_insn_slot(p->ainsn.insn, 0);
-       mutex_unlock(&kprobe_mutex);
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       kcb->prev_kprobe.kp = kprobe_running();
-       kcb->prev_kprobe.status = kcb->kprobe_status;
-       kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
-       kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
-       kcb->kprobe_status = kcb->prev_kprobe.status;
-       kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
-       kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
-                               struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = p;
-       kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
-               = (regs->eflags & (TF_MASK | IF_MASK));
-       if (is_IF_modifier(p->ainsn.insn))
-               kcb->kprobe_saved_rflags &= ~IF_MASK;
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-       regs->eflags |= TF_MASK;
-       regs->eflags &= ~IF_MASK;
-       /*single step inline if the instruction is an int3*/
-       if (p->opcode == BREAKPOINT_INSTRUCTION)
-               regs->rip = (unsigned long)p->addr;
-       else
-               regs->rip = (unsigned long)p->ainsn.insn;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-                                     struct pt_regs *regs)
-{
-       unsigned long *sara = (unsigned long *)regs->rsp;
-
-       ri->ret_addr = (kprobe_opcode_t *) *sara;
-       /* Replace the return addr with trampoline addr */
-       *sara = (unsigned long) &kretprobe_trampoline;
-}
-
-int __kprobes kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *p;
-       int ret = 0;
-       kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
-       struct kprobe_ctlblk *kcb;
-
-       /*
-        * We don't want to be preempted for the entire
-        * duration of kprobe processing
-        */
-       preempt_disable();
-       kcb = get_kprobe_ctlblk();
-
-       /* Check we're not actually recursing */
-       if (kprobe_running()) {
-               p = get_kprobe(addr);
-               if (p) {
-                       if (kcb->kprobe_status == KPROBE_HIT_SS &&
-                               *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-                               regs->eflags &= ~TF_MASK;
-                               regs->eflags |= kcb->kprobe_saved_rflags;
-                               goto no_kprobe;
-                       } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
-                               /* TODO: Provide re-entrancy from
-                                * post_kprobes_handler() and avoid exception
-                                * stack corruption while single-stepping on
-                                * the instruction of the new probe.
-                                */
-                               arch_disarm_kprobe(p);
-                               regs->rip = (unsigned long)p->addr;
-                               reset_current_kprobe();
-                               ret = 1;
-                       } else {
-                               /* We have reentered the kprobe_handler(), since
-                                * another probe was hit while within the
-                                * handler. We here save the original kprobe
-                                * variables and just single step on instruction
-                                * of the new probe without calling any user
-                                * handlers.
-                                */
-                               save_previous_kprobe(kcb);
-                               set_current_kprobe(p, regs, kcb);
-                               kprobes_inc_nmissed_count(p);
-                               prepare_singlestep(p, regs);
-                               kcb->kprobe_status = KPROBE_REENTER;
-                               return 1;
-                       }
-               } else {
-                       if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /* The breakpoint instruction was removed by
-                        * another cpu right after we hit, no further
-                        * handling of this interrupt is appropriate
-                        */
-                               regs->rip = (unsigned long)addr;
-                               ret = 1;
-                               goto no_kprobe;
-                       }
-                       p = __get_cpu_var(current_kprobe);
-                       if (p->break_handler && p->break_handler(p, regs)) {
-                               goto ss_probe;
-                       }
-               }
-               goto no_kprobe;
-       }
-
-       p = get_kprobe(addr);
-       if (!p) {
-               if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /*
-                        * The breakpoint instruction was removed right
-                        * after we hit it.  Another cpu has removed
-                        * either a probepoint or a debugger breakpoint
-                        * at this address.  In either case, no further
-                        * handling of this interrupt is appropriate.
-                        * Back up over the (now missing) int3 and run
-                        * the original instruction.
-                        */
-                       regs->rip = (unsigned long)addr;
-                       ret = 1;
-               }
-               /* Not one of ours: let kernel handle it */
-               goto no_kprobe;
-       }
-
-       set_current_kprobe(p, regs, kcb);
-       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-       if (p->pre_handler && p->pre_handler(p, regs))
-               /* handler has already set things up, so skip ss setup */
-               return 1;
-
-ss_probe:
-       prepare_singlestep(p, regs);
-       kcb->kprobe_status = KPROBE_HIT_SS;
-       return 1;
-
-no_kprobe:
-       preempt_enable_no_resched();
-       return ret;
-}
-
-/*
- * For function-return probes, init_kprobes() establishes a probepoint
- * here. When a retprobed function returns, this probe is hit and
- * trampoline_probe_handler() runs, calling the kretprobe's handler.
- */
- void kretprobe_trampoline_holder(void)
- {
-       asm volatile (  ".global kretprobe_trampoline\n"
-                       "kretprobe_trampoline: \n"
-                       "nop\n");
- }
-
-/*
- * Called when we hit the probe point at kretprobe_trampoline
- */
-int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct kretprobe_instance *ri = NULL;
-       struct hlist_head *head, empty_rp;
-       struct hlist_node *node, *tmp;
-       unsigned long flags, orig_ret_address = 0;
-       unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
-
-       INIT_HLIST_HEAD(&empty_rp);
-       spin_lock_irqsave(&kretprobe_lock, flags);
-       head = kretprobe_inst_table_head(current);
-
-       /*
-        * It is possible to have multiple instances associated with a given
-        * task either because an multiple functions in the call path
-        * have a return probe installed on them, and/or more then one return
-        * return probe was registered for a target function.
-        *
-        * We can handle this because:
-        *     - instances are always inserted at the head of the list
-        *     - when multiple return probes are registered for the same
-        *       function, the first instance's ret_addr will point to the
-        *       real return address, and all the rest will point to
-        *       kretprobe_trampoline
-        */
-       hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-               if (ri->task != current)
-                       /* another task is sharing our hash bucket */
-                       continue;
-
-               if (ri->rp && ri->rp->handler)
-                       ri->rp->handler(ri, regs);
-
-               orig_ret_address = (unsigned long)ri->ret_addr;
-               recycle_rp_inst(ri, &empty_rp);
-
-               if (orig_ret_address != trampoline_address)
-                       /*
-                        * This is the real return address. Any other
-                        * instances associated with this task are for
-                        * other calls deeper on the call stack
-                        */
-                       break;
-       }
-
-       kretprobe_assert(ri, orig_ret_address, trampoline_address);
-       regs->rip = orig_ret_address;
-
-       reset_current_kprobe();
-       spin_unlock_irqrestore(&kretprobe_lock, flags);
-       preempt_enable_no_resched();
-
-       hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
-               hlist_del(&ri->hlist);
-               kfree(ri);
-       }
-       /*
-        * By returning a non-zero value, we are telling
-        * kprobe_handler() that we don't want the post_handler
-        * to run (and have re-enabled preemption)
-        */
-       return 1;
-}
-
-/*
- * Called after single-stepping.  p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction.  To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction.  The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt.  We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new rip is relative to the copied instruction.  We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- */
-static void __kprobes resume_execution(struct kprobe *p,
-               struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
-       unsigned long *tos = (unsigned long *)regs->rsp;
-       unsigned long copy_rip = (unsigned long)p->ainsn.insn;
-       unsigned long orig_rip = (unsigned long)p->addr;
-       kprobe_opcode_t *insn = p->ainsn.insn;
-
-       /*skip the REX prefix*/
-       if (*insn >= 0x40 && *insn <= 0x4f)
-               insn++;
-
-       regs->eflags &= ~TF_MASK;
-       switch (*insn) {
-       case 0x9c:      /* pushfl */
-               *tos &= ~(TF_MASK | IF_MASK);
-               *tos |= kcb->kprobe_old_rflags;
-               break;
-       case 0xc2:      /* iret/ret/lret */
-       case 0xc3:
-       case 0xca:
-       case 0xcb:
-       case 0xcf:
-       case 0xea:      /* jmp absolute -- ip is correct */
-               /* ip is already adjusted, no more changes required */
-               goto no_change;
-       case 0xe8:      /* call relative - Fix return addr */
-               *tos = orig_rip + (*tos - copy_rip);
-               break;
-       case 0xff:
-               if ((insn[1] & 0x30) == 0x10) {
-                       /* call absolute, indirect */
-                       /* Fix return addr; ip is correct. */
-                       *tos = orig_rip + (*tos - copy_rip);
-                       goto no_change;
-               } else if (((insn[1] & 0x31) == 0x20) ||        /* jmp near, absolute indirect */
-                          ((insn[1] & 0x31) == 0x21)) {        /* jmp far, absolute indirect */
-                       /* ip is correct. */
-                       goto no_change;
-               }
-       default:
-               break;
-       }
-
-       regs->rip = orig_rip + (regs->rip - copy_rip);
-no_change:
-
-       return;
-}
-
-int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       if (!cur)
-               return 0;
-
-       if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-               kcb->kprobe_status = KPROBE_HIT_SSDONE;
-               cur->post_handler(cur, regs, 0);
-       }
-
-       resume_execution(cur, regs, kcb);
-       regs->eflags |= kcb->kprobe_saved_rflags;
-       trace_hardirqs_fixup_flags(regs->eflags);
-
-       /* Restore the original saved kprobes variables and continue. */
-       if (kcb->kprobe_status == KPROBE_REENTER) {
-               restore_previous_kprobe(kcb);
-               goto out;
-       }
-       reset_current_kprobe();
-out:
-       preempt_enable_no_resched();
-
-       /*
-        * if somebody else is singlestepping across a probe point, eflags
-        * will have TF set, in which case, continue the remaining processing
-        * of do_debug, as if this is not a probe hit.
-        */
-       if (regs->eflags & TF_MASK)
-               return 0;
-
-       return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-       const struct exception_table_entry *fixup;
-
-       switch(kcb->kprobe_status) {
-       case KPROBE_HIT_SS:
-       case KPROBE_REENTER:
-               /*
-                * We are here because the instruction being single
-                * stepped caused a page fault. We reset the current
-                * kprobe and the rip points back to the probe address
-                * and allow the page fault handler to continue as a
-                * normal page fault.
-                */
-               regs->rip = (unsigned long)cur->addr;
-               regs->eflags |= kcb->kprobe_old_rflags;
-               if (kcb->kprobe_status == KPROBE_REENTER)
-                       restore_previous_kprobe(kcb);
-               else
-                       reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-       case KPROBE_HIT_ACTIVE:
-       case KPROBE_HIT_SSDONE:
-               /*
-                * We increment the nmissed count for accounting,
-                * we can also use npre/npostfault count for accouting
-                * these specific fault cases.
-                */
-               kprobes_inc_nmissed_count(cur);
-
-               /*
-                * We come here because instructions in the pre/post
-                * handler caused the page_fault, this could happen
-                * if handler tries to access user space by
-                * copy_from_user(), get_user() etc. Let the
-                * user-specified handler try to fix it first.
-                */
-               if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-                       return 1;
-
-               /*
-                * In case the user-specified fault handler returned
-                * zero, try to fix up.
-                */
-               fixup = search_exception_tables(regs->rip);
-               if (fixup) {
-                       regs->rip = fixup->fixup;
-                       return 1;
-               }
-
-               /*
-                * fixup() could not handle it,
-                * Let do_page_fault() fix it.
-                */
-               break;
-       default:
-               break;
-       }
-       return 0;
-}
-
-/*
- * Wrapper routine for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-                                      unsigned long val, void *data)
-{
-       struct die_args *args = (struct die_args *)data;
-       int ret = NOTIFY_DONE;
-
-       if (args->regs && user_mode(args->regs))
-               return ret;
-
-       switch (val) {
-       case DIE_INT3:
-               if (kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_GPF:
-               /* kprobe_running() needs smp_processor_id() */
-               preempt_disable();
-               if (kprobe_running() &&
-                   kprobe_fault_handler(args->regs, args->trapnr))
-                       ret = NOTIFY_STOP;
-               preempt_enable();
-               break;
-       default:
-               break;
-       }
-       return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-       unsigned long addr;
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       kcb->jprobe_saved_regs = *regs;
-       kcb->jprobe_saved_rsp = (long *) regs->rsp;
-       addr = (unsigned long)(kcb->jprobe_saved_rsp);
-       /*
-        * As Linus pointed out, gcc assumes that the callee
-        * owns the argument space and could overwrite it, e.g.
-        * tailcall optimization. So, to be absolutely safe
-        * we also save and restore enough stack bytes to cover
-        * the argument area.
-        */
-       memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
-                       MIN_STACK_SIZE(addr));
-       regs->eflags &= ~IF_MASK;
-       trace_hardirqs_off();
-       regs->rip = (unsigned long)(jp->entry);
-       return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       asm volatile ("       xchg   %%rbx,%%rsp     \n"
-                     "       int3                      \n"
-                     "       .globl jprobe_return_end  \n"
-                     "       jprobe_return_end:        \n"
-                     "       nop                       \n"::"b"
-                     (kcb->jprobe_saved_rsp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-       u8 *addr = (u8 *) (regs->rip - 1);
-       unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-
-       if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-               if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) {
-                       struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-                       printk("current rsp %p does not match saved rsp %p\n",
-                              (long *)regs->rsp, kcb->jprobe_saved_rsp);
-                       printk("Saved registers for jprobe %p\n", jp);
-                       show_registers(saved_regs);
-                       printk("Current registers\n");
-                       show_registers(regs);
-                       BUG();
-               }
-               *regs = kcb->jprobe_saved_regs;
-               memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
-                      MIN_STACK_SIZE(stack_addr));
-               preempt_enable_no_resched();
-               return 1;
-       }
-       return 0;
-}
-
-static struct kprobe trampoline_p = {
-       .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
-       .pre_handler = trampoline_probe_handler
-};
-
-int __init arch_init_kprobes(void)
-{
-       return register_kprobe(&trampoline_p);
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-       if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
-               return 1;
-
-       return 0;
-}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt.c

similarity index 61%

rename from arch/x86/kernel/ldt_32.c

rename to arch/x86/kernel/ldt.c

index 9ff90a27c45f75eee7423f5caaf66a2348f4d5a8..8a7660c8394a5a2ae2362e6d3d4ec032d5def37e 100644 (file)
--- a/arch/x86/kernel/ldt_32.c
+++ b/arch/x86/kernel/ldt.c
@@ -1,6 +1,9 @@
  /*
   * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
   * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
   */
  
  #include <linux/errno.h>
@@ -9,7 +12,6 @@
  #include <linux/mm.h>
  #include <linux/smp.h>
  #include <linux/vmalloc.h>
-#include <linux/slab.h>
  
  #include <asm/uaccess.h>
  #include <asm/system.h>
@@ -17,7 +19,7 @@
  #include <asm/desc.h>
  #include <asm/mmu_context.h>
  
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+#ifdef CONFIG_SMP
  static void flush_ldt(void *null)
  {
         if (current->active_mm)
@@ -27,26 +29,31 @@ static void flush_ldt(void *null)
  
  static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
  {
-       void *oldldt;
-       void *newldt;
+       void *oldldt, *newldt;
         int oldsize;
  
         if (mincount <= pc->size)
                 return 0;
         oldsize = pc->size;
-       mincount = (mincount+511)&(~511);
-       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+       mincount = (mincount + 511) & (~511);
+       if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+               newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
         else
-               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+               newldt = (void *)__get_free_page(GFP_KERNEL);
  
         if (!newldt)
                 return -ENOMEM;
  
         if (oldsize)
-               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+               memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
         oldldt = pc->ldt;
-       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+       memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+              (mincount - oldsize) * LDT_ENTRY_SIZE);
+
+#ifdef CONFIG_X86_64
+       /* CHECKME: Do we really need this ? */
+       wmb();
+#endif
         pc->ldt = newldt;
         wmb();
         pc->size = mincount;
@@ -55,6 +62,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
         if (reload) {
  #ifdef CONFIG_SMP
                 cpumask_t mask;
+
                 preempt_disable();
                 load_LDT(pc);
                 mask = cpumask_of_cpu(smp_processor_id());
@@ -66,10 +74,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
  #endif
         }
         if (oldsize) {
-               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+               if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
                         vfree(oldldt);
                 else
-                       kfree(oldldt);
+                       put_page(virt_to_page(oldldt));
         }
         return 0;
  }
@@ -77,9 +85,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
  static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
  {
         int err = alloc_ldt(new, old->size, 0);
+
         if (err < 0)
                 return err;
-       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+       memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
         return 0;
  }
  
@@ -89,7 +98,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
   */
  int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
  {
-       struct mm_struct * old_mm;
+       struct mm_struct *old_mm;
         int retval = 0;
  
         mutex_init(&mm->context.lock);
@@ -105,33 +114,38 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
  
  /*
   * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
   */
  void destroy_context(struct mm_struct *mm)
  {
         if (mm->context.size) {
+#ifdef CONFIG_X86_32
+               /* CHECKME: Can this ever happen ? */
                 if (mm == current->active_mm)
                         clear_LDT();
-               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+#endif
+               if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
                         vfree(mm->context.ldt);
                 else
-                       kfree(mm->context.ldt);
+                       put_page(virt_to_page(mm->context.ldt));
                 mm->context.size = 0;
         }
  }
  
-static int read_ldt(void __user * ptr, unsigned long bytecount)
+static int read_ldt(void __user *ptr, unsigned long bytecount)
  {
         int err;
         unsigned long size;
-       struct mm_struct * mm = current->mm;
+       struct mm_struct *mm = current->mm;
  
         if (!mm->context.size)
                 return 0;
-       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+       if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+               bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
  
         mutex_lock(&mm->context.lock);
-       size = mm->context.size*LDT_ENTRY_SIZE;
+       size = mm->context.size * LDT_ENTRY_SIZE;
         if (size > bytecount)
                 size = bytecount;
  
@@ -143,7 +157,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
                 goto error_return;
         if (size != bytecount) {
                 /* zero-fill the rest */
-               if (clear_user(ptr+size, bytecount-size) != 0) {
+               if (clear_user(ptr + size, bytecount - size) != 0) {
                         err = -EFAULT;
                         goto error_return;
                 }
@@ -153,34 +167,32 @@ error_return:
         return err;
  }
  
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
  {
-       int err;
-       unsigned long size;
-
-       err = 0;
-       size = 5*sizeof(struct desc_struct);
-       if (size > bytecount)
-               size = bytecount;
-
-       err = size;
-       if (clear_user(ptr, size))
-               err = -EFAULT;
-
-       return err;
+       /* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+       unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+       unsigned long size = 128;
+#endif
+       if (bytecount > size)
+               bytecount = size;
+       if (clear_user(ptr, bytecount))
+               return -EFAULT;
+       return bytecount;
  }
  
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
  {
-       struct mm_struct * mm = current->mm;
-       __u32 entry_1, entry_2;
+       struct mm_struct *mm = current->mm;
+       struct desc_struct ldt;
         int error;
         struct user_desc ldt_info;
  
         error = -EINVAL;
         if (bytecount != sizeof(ldt_info))
                 goto out;
-       error = -EFAULT;        
+       error = -EFAULT;
         if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
                 goto out;
  
@@ -196,28 +208,27 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
  
         mutex_lock(&mm->context.lock);
         if (ldt_info.entry_number >= mm->context.size) {
-               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+               error = alloc_ldt(&current->mm->context,
+                                 ldt_info.entry_number + 1, 1);
                 if (error < 0)
                         goto out_unlock;
         }
  
-       /* Allow LDTs to be cleared by the user. */
-       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+       /* Allow LDTs to be cleared by the user. */
+       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
                 if (oldmode || LDT_empty(&ldt_info)) {
-                       entry_1 = 0;
-                       entry_2 = 0;
+                       memset(&ldt, 0, sizeof(ldt));
                         goto install;
                 }
         }
  
-       entry_1 = LDT_entry_a(&ldt_info);
-       entry_2 = LDT_entry_b(&ldt_info);
+       fill_ldt(&ldt, &ldt_info);
         if (oldmode)
-               entry_2 &= ~(1 << 20);
+               ldt.avl = 0;
  
         /* Install the new entry ...  */
  install:
-       write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
+       write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
         error = 0;
  
  out_unlock:
@@ -226,7 +237,8 @@ out:
         return error;
  }
  
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+                             unsigned long bytecount)
  {
         int ret = -ENOSYS;
  
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c

deleted file mode 100644 (file)

index 60e57ab..0000000
--- a/arch/x86/kernel/ldt_64.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2002 Andi Kleen
- * 
- * This handles calls from both 32bit and 64bit mode.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
-static void flush_ldt(void *null)
-{
-       if (current->active_mm)
-               load_LDT(&current->active_mm->context);
-}
-#endif
-
-static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
-{
-       void *oldldt;
-       void *newldt;
-       unsigned oldsize;
-
-       if (mincount <= (unsigned)pc->size)
-               return 0;
-       oldsize = pc->size;
-       mincount = (mincount+511)&(~511);
-       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
-       else
-               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
-
-       if (!newldt)
-               return -ENOMEM;
-
-       if (oldsize)
-               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
-       oldldt = pc->ldt;
-       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
-       wmb();
-       pc->ldt = newldt;
-       wmb();
-       pc->size = mincount;
-       wmb();
-       if (reload) {
-#ifdef CONFIG_SMP
-               cpumask_t mask;
-
-               preempt_disable();
-               mask = cpumask_of_cpu(smp_processor_id());
-               load_LDT(pc);
-               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-                       smp_call_function(flush_ldt, NULL, 1, 1);
-               preempt_enable();
-#else
-               load_LDT(pc);
-#endif
-       }
-       if (oldsize) {
-               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
-                       vfree(oldldt);
-               else
-                       kfree(oldldt);
-       }
-       return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-       int err = alloc_ldt(new, old->size, 0);
-       if (err < 0)
-               return err;
-       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
-       return 0;
-}
-
-/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       struct mm_struct * old_mm;
-       int retval = 0;
-
-       mutex_init(&mm->context.lock);
-       mm->context.size = 0;
-       old_mm = current->mm;
-       if (old_mm && old_mm->context.size > 0) {
-               mutex_lock(&old_mm->context.lock);
-               retval = copy_ldt(&mm->context, &old_mm->context);
-               mutex_unlock(&old_mm->context.lock);
-       }
-       return retval;
-}
-
-/*
- * 
- * Don't touch the LDT register - we're already in the next thread.
- */
-void destroy_context(struct mm_struct *mm)
-{
-       if (mm->context.size) {
-               if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
-                       vfree(mm->context.ldt);
-               else
-                       kfree(mm->context.ldt);
-               mm->context.size = 0;
-       }
-}
-
-static int read_ldt(void __user * ptr, unsigned long bytecount)
-{
-       int err;
-       unsigned long size;
-       struct mm_struct * mm = current->mm;
-
-       if (!mm->context.size)
-               return 0;
-       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
-
-       mutex_lock(&mm->context.lock);
-       size = mm->context.size*LDT_ENTRY_SIZE;
-       if (size > bytecount)
-               size = bytecount;
-
-       err = 0;
-       if (copy_to_user(ptr, mm->context.ldt, size))
-               err = -EFAULT;
-       mutex_unlock(&mm->context.lock);
-       if (err < 0)
-               goto error_return;
-       if (size != bytecount) {
-               /* zero-fill the rest */
-               if (clear_user(ptr+size, bytecount-size) != 0) {
-                       err = -EFAULT;
-                       goto error_return;
-               }
-       }
-       return bytecount;
-error_return:
-       return err;
-}
-
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
-{
-       /* Arbitrary number */ 
-       /* x86-64 default LDT is all zeros */
-       if (bytecount > 128) 
-               bytecount = 128;        
-       if (clear_user(ptr, bytecount))
-               return -EFAULT;
-       return bytecount; 
-}
-
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
-{
-       struct task_struct *me = current;
-       struct mm_struct * mm = me->mm;
-       __u32 entry_1, entry_2, *lp;
-       int error;
-       struct user_desc ldt_info;
-
-       error = -EINVAL;
-
-       if (bytecount != sizeof(ldt_info))
-               goto out;
-       error = -EFAULT;        
-       if (copy_from_user(&ldt_info, ptr, bytecount))
-               goto out;
-
-       error = -EINVAL;
-       if (ldt_info.entry_number >= LDT_ENTRIES)
-               goto out;
-       if (ldt_info.contents == 3) {
-               if (oldmode)
-                       goto out;
-               if (ldt_info.seg_not_present == 0)
-                       goto out;
-       }
-
-       mutex_lock(&mm->context.lock);
-       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
-               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
-               if (error < 0)
-                       goto out_unlock;
-       }
-
-       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
-
-       /* Allow LDTs to be cleared by the user. */
-       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-               if (oldmode || LDT_empty(&ldt_info)) {
-                       entry_1 = 0;
-                       entry_2 = 0;
-                       goto install;
-               }
-       }
-
-       entry_1 = LDT_entry_a(&ldt_info);
-       entry_2 = LDT_entry_b(&ldt_info);
-       if (oldmode)
-               entry_2 &= ~(1 << 20);
-
-       /* Install the new entry ...  */
-install:
-       *lp     = entry_1;
-       *(lp+1) = entry_2;
-       error = 0;
-
-out_unlock:
-       mutex_unlock(&mm->context.lock);
-out:
-       return error;
-}
-
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
-{
-       int ret = -ENOSYS;
-
-       switch (func) {
-       case 0:
-               ret = read_ldt(ptr, bytecount);
-               break;
-       case 1:
-               ret = write_ldt(ptr, bytecount, 1);
-               break;
-       case 2:
-               ret = read_default_ldt(ptr, bytecount);
-               break;
-       case 0x11:
-               ret = write_ldt(ptr, bytecount, 0);
-               break;
-       }
-       return ret;
-}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c

index 11b935f4f886d34679dcf594ac8465ab2a57fe00..c1cfd60639d435f2dbe7358570c925e6a2395100 100644 (file)
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED;
  
  static void set_idt(void *newidt, __u16 limit)
  {
-       struct Xgt_desc_struct curidt;
+       struct desc_ptr curidt;
  
         /* ia32 supports unaliged loads & stores */
         curidt.size    = limit;
@@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit)
  
  static void set_gdt(void *newgdt, __u16 limit)
  {
-       struct Xgt_desc_struct curgdt;
+       struct desc_ptr curgdt;
  
         /* ia32 supports unaligned loads & stores */
         curgdt.size    = limit;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c

index aa3d2c8f7737851562923d37105ad7f719b8aa0b..a1fef42f8cdbccbd1664cc659c07781a2b82450a 100644 (file)
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image)
  void arch_crash_save_vmcoreinfo(void)
  {
         VMCOREINFO_SYMBOL(init_level4_pgt);
-
-#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE
-       VMCOREINFO_SYMBOL(node_data);
-       VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
  }
  
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c

index 3960ab7e149773aa1c3dd4b7f3160cf9b76d944b..219f86eb612301766771651eb56cb1e8f98189ac 100644 (file)
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s)
  }
  __setup("nomfgpt", mfgpt_disable);
  
+/* Reset the MFGPT timers. This is required by some broken BIOSes which already
+ * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
+ * affected at least (0.99 is OK with MFGPT workaround left to off).
+ */
+static int __init mfgpt_fix(char *s)
+{
+       u32 val, dummy;
+
+       /* The following udocumented bit resets the MFGPT timers */
+       val = 0xFF; dummy = 0;
+       wrmsr(0x5140002B, val, dummy);
+       return 1;
+}
+__setup("mfgptfix", mfgpt_fix);
+
  /*
   * Check whether any MFGPTs are available for the kernel to use.  In most
   * cases, firmware that uses AMD's VSA code will claim all timers during
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c

index 40cfd5488719b96fd6ccd8d3a890e54761d0c6c0..6ff447f9fda702398aaaaef9aa8f93143c56a12d 100644 (file)
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc)
                 return 0;
         /* check extended signature checksum */
         for (i = 0; i < ext_sigcount; i++) {
-               ext_sig = (struct extended_signature *)((void *)ext_header
-                       + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+               ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+                         EXT_SIGNATURE_SIZE * i;
                 sum = orig_sum
                         - (mc_header->sig + mc_header->pf + mc_header->cksum)
                         + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
@@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu)
         if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
                 return 0;
  
-       ext_header = (struct extended_sigtable *)(mc +
-                       get_datasize(mc_header) + MC_HEADER_SIZE);
+       ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
         ext_sigcount = ext_header->count;
-       ext_sig = (struct extended_signature *)((void *)ext_header
-                       + EXT_HEADER_SIZE);
+       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
         for (i = 0; i < ext_sigcount; i++) {
                 if (microcode_update_match(cpu, mc_header,
                                 ext_sig->sig, ext_sig->pf))
@@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu)
                 pr_debug("ucode data file %s load failed\n", name);
                 return error;
         }
-       buf = (void *)firmware->data;
+       buf = firmware->data;
         size = firmware->size;
         while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
                         > 0) {
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c

index 7a05a7f6099a4ee31b7bb72e6c11c956282588da..67009cdd5eca194bbf60fb9a7209a1b94550c23b 100644 (file)
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
  /* Processor that is doing the boot up */
  unsigned int boot_cpu_physical_apicid = -1U;
  /* Internal processor count */
-unsigned int __cpuinitdata num_processors;
+unsigned int num_processors;
  
  /* Bitmask of physically existing CPUs */
  physid_mask_t phys_cpu_present_map;
@@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
         if (!(m->mpc_flags & MPC_APIC_USABLE))
                 return;
  
-       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
+       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
                 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
         if (nr_ioapics >= MAX_IO_APICS) {
                 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
@@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
  
         mps_oem_check(mpc, oem, str);
  
-       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+       printk("APIC at: 0x%X\n", mpc->mpc_lapic);
  
-       /* 
+       /*
          * Save the local APIC address (it might be non-default) -- but only
          * if we're not using ACPI.
          */
@@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
         unsigned long *bp = phys_to_virt(base);
         struct intel_mp_floating *mpf;
  
-       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+       printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
         if (sizeof(*mpf) != 16)
                 printk("Error: MPF size\n");
  
@@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
                                 || (mpf->mpf_specification == 4)) ) {
  
                         smp_found_config = 1;
-                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
-                                               virt_to_phys(mpf));
+                       printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+                               mpf, virt_to_phys(mpf));
                         reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
                         if (mpf->mpf_physptr) {
                                 /*
@@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
          */
         mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
         mp_ioapic_routing[idx].gsi_base = gsi_base;
-       mp_ioapic_routing[idx].gsi_end = gsi_base + 
+       mp_ioapic_routing[idx].gsi_end = gsi_base +
                 io_apic_get_redir_entries(idx);
  
-       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
-               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
-               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-               mp_ioapic_routing[idx].gsi_base,
-               mp_ioapic_routing[idx].gsi_end);
+       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+              "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+              mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+              mp_ioapic_routing[idx].gsi_base,
+              mp_ioapic_routing[idx].gsi_end);
  }
  
  void __init
@@ -1041,15 +1041,16 @@ void __init mp_config_acpi_legacy_irqs (void)
  }
  
  #define MAX_GSI_NUM    4096
+#define IRQ_COMPRESSION_START  64
  
  int mp_register_gsi(u32 gsi, int triggering, int polarity)
  {
         int ioapic = -1;
         int ioapic_pin = 0;
         int idx, bit = 0;
-       static int pci_irq = 16;
+       static int pci_irq = IRQ_COMPRESSION_START;
         /*
-        * Mapping between Global System Interrups, which
+        * Mapping between Global System Interrupts, which
          * represent all possible interrupts, and IRQs
          * assigned to actual devices.
          */
@@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
         if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
                 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
                         mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-               return gsi_to_irq[gsi];
+               return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
         }
  
         mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
  
-       if (triggering == ACPI_LEVEL_SENSITIVE) {
+       /*
+        * For GSI >= 64, use IRQ compression
+        */
+       if ((gsi >= IRQ_COMPRESSION_START)
+               && (triggering == ACPI_LEVEL_SENSITIVE)) {
                 /*
                  * For PCI devices assign IRQs in order, avoiding gaps
                  * due to unused I/O APIC pins.
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c

index ef4aab123581a54ac17019d4441138a90b36b980..72ab1403fed7b8d471023d6632d64d8c6e67ad41 100644 (file)
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -60,14 +60,18 @@ unsigned int boot_cpu_id = -1U;
  EXPORT_SYMBOL(boot_cpu_id);
  
  /* Internal processor count */
-unsigned int num_processors __cpuinitdata = 0;
+unsigned int num_processors;
  
  unsigned disabled_cpus __cpuinitdata;
  
  /* Bitmask of physically existing CPUs */
  physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
  
-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
+                               = { [0 ... NR_CPUS-1] = BAD_APICID };
+void *x86_bios_cpu_apicid_early_ptr;
+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
  
  
  /*
@@ -118,24 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
         physid_set(m->mpc_apicid, phys_cpu_present_map);
         if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
                 /*
-                * bios_cpu_apicid is required to have processors listed
+                * x86_bios_cpu_apicid is required to have processors listed
                  * in same order as logical cpu numbers. Hence the first
                  * entry is BSP, and so on.
                  */
                 cpu = 0;
         }
-       bios_cpu_apicid[cpu] = m->mpc_apicid;
-       /*
-        * We get called early in the the start_kernel initialization
-        * process when the per_cpu data area is not yet setup, so we
-        * use a static array that is removed after the per_cpu data
-        * area is created.
-        */
-       if (x86_cpu_to_apicid_ptr) {
-               u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
-               x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+       /* are we being called early in kernel startup? */
+       if (x86_cpu_to_apicid_early_ptr) {
+               u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
+               u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+
+               cpu_to_apicid[cpu] = m->mpc_apicid;
+               bios_cpu_apicid[cpu] = m->mpc_apicid;
         } else {
                 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
+               per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
         }
  
         cpu_set(cpu, cpu_possible_map);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c

index 4f4bfd3a88b6f89bb6bf080bb1e69787e320f2e7..edd413650b3b06954b65ab05664ae0d6f8e3a3e5 100644 (file)
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
  
  static int endflag __initdata = 0;
  
+#ifdef CONFIG_SMP
  /* The performance counters used by NMI_LOCAL_APIC don't trigger when
   * the CPU is idle. To make sure the NMI watchdog really ticks on all
   * CPUs during the test make them busy.
   */
  static __init void nmi_cpu_busy(void *data)
  {
-#ifdef CONFIG_SMP
         local_irq_enable_in_hardirq();
         /* Intentionally don't use cpu_relax here. This is
            to make sure that the performance counter really ticks,
@@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
            care if they get somewhat less cycles. */
         while (endflag == 0)
                 mb();
-#endif
  }
+#endif
  
  static int __init check_nmi_watchdog(void)
  {
@@ -87,11 +87,13 @@ static int __init check_nmi_watchdog(void)
  
         printk(KERN_INFO "Testing NMI watchdog ... ");
  
+#ifdef CONFIG_SMP
         if (nmi_watchdog == NMI_LOCAL_APIC)
                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+#endif
  
         for_each_possible_cpu(cpu)
-               prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
+               prev_nmi_count[cpu] = nmi_count(cpu);
         local_irq_enable();
         mdelay((20*1000)/nmi_hz); // wait 20 ticks
  
@@ -237,10 +239,10 @@ void acpi_nmi_disable(void)
                 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
  }
  
-void setup_apic_nmi_watchdog (void *unused)
+void setup_apic_nmi_watchdog(void *unused)
  {
         if (__get_cpu_var(wd_enabled))
-               return;
+               return;
  
         /* cheap hack to support suspend/resume */
         /* if cpu0 is not active neither should the other cpus */
@@ -329,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
         unsigned int sum;
         int touched = 0;
         int cpu = smp_processor_id();
-       int rc=0;
+       int rc = 0;
  
         /* check for other users first */
         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c

index c3d1476b6a1170673b6ab03a991c320f70af8ec8..fb99484d21cf6db94f7349d466a124515266b858 100644 (file)
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
   *  0: the lapic NMI watchdog is disabled, but can be enabled
   */
  atomic_t nmi_active = ATOMIC_INIT(0);          /* oprofile uses this */
-int panic_on_timeout;
+static int panic_on_timeout;
  
  unsigned int nmi_watchdog = NMI_DEFAULT;
  static unsigned int nmi_hz = HZ;
@@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data)
  }
  #endif
  
-int __init check_nmi_watchdog (void)
+int __init check_nmi_watchdog(void)
  {
-       int *counts;
+       int *prev_nmi_count;
         int cpu;
  
-       if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 
+       if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
                 return 0;
  
         if (!atomic_read(&nmi_active))
                 return 0;
  
-       counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
-       if (!counts)
+       prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+       if (!prev_nmi_count)
                 return -1;
  
-       printk(KERN_INFO "testing NMI watchdog ... ");
+       printk(KERN_INFO "Testing NMI watchdog ... ");
  
  #ifdef CONFIG_SMP
         if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void)
  #endif
  
         for (cpu = 0; cpu < NR_CPUS; cpu++)
-               counts[cpu] = cpu_pda(cpu)->__nmi_count;
+               prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
         local_irq_enable();
         mdelay((20*1000)/nmi_hz); // wait 20 ticks
  
         for_each_online_cpu(cpu) {
                 if (!per_cpu(wd_enabled, cpu))
                         continue;
-               if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
+               if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) {
                         printk(KERN_WARNING "WARNING: CPU#%d: NMI "
                                "appears to be stuck (%d->%d)!\n",
-                              cpu,
-                              counts[cpu],
-                              cpu_pda(cpu)->__nmi_count);
+                               cpu,
+                               prev_nmi_count[cpu],
+                               cpu_pda(cpu)->__nmi_count);
                         per_cpu(wd_enabled, cpu) = 0;
                         atomic_dec(&nmi_active);
                 }
         }
+       endflag = 1;
         if (!atomic_read(&nmi_active)) {
-               kfree(counts);
+               kfree(prev_nmi_count);
                 atomic_set(&nmi_active, -1);
-               endflag = 1;
                 return -1;
         }
-       endflag = 1;
         printk("OK.\n");
  
         /* now that we know it works we can reduce NMI frequency to
@@ -132,11 +131,11 @@ int __init check_nmi_watchdog (void)
         if (nmi_watchdog == NMI_LOCAL_APIC)
                 nmi_hz = lapic_adjust_nmi_hz(1);
  
-       kfree(counts);
+       kfree(prev_nmi_count);
         return 0;
  }
  
-int __init setup_nmi_watchdog(char *str)
+static int __init setup_nmi_watchdog(char *str)
  {
         int nmi;
  
@@ -159,34 +158,6 @@ int __init setup_nmi_watchdog(char *str)
  
  __setup("nmi_watchdog=", setup_nmi_watchdog);
  
-
-static void __acpi_nmi_disable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_enable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
  #ifdef CONFIG_PM
  
  static int nmi_pm_active; /* nmi_active before suspend */
@@ -217,7 +188,7 @@ static struct sysdev_class nmi_sysclass = {
  };
  
  static struct sys_device device_lapic_nmi = {
-       .id             = 0,
+       .id     = 0,
         .cls    = &nmi_sysclass,
  };
  
@@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void)
         if (nmi_watchdog != NMI_LOCAL_APIC)
                 return 0;
  
-       if ( atomic_read(&nmi_active) < 0 )
+       if (atomic_read(&nmi_active) < 0)
                 return 0;
  
         error = sysdev_class_register(&nmi_sysclass);
@@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs);
  
  #endif /* CONFIG_PM */
  
+static void __acpi_nmi_enable(void *__unused)
+{
+       apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+               on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_disable(void *__unused)
+{
+       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+               on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
  void setup_apic_nmi_watchdog(void *unused)
  {
-       if (__get_cpu_var(wd_enabled) == 1)
+       if (__get_cpu_var(wd_enabled))
                 return;
  
         /* cheap hack to support suspend/resume */
@@ -311,8 +310,9 @@ void touch_nmi_watchdog(void)
                 }
         }
  
-       touch_softlockup_watchdog();
+       touch_softlockup_watchdog();
  }
+EXPORT_SYMBOL(touch_nmi_watchdog);
  
  int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
  {
@@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void)
  
  EXPORT_SYMBOL(nmi_active);
  EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c

index 9000d82c6dc0a784de020d1723ae75522bee9082..e65281b1634b790bd151c479fc96f3846febe87d 100644 (file)
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void)
  {
         if (num_online_nodes() > 1) {
                 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-               tsc_disable = 1;
+               setup_clear_cpu_cap(X86_FEATURE_TSC);
         }
         return 0;
  }
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt.c

similarity index 83%

rename from arch/x86/kernel/paravirt_32.c

rename to arch/x86/kernel/paravirt.c

index f5000799f8efe4815caff23625eeee66c8d7c0bf..075962cc75ab60a3bc37823644f1050c64a3b9bf 100644 (file)
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt.c
@@ -14,7 +14,10 @@
      You should have received a copy of the GNU General Public License
      along with this program; if not, write to the Free Software
      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
  */
+
  #include <linux/errno.h>
  #include <linux/module.h>
  #include <linux/efi.h>
@@ -55,59 +58,9 @@ char *memory_setup(void)
         extern const char start_##ops##_##name[], end_##ops##_##name[]; \
         asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
  
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
-DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
-
  /* Undefined instruction for dealing with missing ops pointers. */
  static const unsigned char ud2a[] = { 0x0f, 0x0b };
  
-static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
-                            unsigned long addr, unsigned len)
-{
-       const unsigned char *start, *end;
-       unsigned ret;
-
-       switch(type) {
-#define SITE(ops, x)                                           \
-       case PARAVIRT_PATCH(ops.x):                             \
-               start = start_##ops##_##x;                      \
-               end = end_##ops##_##x;                          \
-               goto patch_site
-
-       SITE(pv_irq_ops, irq_disable);
-       SITE(pv_irq_ops, irq_enable);
-       SITE(pv_irq_ops, restore_fl);
-       SITE(pv_irq_ops, save_fl);
-       SITE(pv_cpu_ops, iret);
-       SITE(pv_cpu_ops, irq_enable_sysexit);
-       SITE(pv_mmu_ops, read_cr2);
-       SITE(pv_mmu_ops, read_cr3);
-       SITE(pv_mmu_ops, write_cr3);
-       SITE(pv_cpu_ops, clts);
-       SITE(pv_cpu_ops, read_tsc);
-#undef SITE
-
-       patch_site:
-               ret = paravirt_patch_insns(ibuf, len, start, end);
-               break;
-
-       default:
-               ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
-               break;
-       }
-
-       return ret;
-}
-
  unsigned paravirt_patch_nop(void)
  {
         return 0;
@@ -186,7 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
                 /* If the operation is a nop, then nop the callsite */
                 ret = paravirt_patch_nop();
         else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-                type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
+                type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
                 /* If operation requires a jmp, then jmp */
                 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
         else
@@ -237,7 +190,7 @@ static void native_flush_tlb_single(unsigned long addr)
  
  /* These are in entry.S */
  extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
+extern void native_irq_enable_syscall_ret(void);
  
  static int __init print_banner(void)
  {
@@ -285,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
  
  static inline void enter_lazy(enum paravirt_lazy_mode mode)
  {
-       BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+       BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
         BUG_ON(preemptible());
  
-       x86_write_percpu(paravirt_lazy_mode, mode);
+       __get_cpu_var(paravirt_lazy_mode) = mode;
  }
  
  void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
  {
-       BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
+       BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
         BUG_ON(preemptible());
  
-       x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+       __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
  }
  
  void paravirt_enter_lazy_mmu(void)
@@ -321,7 +274,7 @@ void paravirt_leave_lazy_cpu(void)
  
  enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
  {
-       return x86_read_percpu(paravirt_lazy_mode);
+       return __get_cpu_var(paravirt_lazy_mode);
  }
  
  struct pv_info pv_info = {
@@ -366,11 +319,16 @@ struct pv_cpu_ops pv_cpu_ops = {
         .read_cr4 = native_read_cr4,
         .read_cr4_safe = native_read_cr4_safe,
         .write_cr4 = native_write_cr4,
+#ifdef CONFIG_X86_64
+       .read_cr8 = native_read_cr8,
+       .write_cr8 = native_write_cr8,
+#endif
         .wbinvd = native_wbinvd,
         .read_msr = native_read_msr_safe,
         .write_msr = native_write_msr_safe,
         .read_tsc = native_read_tsc,
         .read_pmc = native_read_pmc,
+       .read_tscp = native_read_tscp,
         .load_tr_desc = native_load_tr_desc,
         .set_ldt = native_set_ldt,
         .load_gdt = native_load_gdt,
@@ -379,13 +337,14 @@ struct pv_cpu_ops pv_cpu_ops = {
         .store_idt = native_store_idt,
         .store_tr = native_store_tr,
         .load_tls = native_load_tls,
-       .write_ldt_entry = write_dt_entry,
-       .write_gdt_entry = write_dt_entry,
-       .write_idt_entry = write_dt_entry,
-       .load_esp0 = native_load_esp0,
+       .write_ldt_entry = native_write_ldt_entry,
+       .write_gdt_entry = native_write_gdt_entry,
+       .write_idt_entry = native_write_idt_entry,
+       .load_sp0 = native_load_sp0,
  
-       .irq_enable_sysexit = native_irq_enable_sysexit,
+       .irq_enable_syscall_ret = native_irq_enable_syscall_ret,
         .iret = native_iret,
+       .swapgs = native_swapgs,
  
         .set_iopl_mask = native_set_iopl_mask,
         .io_delay = native_io_delay,
@@ -408,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = {
  };
  
  struct pv_mmu_ops pv_mmu_ops = {
+#ifndef CONFIG_X86_64
         .pagetable_setup_start = native_pagetable_setup_start,
         .pagetable_setup_done = native_pagetable_setup_done,
+#endif
  
         .read_cr2 = native_read_cr2,
         .write_cr2 = native_write_cr2,
@@ -437,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = {
         .kmap_atomic_pte = kmap_atomic,
  #endif
  
+#if PAGETABLE_LEVELS >= 3
  #ifdef CONFIG_X86_PAE
         .set_pte_atomic = native_set_pte_atomic,
         .set_pte_present = native_set_pte_present,
-       .set_pud = native_set_pud,
         .pte_clear = native_pte_clear,
         .pmd_clear = native_pmd_clear,
-
+#endif
+       .set_pud = native_set_pud,
         .pmd_val = native_pmd_val,
         .make_pmd = native_make_pmd,
+
+#if PAGETABLE_LEVELS == 4
+       .pud_val = native_pud_val,
+       .make_pud = native_make_pud,
+       .set_pgd = native_set_pgd,
  #endif
+#endif /* PAGETABLE_LEVELS >= 3 */
  
         .pte_val = native_pte_val,
         .pgd_val = native_pgd_val,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c

new file mode 100644 (file)

index 0000000..82fc5fc
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -0,0 +1,49 @@
+#include <asm/paravirt.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+                     unsigned long addr, unsigned len)
+{
+       const unsigned char *start, *end;
+       unsigned ret;
+
+#define PATCH_SITE(ops, x)                                     \
+               case PARAVIRT_PATCH(ops.x):                     \
+                       start = start_##ops##_##x;              \
+                       end = end_##ops##_##x;                  \
+                       goto patch_site
+       switch(type) {
+               PATCH_SITE(pv_irq_ops, irq_disable);
+               PATCH_SITE(pv_irq_ops, irq_enable);
+               PATCH_SITE(pv_irq_ops, restore_fl);
+               PATCH_SITE(pv_irq_ops, save_fl);
+               PATCH_SITE(pv_cpu_ops, iret);
+               PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+               PATCH_SITE(pv_mmu_ops, read_cr2);
+               PATCH_SITE(pv_mmu_ops, read_cr3);
+               PATCH_SITE(pv_mmu_ops, write_cr3);
+               PATCH_SITE(pv_cpu_ops, clts);
+               PATCH_SITE(pv_cpu_ops, read_tsc);
+
+       patch_site:
+               ret = paravirt_patch_insns(ibuf, len, start, end);
+               break;
+
+       default:
+               ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+               break;
+       }
+#undef PATCH_SITE
+       return ret;
+}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c

new file mode 100644 (file)

index 0000000..7d904e1
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -0,0 +1,57 @@
+#include <asm/paravirt.h>
+#include <asm/asm-offsets.h>
+#include <linux/stringify.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
+DEF_NATIVE(pv_cpu_ops, iret, "iretq");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
+DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+
+/* the three commands give us more control to how to return from a syscall */
+DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
+DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+                     unsigned long addr, unsigned len)
+{
+       const unsigned char *start, *end;
+       unsigned ret;
+
+#define PATCH_SITE(ops, x)                                     \
+               case PARAVIRT_PATCH(ops.x):                     \
+                       start = start_##ops##_##x;              \
+                       end = end_##ops##_##x;                  \
+                       goto patch_site
+       switch(type) {
+               PATCH_SITE(pv_irq_ops, restore_fl);
+               PATCH_SITE(pv_irq_ops, save_fl);
+               PATCH_SITE(pv_irq_ops, irq_enable);
+               PATCH_SITE(pv_irq_ops, irq_disable);
+               PATCH_SITE(pv_cpu_ops, iret);
+               PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+               PATCH_SITE(pv_cpu_ops, swapgs);
+               PATCH_SITE(pv_mmu_ops, read_cr2);
+               PATCH_SITE(pv_mmu_ops, read_cr3);
+               PATCH_SITE(pv_mmu_ops, write_cr3);
+               PATCH_SITE(pv_cpu_ops, clts);
+               PATCH_SITE(pv_mmu_ops, flush_tlb_single);
+               PATCH_SITE(pv_cpu_ops, wbinvd);
+
+       patch_site:
+               ret = paravirt_patch_insns(ibuf, len, start, end);
+               break;
+
+       default:
+               ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+               break;
+       }
+#undef PATCH_SITE
+       return ret;
+}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c

index 6bf1f716909da5fddf6674adc4b0974f3fbd1ab6..21f34db2c03c1efe90fd7ef4dcedc2407dc17e6f 100644 (file)
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -30,7 +30,6 @@
  #include <linux/spinlock.h>
  #include <linux/string.h>
  #include <linux/dma-mapping.h>
-#include <linux/init.h>
  #include <linux/bitops.h>
  #include <linux/pci_ids.h>
  #include <linux/pci.h>
@@ -183,7 +182,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
  
  /* enable this to stress test the chip's TCE cache */
  #ifdef CONFIG_IOMMU_DEBUG
-int debugging __read_mostly = 1;
+static int debugging = 1;
  
  static inline unsigned long verify_bit_range(unsigned long* bitmap,
         int expected, unsigned long start, unsigned long end)
@@ -202,7 +201,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
         return ~0UL;
  }
  #else /* debugging is disabled */
-int debugging __read_mostly = 0;
+static int debugging;
  
  static inline unsigned long verify_bit_range(unsigned long* bitmap,
         int expected, unsigned long start, unsigned long end)
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c

index 5552d23d23c27558ec32f64930f14f6870181ba5..a82473d192a31b830eea331c07fb043dd9849343 100644 (file)
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -13,7 +13,6 @@
  #include <asm/calgary.h>
  
  int iommu_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_merge);
  
  dma_addr_t bad_dma_address __read_mostly;
  EXPORT_SYMBOL(bad_dma_address);
@@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask);
   * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
   * documentation.
   */
-__init int iommu_setup(char *p)
+static __init int iommu_setup(char *p)
  {
         iommu_merge = 1;
  
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c

index 06bcba536045ae5ca038ffa9ac3f6c16b6b26a2e..4d5cc718198229eda0913fa5a19c6d417fa6589e 100644 (file)
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -1,12 +1,12 @@
  /*
   * Dynamic DMA mapping support for AMD Hammer.
- * 
+ *
   * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
   * This allows to use PCI devices that only support 32bit addresses on systems
- * with more than 4GB. 
+ * with more than 4GB.
   *
   * See Documentation/DMA-mapping.txt for the interface specification.
- * 
+ *
   * Copyright 2002 Andi Kleen, SuSE Labs.
   * Subject to the GNU General Public License v2 only.
   */
@@ -37,23 +37,26 @@
  #include <asm/k8.h>
  
  static unsigned long iommu_bus_base;   /* GART remapping area (physical) */
-static unsigned long iommu_size;       /* size of remapping area bytes */
+static unsigned long iommu_size;       /* size of remapping area bytes */
  static unsigned long iommu_pages;      /* .. and in pages */
  
-static u32 *iommu_gatt_base;           /* Remapping table */
+static u32 *iommu_gatt_base;           /* Remapping table */
  
-/* If this is disabled the IOMMU will use an optimized flushing strategy
-   of only flushing when an mapping is reused. With it true the GART is flushed 
-   for every mapping. Problem is that doing the lazy flush seems to trigger
-   bugs with some popular PCI cards, in particular 3ware (but has been also
-   also seen with Qlogic at least). */
+/*
+ * If this is disabled the IOMMU will use an optimized flushing strategy
+ * of only flushing when an mapping is reused. With it true the GART is
+ * flushed for every mapping. Problem is that doing the lazy flush seems
+ * to trigger bugs with some popular PCI cards, in particular 3ware (but
+ * has been also also seen with Qlogic at least).
+ */
  int iommu_fullflush = 1;
  
-/* Allocation bitmap for the remapping area */ 
+/* Allocation bitmap for the remapping area: */
  static DEFINE_SPINLOCK(iommu_bitmap_lock);
-static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+/* Guarded by iommu_bitmap_lock: */
+static unsigned long *iommu_gart_bitmap;
  
-static u32 gart_unmapped_entry; 
+static u32 gart_unmapped_entry;
  
  #define GPTE_VALID    1
  #define GPTE_COHERENT 2
@@ -61,10 +64,10 @@ static u32 gart_unmapped_entry;
         (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
  #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
  
-#define to_pages(addr,size) \
+#define to_pages(addr, size) \
         (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
  
-#define EMERGENCY_PAGES 32 /* = 128KB */ 
+#define EMERGENCY_PAGES 32 /* = 128KB */
  
  #ifdef CONFIG_AGP
  #define AGPEXTERN extern
@@ -77,130 +80,152 @@ AGPEXTERN int agp_memory_reserved;
  AGPEXTERN __u32 *agp_gatt_table;
  
  static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
-static int need_flush;                 /* global flush state. set for each gart wrap */
+static int need_flush;         /* global flush state. set for each gart wrap */
  
-static unsigned long alloc_iommu(int size) 
-{      
+static unsigned long alloc_iommu(int size)
+{
         unsigned long offset, flags;
  
-       spin_lock_irqsave(&iommu_bitmap_lock, flags);   
-       offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+       spin_lock_irqsave(&iommu_bitmap_lock, flags);
+       offset = find_next_zero_string(iommu_gart_bitmap, next_bit,
+                                       iommu_pages, size);
         if (offset == -1) {
                 need_flush = 1;
-               offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
+               offset = find_next_zero_string(iommu_gart_bitmap, 0,
+                                               iommu_pages, size);
         }
-       if (offset != -1) { 
-               set_bit_string(iommu_gart_bitmap, offset, size); 
-               next_bit = offset+size; 
-               if (next_bit >= iommu_pages) { 
+       if (offset != -1) {
+               set_bit_string(iommu_gart_bitmap, offset, size);
+               next_bit = offset+size;
+               if (next_bit >= iommu_pages) {
                         next_bit = 0;
                         need_flush = 1;
-               } 
-       } 
+               }
+       }
         if (iommu_fullflush)
                 need_flush = 1;
-       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+
         return offset;
-} 
+}
  
  static void free_iommu(unsigned long offset, int size)
-{ 
+{
         unsigned long flags;
+
         spin_lock_irqsave(&iommu_bitmap_lock, flags);
         __clear_bit_string(iommu_gart_bitmap, offset, size);
         spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-} 
+}
  
-/* 
+/*
   * Use global flush state to avoid races with multiple flushers.
   */
  static void flush_gart(void)
-{ 
+{
         unsigned long flags;
+
         spin_lock_irqsave(&iommu_bitmap_lock, flags);
         if (need_flush) {
                 k8_flush_garts();
                 need_flush = 0;
-       } 
+       }
         spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-} 
+}
  
  #ifdef CONFIG_IOMMU_LEAK
  
-#define SET_LEAK(x) if (iommu_leak_tab) \
-                       iommu_leak_tab[x] = __builtin_return_address(0);
-#define CLEAR_LEAK(x) if (iommu_leak_tab) \
-                       iommu_leak_tab[x] = NULL;
+#define SET_LEAK(x)                                                    \
+       do {                                                            \
+               if (iommu_leak_tab)                                     \
+                       iommu_leak_tab[x] = __builtin_return_address(0);\
+       } while (0)
+
+#define CLEAR_LEAK(x)                                                  \
+       do {                                                            \
+               if (iommu_leak_tab)                                     \
+                       iommu_leak_tab[x] = NULL;                       \
+       } while (0)
  
  /* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab; 
+static void **iommu_leak_tab;
  static int leak_trace;
  static int iommu_leak_pages = 20;
+
  static void dump_leak(void)
  {
         int i;
-       static int dump; 
-       if (dump || !iommu_leak_tab) return;
+       static int dump;
+
+       if (dump || !iommu_leak_tab)
+               return;
         dump = 1;
-       show_stack(NULL,NULL);
-       /* Very crude. dump some from the end of the table too */ 
-       printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
-       for (i = 0; i < iommu_leak_pages; i+=2) {
-               printk("%lu: ", iommu_pages-i);
-               printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
-               printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
-       } 
-       printk("\n");
+       show_stack(NULL, NULL);
+
+       /* Very crude. dump some from the end of the table too */
+       printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
+              iommu_leak_pages);
+       for (i = 0; i < iommu_leak_pages; i += 2) {
+               printk(KERN_DEBUG "%lu: ", iommu_pages-i);
+               printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
+               printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
+       }
+       printk(KERN_DEBUG "\n");
  }
  #else
-#define SET_LEAK(x)
-#define CLEAR_LEAK(x)
+# define SET_LEAK(x)
+# define CLEAR_LEAK(x)
  #endif
  
  static void iommu_full(struct device *dev, size_t size, int dir)
  {
-       /* 
+       /*
          * Ran out of IOMMU space for this operation. This is very bad.
          * Unfortunately the drivers cannot handle this operation properly.
-        * Return some non mapped prereserved space in the aperture and 
+        * Return some non mapped prereserved space in the aperture and
          * let the Northbridge deal with it. This will result in garbage
          * in the IO operation. When the size exceeds the prereserved space
-        * memory corruption will occur or random memory will be DMAed 
+        * memory corruption will occur or random memory will be DMAed
          * out. Hopefully no network devices use single mappings that big.
-        */ 
-       
-       printk(KERN_ERR 
-  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
-              size, dev->bus_id);
+        */
+
+       printk(KERN_ERR
+               "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+               size, dev->bus_id);
  
         if (size > PAGE_SIZE*EMERGENCY_PAGES) {
                 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
                         panic("PCI-DMA: Memory would be corrupted\n");
-               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
-                       panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
-       } 
-
+               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+                       panic(KERN_ERR
+                               "PCI-DMA: Random memory would be DMAed\n");
+       }
  #ifdef CONFIG_IOMMU_LEAK
-       dump_leak(); 
+       dump_leak();
  #endif
-} 
+}
  
-static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
-{ 
+static inline int
+need_iommu(struct device *dev, unsigned long addr, size_t size)
+{
         u64 mask = *dev->dma_mask;
         int high = addr + size > mask;
         int mmu = high;
-       if (force_iommu) 
-               mmu = 1; 
-       return mmu; 
+
+       if (force_iommu)
+               mmu = 1;
+
+       return mmu;
  }
  
-static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
-{ 
+static inline int
+nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{
         u64 mask = *dev->dma_mask;
         int high = addr + size > mask;
         int mmu = high;
-       return mmu; 
+
+       return mmu;
  }
  
  /* Map a single continuous physical area into the IOMMU.
@@ -208,13 +233,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t
   */
  static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
                                 size_t size, int dir)
-{ 
+{
         unsigned long npages = to_pages(phys_mem, size);
         unsigned long iommu_page = alloc_iommu(npages);
         int i;
+
         if (iommu_page == -1) {
                 if (!nonforced_iommu(dev, phys_mem, size))
-                       return phys_mem; 
+                       return phys_mem;
                 if (panic_on_overflow)
                         panic("dma_map_area overflow %lu bytes\n", size);
                 iommu_full(dev, size, dir);
@@ -229,35 +255,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
         return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
  }
  
-static dma_addr_t gart_map_simple(struct device *dev, char *buf,
-                                size_t size, int dir)
+static dma_addr_t
+gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
  {
         dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+
         flush_gart();
+
         return map;
  }
  
  /* Map a single area into the IOMMU */
-static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+static dma_addr_t
+gart_map_single(struct device *dev, void *addr, size_t size, int dir)
  {
         unsigned long phys_mem, bus;
  
         if (!dev)
                 dev = &fallback_dev;
  
-       phys_mem = virt_to_phys(addr); 
+       phys_mem = virt_to_phys(addr);
         if (!need_iommu(dev, phys_mem, size))
-               return phys_mem; 
+               return phys_mem;
  
         bus = gart_map_simple(dev, addr, size, dir);
-       return bus; 
+
+       return bus;
  }
  
  /*
   * Free a DMA mapping.
   */
  static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
-                     size_t size, int direction)
+                             size_t size, int direction)
  {
         unsigned long iommu_page;
         int npages;
@@ -266,6 +296,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
         if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
             dma_addr >= iommu_bus_base + iommu_size)
                 return;
+
         iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
         npages = to_pages(dma_addr, size);
         for (i = 0; i < npages; i++) {
@@ -278,7 +309,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
  /*
   * Wrapper for pci_unmap_single working with scatterlists.
   */
-static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+static void
+gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
  {
         struct scatterlist *s;
         int i;
@@ -303,12 +335,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
  
         for_each_sg(sg, s, nents, i) {
                 unsigned long addr = sg_phys(s);
-               if (nonforced_iommu(dev, addr, s->length)) { 
+
+               if (nonforced_iommu(dev, addr, s->length)) {
                         addr = dma_map_area(dev, addr, s->length, dir);
-                       if (addr == bad_dma_address) { 
-                               if (i > 0) 
+                       if (addr == bad_dma_address) {
+                               if (i > 0)
                                         gart_unmap_sg(dev, sg, i, dir);
-                               nents = 0; 
+                               nents = 0;
                                 sg[0].dma_length = 0;
                                 break;
                         }
@@ -317,15 +350,16 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
                 s->dma_length = s->length;
         }
         flush_gart();
+
         return nents;
  }
  
  /* Map multiple scatterlist entries continuous into the first. */
  static int __dma_map_cont(struct scatterlist *start, int nelems,
-                     struct scatterlist *sout, unsigned long pages)
+                         struct scatterlist *sout, unsigned long pages)
  {
         unsigned long iommu_start = alloc_iommu(pages);
-       unsigned long iommu_page = iommu_start; 
+       unsigned long iommu_page = iommu_start;
         struct scatterlist *s;
         int i;
  
@@ -335,32 +369,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems,
         for_each_sg(start, s, nelems, i) {
                 unsigned long pages, addr;
                 unsigned long phys_addr = s->dma_address;
-               
+
                 BUG_ON(s != start && s->offset);
                 if (s == start) {
                         sout->dma_address = iommu_bus_base;
                         sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
                         sout->dma_length = s->length;
-               } else { 
-                       sout->dma_length += s->length; 
+               } else {
+                       sout->dma_length += s->length;
                 }
  
                 addr = phys_addr;
-               pages = to_pages(s->offset, s->length); 
-               while (pages--) { 
-                       iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+               pages = to_pages(s->offset, s->length);
+               while (pages--) {
+                       iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
                         SET_LEAK(iommu_page);
                         addr += PAGE_SIZE;
                         iommu_page++;
                 }
-       } 
-       BUG_ON(iommu_page - iommu_start != pages);      
+       }
+       BUG_ON(iommu_page - iommu_start != pages);
+
         return 0;
  }
  
-static inline int dma_map_cont(struct scatterlist *start, int nelems,
-                     struct scatterlist *sout,
-                     unsigned long pages, int need)
+static inline int
+dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout,
+            unsigned long pages, int need)
  {
         if (!need) {
                 BUG_ON(nelems != 1);
@@ -370,22 +405,19 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems,
         }
         return __dma_map_cont(start, nelems, sout, pages);
  }
-               
+
  /*
   * DMA map all entries in a scatterlist.
- * Merge chunks that have page aligned sizes into a continuous mapping. 
+ * Merge chunks that have page aligned sizes into a continuous mapping.
   */
-static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-                       int dir)
+static int
+gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
  {
-       int i;
-       int out;
-       int start;
-       unsigned long pages = 0;
-       int need = 0, nextneed;
         struct scatterlist *s, *ps, *start_sg, *sgmap;
+       int need = 0, nextneed, i, out, start;
+       unsigned long pages = 0;
  
-       if (nents == 0) 
+       if (nents == 0)
                 return 0;
  
         if (!dev)
@@ -397,15 +429,19 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
         ps = NULL; /* shut up gcc */
         for_each_sg(sg, s, nents, i) {
                 dma_addr_t addr = sg_phys(s);
+
                 s->dma_address = addr;
-               BUG_ON(s->length == 0); 
+               BUG_ON(s->length == 0);
  
-               nextneed = need_iommu(dev, addr, s->length); 
+               nextneed = need_iommu(dev, addr, s->length);
  
                 /* Handle the previous not yet processed entries */
                 if (i > start) {
-                       /* Can only merge when the last chunk ends on a page 
-                          boundary and the new one doesn't have an offset. */
+                       /*
+                        * Can only merge when the last chunk ends on a
+                        * page boundary and the new one doesn't have an
+                        * offset.
+                        */
                         if (!iommu_merge || !nextneed || !need || s->offset ||
                             (ps->offset + ps->length) % PAGE_SIZE) {
                                 if (dma_map_cont(start_sg, i - start, sgmap,
@@ -436,6 +472,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
  error:
         flush_gart();
         gart_unmap_sg(dev, sg, out, dir);
+
         /* When it was forced or merged try again in a dumb way */
         if (force_iommu || iommu_merge) {
                 out = dma_map_sg_nonforce(dev, sg, nents, dir);
@@ -444,64 +481,68 @@ error:
         }
         if (panic_on_overflow)
                 panic("dma_map_sg: overflow on %lu pages\n", pages);
+
         iommu_full(dev, pages << PAGE_SHIFT, dir);
         for_each_sg(sg, s, nents, i)
                 s->dma_address = bad_dma_address;
         return 0;
-} 
+}
  
  static int no_agp;
  
  static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
-{ 
-       unsigned long a; 
-       if (!iommu_size) { 
-               iommu_size = aper_size; 
-               if (!no_agp) 
-                       iommu_size /= 2; 
-       } 
-
-       a = aper + iommu_size; 
+{
+       unsigned long a;
+
+       if (!iommu_size) {
+               iommu_size = aper_size;
+               if (!no_agp)
+                       iommu_size /= 2;
+       }
+
+       a = aper + iommu_size;
         iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
  
-       if (iommu_size < 64*1024*1024) 
+       if (iommu_size < 64*1024*1024) {
                 printk(KERN_WARNING
-  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
-       
+                       "PCI-DMA: Warning: Small IOMMU %luMB."
+                       " Consider increasing the AGP aperture in BIOS\n",
+                               iommu_size >> 20);
+       }
+
         return iommu_size;
-} 
+}
  
-static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
-{ 
-       unsigned aper_size = 0, aper_base_32;
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
+{
+       unsigned aper_size = 0, aper_base_32, aper_order;
         u64 aper_base;
-       unsigned aper_order;
  
-       pci_read_config_dword(dev, 0x94, &aper_base_32); 
+       pci_read_config_dword(dev, 0x94, &aper_base_32);
         pci_read_config_dword(dev, 0x90, &aper_order);
-       aper_order = (aper_order >> 1) & 7;     
+       aper_order = (aper_order >> 1) & 7;
  
-       aper_base = aper_base_32 & 0x7fff; 
+       aper_base = aper_base_32 & 0x7fff;
         aper_base <<= 25;
  
-       aper_size = (32 * 1024 * 1024) << aper_order; 
-       if (aper_base + aper_size > 0x100000000UL || !aper_size)
+       aper_size = (32 * 1024 * 1024) << aper_order;
+       if (aper_base + aper_size > 0x100000000UL || !aper_size)
                 aper_base = 0;
  
         *size = aper_size;
         return aper_base;
-} 
+}
  
-/* 
+/*
   * Private Northbridge GATT initialization in case we cannot use the
- * AGP driver for some reason.  
+ * AGP driver for some reason.
   */
  static __init int init_k8_gatt(struct agp_kern_info *info)
-{ 
+{
+       unsigned aper_size, gatt_size, new_aper_size;
+       unsigned aper_base, new_aper_base;
         struct pci_dev *dev;
         void *gatt;
-       unsigned aper_base, new_aper_base;
-       unsigned aper_size, gatt_size, new_aper_size;
         int i;
  
         printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
@@ -509,75 +550,75 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
         dev = NULL;
         for (i = 0; i < num_k8_northbridges; i++) {
                 dev = k8_northbridges[i];
-               new_aper_base = read_aperture(dev, &new_aper_size); 
-               if (!new_aper_base) 
-                       goto nommu; 
-               
-               if (!aper_base) { 
+               new_aper_base = read_aperture(dev, &new_aper_size);
+               if (!new_aper_base)
+                       goto nommu;
+
+               if (!aper_base) {
                         aper_size = new_aper_size;
                         aper_base = new_aper_base;
-               } 
-               if (aper_size != new_aper_size || aper_base != new_aper_base) 
+               }
+               if (aper_size != new_aper_size || aper_base != new_aper_base)
                         goto nommu;
         }
         if (!aper_base)
-               goto nommu; 
+               goto nommu;
         info->aper_base = aper_base;
-       info->aper_size = aper_size>>20; 
+       info->aper_size = aper_size >> 20;
  
-       gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
-       gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
-       if (!gatt) 
+       gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
+       gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
+       if (!gatt)
                 panic("Cannot allocate GATT table");
-       if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
+       if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
                 panic("Could not set GART PTEs to uncacheable pages");
-       global_flush_tlb();
  
-       memset(gatt, 0, gatt_size); 
+       memset(gatt, 0, gatt_size);
         agp_gatt_table = gatt;
  
         for (i = 0; i < num_k8_northbridges; i++) {
-               u32 ctl; 
-               u32 gatt_reg; 
+               u32 gatt_reg;
+               u32 ctl;
  
                 dev = k8_northbridges[i];
-               gatt_reg = __pa(gatt) >> 12; 
-               gatt_reg <<= 4; 
+               gatt_reg = __pa(gatt) >> 12;
+               gatt_reg <<= 4;
                 pci_write_config_dword(dev, 0x98, gatt_reg);
-               pci_read_config_dword(dev, 0x90, &ctl); 
+               pci_read_config_dword(dev, 0x90, &ctl);
  
                 ctl |= 1;
                 ctl &= ~((1<<4) | (1<<5));
  
-               pci_write_config_dword(dev, 0x90, ctl); 
+               pci_write_config_dword(dev, 0x90, ctl);
         }
         flush_gart();
-       
-       printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+
+       printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
+              aper_base, aper_size>>10);
         return 0;
  
   nommu:
-       /* Should not happen anymore */
+       /* Should not happen anymore */
         printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
                KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
-       return -1; 
-} 
+       return -1;
+}
  
  extern int agp_amd64_init(void);
  
  static const struct dma_mapping_ops gart_dma_ops = {
-       .mapping_error = NULL,
-       .map_single = gart_map_single,
-       .map_simple = gart_map_simple,
-       .unmap_single = gart_unmap_single,
-       .sync_single_for_cpu = NULL,
-       .sync_single_for_device = NULL,
-       .sync_single_range_for_cpu = NULL,
-       .sync_single_range_for_device = NULL,
-       .sync_sg_for_cpu = NULL,
-       .sync_sg_for_device = NULL,
-       .map_sg = gart_map_sg,
-       .unmap_sg = gart_unmap_sg,
+       .mapping_error                  = NULL,
+       .map_single                     = gart_map_single,
+       .map_simple                     = gart_map_simple,
+       .unmap_single                   = gart_unmap_single,
+       .sync_single_for_cpu            = NULL,
+       .sync_single_for_device         = NULL,
+       .sync_single_range_for_cpu      = NULL,
+       .sync_single_range_for_device   = NULL,
+       .sync_sg_for_cpu                = NULL,
+       .sync_sg_for_device             = NULL,
+       .map_sg                         = gart_map_sg,
+       .unmap_sg                       = gart_unmap_sg,
  };
  
  void gart_iommu_shutdown(void)
@@ -588,23 +629,23 @@ void gart_iommu_shutdown(void)
         if (no_agp && (dma_ops != &gart_dma_ops))
                 return;
  
-        for (i = 0; i < num_k8_northbridges; i++) {
-                u32 ctl;
+       for (i = 0; i < num_k8_northbridges; i++) {
+               u32 ctl;
  
-                dev = k8_northbridges[i];
-                pci_read_config_dword(dev, 0x90, &ctl);
+               dev = k8_northbridges[i];
+               pci_read_config_dword(dev, 0x90, &ctl);
  
-                ctl &= ~1;
+               ctl &= ~1;
  
-                pci_write_config_dword(dev, 0x90, ctl);
-        }
+               pci_write_config_dword(dev, 0x90, ctl);
+       }
  }
  
  void __init gart_iommu_init(void)
-{ 
+{
         struct agp_kern_info info;
-       unsigned long aper_size;
         unsigned long iommu_start;
+       unsigned long aper_size;
         unsigned long scratch;
         long i;
  
@@ -614,14 +655,14 @@ void __init gart_iommu_init(void)
         }
  
  #ifndef CONFIG_AGP_AMD64
-       no_agp = 1; 
+       no_agp = 1;
  #else
         /* Makefile puts PCI initialization via subsys_initcall first. */
         /* Add other K8 AGP bridge drivers here */
-       no_agp = no_agp || 
-               (agp_amd64_init() < 0) || 
+       no_agp = no_agp ||
+               (agp_amd64_init() < 0) ||
                 (agp_copy_info(agp_bridge, &info) < 0);
-#endif 
+#endif
  
         if (swiotlb)
                 return;
@@ -643,77 +684,78 @@ void __init gart_iommu_init(void)
         }
  
         printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
-       aper_size = info.aper_size * 1024 * 1024;       
-       iommu_size = check_iommu_size(info.aper_base, aper_size); 
-       iommu_pages = iommu_size >> PAGE_SHIFT; 
-
-       iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
-                                                   get_order(iommu_pages/8)); 
-       if (!iommu_gart_bitmap) 
-               panic("Cannot allocate iommu bitmap\n"); 
+       aper_size = info.aper_size * 1024 * 1024;
+       iommu_size = check_iommu_size(info.aper_base, aper_size);
+       iommu_pages = iommu_size >> PAGE_SHIFT;
+
+       iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
+                                                     get_order(iommu_pages/8));
+       if (!iommu_gart_bitmap)
+               panic("Cannot allocate iommu bitmap\n");
         memset(iommu_gart_bitmap, 0, iommu_pages/8);
  
  #ifdef CONFIG_IOMMU_LEAK
-       if (leak_trace) { 
-               iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+       if (leak_trace) {
+               iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
                                   get_order(iommu_pages*sizeof(void *)));
-               if (iommu_leak_tab) 
-                       memset(iommu_leak_tab, 0, iommu_pages * 8); 
+               if (iommu_leak_tab)
+                       memset(iommu_leak_tab, 0, iommu_pages * 8);
                 else
-                       printk("PCI-DMA: Cannot allocate leak trace area\n"); 
-       } 
+                       printk(KERN_DEBUG
+                              "PCI-DMA: Cannot allocate leak trace area\n");
+       }
  #endif
  
-       /* 
+       /*
          * Out of IOMMU space handling.
-        * Reserve some invalid pages at the beginning of the GART. 
-        */ 
-       set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+        * Reserve some invalid pages at the beginning of the GART.
+        */
+       set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
  
-       agp_memory_reserved = iommu_size;       
+       agp_memory_reserved = iommu_size;
         printk(KERN_INFO
                "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
-              iommu_size>>20); 
+              iommu_size >> 20);
  
-       iommu_start = aper_size - iommu_size;   
-       iommu_bus_base = info.aper_base + iommu_start; 
+       iommu_start = aper_size - iommu_size;
+       iommu_bus_base = info.aper_base + iommu_start;
         bad_dma_address = iommu_bus_base;
         iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
  
-       /* 
+       /*
          * Unmap the IOMMU part of the GART. The alias of the page is
          * always mapped with cache enabled and there is no full cache
          * coherency across the GART remapping. The unmapping avoids
          * automatic prefetches from the CPU allocating cache lines in
          * there. All CPU accesses are done via the direct mapping to
          * the backing memory. The GART address is only used by PCI
-        * devices. 
+        * devices.
          */
         clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
  
-       /* 
-        * Try to workaround a bug (thanks to BenH) 
-        * Set unmapped entries to a scratch page instead of 0. 
+       /*
+        * Try to workaround a bug (thanks to BenH)
+        * Set unmapped entries to a scratch page instead of 0.
          * Any prefetches that hit unmapped entries won't get an bus abort
          * then.
          */
-       scratch = get_zeroed_page(GFP_KERNEL); 
-       if (!scratch) 
+       scratch = get_zeroed_page(GFP_KERNEL);
+       if (!scratch)
                 panic("Cannot allocate iommu scratch page");
         gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
-       for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+       for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
                 iommu_gatt_base[i] = gart_unmapped_entry;
  
         flush_gart();
         dma_ops = &gart_dma_ops;
-} 
+}
  
  void __init gart_parse_options(char *p)
  {
         int arg;
  
  #ifdef CONFIG_IOMMU_LEAK
-       if (!strncmp(p,"leak",4)) {
+       if (!strncmp(p, "leak", 4)) {
                 leak_trace = 1;
                 p += 4;
                 if (*p == '=') ++p;
@@ -723,18 +765,18 @@ void __init gart_parse_options(char *p)
  #endif
         if (isdigit(*p) && get_option(&p, &arg))
                 iommu_size = arg;
-       if (!strncmp(p, "fullflush",8))
+       if (!strncmp(p, "fullflush", 8))
                 iommu_fullflush = 1;
-       if (!strncmp(p, "nofullflush",11))
+       if (!strncmp(p, "nofullflush", 11))
                 iommu_fullflush = 0;
-       if (!strncmp(p,"noagp",5))
+       if (!strncmp(p, "noagp", 5))
                 no_agp = 1;
-       if (!strncmp(p, "noaperture",10))
+       if (!strncmp(p, "noaperture", 10))
                 fix_aperture = 0;
         /* duplicated from pci-dma.c */
-       if (!strncmp(p,"force",5))
+       if (!strncmp(p, "force", 5))
                 gart_iommu_aperture_allowed = 1;
-       if (!strncmp(p,"allowed",7))
+       if (!strncmp(p, "allowed", 7))
                 gart_iommu_aperture_allowed = 1;
         if (!strncmp(p, "memaper", 7)) {
                 fallback_aper_force = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c

index 102866d729a5a244f368b258150ec2718087467c..82a0a674a003f815b5d98cd540e56e26f4c332df 100644 (file)
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -10,7 +10,6 @@
  #include <asm/dma.h>
  
  int swiotlb __read_mostly;
-EXPORT_SYMBOL(swiotlb);
  
  const struct dma_mapping_ops swiotlb_dma_ops = {
         .mapping_error = swiotlb_dma_mapping_error,
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c

index ae8f91214f1564e510cf63c49efe079eb504853a..b112406f19961932f1505aeafa09d402a2ca45b1 100644 (file)
--- a/arch/x86/kernel/pmtimer_64.c
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -19,13 +19,13 @@
  #include <linux/time.h>
  #include <linux/init.h>
  #include <linux/cpumask.h>
+#include <linux/acpi_pmtmr.h>
+
  #include <asm/io.h>
  #include <asm/proto.h>
  #include <asm/msr.h>
  #include <asm/vsyscall.h>
  
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
  static inline u32 cyc2us(u32 cycles)
  {
         /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c

index 46d391d49de8a82b750647787e0cd80186533e68..968371ab223a250b59117d4fad4df43c247c104a 100644 (file)
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,6 +55,7 @@
  
  #include <asm/tlbflush.h>
  #include <asm/cpu.h>
+#include <asm/kdebug.h>
  
  asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
  
@@ -74,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
   */
  unsigned long thread_saved_pc(struct task_struct *tsk)
  {
-       return ((unsigned long *)tsk->thread.esp)[3];
+       return ((unsigned long *)tsk->thread.sp)[3];
  }
  
  /*
@@ -113,10 +114,19 @@ void default_idle(void)
                 smp_mb();
  
                 local_irq_disable();
-               if (!need_resched())
+               if (!need_resched()) {
+                       ktime_t t0, t1;
+                       u64 t0n, t1n;
+
+                       t0 = ktime_get();
+                       t0n = ktime_to_ns(t0);
                         safe_halt();    /* enables interrupts racelessly */
-               else
-                       local_irq_enable();
+                       local_irq_disable();
+                       t1 = ktime_get();
+                       t1n = ktime_to_ns(t1);
+                       sched_clock_idle_wakeup_event(t1n - t0n);
+               }
+               local_irq_enable();
                 current_thread_info()->status |= TS_POLLING;
         } else {
                 /* loop is done by the caller */
@@ -132,7 +142,7 @@ EXPORT_SYMBOL(default_idle);
   * to poll the ->work.need_resched flag instead of waiting for the
   * cross-CPU IPI to arrive. Use this option with caution.
   */
-static void poll_idle (void)
+static void poll_idle(void)
  {
         cpu_relax();
  }
@@ -188,6 +198,9 @@ void cpu_idle(void)
                         rmb();
                         idle = pm_idle;
  
+                       if (rcu_pending(cpu))
+                               rcu_check_callbacks(cpu, 0);
+
                         if (!idle)
                                 idle = default_idle;
  
@@ -255,13 +268,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
   * New with Core Duo processors, MWAIT can take some hints based on CPU
   * capability.
   */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
  {
         if (!need_resched()) {
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
                 smp_mb();
                 if (!need_resched())
-                       __mwait(eax, ecx);
+                       __mwait(ax, cx);
         }
  }
  
@@ -272,19 +285,37 @@ static void mwait_idle(void)
         mwait_idle_with_hints(0, 0);
  }
  
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+{
+       if (force_mwait)
+               return 1;
+       /* Any C1 states supported? */
+       return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
+}
+
  void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
  {
-       if (cpu_has(c, X86_FEATURE_MWAIT)) {
-               printk("monitor/mwait feature present.\n");
+       static int selected;
+
+       if (selected)
+               return;
+#ifdef CONFIG_X86_SMP
+       if (pm_idle == poll_idle && smp_num_siblings > 1) {
+               printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+                       " performance may degrade.\n");
+       }
+#endif
+       if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
                 /*
                  * Skip, if setup has overridden idle.
                  * One CPU supports mwait => All CPUs supports mwait
                  */
                 if (!pm_idle) {
-                       printk("using mwait in idle threads.\n");
+                       printk(KERN_INFO "using mwait in idle threads.\n");
                         pm_idle = mwait_idle;
                 }
         }
+       selected = 1;
  }
  
  static int __init idle_setup(char *str)
@@ -292,10 +323,6 @@ static int __init idle_setup(char *str)
         if (!strcmp(str, "poll")) {
                 printk("using polling idle threads.\n");
                 pm_idle = poll_idle;
-#ifdef CONFIG_X86_SMP
-               if (smp_num_siblings > 1)
-                       printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
-#endif
         } else if (!strcmp(str, "mwait"))
                 force_mwait = 1;
         else
@@ -310,15 +337,15 @@ void __show_registers(struct pt_regs *regs, int all)
  {
         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
         unsigned long d0, d1, d2, d3, d6, d7;
-       unsigned long esp;
+       unsigned long sp;
         unsigned short ss, gs;
  
         if (user_mode_vm(regs)) {
-               esp = regs->esp;
-               ss = regs->xss & 0xffff;
+               sp = regs->sp;
+               ss = regs->ss & 0xffff;
                 savesegment(gs, gs);
         } else {
-               esp = (unsigned long) (&regs->esp);
+               sp = (unsigned long) (&regs->sp);
                 savesegment(ss, ss);
                 savesegment(gs, gs);
         }
@@ -331,17 +358,17 @@ void __show_registers(struct pt_regs *regs, int all)
                         init_utsname()->version);
  
         printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
-                       0xffff & regs->xcs, regs->eip, regs->eflags,
+                       0xffff & regs->cs, regs->ip, regs->flags,
                         smp_processor_id());
-       print_symbol("EIP is at %s\n", regs->eip);
+       print_symbol("EIP is at %s\n", regs->ip);
  
         printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
-               regs->eax, regs->ebx, regs->ecx, regs->edx);
+               regs->ax, regs->bx, regs->cx, regs->dx);
         printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
-               regs->esi, regs->edi, regs->ebp, esp);
+               regs->si, regs->di, regs->bp, sp);
         printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-              regs->xds & 0xffff, regs->xes & 0xffff,
-              regs->xfs & 0xffff, gs, ss);
+              regs->ds & 0xffff, regs->es & 0xffff,
+              regs->fs & 0xffff, gs, ss);
  
         if (!all)
                 return;
@@ -369,12 +396,12 @@ void __show_registers(struct pt_regs *regs, int all)
  void show_regs(struct pt_regs *regs)
  {
         __show_registers(regs, 1);
-       show_trace(NULL, regs, &regs->esp);
+       show_trace(NULL, regs, &regs->sp, regs->bp);
  }
  
  /*
- * This gets run with %ebx containing the
- * function to call, and %edx containing
+ * This gets run with %bx containing the
+ * function to call, and %dx containing
   * the "args".
   */
  extern void kernel_thread_helper(void);
@@ -388,16 +415,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
  
         memset(&regs, 0, sizeof(regs));
  
-       regs.ebx = (unsigned long) fn;
-       regs.edx = (unsigned long) arg;
+       regs.bx = (unsigned long) fn;
+       regs.dx = (unsigned long) arg;
  
-       regs.xds = __USER_DS;
-       regs.xes = __USER_DS;
-       regs.xfs = __KERNEL_PERCPU;
-       regs.orig_eax = -1;
-       regs.eip = (unsigned long) kernel_thread_helper;
-       regs.xcs = __KERNEL_CS | get_kernel_rpl();
-       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+       regs.ds = __USER_DS;
+       regs.es = __USER_DS;
+       regs.fs = __KERNEL_PERCPU;
+       regs.orig_ax = -1;
+       regs.ip = (unsigned long) kernel_thread_helper;
+       regs.cs = __KERNEL_CS | get_kernel_rpl();
+       regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
  
         /* Ok, create the new process.. */
         return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -435,7 +462,12 @@ void flush_thread(void)
  {
         struct task_struct *tsk = current;
  
-       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+       tsk->thread.debugreg0 = 0;
+       tsk->thread.debugreg1 = 0;
+       tsk->thread.debugreg2 = 0;
+       tsk->thread.debugreg3 = 0;
+       tsk->thread.debugreg6 = 0;
+       tsk->thread.debugreg7 = 0;
         memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
         clear_tsk_thread_flag(tsk, TIF_DEBUG);
         /*
@@ -460,7 +492,7 @@ void prepare_to_copy(struct task_struct *tsk)
         unlazy_fpu(tsk);
  }
  
-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
         unsigned long unused,
         struct task_struct * p, struct pt_regs * regs)
  {
@@ -470,15 +502,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
  
         childregs = task_pt_regs(p);
         *childregs = *regs;
-       childregs->eax = 0;
-       childregs->esp = esp;
+       childregs->ax = 0;
+       childregs->sp = sp;
  
-       p->thread.esp = (unsigned long) childregs;
-       p->thread.esp0 = (unsigned long) (childregs+1);
+       p->thread.sp = (unsigned long) childregs;
+       p->thread.sp0 = (unsigned long) (childregs+1);
  
-       p->thread.eip = (unsigned long) ret_from_fork;
+       p->thread.ip = (unsigned long) ret_from_fork;
  
-       savesegment(gs,p->thread.gs);
+       savesegment(gs, p->thread.gs);
  
         tsk = current;
         if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -491,32 +523,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
         }
  
+       err = 0;
+
         /*
          * Set a new TLS for the child thread?
          */
-       if (clone_flags & CLONE_SETTLS) {
-               struct desc_struct *desc;
-               struct user_desc info;
-               int idx;
-
-               err = -EFAULT;
-               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
-                       goto out;
-               err = -EINVAL;
-               if (LDT_empty(&info))
-                       goto out;
-
-               idx = info.entry_number;
-               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-                       goto out;
-
-               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-               desc->a = LDT_entry_a(&info);
-               desc->b = LDT_entry_b(&info);
-       }
+       if (clone_flags & CLONE_SETTLS)
+               err = do_set_thread_area(p, -1,
+                       (struct user_desc __user *)childregs->si, 0);
  
-       err = 0;
- out:
         if (err && p->thread.io_bitmap_ptr) {
                 kfree(p->thread.io_bitmap_ptr);
                 p->thread.io_bitmap_max = 0;
@@ -529,62 +544,52 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
   */
  void dump_thread(struct pt_regs * regs, struct user * dump)
  {
-       int i;
+       u16 gs;
  
  /* changed the size calculations - should hopefully work better. lbt */
         dump->magic = CMAGIC;
         dump->start_code = 0;
-       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+       dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
         dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
         dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
         dump->u_dsize -= dump->u_tsize;
         dump->u_ssize = 0;
-       for (i = 0; i < 8; i++)
-               dump->u_debugreg[i] = current->thread.debugreg[i];  
+       dump->u_debugreg[0] = current->thread.debugreg0;
+       dump->u_debugreg[1] = current->thread.debugreg1;
+       dump->u_debugreg[2] = current->thread.debugreg2;
+       dump->u_debugreg[3] = current->thread.debugreg3;
+       dump->u_debugreg[4] = 0;
+       dump->u_debugreg[5] = 0;
+       dump->u_debugreg[6] = current->thread.debugreg6;
+       dump->u_debugreg[7] = current->thread.debugreg7;
  
         if (dump->start_stack < TASK_SIZE)
                 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
  
-       dump->regs.ebx = regs->ebx;
-       dump->regs.ecx = regs->ecx;
-       dump->regs.edx = regs->edx;
-       dump->regs.esi = regs->esi;
-       dump->regs.edi = regs->edi;
-       dump->regs.ebp = regs->ebp;
-       dump->regs.eax = regs->eax;
-       dump->regs.ds = regs->xds;
-       dump->regs.es = regs->xes;
-       dump->regs.fs = regs->xfs;
-       savesegment(gs,dump->regs.gs);
-       dump->regs.orig_eax = regs->orig_eax;
-       dump->regs.eip = regs->eip;
-       dump->regs.cs = regs->xcs;
-       dump->regs.eflags = regs->eflags;
-       dump->regs.esp = regs->esp;
-       dump->regs.ss = regs->xss;
+       dump->regs.bx = regs->bx;
+       dump->regs.cx = regs->cx;
+       dump->regs.dx = regs->dx;
+       dump->regs.si = regs->si;
+       dump->regs.di = regs->di;
+       dump->regs.bp = regs->bp;
+       dump->regs.ax = regs->ax;
+       dump->regs.ds = (u16)regs->ds;
+       dump->regs.es = (u16)regs->es;
+       dump->regs.fs = (u16)regs->fs;
+       savesegment(gs,gs);
+       dump->regs.orig_ax = regs->orig_ax;
+       dump->regs.ip = regs->ip;
+       dump->regs.cs = (u16)regs->cs;
+       dump->regs.flags = regs->flags;
+       dump->regs.sp = regs->sp;
+       dump->regs.ss = (u16)regs->ss;
  
         dump->u_fpvalid = dump_fpu (regs, &dump->i387);
  }
  EXPORT_SYMBOL(dump_thread);
  
-/* 
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-       struct pt_regs ptregs = *task_pt_regs(tsk);
-       ptregs.xcs &= 0xffff;
-       ptregs.xds &= 0xffff;
-       ptregs.xes &= 0xffff;
-       ptregs.xss &= 0xffff;
-
-       elf_core_copy_regs(regs, &ptregs);
-
-       return 1;
-}
-
  #ifdef CONFIG_SECCOMP
-void hard_disable_TSC(void)
+static void hard_disable_TSC(void)
  {
         write_cr4(read_cr4() | X86_CR4_TSD);
  }
@@ -599,7 +604,7 @@ void disable_TSC(void)
                 hard_disable_TSC();
         preempt_enable();
  }
-void hard_enable_TSC(void)
+static void hard_enable_TSC(void)
  {
         write_cr4(read_cr4() & ~X86_CR4_TSD);
  }
@@ -609,18 +614,32 @@ static noinline void
  __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
                  struct tss_struct *tss)
  {
-       struct thread_struct *next;
+       struct thread_struct *prev, *next;
+       unsigned long debugctl;
  
+       prev = &prev_p->thread;
         next = &next_p->thread;
  
+       debugctl = prev->debugctlmsr;
+       if (next->ds_area_msr != prev->ds_area_msr) {
+               /* we clear debugctl to make sure DS
+                * is not in use when we change it */
+               debugctl = 0;
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+               wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
+       }
+
+       if (next->debugctlmsr != debugctl)
+               wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
+
         if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-               set_debugreg(next->debugreg[0], 0);
-               set_debugreg(next->debugreg[1], 1);
-               set_debugreg(next->debugreg[2], 2);
-               set_debugreg(next->debugreg[3], 3);
+               set_debugreg(next->debugreg0, 0);
+               set_debugreg(next->debugreg1, 1);
+               set_debugreg(next->debugreg2, 2);
+               set_debugreg(next->debugreg3, 3);
                 /* no 4 and 5 */
-               set_debugreg(next->debugreg[6], 6);
-               set_debugreg(next->debugreg[7], 7);
+               set_debugreg(next->debugreg6, 6);
+               set_debugreg(next->debugreg7, 7);
         }
  
  #ifdef CONFIG_SECCOMP
@@ -634,6 +653,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
         }
  #endif
  
+       if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+               ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+       if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+               ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+
+
         if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
                 /*
                  * Disable the bitmap via an invalid offset. We still cache
@@ -687,11 +713,11 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
   * More important, however, is the fact that this allows us much
   * more flexibility.
   *
- * The return value (in %eax) will be the "prev" task after
+ * The return value (in %ax) will be the "prev" task after
   * the task-switch, and shows up in ret_from_fork in entry.S,
   * for example.
   */
-struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  {
         struct thread_struct *prev = &prev_p->thread,
                                  *next = &next_p->thread;
@@ -710,7 +736,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
         /*
          * Reload esp0.
          */
-       load_esp0(tss, next);
+       load_sp0(tss, next);
  
         /*
          * Save away %gs. No need to save %fs, as it was saved on the
@@ -774,7 +800,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
  
  asmlinkage int sys_fork(struct pt_regs regs)
  {
-       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+       return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
  }
  
  asmlinkage int sys_clone(struct pt_regs regs)
@@ -783,12 +809,12 @@ asmlinkage int sys_clone(struct pt_regs regs)
         unsigned long newsp;
         int __user *parent_tidptr, *child_tidptr;
  
-       clone_flags = regs.ebx;
-       newsp = regs.ecx;
-       parent_tidptr = (int __user *)regs.edx;
-       child_tidptr = (int __user *)regs.edi;
+       clone_flags = regs.bx;
+       newsp = regs.cx;
+       parent_tidptr = (int __user *)regs.dx;
+       child_tidptr = (int __user *)regs.di;
         if (!newsp)
-               newsp = regs.esp;
+               newsp = regs.sp;
         return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
  }
  
@@ -804,7 +830,7 @@ asmlinkage int sys_clone(struct pt_regs regs)
   */
  asmlinkage int sys_vfork(struct pt_regs regs)
  {
-       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
  }
  
  /*
@@ -815,18 +841,15 @@ asmlinkage int sys_execve(struct pt_regs regs)
         int error;
         char * filename;
  
-       filename = getname((char __user *) regs.ebx);
+       filename = getname((char __user *) regs.bx);
         error = PTR_ERR(filename);
         if (IS_ERR(filename))
                 goto out;
         error = do_execve(filename,
-                       (char __user * __user *) regs.ecx,
-                       (char __user * __user *) regs.edx,
+                       (char __user * __user *) regs.cx,
+                       (char __user * __user *) regs.dx,
                         &regs);
         if (error == 0) {
-               task_lock(current);
-               current->ptrace &= ~PT_DTRACE;
-               task_unlock(current);
                 /* Make sure we don't return using sysenter.. */
                 set_thread_flag(TIF_IRET);
         }
@@ -840,145 +863,37 @@ out:
  
  unsigned long get_wchan(struct task_struct *p)
  {
-       unsigned long ebp, esp, eip;
+       unsigned long bp, sp, ip;
         unsigned long stack_page;
         int count = 0;
         if (!p || p == current || p->state == TASK_RUNNING)
                 return 0;
         stack_page = (unsigned long)task_stack_page(p);
-       esp = p->thread.esp;
-       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+       sp = p->thread.sp;
+       if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
                 return 0;
-       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
-       ebp = *(unsigned long *) esp;
+       /* include/asm-i386/system.h:switch_to() pushes bp last. */
+       bp = *(unsigned long *) sp;
         do {
-               if (ebp < stack_page || ebp > top_ebp+stack_page)
+               if (bp < stack_page || bp > top_ebp+stack_page)
                         return 0;
-               eip = *(unsigned long *) (ebp+4);
-               if (!in_sched_functions(eip))
-                       return eip;
-               ebp = *(unsigned long *) ebp;
+               ip = *(unsigned long *) (bp+4);
+               if (!in_sched_functions(ip))
+                       return ip;
+               bp = *(unsigned long *) bp;
         } while (count++ < 16);
         return 0;
  }
  
-/*
- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
- */
-static int get_free_idx(void)
-{
-       struct thread_struct *t = &current->thread;
-       int idx;
-
-       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
-               if (desc_empty(t->tls_array + idx))
-                       return idx + GDT_ENTRY_TLS_MIN;
-       return -ESRCH;
-}
-
-/*
- * Set a given TLS descriptor:
- */
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
-{
-       struct thread_struct *t = &current->thread;
-       struct user_desc info;
-       struct desc_struct *desc;
-       int cpu, idx;
-
-       if (copy_from_user(&info, u_info, sizeof(info)))
-               return -EFAULT;
-       idx = info.entry_number;
-
-       /*
-        * index -1 means the kernel should try to find and
-        * allocate an empty descriptor:
-        */
-       if (idx == -1) {
-               idx = get_free_idx();
-               if (idx < 0)
-                       return idx;
-               if (put_user(idx, &u_info->entry_number))
-                       return -EFAULT;
-       }
-
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-       /*
-        * We must not get preempted while modifying the TLS.
-        */
-       cpu = get_cpu();
-
-       if (LDT_empty(&info)) {
-               desc->a = 0;
-               desc->b = 0;
-       } else {
-               desc->a = LDT_entry_a(&info);
-               desc->b = LDT_entry_b(&info);
-       }
-       load_TLS(t, cpu);
-
-       put_cpu();
-
-       return 0;
-}
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-       (((desc)->a >> 16) & 0x0000ffff) | \
-       (((desc)->b << 16) & 0x00ff0000) | \
-       ( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-       ((desc)->a & 0x0ffff) | \
-        ((desc)->b & 0xf0000) )
-       
-#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
-
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
-{
-       struct user_desc info;
-       struct desc_struct *desc;
-       int idx;
-
-       if (get_user(idx, &u_info->entry_number))
-               return -EFAULT;
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       memset(&info, 0, sizeof(info));
-
-       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-       info.entry_number = idx;
-       info.base_addr = GET_BASE(desc);
-       info.limit = GET_LIMIT(desc);
-       info.seg_32bit = GET_32BIT(desc);
-       info.contents = GET_CONTENTS(desc);
-       info.read_exec_only = !GET_WRITABLE(desc);
-       info.limit_in_pages = GET_LIMIT_PAGES(desc);
-       info.seg_not_present = !GET_PRESENT(desc);
-       info.useable = GET_USEABLE(desc);
-
-       if (copy_to_user(u_info, &info, sizeof(info)))
-               return -EFAULT;
-       return 0;
-}
-
  unsigned long arch_align_stack(unsigned long sp)
  {
         if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
                 sp -= get_random_int() % 8192;
         return sp & ~0xf;
  }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+       unsigned long range_end = mm->brk + 0x02000000;
+       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index ab79e1dfa02311bb75c81c44aa137ddd450c5922..137a86171c393ac64fd0e6df392d40caad951b9f 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -3,7 +3,7 @@
   *
   *  Pentium III FXSR, SSE support
   *     Gareth Hughes <gareth@valinux.com>, May 2000
- * 
+ *
   *  X86-64 port
   *     Andi Kleen.
   *
@@ -19,19 +19,19 @@
  #include <linux/cpu.h>
  #include <linux/errno.h>
  #include <linux/sched.h>
+#include <linux/fs.h>
  #include <linux/kernel.h>
  #include <linux/mm.h>
-#include <linux/fs.h>
  #include <linux/elfcore.h>
  #include <linux/smp.h>
  #include <linux/slab.h>
  #include <linux/user.h>
-#include <linux/module.h>
  #include <linux/a.out.h>
  #include <linux/interrupt.h>
+#include <linux/utsname.h>
  #include <linux/delay.h>
+#include <linux/module.h>
  #include <linux/ptrace.h>
-#include <linux/utsname.h>
  #include <linux/random.h>
  #include <linux/notifier.h>
  #include <linux/kprobes.h>
@@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n)
  {
         atomic_notifier_chain_register(&idle_notifier, n);
  }
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
-       atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL(idle_notifier_unregister);
  
  void enter_idle(void)
  {
@@ -106,7 +99,7 @@ void exit_idle(void)
   * We use this if we don't have any better
   * idle routine..
   */
-static void default_idle(void)
+void default_idle(void)
  {
         current_thread_info()->status &= ~TS_POLLING;
         /*
@@ -116,11 +109,18 @@ static void default_idle(void)
         smp_mb();
         local_irq_disable();
         if (!need_resched()) {
-               /* Enables interrupts one instruction before HLT.
-                  x86 special cases this so there is no race. */
-               safe_halt();
-       } else
-               local_irq_enable();
+               ktime_t t0, t1;
+               u64 t0n, t1n;
+
+               t0 = ktime_get();
+               t0n = ktime_to_ns(t0);
+               safe_halt();    /* enables interrupts racelessly */
+               local_irq_disable();
+               t1 = ktime_get();
+               t1n = ktime_to_ns(t1);
+               sched_clock_idle_wakeup_event(t1n - t0n);
+       }
+       local_irq_enable();
         current_thread_info()->status |= TS_POLLING;
  }
  
@@ -129,54 +129,12 @@ static void default_idle(void)
   * to poll the ->need_resched flag instead of waiting for the
   * cross-CPU IPI to arrive. Use this option with caution.
   */
-static void poll_idle (void)
+static void poll_idle(void)
  {
         local_irq_enable();
         cpu_relax();
  }
  
-static void do_nothing(void *unused)
-{
-}
-
-void cpu_idle_wait(void)
-{
-       unsigned int cpu, this_cpu = get_cpu();
-       cpumask_t map, tmp = current->cpus_allowed;
-
-       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
-       put_cpu();
-
-       cpus_clear(map);
-       for_each_online_cpu(cpu) {
-               per_cpu(cpu_idle_state, cpu) = 1;
-               cpu_set(cpu, map);
-       }
-
-       __get_cpu_var(cpu_idle_state) = 0;
-
-       wmb();
-       do {
-               ssleep(1);
-               for_each_online_cpu(cpu) {
-                       if (cpu_isset(cpu, map) &&
-                                       !per_cpu(cpu_idle_state, cpu))
-                               cpu_clear(cpu, map);
-               }
-               cpus_and(map, map, cpu_online_map);
-               /*
-                * We waited 1 sec, if a CPU still did not call idle
-                * it may be because it is in idle and not waking up
-                * because it has nothing to do.
-                * Give all the remaining CPUS a kick.
-                */
-               smp_call_function_mask(map, do_nothing, 0, 0);
-       } while (!cpus_empty(map));
-
-       set_cpus_allowed(current, tmp);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
  #ifdef CONFIG_HOTPLUG_CPU
  DECLARE_PER_CPU(int, cpu_state);
  
@@ -207,19 +165,18 @@ static inline void play_dead(void)
   * low exit latency (ie sit in a loop waiting for
   * somebody to say that they'd like to reschedule)
   */
-void cpu_idle (void)
+void cpu_idle(void)
  {
         current_thread_info()->status |= TS_POLLING;
         /* endless idle loop with no priority at all */
         while (1) {
+               tick_nohz_stop_sched_tick();
                 while (!need_resched()) {
                         void (*idle)(void);
  
                         if (__get_cpu_var(cpu_idle_state))
                                 __get_cpu_var(cpu_idle_state) = 0;
  
-                       tick_nohz_stop_sched_tick();
-
                         rmb();
                         idle = pm_idle;
                         if (!idle)
@@ -247,6 +204,47 @@ void cpu_idle (void)
         }
  }
  
+static void do_nothing(void *unused)
+{
+}
+
+void cpu_idle_wait(void)
+{
+       unsigned int cpu, this_cpu = get_cpu();
+       cpumask_t map, tmp = current->cpus_allowed;
+
+       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+       put_cpu();
+
+       cpus_clear(map);
+       for_each_online_cpu(cpu) {
+               per_cpu(cpu_idle_state, cpu) = 1;
+               cpu_set(cpu, map);
+       }
+
+       __get_cpu_var(cpu_idle_state) = 0;
+
+       wmb();
+       do {
+               ssleep(1);
+               for_each_online_cpu(cpu) {
+                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+                               cpu_clear(cpu, map);
+               }
+               cpus_and(map, map, cpu_online_map);
+               /*
+                * We waited 1 sec, if a CPU still did not call idle
+                * it may be because it is in idle and not waking up
+                * because it has nothing to do.
+                * Give all the remaining CPUS a kick.
+                */
+               smp_call_function_mask(map, do_nothing, 0, 0);
+       } while (!cpus_empty(map));
+
+       set_cpus_allowed(current, tmp);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
  /*
   * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
   * which can obviate IPI to trigger checking of need_resched.
@@ -257,13 +255,13 @@ void cpu_idle (void)
   * New with Core Duo processors, MWAIT can take some hints based on CPU
   * capability.
   */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
  {
         if (!need_resched()) {
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
                 smp_mb();
                 if (!need_resched())
-                       __mwait(eax, ecx);
+                       __mwait(ax, cx);
         }
  }
  
@@ -282,25 +280,41 @@ static void mwait_idle(void)
         }
  }
  
+
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+{
+       if (force_mwait)
+               return 1;
+       /* Any C1 states supported? */
+       return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
+}
+
  void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
  {
-       static int printed;
-       if (cpu_has(c, X86_FEATURE_MWAIT)) {
+       static int selected;
+
+       if (selected)
+               return;
+#ifdef CONFIG_X86_SMP
+       if (pm_idle == poll_idle && smp_num_siblings > 1) {
+               printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+                       " performance may degrade.\n");
+       }
+#endif
+       if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
                 /*
                  * Skip, if setup has overridden idle.
                  * One CPU supports mwait => All CPUs supports mwait
                  */
                 if (!pm_idle) {
-                       if (!printed) {
-                               printk(KERN_INFO "using mwait in idle threads.\n");
-                               printed = 1;
-                       }
+                       printk(KERN_INFO "using mwait in idle threads.\n");
                         pm_idle = mwait_idle;
                 }
         }
+       selected = 1;
  }
  
-static int __init idle_setup (char *str)
+static int __init idle_setup(char *str)
  {
         if (!strcmp(str, "poll")) {
                 printk("using polling idle threads.\n");
@@ -315,13 +329,13 @@ static int __init idle_setup (char *str)
  }
  early_param("idle", idle_setup);
  
-/* Prints also some state that isn't saved in the pt_regs */ 
+/* Prints also some state that isn't saved in the pt_regs */
  void __show_regs(struct pt_regs * regs)
  {
         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
         unsigned long d0, d1, d2, d3, d6, d7;
-       unsigned int fsindex,gsindex;
-       unsigned int ds,cs,es; 
+       unsigned int fsindex, gsindex;
+       unsigned int ds, cs, es;
  
         printk("\n");
         print_modules();
@@ -330,16 +344,16 @@ void __show_regs(struct pt_regs * regs)
                 init_utsname()->release,
                 (int)strcspn(init_utsname()->version, " "),
                 init_utsname()->version);
-       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
-       printk_address(regs->rip); 
-       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
-               regs->eflags);
+       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+       printk_address(regs->ip, 1);
+       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
+               regs->flags);
         printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
-              regs->rax, regs->rbx, regs->rcx);
+              regs->ax, regs->bx, regs->cx);
         printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
-              regs->rdx, regs->rsi, regs->rdi); 
+              regs->dx, regs->si, regs->di);
         printk("RBP: %016lx R08: %016lx R09: %016lx\n",
-              regs->rbp, regs->r8, regs->r9); 
+              regs->bp, regs->r8, regs->r9);
         printk("R10: %016lx R11: %016lx R12: %016lx\n",
                regs->r10, regs->r11, regs->r12); 
         printk("R13: %016lx R14: %016lx R15: %016lx\n",
@@ -379,7 +393,7 @@ void show_regs(struct pt_regs *regs)
  {
         printk("CPU %d:", smp_processor_id());
         __show_regs(regs);
-       show_trace(NULL, regs, (void *)(regs + 1));
+       show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
  }
  
  /*
@@ -390,7 +404,7 @@ void exit_thread(void)
         struct task_struct *me = current;
         struct thread_struct *t = &me->thread;
  
-       if (me->thread.io_bitmap_ptr) { 
+       if (me->thread.io_bitmap_ptr) {
                 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
  
                 kfree(t->io_bitmap_ptr);
@@ -426,7 +440,7 @@ void flush_thread(void)
         tsk->thread.debugreg3 = 0;
         tsk->thread.debugreg6 = 0;
         tsk->thread.debugreg7 = 0;
-       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
+       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
         /*
          * Forget coprocessor state..
          */
@@ -449,26 +463,21 @@ void release_thread(struct task_struct *dead_task)
  
  static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
  {
-       struct user_desc ud = { 
+       struct user_desc ud = {
                 .base_addr = addr,
                 .limit = 0xfffff,
                 .seg_32bit = 1,
                 .limit_in_pages = 1,
                 .useable = 1,
         };
-       struct n_desc_struct *desc = (void *)t->thread.tls_array;
+       struct desc_struct *desc = t->thread.tls_array;
         desc += tls;
-       desc->a = LDT_entry_a(&ud); 
-       desc->b = LDT_entry_b(&ud); 
+       fill_ldt(desc, &ud);
  }
  
  static inline u32 read_32bit_tls(struct task_struct *t, int tls)
  {
-       struct desc_struct *desc = (void *)t->thread.tls_array;
-       desc += tls;
-       return desc->base0 | 
-               (((u32)desc->base1) << 16) | 
-               (((u32)desc->base2) << 24);
+       return get_desc_base(&t->thread.tls_array[tls]);
  }
  
  /*
@@ -480,7 +489,7 @@ void prepare_to_copy(struct task_struct *tsk)
         unlazy_fpu(tsk);
  }
  
-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
                 unsigned long unused,
         struct task_struct * p, struct pt_regs * regs)
  {
@@ -492,14 +501,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
                         (THREAD_SIZE + task_stack_page(p))) - 1;
         *childregs = *regs;
  
-       childregs->rax = 0;
-       childregs->rsp = rsp;
-       if (rsp == ~0UL)
-               childregs->rsp = (unsigned long)childregs;
+       childregs->ax = 0;
+       childregs->sp = sp;
+       if (sp == ~0UL)
+               childregs->sp = (unsigned long)childregs;
  
-       p->thread.rsp = (unsigned long) childregs;
-       p->thread.rsp0 = (unsigned long) (childregs+1);
-       p->thread.userrsp = me->thread.userrsp; 
+       p->thread.sp = (unsigned long) childregs;
+       p->thread.sp0 = (unsigned long) (childregs+1);
+       p->thread.usersp = me->thread.usersp;
  
         set_tsk_thread_flag(p, TIF_FORK);
  
@@ -520,7 +529,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
                 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
                                 IO_BITMAP_BYTES);
                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
-       } 
+       }
  
         /*
          * Set a new TLS for the child thread?
@@ -528,7 +537,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
         if (clone_flags & CLONE_SETTLS) {
  #ifdef CONFIG_IA32_EMULATION
                 if (test_thread_flag(TIF_IA32))
-                       err = ia32_child_tls(p, childregs); 
+                       err = do_set_thread_area(p, -1,
+                               (struct user_desc __user *)childregs->si, 0);
                 else                    
  #endif  
                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
@@ -547,17 +557,30 @@ out:
  /*
   * This special macro can be used to load a debugging register
   */
-#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
  
  static inline void __switch_to_xtra(struct task_struct *prev_p,
-                                   struct task_struct *next_p,
-                                   struct tss_struct *tss)
+                                   struct task_struct *next_p,
+                                   struct tss_struct *tss)
  {
         struct thread_struct *prev, *next;
+       unsigned long debugctl;
  
         prev = &prev_p->thread,
         next = &next_p->thread;
  
+       debugctl = prev->debugctlmsr;
+       if (next->ds_area_msr != prev->ds_area_msr) {
+               /* we clear debugctl to make sure DS
+                * is not in use when we change it */
+               debugctl = 0;
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+               wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+       }
+
+       if (next->debugctlmsr != debugctl)
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
+
         if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
                 loaddebug(next, 0);
                 loaddebug(next, 1);
@@ -581,12 +604,18 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
                  */
                 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
         }
+
+       if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+               ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+       if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+               ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
  }
  
  /*
   *     switch_to(x,y) should switch tasks from x to y.
   *
- * This could still be optimized: 
+ * This could still be optimized:
   * - fold all the options into a flag word and test it with a single test.
   * - could test fs/gs bitsliced
   *
@@ -597,7 +626,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  {
         struct thread_struct *prev = &prev_p->thread,
                                  *next = &next_p->thread;
-       int cpu = smp_processor_id();  
+       int cpu = smp_processor_id();
         struct tss_struct *tss = &per_cpu(init_tss, cpu);
  
         /* we're going to use this soon, after a few expensive things */
@@ -607,7 +636,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         /*
          * Reload esp0, LDT and the page table pointer:
          */
-       tss->rsp0 = next->rsp0;
+       load_sp0(tss, next);
  
         /* 
          * Switch DS and ES.
@@ -666,8 +695,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         /* 
          * Switch the PDA and FPU contexts.
          */
-       prev->userrsp = read_pda(oldrsp); 
-       write_pda(oldrsp, next->userrsp); 
+       prev->usersp = read_pda(oldrsp);
+       write_pda(oldrsp, next->usersp);
         write_pda(pcurrent, next_p); 
  
         write_pda(kernelstack,
@@ -684,8 +713,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         /*
          * Now maybe reload the debug registers and handle I/O bitmaps
          */
-       if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
-           || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+       if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
+                    task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
                 __switch_to_xtra(prev_p, next_p, tss);
  
         /* If the task has used fpu the last 5 timeslices, just do a full
@@ -700,7 +729,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
  /*
   * sys_execve() executes a new program.
   */
-asmlinkage 
+asmlinkage
  long sys_execve(char __user *name, char __user * __user *argv,
                 char __user * __user *envp, struct pt_regs regs)
  {
@@ -712,11 +741,6 @@ long sys_execve(char __user *name, char __user * __user *argv,
         if (IS_ERR(filename)) 
                 return error;
         error = do_execve(filename, argv, envp, &regs); 
-       if (error == 0) {
-               task_lock(current);
-               current->ptrace &= ~PT_DTRACE;
-               task_unlock(current);
-       }
         putname(filename);
         return error;
  }
@@ -726,18 +750,18 @@ void set_personality_64bit(void)
         /* inherit personality from parent */
  
         /* Make sure to be in 64bit mode */
-       clear_thread_flag(TIF_IA32); 
+       clear_thread_flag(TIF_IA32);
  
         /* TBD: overwrites user setup. Should have two bits.
            But 64bit processes have always behaved this way,
            so it's not too bad. The main problem is just that
-          32bit childs are affected again. */
+          32bit childs are affected again. */
         current->personality &= ~READ_IMPLIES_EXEC;
  }
  
  asmlinkage long sys_fork(struct pt_regs *regs)
  {
-       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+       return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
  }
  
  asmlinkage long
@@ -745,7 +769,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
           void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
  {
         if (!newsp)
-               newsp = regs->rsp;
+               newsp = regs->sp;
         return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
  }
  
@@ -761,29 +785,29 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
   */
  asmlinkage long sys_vfork(struct pt_regs *regs)
  {
-       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
                     NULL, NULL);
  }
  
  unsigned long get_wchan(struct task_struct *p)
  {
         unsigned long stack;
-       u64 fp,rip;
+       u64 fp,ip;
         int count = 0;
  
         if (!p || p == current || p->state==TASK_RUNNING)
                 return 0; 
         stack = (unsigned long)task_stack_page(p);
-       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
+       if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
                 return 0;
-       fp = *(u64 *)(p->thread.rsp);
+       fp = *(u64 *)(p->thread.sp);
         do { 
                 if (fp < (unsigned long)stack ||
                     fp > (unsigned long)stack+THREAD_SIZE)
                         return 0; 
-               rip = *(u64 *)(fp+8); 
-               if (!in_sched_functions(rip))
-                       return rip; 
+               ip = *(u64 *)(fp+8);
+               if (!in_sched_functions(ip))
+                       return ip;
                 fp = *(u64 *)fp; 
         } while (count++ < 16); 
         return 0;
@@ -824,19 +848,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
                 /* Not strictly needed for fs, but do it for symmetry
                    with gs */
                 if (addr >= TASK_SIZE_OF(task))
-                       return -EPERM; 
+                       return -EPERM;
                 cpu = get_cpu();
-               /* handle small bases via the GDT because that's faster to 
+               /* handle small bases via the GDT because that's faster to
                    switch. */
-               if (addr <= 0xffffffff) { 
+               if (addr <= 0xffffffff) {
                         set_32bit_tls(task, FS_TLS, addr);
-                       if (doit) { 
-                               load_TLS(&task->thread, cpu); 
+                       if (doit) {
+                               load_TLS(&task->thread, cpu);
                                 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
                         }
                         task->thread.fsindex = FS_TLS_SEL;
                         task->thread.fs = 0;
-               } else { 
+               } else {
                         task->thread.fsindex = 0;
                         task->thread.fs = addr;
                         if (doit) {
@@ -848,24 +872,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
                 }
                 put_cpu();
                 break;
-       case ARCH_GET_FS: { 
-               unsigned long base; 
+       case ARCH_GET_FS: {
+               unsigned long base;
                 if (task->thread.fsindex == FS_TLS_SEL)
                         base = read_32bit_tls(task, FS_TLS);
                 else if (doit)
                         rdmsrl(MSR_FS_BASE, base);
                 else
                         base = task->thread.fs;
-               ret = put_user(base, (unsigned long __user *)addr); 
-               break; 
+               ret = put_user(base, (unsigned long __user *)addr);
+               break;
         }
-       case ARCH_GET_GS: { 
+       case ARCH_GET_GS: {
                 unsigned long base;
                 unsigned gsindex;
                 if (task->thread.gsindex == GS_TLS_SEL)
                         base = read_32bit_tls(task, GS_TLS);
                 else if (doit) {
-                       asm("movl %%gs,%0" : "=r" (gsindex));
+                       asm("movl %%gs,%0" : "=r" (gsindex));
                         if (gsindex)
                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
                         else
@@ -873,39 +897,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
                 }
                 else
                         base = task->thread.gs;
-               ret = put_user(base, (unsigned long __user *)addr); 
+               ret = put_user(base, (unsigned long __user *)addr);
                 break;
         }
  
         default:
                 ret = -EINVAL;
                 break;
-       } 
+       }
  
-       return ret;     
-} 
+       return ret;
+}
  
  long sys_arch_prctl(int code, unsigned long addr)
  {
         return do_arch_prctl(current, code, addr);
-} 
-
-/* 
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-       struct pt_regs *pp, ptregs;
-
-       pp = task_pt_regs(tsk);
-
-       ptregs = *pp; 
-       ptregs.cs &= 0xffff;
-       ptregs.ss &= 0xffff;
-
-       elf_core_copy_regs(regs, &ptregs);
- 
-       return 1;
  }
  
  unsigned long arch_align_stack(unsigned long sp)
@@ -914,3 +920,9 @@ unsigned long arch_align_stack(unsigned long sp)
                 sp -= get_random_int() % 8192;
         return sp & ~0xf;
  }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+       unsigned long range_end = mm->brk + 0x02000000;
+       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c

new file mode 100644 (file)

index 0000000..96286df
--- /dev/null
+++ b/arch/x86/kernel/ptrace.c
@@ -0,0 +1,1545 @@
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * BTS tracing
+ *     Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/regset.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
+#include <asm/ds.h>
+
+#include "tls.h"
+
+enum x86_regset {
+       REGSET_GENERAL,
+       REGSET_FP,
+       REGSET_XFP,
+       REGSET_TLS,
+};
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ */
+#define FLAG_MASK_32           ((unsigned long)                        \
+                                (X86_EFLAGS_CF | X86_EFLAGS_PF |       \
+                                 X86_EFLAGS_AF | X86_EFLAGS_ZF |       \
+                                 X86_EFLAGS_SF | X86_EFLAGS_TF |       \
+                                 X86_EFLAGS_DF | X86_EFLAGS_OF |       \
+                                 X86_EFLAGS_RF | X86_EFLAGS_AC))
+
+/*
+ * Determines whether a value may be installed in a segment register.
+ */
+static inline bool invalid_selector(u16 value)
+{
+       return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL);
+}
+
+#ifdef CONFIG_X86_32
+
+#define FLAG_MASK              FLAG_MASK_32
+
+static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+{
+       BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
+       regno >>= 2;
+       if (regno > FS)
+               --regno;
+       return &regs->bx + regno;
+}
+
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
+{
+       /*
+        * Returning the value truncates it to 16 bits.
+        */
+       unsigned int retval;
+       if (offset != offsetof(struct user_regs_struct, gs))
+               retval = *pt_regs_access(task_pt_regs(task), offset);
+       else {
+               retval = task->thread.gs;
+               if (task == current)
+                       savesegment(gs, retval);
+       }
+       return retval;
+}
+
+static int set_segment_reg(struct task_struct *task,
+                          unsigned long offset, u16 value)
+{
+       /*
+        * The value argument was already truncated to 16 bits.
+        */
+       if (invalid_selector(value))
+               return -EIO;
+
+       if (offset != offsetof(struct user_regs_struct, gs))
+               *pt_regs_access(task_pt_regs(task), offset) = value;
+       else {
+               task->thread.gs = value;
+               if (task == current)
+                       /*
+                        * The user-mode %gs is not affected by
+                        * kernel entry, so we must update the CPU.
+                        */
+                       loadsegment(gs, value);
+       }
+
+       return 0;
+}
+
+static unsigned long debugreg_addr_limit(struct task_struct *task)
+{
+       return TASK_SIZE - 3;
+}
+
+#else  /* CONFIG_X86_64 */
+
+#define FLAG_MASK              (FLAG_MASK_32 | X86_EFLAGS_NT)
+
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
+{
+       BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
+       return &regs->r15 + (offset / sizeof(regs->r15));
+}
+
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
+{
+       /*
+        * Returning the value truncates it to 16 bits.
+        */
+       unsigned int seg;
+
+       switch (offset) {
+       case offsetof(struct user_regs_struct, fs):
+               if (task == current) {
+                       /* Older gas can't assemble movq %?s,%r?? */
+                       asm("movl %%fs,%0" : "=r" (seg));
+                       return seg;
+               }
+               return task->thread.fsindex;
+       case offsetof(struct user_regs_struct, gs):
+               if (task == current) {
+                       asm("movl %%gs,%0" : "=r" (seg));
+                       return seg;
+               }
+               return task->thread.gsindex;
+       case offsetof(struct user_regs_struct, ds):
+               if (task == current) {
+                       asm("movl %%ds,%0" : "=r" (seg));
+                       return seg;
+               }
+               return task->thread.ds;
+       case offsetof(struct user_regs_struct, es):
+               if (task == current) {
+                       asm("movl %%es,%0" : "=r" (seg));
+                       return seg;
+               }
+               return task->thread.es;
+
+       case offsetof(struct user_regs_struct, cs):
+       case offsetof(struct user_regs_struct, ss):
+               break;
+       }
+       return *pt_regs_access(task_pt_regs(task), offset);
+}
+
+static int set_segment_reg(struct task_struct *task,
+                          unsigned long offset, u16 value)
+{
+       /*
+        * The value argument was already truncated to 16 bits.
+        */
+       if (invalid_selector(value))
+               return -EIO;
+
+       switch (offset) {
+       case offsetof(struct user_regs_struct,fs):
+               /*
+                * If this is setting fs as for normal 64-bit use but
+                * setting fs_base has implicitly changed it, leave it.
+                */
+               if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
+                    task->thread.fs != 0) ||
+                   (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
+                    task->thread.fs == 0))
+                       break;
+               task->thread.fsindex = value;
+               if (task == current)
+                       loadsegment(fs, task->thread.fsindex);
+               break;
+       case offsetof(struct user_regs_struct,gs):
+               /*
+                * If this is setting gs as for normal 64-bit use but
+                * setting gs_base has implicitly changed it, leave it.
+                */
+               if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
+                    task->thread.gs != 0) ||
+                   (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
+                    task->thread.gs == 0))
+                       break;
+               task->thread.gsindex = value;
+               if (task == current)
+                       load_gs_index(task->thread.gsindex);
+               break;
+       case offsetof(struct user_regs_struct,ds):
+               task->thread.ds = value;
+               if (task == current)
+                       loadsegment(ds, task->thread.ds);
+               break;
+       case offsetof(struct user_regs_struct,es):
+               task->thread.es = value;
+               if (task == current)
+                       loadsegment(es, task->thread.es);
+               break;
+
+               /*
+                * Can't actually change these in 64-bit mode.
+                */
+       case offsetof(struct user_regs_struct,cs):
+#ifdef CONFIG_IA32_EMULATION
+               if (test_tsk_thread_flag(task, TIF_IA32))
+                       task_pt_regs(task)->cs = value;
+#endif
+               break;
+       case offsetof(struct user_regs_struct,ss):
+#ifdef CONFIG_IA32_EMULATION
+               if (test_tsk_thread_flag(task, TIF_IA32))
+                       task_pt_regs(task)->ss = value;
+#endif
+               break;
+       }
+
+       return 0;
+}
+
+static unsigned long debugreg_addr_limit(struct task_struct *task)
+{
+#ifdef CONFIG_IA32_EMULATION
+       if (test_tsk_thread_flag(task, TIF_IA32))
+               return IA32_PAGE_OFFSET - 3;
+#endif
+       return TASK_SIZE64 - 7;
+}
+
+#endif /* CONFIG_X86_32 */
+
+static unsigned long get_flags(struct task_struct *task)
+{
+       unsigned long retval = task_pt_regs(task)->flags;
+
+       /*
+        * If the debugger set TF, hide it from the readout.
+        */
+       if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+               retval &= ~X86_EFLAGS_TF;
+
+       return retval;
+}
+
+static int set_flags(struct task_struct *task, unsigned long value)
+{
+       struct pt_regs *regs = task_pt_regs(task);
+
+       /*
+        * If the user value contains TF, mark that
+        * it was not "us" (the debugger) that set it.
+        * If not, make sure it stays set if we had.
+        */
+       if (value & X86_EFLAGS_TF)
+               clear_tsk_thread_flag(task, TIF_FORCED_TF);
+       else if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+               value |= X86_EFLAGS_TF;
+
+       regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK);
+
+       return 0;
+}
+
+static int putreg(struct task_struct *child,
+                 unsigned long offset, unsigned long value)
+{
+       switch (offset) {
+       case offsetof(struct user_regs_struct, cs):
+       case offsetof(struct user_regs_struct, ds):
+       case offsetof(struct user_regs_struct, es):
+       case offsetof(struct user_regs_struct, fs):
+       case offsetof(struct user_regs_struct, gs):
+       case offsetof(struct user_regs_struct, ss):
+               return set_segment_reg(child, offset, value);
+
+       case offsetof(struct user_regs_struct, flags):
+               return set_flags(child, value);
+
+#ifdef CONFIG_X86_64
+       case offsetof(struct user_regs_struct,fs_base):
+               if (value >= TASK_SIZE_OF(child))
+                       return -EIO;
+               /*
+                * When changing the segment base, use do_arch_prctl
+                * to set either thread.fs or thread.fsindex and the
+                * corresponding GDT slot.
+                */
+               if (child->thread.fs != value)
+                       return do_arch_prctl(child, ARCH_SET_FS, value);
+               return 0;
+       case offsetof(struct user_regs_struct,gs_base):
+               /*
+                * Exactly the same here as the %fs handling above.
+                */
+               if (value >= TASK_SIZE_OF(child))
+                       return -EIO;
+               if (child->thread.gs != value)
+                       return do_arch_prctl(child, ARCH_SET_GS, value);
+               return 0;
+#endif
+       }
+
+       *pt_regs_access(task_pt_regs(child), offset) = value;
+       return 0;
+}
+
+static unsigned long getreg(struct task_struct *task, unsigned long offset)
+{
+       switch (offset) {
+       case offsetof(struct user_regs_struct, cs):
+       case offsetof(struct user_regs_struct, ds):
+       case offsetof(struct user_regs_struct, es):
+       case offsetof(struct user_regs_struct, fs):
+       case offsetof(struct user_regs_struct, gs):
+       case offsetof(struct user_regs_struct, ss):
+               return get_segment_reg(task, offset);
+
+       case offsetof(struct user_regs_struct, flags):
+               return get_flags(task);
+
+#ifdef CONFIG_X86_64
+       case offsetof(struct user_regs_struct, fs_base): {
+               /*
+                * do_arch_prctl may have used a GDT slot instead of
+                * the MSR.  To userland, it appears the same either
+                * way, except the %fs segment selector might not be 0.
+                */
+               unsigned int seg = task->thread.fsindex;
+               if (task->thread.fs != 0)
+                       return task->thread.fs;
+               if (task == current)
+                       asm("movl %%fs,%0" : "=r" (seg));
+               if (seg != FS_TLS_SEL)
+                       return 0;
+               return get_desc_base(&task->thread.tls_array[FS_TLS]);
+       }
+       case offsetof(struct user_regs_struct, gs_base): {
+               /*
+                * Exactly the same here as the %fs handling above.
+                */
+               unsigned int seg = task->thread.gsindex;
+               if (task->thread.gs != 0)
+                       return task->thread.gs;
+               if (task == current)
+                       asm("movl %%gs,%0" : "=r" (seg));
+               if (seg != GS_TLS_SEL)
+                       return 0;
+               return get_desc_base(&task->thread.tls_array[GS_TLS]);
+       }
+#endif
+       }
+
+       return *pt_regs_access(task_pt_regs(task), offset);
+}
+
+static int genregs_get(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      void *kbuf, void __user *ubuf)
+{
+       if (kbuf) {
+               unsigned long *k = kbuf;
+               while (count > 0) {
+                       *k++ = getreg(target, pos);
+                       count -= sizeof(*k);
+                       pos += sizeof(*k);
+               }
+       } else {
+               unsigned long __user *u = ubuf;
+               while (count > 0) {
+                       if (__put_user(getreg(target, pos), u++))
+                               return -EFAULT;
+                       count -= sizeof(*u);
+                       pos += sizeof(*u);
+               }
+       }
+
+       return 0;
+}
+
+static int genregs_set(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      const void *kbuf, const void __user *ubuf)
+{
+       int ret = 0;
+       if (kbuf) {
+               const unsigned long *k = kbuf;
+               while (count > 0 && !ret) {
+                       ret = putreg(target, pos, *k++);
+                       count -= sizeof(*k);
+                       pos += sizeof(*k);
+               }
+       } else {
+               const unsigned long  __user *u = ubuf;
+               while (count > 0 && !ret) {
+                       unsigned long word;
+                       ret = __get_user(word, u++);
+                       if (ret)
+                               break;
+                       ret = putreg(target, pos, word);
+                       count -= sizeof(*u);
+                       pos += sizeof(*u);
+               }
+       }
+       return ret;
+}
+
+/*
+ * This function is trivial and will be inlined by the compiler.
+ * Having it separates the implementation details of debug
+ * registers from the interface details of ptrace.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+{
+       switch (n) {
+       case 0:         return child->thread.debugreg0;
+       case 1:         return child->thread.debugreg1;
+       case 2:         return child->thread.debugreg2;
+       case 3:         return child->thread.debugreg3;
+       case 6:         return child->thread.debugreg6;
+       case 7:         return child->thread.debugreg7;
+       }
+       return 0;
+}
+
+static int ptrace_set_debugreg(struct task_struct *child,
+                              int n, unsigned long data)
+{
+       int i;
+
+       if (unlikely(n == 4 || n == 5))
+               return -EIO;
+
+       if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
+               return -EIO;
+
+       switch (n) {
+       case 0:         child->thread.debugreg0 = data; break;
+       case 1:         child->thread.debugreg1 = data; break;
+       case 2:         child->thread.debugreg2 = data; break;
+       case 3:         child->thread.debugreg3 = data; break;
+
+       case 6:
+               if ((data & ~0xffffffffUL) != 0)
+                       return -EIO;
+               child->thread.debugreg6 = data;
+               break;
+
+       case 7:
+               /*
+                * Sanity-check data. Take one half-byte at once with
+                * check = (val >> (16 + 4*i)) & 0xf. It contains the
+                * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
+                * 2 and 3 are LENi. Given a list of invalid values,
+                * we do mask |= 1 << invalid_value, so that
+                * (mask >> check) & 1 is a correct test for invalid
+                * values.
+                *
+                * R/Wi contains the type of the breakpoint /
+                * watchpoint, LENi contains the length of the watched
+                * data in the watchpoint case.
+                *
+                * The invalid values are:
+                * - LENi == 0x10 (undefined), so mask |= 0x0f00.       [32-bit]
+                * - R/Wi == 0x10 (break on I/O reads or writes), so
+                *   mask |= 0x4444.
+                * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
+                *   0x1110.
+                *
+                * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
+                *
+                * See the Intel Manual "System Programming Guide",
+                * 15.2.4
+                *
+                * Note that LENi == 0x10 is defined on x86_64 in long
+                * mode (i.e. even for 32-bit userspace software, but
+                * 64-bit kernel), so the x86_64 mask value is 0x5454.
+                * See the AMD manual no. 24593 (AMD64 System Programming)
+                */
+#ifdef CONFIG_X86_32
+#define        DR7_MASK        0x5f54
+#else
+#define        DR7_MASK        0x5554
+#endif
+               data &= ~DR_CONTROL_RESERVED;
+               for (i = 0; i < 4; i++)
+                       if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+                               return -EIO;
+               child->thread.debugreg7 = data;
+               if (data)
+                       set_tsk_thread_flag(child, TIF_DEBUG);
+               else
+                       clear_tsk_thread_flag(child, TIF_DEBUG);
+               break;
+       }
+
+       return 0;
+}
+
+static int ptrace_bts_get_size(struct task_struct *child)
+{
+       if (!child->thread.ds_area_msr)
+               return -ENXIO;
+
+       return ds_get_bts_index((void *)child->thread.ds_area_msr);
+}
+
+static int ptrace_bts_read_record(struct task_struct *child,
+                                 long index,
+                                 struct bts_struct __user *out)
+{
+       struct bts_struct ret;
+       int retval;
+       int bts_end;
+       int bts_index;
+
+       if (!child->thread.ds_area_msr)
+               return -ENXIO;
+
+       if (index < 0)
+               return -EINVAL;
+
+       bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
+       if (bts_end <= index)
+               return -EINVAL;
+
+       /* translate the ptrace bts index into the ds bts index */
+       bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
+       bts_index -= (index + 1);
+       if (bts_index < 0)
+               bts_index += bts_end;
+
+       retval = ds_read_bts((void *)child->thread.ds_area_msr,
+                            bts_index, &ret);
+       if (retval < 0)
+               return retval;
+
+       if (copy_to_user(out, &ret, sizeof(ret)))
+               return -EFAULT;
+
+       return sizeof(ret);
+}
+
+static int ptrace_bts_write_record(struct task_struct *child,
+                                  const struct bts_struct *in)
+{
+       int retval;
+
+       if (!child->thread.ds_area_msr)
+               return -ENXIO;
+
+       retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
+       if (retval)
+               return retval;
+
+       return sizeof(*in);
+}
+
+static int ptrace_bts_clear(struct task_struct *child)
+{
+       if (!child->thread.ds_area_msr)
+               return -ENXIO;
+
+       return ds_clear((void *)child->thread.ds_area_msr);
+}
+
+static int ptrace_bts_drain(struct task_struct *child,
+                           long size,
+                           struct bts_struct __user *out)
+{
+       int end, i;
+       void *ds = (void *)child->thread.ds_area_msr;
+
+       if (!ds)
+               return -ENXIO;
+
+       end = ds_get_bts_index(ds);
+       if (end <= 0)
+               return end;
+
+       if (size < (end * sizeof(struct bts_struct)))
+               return -EIO;
+
+       for (i = 0; i < end; i++, out++) {
+               struct bts_struct ret;
+               int retval;
+
+               retval = ds_read_bts(ds, i, &ret);
+               if (retval < 0)
+                       return retval;
+
+               if (copy_to_user(out, &ret, sizeof(ret)))
+                       return -EFAULT;
+       }
+
+       ds_clear(ds);
+
+       return end;
+}
+
+static int ptrace_bts_realloc(struct task_struct *child,
+                             int size, int reduce_size)
+{
+       unsigned long rlim, vm;
+       int ret, old_size;
+
+       if (size < 0)
+               return -EINVAL;
+
+       old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
+       if (old_size < 0)
+               return old_size;
+
+       ret = ds_free((void **)&child->thread.ds_area_msr);
+       if (ret < 0)
+               goto out;
+
+       size >>= PAGE_SHIFT;
+       old_size >>= PAGE_SHIFT;
+
+       current->mm->total_vm  -= old_size;
+       current->mm->locked_vm -= old_size;
+
+       if (size == 0)
+               goto out;
+
+       rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+       vm = current->mm->total_vm  + size;
+       if (rlim < vm) {
+               ret = -ENOMEM;
+
+               if (!reduce_size)
+                       goto out;
+
+               size = rlim - current->mm->total_vm;
+               if (size <= 0)
+                       goto out;
+       }
+
+       rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+       vm = current->mm->locked_vm  + size;
+       if (rlim < vm) {
+               ret = -ENOMEM;
+
+               if (!reduce_size)
+                       goto out;
+
+               size = rlim - current->mm->locked_vm;
+               if (size <= 0)
+                       goto out;
+       }
+
+       ret = ds_allocate((void **)&child->thread.ds_area_msr,
+                         size << PAGE_SHIFT);
+       if (ret < 0)
+               goto out;
+
+       current->mm->total_vm  += size;
+       current->mm->locked_vm += size;
+
+out:
+       if (child->thread.ds_area_msr)
+               set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+       else
+               clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+
+       return ret;
+}
+
+static int ptrace_bts_config(struct task_struct *child,
+                            long cfg_size,
+                            const struct ptrace_bts_config __user *ucfg)
+{
+       struct ptrace_bts_config cfg;
+       int bts_size, ret = 0;
+       void *ds;
+
+       if (cfg_size < sizeof(cfg))
+               return -EIO;
+
+       if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
+               return -EFAULT;
+
+       if ((int)cfg.size < 0)
+               return -EINVAL;
+
+       bts_size = 0;
+       ds = (void *)child->thread.ds_area_msr;
+       if (ds) {
+               bts_size = ds_get_bts_size(ds);
+               if (bts_size < 0)
+                       return bts_size;
+       }
+       cfg.size = PAGE_ALIGN(cfg.size);
+
+       if (bts_size != cfg.size) {
+               ret = ptrace_bts_realloc(child, cfg.size,
+                                        cfg.flags & PTRACE_BTS_O_CUT_SIZE);
+               if (ret < 0)
+                       goto errout;
+
+               ds = (void *)child->thread.ds_area_msr;
+       }
+
+       if (cfg.flags & PTRACE_BTS_O_SIGNAL)
+               ret = ds_set_overflow(ds, DS_O_SIGNAL);
+       else
+               ret = ds_set_overflow(ds, DS_O_WRAP);
+       if (ret < 0)
+               goto errout;
+
+       if (cfg.flags & PTRACE_BTS_O_TRACE)
+               child->thread.debugctlmsr |= ds_debugctl_mask();
+       else
+               child->thread.debugctlmsr &= ~ds_debugctl_mask();
+
+       if (cfg.flags & PTRACE_BTS_O_SCHED)
+               set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+       else
+               clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+
+       ret = sizeof(cfg);
+
+out:
+       if (child->thread.debugctlmsr)
+               set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+       else
+               clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+
+       return ret;
+
+errout:
+       child->thread.debugctlmsr &= ~ds_debugctl_mask();
+       clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+       goto out;
+}
+
+static int ptrace_bts_status(struct task_struct *child,
+                            long cfg_size,
+                            struct ptrace_bts_config __user *ucfg)
+{
+       void *ds = (void *)child->thread.ds_area_msr;
+       struct ptrace_bts_config cfg;
+
+       if (cfg_size < sizeof(cfg))
+               return -EIO;
+
+       memset(&cfg, 0, sizeof(cfg));
+
+       if (ds) {
+               cfg.size = ds_get_bts_size(ds);
+
+               if (ds_get_overflow(ds) == DS_O_SIGNAL)
+                       cfg.flags |= PTRACE_BTS_O_SIGNAL;
+
+               if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+                   child->thread.debugctlmsr & ds_debugctl_mask())
+                       cfg.flags |= PTRACE_BTS_O_TRACE;
+
+               if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+                       cfg.flags |= PTRACE_BTS_O_SCHED;
+       }
+
+       cfg.bts_size = sizeof(struct bts_struct);
+
+       if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
+               return -EFAULT;
+
+       return sizeof(cfg);
+}
+
+void ptrace_bts_take_timestamp(struct task_struct *tsk,
+                              enum bts_qualifier qualifier)
+{
+       struct bts_struct rec = {
+               .qualifier = qualifier,
+               .variant.jiffies = jiffies_64
+       };
+
+       ptrace_bts_write_record(tsk, &rec);
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{
+       user_disable_single_step(child);
+#ifdef TIF_SYSCALL_EMU
+       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+       if (child->thread.ds_area_msr) {
+               ptrace_bts_realloc(child, 0, 0);
+               child->thread.debugctlmsr &= ~ds_debugctl_mask();
+               if (!child->thread.debugctlmsr)
+                       clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+               clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+       }
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static const struct user_regset_view user_x86_32_view; /* Initialized below. */
+#endif
+
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+       int ret;
+       unsigned long __user *datap = (unsigned long __user *)data;
+
+       switch (request) {
+       /* read the word at location addr in the USER area. */
+       case PTRACE_PEEKUSR: {
+               unsigned long tmp;
+
+               ret = -EIO;
+               if ((addr & (sizeof(data) - 1)) || addr < 0 ||
+                   addr >= sizeof(struct user))
+                       break;
+
+               tmp = 0;  /* Default return condition */
+               if (addr < sizeof(struct user_regs_struct))
+                       tmp = getreg(child, addr);
+               else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+                        addr <= offsetof(struct user, u_debugreg[7])) {
+                       addr -= offsetof(struct user, u_debugreg[0]);
+                       tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+               }
+               ret = put_user(tmp, datap);
+               break;
+       }
+
+       case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+               ret = -EIO;
+               if ((addr & (sizeof(data) - 1)) || addr < 0 ||
+                   addr >= sizeof(struct user))
+                       break;
+
+               if (addr < sizeof(struct user_regs_struct))
+                       ret = putreg(child, addr, data);
+               else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+                        addr <= offsetof(struct user, u_debugreg[7])) {
+                       addr -= offsetof(struct user, u_debugreg[0]);
+                       ret = ptrace_set_debugreg(child,
+                                                 addr / sizeof(data), data);
+               }
+               break;
+
+       case PTRACE_GETREGS:    /* Get all gp regs from the child. */
+               return copy_regset_to_user(child,
+                                          task_user_regset_view(current),
+                                          REGSET_GENERAL,
+                                          0, sizeof(struct user_regs_struct),
+                                          datap);
+
+       case PTRACE_SETREGS:    /* Set all gp regs in the child. */
+               return copy_regset_from_user(child,
+                                            task_user_regset_view(current),
+                                            REGSET_GENERAL,
+                                            0, sizeof(struct user_regs_struct),
+                                            datap);
+
+       case PTRACE_GETFPREGS:  /* Get the child FPU state. */
+               return copy_regset_to_user(child,
+                                          task_user_regset_view(current),
+                                          REGSET_FP,
+                                          0, sizeof(struct user_i387_struct),
+                                          datap);
+
+       case PTRACE_SETFPREGS:  /* Set the child FPU state. */
+               return copy_regset_from_user(child,
+                                            task_user_regset_view(current),
+                                            REGSET_FP,
+                                            0, sizeof(struct user_i387_struct),
+                                            datap);
+
+#ifdef CONFIG_X86_32
+       case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
+               return copy_regset_to_user(child, &user_x86_32_view,
+                                          REGSET_XFP,
+                                          0, sizeof(struct user_fxsr_struct),
+                                          datap);
+
+       case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
+               return copy_regset_from_user(child, &user_x86_32_view,
+                                            REGSET_XFP,
+                                            0, sizeof(struct user_fxsr_struct),
+                                            datap);
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+       case PTRACE_GET_THREAD_AREA:
+               if (addr < 0)
+                       return -EIO;
+               ret = do_get_thread_area(child, addr,
+                                        (struct user_desc __user *) data);
+               break;
+
+       case PTRACE_SET_THREAD_AREA:
+               if (addr < 0)
+                       return -EIO;
+               ret = do_set_thread_area(child, addr,
+                                        (struct user_desc __user *) data, 0);
+               break;
+#endif
+
+#ifdef CONFIG_X86_64
+               /* normal 64bit interface to access TLS data.
+                  Works just like arch_prctl, except that the arguments
+                  are reversed. */
+       case PTRACE_ARCH_PRCTL:
+               ret = do_arch_prctl(child, data, addr);
+               break;
+#endif
+
+       case PTRACE_BTS_CONFIG:
+               ret = ptrace_bts_config
+                       (child, data, (struct ptrace_bts_config __user *)addr);
+               break;
+
+       case PTRACE_BTS_STATUS:
+               ret = ptrace_bts_status
+                       (child, data, (struct ptrace_bts_config __user *)addr);
+               break;
+
+       case PTRACE_BTS_SIZE:
+               ret = ptrace_bts_get_size(child);
+               break;
+
+       case PTRACE_BTS_GET:
+               ret = ptrace_bts_read_record
+                       (child, data, (struct bts_struct __user *) addr);
+               break;
+
+       case PTRACE_BTS_CLEAR:
+               ret = ptrace_bts_clear(child);
+               break;
+
+       case PTRACE_BTS_DRAIN:
+               ret = ptrace_bts_drain
+                       (child, data, (struct bts_struct __user *) addr);
+               break;
+
+       default:
+               ret = ptrace_request(child, request, addr, data);
+               break;
+       }
+
+       return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <asm/ia32.h>
+#include <asm/user32.h>
+
+#define R32(l,q)                                                       \
+       case offsetof(struct user32, regs.l):                           \
+               regs->q = value; break
+
+#define SEG32(rs)                                                      \
+       case offsetof(struct user32, regs.rs):                          \
+               return set_segment_reg(child,                           \
+                                      offsetof(struct user_regs_struct, rs), \
+                                      value);                          \
+               break
+
+static int putreg32(struct task_struct *child, unsigned regno, u32 value)
+{
+       struct pt_regs *regs = task_pt_regs(child);
+
+       switch (regno) {
+
+       SEG32(cs);
+       SEG32(ds);
+       SEG32(es);
+       SEG32(fs);
+       SEG32(gs);
+       SEG32(ss);
+
+       R32(ebx, bx);
+       R32(ecx, cx);
+       R32(edx, dx);
+       R32(edi, di);
+       R32(esi, si);
+       R32(ebp, bp);
+       R32(eax, ax);
+       R32(orig_eax, orig_ax);
+       R32(eip, ip);
+       R32(esp, sp);
+
+       case offsetof(struct user32, regs.eflags):
+               return set_flags(child, value);
+
+       case offsetof(struct user32, u_debugreg[0]) ...
+               offsetof(struct user32, u_debugreg[7]):
+               regno -= offsetof(struct user32, u_debugreg[0]);
+               return ptrace_set_debugreg(child, regno / 4, value);
+
+       default:
+               if (regno > sizeof(struct user32) || (regno & 3))
+                       return -EIO;
+
+               /*
+                * Other dummy fields in the virtual user structure
+                * are ignored
+                */
+               break;
+       }
+       return 0;
+}
+
+#undef R32
+#undef SEG32
+
+#define R32(l,q)                                                       \
+       case offsetof(struct user32, regs.l):                           \
+               *val = regs->q; break
+
+#define SEG32(rs)                                                      \
+       case offsetof(struct user32, regs.rs):                          \
+               *val = get_segment_reg(child,                           \
+                                      offsetof(struct user_regs_struct, rs)); \
+               break
+
+static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
+{
+       struct pt_regs *regs = task_pt_regs(child);
+
+       switch (regno) {
+
+       SEG32(ds);
+       SEG32(es);
+       SEG32(fs);
+       SEG32(gs);
+
+       R32(cs, cs);
+       R32(ss, ss);
+       R32(ebx, bx);
+       R32(ecx, cx);
+       R32(edx, dx);
+       R32(edi, di);
+       R32(esi, si);
+       R32(ebp, bp);
+       R32(eax, ax);
+       R32(orig_eax, orig_ax);
+       R32(eip, ip);
+       R32(esp, sp);
+
+       case offsetof(struct user32, regs.eflags):
+               *val = get_flags(child);
+               break;
+
+       case offsetof(struct user32, u_debugreg[0]) ...
+               offsetof(struct user32, u_debugreg[7]):
+               regno -= offsetof(struct user32, u_debugreg[0]);
+               *val = ptrace_get_debugreg(child, regno / 4);
+               break;
+
+       default:
+               if (regno > sizeof(struct user32) || (regno & 3))
+                       return -EIO;
+
+               /*
+                * Other dummy fields in the virtual user structure
+                * are ignored
+                */
+               *val = 0;
+               break;
+       }
+       return 0;
+}
+
+#undef R32
+#undef SEG32
+
+static int genregs32_get(struct task_struct *target,
+                        const struct user_regset *regset,
+                        unsigned int pos, unsigned int count,
+                        void *kbuf, void __user *ubuf)
+{
+       if (kbuf) {
+               compat_ulong_t *k = kbuf;
+               while (count > 0) {
+                       getreg32(target, pos, k++);
+                       count -= sizeof(*k);
+                       pos += sizeof(*k);
+               }
+       } else {
+               compat_ulong_t __user *u = ubuf;
+               while (count > 0) {
+                       compat_ulong_t word;
+                       getreg32(target, pos, &word);
+                       if (__put_user(word, u++))
+                               return -EFAULT;
+                       count -= sizeof(*u);
+                       pos += sizeof(*u);
+               }
+       }
+
+       return 0;
+}
+
+static int genregs32_set(struct task_struct *target,
+                        const struct user_regset *regset,
+                        unsigned int pos, unsigned int count,
+                        const void *kbuf, const void __user *ubuf)
+{
+       int ret = 0;
+       if (kbuf) {
+               const compat_ulong_t *k = kbuf;
+               while (count > 0 && !ret) {
+                       ret = putreg(target, pos, *k++);
+                       count -= sizeof(*k);
+                       pos += sizeof(*k);
+               }
+       } else {
+               const compat_ulong_t __user *u = ubuf;
+               while (count > 0 && !ret) {
+                       compat_ulong_t word;
+                       ret = __get_user(word, u++);
+                       if (ret)
+                               break;
+                       ret = putreg(target, pos, word);
+                       count -= sizeof(*u);
+                       pos += sizeof(*u);
+               }
+       }
+       return ret;
+}
+
+static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
+{
+       siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
+       compat_siginfo_t __user *si32 = compat_ptr(data);
+       siginfo_t ssi;
+       int ret;
+
+       if (request == PTRACE_SETSIGINFO) {
+               memset(&ssi, 0, sizeof(siginfo_t));
+               ret = copy_siginfo_from_user32(&ssi, si32);
+               if (ret)
+                       return ret;
+               if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
+                       return -EFAULT;
+       }
+       ret = sys_ptrace(request, pid, addr, (unsigned long)si);
+       if (ret)
+               return ret;
+       if (request == PTRACE_GETSIGINFO) {
+               if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
+                       return -EFAULT;
+               ret = copy_siginfo_to_user32(si32, &ssi);
+       }
+       return ret;
+}
+
+asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
+{
+       struct task_struct *child;
+       struct pt_regs *childregs;
+       void __user *datap = compat_ptr(data);
+       int ret;
+       __u32 val;
+
+       switch (request) {
+       case PTRACE_TRACEME:
+       case PTRACE_ATTACH:
+       case PTRACE_KILL:
+       case PTRACE_CONT:
+       case PTRACE_SINGLESTEP:
+       case PTRACE_SINGLEBLOCK:
+       case PTRACE_DETACH:
+       case PTRACE_SYSCALL:
+       case PTRACE_OLDSETOPTIONS:
+       case PTRACE_SETOPTIONS:
+       case PTRACE_SET_THREAD_AREA:
+       case PTRACE_GET_THREAD_AREA:
+       case PTRACE_BTS_CONFIG:
+       case PTRACE_BTS_STATUS:
+       case PTRACE_BTS_SIZE:
+       case PTRACE_BTS_GET:
+       case PTRACE_BTS_CLEAR:
+       case PTRACE_BTS_DRAIN:
+               return sys_ptrace(request, pid, addr, data);
+
+       default:
+               return -EINVAL;
+
+       case PTRACE_PEEKTEXT:
+       case PTRACE_PEEKDATA:
+       case PTRACE_POKEDATA:
+       case PTRACE_POKETEXT:
+       case PTRACE_POKEUSR:
+       case PTRACE_PEEKUSR:
+       case PTRACE_GETREGS:
+       case PTRACE_SETREGS:
+       case PTRACE_SETFPREGS:
+       case PTRACE_GETFPREGS:
+       case PTRACE_SETFPXREGS:
+       case PTRACE_GETFPXREGS:
+       case PTRACE_GETEVENTMSG:
+               break;
+
+       case PTRACE_SETSIGINFO:
+       case PTRACE_GETSIGINFO:
+               return ptrace32_siginfo(request, pid, addr, data);
+       }
+
+       child = ptrace_get_task_struct(pid);
+       if (IS_ERR(child))
+               return PTR_ERR(child);
+
+       ret = ptrace_check_attach(child, request == PTRACE_KILL);
+       if (ret < 0)
+               goto out;
+
+       childregs = task_pt_regs(child);
+
+       switch (request) {
+       case PTRACE_PEEKUSR:
+               ret = getreg32(child, addr, &val);
+               if (ret == 0)
+                       ret = put_user(val, (__u32 __user *)datap);
+               break;
+
+       case PTRACE_POKEUSR:
+               ret = putreg32(child, addr, data);
+               break;
+
+       case PTRACE_GETREGS:    /* Get all gp regs from the child. */
+               return copy_regset_to_user(child, &user_x86_32_view,
+                                          REGSET_GENERAL,
+                                          0, sizeof(struct user_regs_struct32),
+                                          datap);
+
+       case PTRACE_SETREGS:    /* Set all gp regs in the child. */
+               return copy_regset_from_user(child, &user_x86_32_view,
+                                            REGSET_GENERAL, 0,
+                                            sizeof(struct user_regs_struct32),
+                                            datap);
+
+       case PTRACE_GETFPREGS:  /* Get the child FPU state. */
+               return copy_regset_to_user(child, &user_x86_32_view,
+                                          REGSET_FP, 0,
+                                          sizeof(struct user_i387_ia32_struct),
+                                          datap);
+
+       case PTRACE_SETFPREGS:  /* Set the child FPU state. */
+               return copy_regset_from_user(
+                       child, &user_x86_32_view, REGSET_FP,
+                       0, sizeof(struct user_i387_ia32_struct), datap);
+
+       case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
+               return copy_regset_to_user(child, &user_x86_32_view,
+                                          REGSET_XFP, 0,
+                                          sizeof(struct user32_fxsr_struct),
+                                          datap);
+
+       case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
+               return copy_regset_from_user(child, &user_x86_32_view,
+                                            REGSET_XFP, 0,
+                                            sizeof(struct user32_fxsr_struct),
+                                            datap);
+
+       default:
+               return compat_ptrace_request(child, request, addr, data);
+       }
+
+ out:
+       put_task_struct(child);
+       return ret;
+}
+
+#endif /* CONFIG_IA32_EMULATION */
+
+#ifdef CONFIG_X86_64
+
+static const struct user_regset x86_64_regsets[] = {
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS,
+               .n = sizeof(struct user_regs_struct) / sizeof(long),
+               .size = sizeof(long), .align = sizeof(long),
+               .get = genregs_get, .set = genregs_set
+       },
+       [REGSET_FP] = {
+               .core_note_type = NT_PRFPREG,
+               .n = sizeof(struct user_i387_struct) / sizeof(long),
+               .size = sizeof(long), .align = sizeof(long),
+               .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+       },
+};
+
+static const struct user_regset_view user_x86_64_view = {
+       .name = "x86_64", .e_machine = EM_X86_64,
+       .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets)
+};
+
+#else  /* CONFIG_X86_32 */
+
+#define user_regs_struct32     user_regs_struct
+#define genregs32_get          genregs_get
+#define genregs32_set          genregs_set
+
+#endif /* CONFIG_X86_64 */
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static const struct user_regset x86_32_regsets[] = {
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS,
+               .n = sizeof(struct user_regs_struct32) / sizeof(u32),
+               .size = sizeof(u32), .align = sizeof(u32),
+               .get = genregs32_get, .set = genregs32_set
+       },
+       [REGSET_FP] = {
+               .core_note_type = NT_PRFPREG,
+               .n = sizeof(struct user_i387_struct) / sizeof(u32),
+               .size = sizeof(u32), .align = sizeof(u32),
+               .active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+       },
+       [REGSET_XFP] = {
+               .core_note_type = NT_PRXFPREG,
+               .n = sizeof(struct user_i387_struct) / sizeof(u32),
+               .size = sizeof(u32), .align = sizeof(u32),
+               .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+       },
+       [REGSET_TLS] = {
+               .core_note_type = NT_386_TLS,
+               .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
+               .size = sizeof(struct user_desc),
+               .align = sizeof(struct user_desc),
+               .active = regset_tls_active,
+               .get = regset_tls_get, .set = regset_tls_set
+       },
+};
+
+static const struct user_regset_view user_x86_32_view = {
+       .name = "i386", .e_machine = EM_386,
+       .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets)
+};
+#endif
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+#ifdef CONFIG_IA32_EMULATION
+       if (test_tsk_thread_flag(task, TIF_IA32))
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+               return &user_x86_32_view;
+#endif
+#ifdef CONFIG_X86_64
+       return &user_x86_64_view;
+#endif
+}
+
+#ifdef CONFIG_X86_32
+
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+{
+       struct siginfo info;
+
+       tsk->thread.trap_no = 1;
+       tsk->thread.error_code = error_code;
+
+       memset(&info, 0, sizeof(info));
+       info.si_signo = SIGTRAP;
+       info.si_code = TRAP_BRKPT;
+
+       /* User-mode ip? */
+       info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
+
+       /* Send us the fake SIGTRAP */
+       force_sig_info(SIGTRAP, &info, tsk);
+}
+
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
+{
+       int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+       /*
+        * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
+        * interception
+        */
+       int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
+       int ret = 0;
+
+       /* do the secure computing check first */
+       if (!entryexit)
+               secure_computing(regs->orig_ax);
+
+       if (unlikely(current->audit_context)) {
+               if (entryexit)
+                       audit_syscall_exit(AUDITSC_RESULT(regs->ax),
+                                               regs->ax);
+               /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+                * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+                * not used, entry.S will call us only on syscall exit, not
+                * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
+                * calling send_sigtrap() on syscall entry.
+                *
+                * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
+                * is_singlestep is false, despite his name, so we will still do
+                * the correct thing.
+                */
+               else if (is_singlestep)
+                       goto out;
+       }
+
+       if (!(current->ptrace & PT_PTRACED))
+               goto out;
+
+       /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
+        * and then is resumed with SYSEMU_SINGLESTEP, it will come in
+        * here. We have to check this and return */
+       if (is_sysemu && entryexit)
+               return 0;
+
+       /* Fake a debug trap */
+       if (is_singlestep)
+               send_sigtrap(current, regs, 0);
+
+       if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
+               goto out;
+
+       /* the 0x80 provides a way for the tracing parent to distinguish
+          between a syscall stop and SIGTRAP delivery */
+       /* Note that the debugger could change the result of test_thread_flag!*/
+       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
+
+       /*
+        * this isn't the same as continuing with a signal, but it will do
+        * for normal use.  strace only continues with a signal if the
+        * stopping signal is not SIGTRAP.  -brl
+        */
+       if (current->exit_code) {
+               send_sig(current->exit_code, current, 1);
+               current->exit_code = 0;
+       }
+       ret = is_sysemu;
+out:
+       if (unlikely(current->audit_context) && !entryexit)
+               audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
+                                   regs->bx, regs->cx, regs->dx, regs->si);
+       if (ret == 0)
+               return 0;
+
+       regs->orig_ax = -1; /* force skip of syscall restarting */
+       if (unlikely(current->audit_context))
+               audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+       return 1;
+}
+
+#else  /* CONFIG_X86_64 */
+
+static void syscall_trace(struct pt_regs *regs)
+{
+
+#if 0
+       printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+              current->comm,
+              regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
+              current_thread_info()->flags, current->ptrace);
+#endif
+
+       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+                               ? 0x80 : 0));
+       /*
+        * this isn't the same as continuing with a signal, but it will do
+        * for normal use.  strace only continues with a signal if the
+        * stopping signal is not SIGTRAP.  -brl
+        */
+       if (current->exit_code) {
+               send_sig(current->exit_code, current, 1);
+               current->exit_code = 0;
+       }
+}
+
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+       /* do the secure computing check first */
+       secure_computing(regs->orig_ax);
+
+       if (test_thread_flag(TIF_SYSCALL_TRACE)
+           && (current->ptrace & PT_PTRACED))
+               syscall_trace(regs);
+
+       if (unlikely(current->audit_context)) {
+               if (test_thread_flag(TIF_IA32)) {
+                       audit_syscall_entry(AUDIT_ARCH_I386,
+                                           regs->orig_ax,
+                                           regs->bx, regs->cx,
+                                           regs->dx, regs->si);
+               } else {
+                       audit_syscall_entry(AUDIT_ARCH_X86_64,
+                                           regs->orig_ax,
+                                           regs->di, regs->si,
+                                           regs->dx, regs->r10);
+               }
+       }
+}
+
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+       if (unlikely(current->audit_context))
+               audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+
+       if ((test_thread_flag(TIF_SYSCALL_TRACE)
+            || test_thread_flag(TIF_SINGLESTEP))
+           && (current->ptrace & PT_PTRACED))
+               syscall_trace(regs);
+}
+
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c

deleted file mode 100644 (file)

index ff5431c..0000000
--- a/arch/x86/kernel/ptrace_32.c
+++ /dev/null
@@ -1,717 +0,0 @@
-/* By Ross Biro 1/23/92 */
-/*
- * Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/signal.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/debugreg.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-
-/*
- * does not yet catch signals sent when the child dies.
- * in exit.c or in signal.c.
- */
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
- * Also masks reserved bits (31-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x00050dd5
-
-/* set's the trap flag. */
-#define TRAP_FLAG 0x100
-
-/*
- * Offset of eflags on child stack..
- */
-#define EFL_OFFSET offsetof(struct pt_regs, eflags)
-
-static inline struct pt_regs *get_child_regs(struct task_struct *task)
-{
-       void *stack_top = (void *)task->thread.esp0;
-       return stack_top - sizeof(struct pt_regs);
-}
-
-/*
- * This routine will get a word off of the processes privileged stack.
- * the offset is bytes into the pt_regs structure on the stack.
- * This routine assumes that all the privileged stacks are in our
- * data space.
- */   
-static inline int get_stack_long(struct task_struct *task, int offset)
-{
-       unsigned char *stack;
-
-       stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
-       stack += offset;
-       return (*((int *)stack));
-}
-
-/*
- * This routine will put a word on the processes privileged stack.
- * the offset is bytes into the pt_regs structure on the stack.
- * This routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline int put_stack_long(struct task_struct *task, int offset,
-       unsigned long data)
-{
-       unsigned char * stack;
-
-       stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
-       stack += offset;
-       *(unsigned long *) stack = data;
-       return 0;
-}
-
-static int putreg(struct task_struct *child,
-       unsigned long regno, unsigned long value)
-{
-       switch (regno >> 2) {
-               case GS:
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.gs = value;
-                       return 0;
-               case DS:
-               case ES:
-               case FS:
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       break;
-               case SS:
-               case CS:
-                       if ((value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       break;
-               case EFL:
-                       value &= FLAG_MASK;
-                       value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
-                       break;
-       }
-       if (regno > FS*4)
-               regno -= 1*4;
-       put_stack_long(child, regno, value);
-       return 0;
-}
-
-static unsigned long getreg(struct task_struct *child,
-       unsigned long regno)
-{
-       unsigned long retval = ~0UL;
-
-       switch (regno >> 2) {
-               case GS:
-                       retval = child->thread.gs;
-                       break;
-               case DS:
-               case ES:
-               case FS:
-               case SS:
-               case CS:
-                       retval = 0xffff;
-                       /* fall through */
-               default:
-                       if (regno > FS*4)
-                               regno -= 1*4;
-                       retval &= get_stack_long(child, regno);
-       }
-       return retval;
-}
-
-#define LDT_SEGMENT 4
-
-static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
-{
-       unsigned long addr, seg;
-
-       addr = regs->eip;
-       seg = regs->xcs & 0xffff;
-       if (regs->eflags & VM_MASK) {
-               addr = (addr & 0xffff) + (seg << 4);
-               return addr;
-       }
-
-       /*
-        * We'll assume that the code segments in the GDT
-        * are all zero-based. That is largely true: the
-        * TLS segments are used for data, and the PNPBIOS
-        * and APM bios ones we just ignore here.
-        */
-       if (seg & LDT_SEGMENT) {
-               u32 *desc;
-               unsigned long base;
-
-               seg &= ~7UL;
-
-               mutex_lock(&child->mm->context.lock);
-               if (unlikely((seg >> 3) >= child->mm->context.size))
-                       addr = -1L; /* bogus selector, access would fault */
-               else {
-                       desc = child->mm->context.ldt + seg;
-                       base = ((desc[0] >> 16) |
-                               ((desc[1] & 0xff) << 16) |
-                               (desc[1] & 0xff000000));
-
-                       /* 16-bit code segment? */
-                       if (!((desc[1] >> 22) & 1))
-                               addr &= 0xffff;
-                       addr += base;
-               }
-               mutex_unlock(&child->mm->context.lock);
-       }
-       return addr;
-}
-
-static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
-{
-       int i, copied;
-       unsigned char opcode[15];
-       unsigned long addr = convert_eip_to_linear(child, regs);
-
-       copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
-       for (i = 0; i < copied; i++) {
-               switch (opcode[i]) {
-               /* popf and iret */
-               case 0x9d: case 0xcf:
-                       return 1;
-               /* opcode and address size prefixes */
-               case 0x66: case 0x67:
-                       continue;
-               /* irrelevant prefixes (segment overrides and repeats) */
-               case 0x26: case 0x2e:
-               case 0x36: case 0x3e:
-               case 0x64: case 0x65:
-               case 0xf0: case 0xf2: case 0xf3:
-                       continue;
-
-               /*
-                * pushf: NOTE! We should probably not let
-                * the user see the TF bit being set. But
-                * it's more pain than it's worth to avoid
-                * it, and a debugger could emulate this
-                * all in user space if it _really_ cares.
-                */
-               case 0x9c:
-               default:
-                       return 0;
-               }
-       }
-       return 0;
-}
-
-static void set_singlestep(struct task_struct *child)
-{
-       struct pt_regs *regs = get_child_regs(child);
-
-       /*
-        * Always set TIF_SINGLESTEP - this guarantees that 
-        * we single-step system calls etc..  This will also
-        * cause us to set TF when returning to user mode.
-        */
-       set_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /*
-        * If TF was already set, don't do anything else
-        */
-       if (regs->eflags & TRAP_FLAG)
-               return;
-
-       /* Set TF on the kernel stack.. */
-       regs->eflags |= TRAP_FLAG;
-
-       /*
-        * ..but if TF is changed by the instruction we will trace,
-        * don't mark it as being "us" that set it, so that we
-        * won't clear it by hand later.
-        */
-       if (is_setting_trap_flag(child, regs))
-               return;
-       
-       child->ptrace |= PT_DTRACE;
-}
-
-static void clear_singlestep(struct task_struct *child)
-{
-       /* Always clear TIF_SINGLESTEP... */
-       clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /* But touch TF only if it was set by us.. */
-       if (child->ptrace & PT_DTRACE) {
-               struct pt_regs *regs = get_child_regs(child);
-               regs->eflags &= ~TRAP_FLAG;
-               child->ptrace &= ~PT_DTRACE;
-       }
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void ptrace_disable(struct task_struct *child)
-{ 
-       clear_singlestep(child);
-       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-}
-
-/*
- * Perform get_thread_area on behalf of the traced child.
- */
-static int
-ptrace_get_thread_area(struct task_struct *child,
-                      int idx, struct user_desc __user *user_desc)
-{
-       struct user_desc info;
-       struct desc_struct *desc;
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-       (((desc)->a >> 16) & 0x0000ffff) | \
-       (((desc)->b << 16) & 0x00ff0000) | \
-       ( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-       ((desc)->a & 0x0ffff) | \
-        ((desc)->b & 0xf0000) )
-
-#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
-
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-       info.entry_number = idx;
-       info.base_addr = GET_BASE(desc);
-       info.limit = GET_LIMIT(desc);
-       info.seg_32bit = GET_32BIT(desc);
-       info.contents = GET_CONTENTS(desc);
-       info.read_exec_only = !GET_WRITABLE(desc);
-       info.limit_in_pages = GET_LIMIT_PAGES(desc);
-       info.seg_not_present = !GET_PRESENT(desc);
-       info.useable = GET_USEABLE(desc);
-
-       if (copy_to_user(user_desc, &info, sizeof(info)))
-               return -EFAULT;
-
-       return 0;
-}
-
-/*
- * Perform set_thread_area on behalf of the traced child.
- */
-static int
-ptrace_set_thread_area(struct task_struct *child,
-                      int idx, struct user_desc __user *user_desc)
-{
-       struct user_desc info;
-       struct desc_struct *desc;
-
-       if (copy_from_user(&info, user_desc, sizeof(info)))
-               return -EFAULT;
-
-       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-               return -EINVAL;
-
-       desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-       if (LDT_empty(&info)) {
-               desc->a = 0;
-               desc->b = 0;
-       } else {
-               desc->a = LDT_entry_a(&info);
-               desc->b = LDT_entry_b(&info);
-       }
-
-       return 0;
-}
-
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-       struct user * dummy = NULL;
-       int i, ret;
-       unsigned long __user *datap = (unsigned long __user *)data;
-
-       switch (request) {
-       /* when I and D space are separate, these will need to be fixed. */
-       case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-       case PTRACE_PEEKDATA:
-               ret = generic_ptrace_peekdata(child, addr, data);
-               break;
-
-       /* read the word at location addr in the USER area. */
-       case PTRACE_PEEKUSR: {
-               unsigned long tmp;
-
-               ret = -EIO;
-               if ((addr & 3) || addr < 0 || 
-                   addr > sizeof(struct user) - 3)
-                       break;
-
-               tmp = 0;  /* Default return condition */
-               if(addr < FRAME_SIZE*sizeof(long))
-                       tmp = getreg(child, addr);
-               if(addr >= (long) &dummy->u_debugreg[0] &&
-                  addr <= (long) &dummy->u_debugreg[7]){
-                       addr -= (long) &dummy->u_debugreg[0];
-                       addr = addr >> 2;
-                       tmp = child->thread.debugreg[addr];
-               }
-               ret = put_user(tmp, datap);
-               break;
-       }
-
-       /* when I and D space are separate, this will have to be fixed. */
-       case PTRACE_POKETEXT: /* write the word at location addr. */
-       case PTRACE_POKEDATA:
-               ret = generic_ptrace_pokedata(child, addr, data);
-               break;
-
-       case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
-               ret = -EIO;
-               if ((addr & 3) || addr < 0 || 
-                   addr > sizeof(struct user) - 3)
-                       break;
-
-               if (addr < FRAME_SIZE*sizeof(long)) {
-                       ret = putreg(child, addr, data);
-                       break;
-               }
-               /* We need to be very careful here.  We implicitly
-                  want to modify a portion of the task_struct, and we
-                  have to be selective about what portions we allow someone
-                  to modify. */
-
-                 ret = -EIO;
-                 if(addr >= (long) &dummy->u_debugreg[0] &&
-                    addr <= (long) &dummy->u_debugreg[7]){
-
-                         if(addr == (long) &dummy->u_debugreg[4]) break;
-                         if(addr == (long) &dummy->u_debugreg[5]) break;
-                         if(addr < (long) &dummy->u_debugreg[4] &&
-                            ((unsigned long) data) >= TASK_SIZE-3) break;
-                         
-                         /* Sanity-check data. Take one half-byte at once with
-                          * check = (val >> (16 + 4*i)) & 0xf. It contains the
-                          * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-                          * 2 and 3 are LENi. Given a list of invalid values,
-                          * we do mask |= 1 << invalid_value, so that
-                          * (mask >> check) & 1 is a correct test for invalid
-                          * values.
-                          *
-                          * R/Wi contains the type of the breakpoint /
-                          * watchpoint, LENi contains the length of the watched
-                          * data in the watchpoint case.
-                          *
-                          * The invalid values are:
-                          * - LENi == 0x10 (undefined), so mask |= 0x0f00.
-                          * - R/Wi == 0x10 (break on I/O reads or writes), so
-                          *   mask |= 0x4444.
-                          * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-                          *   0x1110.
-                          *
-                          * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-                          *
-                          * See the Intel Manual "System Programming Guide",
-                          * 15.2.4
-                          *
-                          * Note that LENi == 0x10 is defined on x86_64 in long
-                          * mode (i.e. even for 32-bit userspace software, but
-                          * 64-bit kernel), so the x86_64 mask value is 0x5454.
-                          * See the AMD manual no. 24593 (AMD64 System
-                          * Programming)*/
-
-                         if(addr == (long) &dummy->u_debugreg[7]) {
-                                 data &= ~DR_CONTROL_RESERVED;
-                                 for(i=0; i<4; i++)
-                                         if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                                                 goto out_tsk;
-                                 if (data)
-                                         set_tsk_thread_flag(child, TIF_DEBUG);
-                                 else
-                                         clear_tsk_thread_flag(child, TIF_DEBUG);
-                         }
-                         addr -= (long) &dummy->u_debugreg;
-                         addr = addr >> 2;
-                         child->thread.debugreg[addr] = data;
-                         ret = 0;
-                 }
-                 break;
-
-       case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
-       case PTRACE_SYSCALL:    /* continue and stop at next (return from) syscall */
-       case PTRACE_CONT:       /* restart after signal. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               if (request == PTRACE_SYSEMU) {
-                       set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               } else if (request == PTRACE_SYSCALL) {
-                       set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-               } else {
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               }
-               child->exit_code = data;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-/*
- * make the child exit.  Best I can do is send it a sigkill. 
- * perhaps it should be put in the status that it wants to 
- * exit.
- */
-       case PTRACE_KILL:
-               ret = 0;
-               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
-                       break;
-               child->exit_code = SIGKILL;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               break;
-
-       case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
-       case PTRACE_SINGLESTEP: /* set the trap flag. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-
-               if (request == PTRACE_SYSEMU_SINGLESTEP)
-                       set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-               else
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-
-               clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               set_singlestep(child);
-               child->exit_code = data;
-               /* give it a chance to run. */
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-       case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-               if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
-                       ret = -EIO;
-                       break;
-               }
-               for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
-                       __put_user(getreg(child, i), datap);
-                       datap++;
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-               unsigned long tmp;
-               if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
-                       ret = -EIO;
-                       break;
-               }
-               for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
-                       __get_user(tmp, datap);
-                       putreg(child, i, tmp);
-                       datap++;
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_GETFPREGS: { /* Get the child FPU state. */
-               if (!access_ok(VERIFY_WRITE, datap,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0;
-               if (!tsk_used_math(child))
-                       init_fpu(child);
-               get_fpregs((struct user_i387_struct __user *)data, child);
-               break;
-       }
-
-       case PTRACE_SETFPREGS: { /* Set the child FPU state. */
-               if (!access_ok(VERIFY_READ, datap,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               set_stopped_child_used_math(child);
-               set_fpregs(child, (struct user_i387_struct __user *)data);
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
-               if (!access_ok(VERIFY_WRITE, datap,
-                              sizeof(struct user_fxsr_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               if (!tsk_used_math(child))
-                       init_fpu(child);
-               ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
-               break;
-       }
-
-       case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
-               if (!access_ok(VERIFY_READ, datap,
-                              sizeof(struct user_fxsr_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               set_stopped_child_used_math(child);
-               ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
-               break;
-       }
-
-       case PTRACE_GET_THREAD_AREA:
-               ret = ptrace_get_thread_area(child, addr,
-                                       (struct user_desc __user *) data);
-               break;
-
-       case PTRACE_SET_THREAD_AREA:
-               ret = ptrace_set_thread_area(child, addr,
-                                       (struct user_desc __user *) data);
-               break;
-
-       default:
-               ret = ptrace_request(child, request, addr, data);
-               break;
-       }
- out_tsk:
-       return ret;
-}
-
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
-{
-       struct siginfo info;
-
-       tsk->thread.trap_no = 1;
-       tsk->thread.error_code = error_code;
-
-       memset(&info, 0, sizeof(info));
-       info.si_signo = SIGTRAP;
-       info.si_code = TRAP_BRKPT;
-
-       /* User-mode eip? */
-       info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
-
-       /* Send us the fake SIGTRAP */
-       force_sig_info(SIGTRAP, &info, tsk);
-}
-
-/* notification of system call entry/exit
- * - triggered by current->work.syscall_trace
- */
-__attribute__((regparm(3)))
-int do_syscall_trace(struct pt_regs *regs, int entryexit)
-{
-       int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-       /*
-        * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
-        * interception
-        */
-       int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
-       int ret = 0;
-
-       /* do the secure computing check first */
-       if (!entryexit)
-               secure_computing(regs->orig_eax);
-
-       if (unlikely(current->audit_context)) {
-               if (entryexit)
-                       audit_syscall_exit(AUDITSC_RESULT(regs->eax),
-                                               regs->eax);
-               /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-                * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-                * not used, entry.S will call us only on syscall exit, not
-                * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
-                * calling send_sigtrap() on syscall entry.
-                *
-                * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
-                * is_singlestep is false, despite his name, so we will still do
-                * the correct thing.
-                */
-               else if (is_singlestep)
-                       goto out;
-       }
-
-       if (!(current->ptrace & PT_PTRACED))
-               goto out;
-
-       /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
-        * and then is resumed with SYSEMU_SINGLESTEP, it will come in
-        * here. We have to check this and return */
-       if (is_sysemu && entryexit)
-               return 0;
-
-       /* Fake a debug trap */
-       if (is_singlestep)
-               send_sigtrap(current, regs, 0);
-
-       if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
-               goto out;
-
-       /* the 0x80 provides a way for the tracing parent to distinguish
-          between a syscall stop and SIGTRAP delivery */
-       /* Note that the debugger could change the result of test_thread_flag!*/
-       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
-
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
-       ret = is_sysemu;
-out:
-       if (unlikely(current->audit_context) && !entryexit)
-               audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
-                                   regs->ebx, regs->ecx, regs->edx, regs->esi);
-       if (ret == 0)
-               return 0;
-
-       regs->orig_eax = -1; /* force skip of syscall restarting */
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
-       return 1;
-}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c

deleted file mode 100644 (file)

index 607085f..0000000
--- a/arch/x86/kernel/ptrace_64.c
+++ /dev/null
@@ -1,621 +0,0 @@
-/* By Ross Biro 1/23/92 */
-/*
- * Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * 
- * x86-64 port 2000-2002 Andi Kleen
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/signal.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/debugreg.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-#include <asm/ia32.h>
-
-/*
- * does not yet catch signals sent when the child dies.
- * in exit.c or in signal.c.
- */
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
- * Also masks reserved bits (63-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x54dd5UL
-
-/* set's the trap flag. */
-#define TRAP_FLAG 0x100UL
-
-/*
- * eflags and offset of eflags on child stack..
- */
-#define EFLAGS offsetof(struct pt_regs, eflags)
-#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
-
-/*
- * this routine will get a word off of the processes privileged stack. 
- * the offset is how far from the base addr as stored in the TSS.  
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */   
-static inline unsigned long get_stack_long(struct task_struct *task, int offset)
-{
-       unsigned char *stack;
-
-       stack = (unsigned char *)task->thread.rsp0;
-       stack += offset;
-       return (*((unsigned long *)stack));
-}
-
-/*
- * this routine will put a word on the processes privileged stack. 
- * the offset is how far from the base addr as stored in the TSS.  
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline long put_stack_long(struct task_struct *task, int offset,
-       unsigned long data)
-{
-       unsigned char * stack;
-
-       stack = (unsigned char *) task->thread.rsp0;
-       stack += offset;
-       *(unsigned long *) stack = data;
-       return 0;
-}
-
-#define LDT_SEGMENT 4
-
-unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
-{
-       unsigned long addr, seg;
-
-       addr = regs->rip;
-       seg = regs->cs & 0xffff;
-
-       /*
-        * We'll assume that the code segments in the GDT
-        * are all zero-based. That is largely true: the
-        * TLS segments are used for data, and the PNPBIOS
-        * and APM bios ones we just ignore here.
-        */
-       if (seg & LDT_SEGMENT) {
-               u32 *desc;
-               unsigned long base;
-
-               seg &= ~7UL;
-
-               mutex_lock(&child->mm->context.lock);
-               if (unlikely((seg >> 3) >= child->mm->context.size))
-                       addr = -1L; /* bogus selector, access would fault */
-               else {
-                       desc = child->mm->context.ldt + seg;
-                       base = ((desc[0] >> 16) |
-                               ((desc[1] & 0xff) << 16) |
-                               (desc[1] & 0xff000000));
-
-                       /* 16-bit code segment? */
-                       if (!((desc[1] >> 22) & 1))
-                               addr &= 0xffff;
-                       addr += base;
-               }
-               mutex_unlock(&child->mm->context.lock);
-       }
-
-       return addr;
-}
-
-static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
-{
-       int i, copied;
-       unsigned char opcode[15];
-       unsigned long addr = convert_rip_to_linear(child, regs);
-
-       copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
-       for (i = 0; i < copied; i++) {
-               switch (opcode[i]) {
-               /* popf and iret */
-               case 0x9d: case 0xcf:
-                       return 1;
-
-                       /* CHECKME: 64 65 */
-
-               /* opcode and address size prefixes */
-               case 0x66: case 0x67:
-                       continue;
-               /* irrelevant prefixes (segment overrides and repeats) */
-               case 0x26: case 0x2e:
-               case 0x36: case 0x3e:
-               case 0x64: case 0x65:
-               case 0xf2: case 0xf3:
-                       continue;
-
-               case 0x40 ... 0x4f:
-                       if (regs->cs != __USER_CS)
-                               /* 32-bit mode: register increment */
-                               return 0;
-                       /* 64-bit mode: REX prefix */
-                       continue;
-
-                       /* CHECKME: f2, f3 */
-
-               /*
-                * pushf: NOTE! We should probably not let
-                * the user see the TF bit being set. But
-                * it's more pain than it's worth to avoid
-                * it, and a debugger could emulate this
-                * all in user space if it _really_ cares.
-                */
-               case 0x9c:
-               default:
-                       return 0;
-               }
-       }
-       return 0;
-}
-
-static void set_singlestep(struct task_struct *child)
-{
-       struct pt_regs *regs = task_pt_regs(child);
-
-       /*
-        * Always set TIF_SINGLESTEP - this guarantees that
-        * we single-step system calls etc..  This will also
-        * cause us to set TF when returning to user mode.
-        */
-       set_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /*
-        * If TF was already set, don't do anything else
-        */
-       if (regs->eflags & TRAP_FLAG)
-               return;
-
-       /* Set TF on the kernel stack.. */
-       regs->eflags |= TRAP_FLAG;
-
-       /*
-        * ..but if TF is changed by the instruction we will trace,
-        * don't mark it as being "us" that set it, so that we
-        * won't clear it by hand later.
-        */
-       if (is_setting_trap_flag(child, regs))
-               return;
-
-       child->ptrace |= PT_DTRACE;
-}
-
-static void clear_singlestep(struct task_struct *child)
-{
-       /* Always clear TIF_SINGLESTEP... */
-       clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /* But touch TF only if it was set by us.. */
-       if (child->ptrace & PT_DTRACE) {
-               struct pt_regs *regs = task_pt_regs(child);
-               regs->eflags &= ~TRAP_FLAG;
-               child->ptrace &= ~PT_DTRACE;
-       }
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void ptrace_disable(struct task_struct *child)
-{ 
-       clear_singlestep(child);
-}
-
-static int putreg(struct task_struct *child,
-       unsigned long regno, unsigned long value)
-{
-       unsigned long tmp; 
-       
-       switch (regno) {
-               case offsetof(struct user_regs_struct,fs):
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.fsindex = value & 0xffff; 
-                       return 0;
-               case offsetof(struct user_regs_struct,gs):
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.gsindex = value & 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,ds):
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.ds = value & 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,es): 
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.es = value & 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,ss):
-                       if ((value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,fs_base):
-                       if (value >= TASK_SIZE_OF(child))
-                               return -EIO;
-                       child->thread.fs = value;
-                       return 0;
-               case offsetof(struct user_regs_struct,gs_base):
-                       if (value >= TASK_SIZE_OF(child))
-                               return -EIO;
-                       child->thread.gs = value;
-                       return 0;
-               case offsetof(struct user_regs_struct, eflags):
-                       value &= FLAG_MASK;
-                       tmp = get_stack_long(child, EFL_OFFSET); 
-                       tmp &= ~FLAG_MASK; 
-                       value |= tmp;
-                       break;
-               case offsetof(struct user_regs_struct,cs): 
-                       if ((value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       break;
-       }
-       put_stack_long(child, regno - sizeof(struct pt_regs), value);
-       return 0;
-}
-
-static unsigned long getreg(struct task_struct *child, unsigned long regno)
-{
-       unsigned long val;
-       switch (regno) {
-               case offsetof(struct user_regs_struct, fs):
-                       return child->thread.fsindex;
-               case offsetof(struct user_regs_struct, gs):
-                       return child->thread.gsindex;
-               case offsetof(struct user_regs_struct, ds):
-                       return child->thread.ds;
-               case offsetof(struct user_regs_struct, es):
-                       return child->thread.es; 
-               case offsetof(struct user_regs_struct, fs_base):
-                       return child->thread.fs;
-               case offsetof(struct user_regs_struct, gs_base):
-                       return child->thread.gs;
-               default:
-                       regno = regno - sizeof(struct pt_regs);
-                       val = get_stack_long(child, regno);
-                       if (test_tsk_thread_flag(child, TIF_IA32))
-                               val &= 0xffffffff;
-                       return val;
-       }
-
-}
-
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-       long i, ret;
-       unsigned ui;
-
-       switch (request) {
-       /* when I and D space are separate, these will need to be fixed. */
-       case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-       case PTRACE_PEEKDATA:
-               ret = generic_ptrace_peekdata(child, addr, data);
-               break;
-
-       /* read the word at location addr in the USER area. */
-       case PTRACE_PEEKUSR: {
-               unsigned long tmp;
-
-               ret = -EIO;
-               if ((addr & 7) ||
-                   addr > sizeof(struct user) - 7)
-                       break;
-
-               switch (addr) { 
-               case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
-                       tmp = getreg(child, addr);
-                       break;
-               case offsetof(struct user, u_debugreg[0]):
-                       tmp = child->thread.debugreg0;
-                       break;
-               case offsetof(struct user, u_debugreg[1]):
-                       tmp = child->thread.debugreg1;
-                       break;
-               case offsetof(struct user, u_debugreg[2]):
-                       tmp = child->thread.debugreg2;
-                       break;
-               case offsetof(struct user, u_debugreg[3]):
-                       tmp = child->thread.debugreg3;
-                       break;
-               case offsetof(struct user, u_debugreg[6]):
-                       tmp = child->thread.debugreg6;
-                       break;
-               case offsetof(struct user, u_debugreg[7]):
-                       tmp = child->thread.debugreg7;
-                       break;
-               default:
-                       tmp = 0;
-                       break;
-               }
-               ret = put_user(tmp,(unsigned long __user *) data);
-               break;
-       }
-
-       /* when I and D space are separate, this will have to be fixed. */
-       case PTRACE_POKETEXT: /* write the word at location addr. */
-       case PTRACE_POKEDATA:
-               ret = generic_ptrace_pokedata(child, addr, data);
-               break;
-
-       case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
-       {
-               int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
-               ret = -EIO;
-               if ((addr & 7) ||
-                   addr > sizeof(struct user) - 7)
-                       break;
-
-               switch (addr) { 
-               case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
-                       ret = putreg(child, addr, data);
-                       break;
-               /* Disallows to set a breakpoint into the vsyscall */
-               case offsetof(struct user, u_debugreg[0]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg0 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[1]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg1 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[2]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg2 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[3]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg3 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[6]):
-                                 if (data >> 32)
-                               break; 
-                       child->thread.debugreg6 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[7]):
-                       /* See arch/i386/kernel/ptrace.c for an explanation of
-                        * this awkward check.*/
-                       data &= ~DR_CONTROL_RESERVED;
-                       for(i=0; i<4; i++)
-                               if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                                       break;
-                       if (i == 4) {
-                         child->thread.debugreg7 = data;
-                         if (data)
-                               set_tsk_thread_flag(child, TIF_DEBUG);
-                         else
-                               clear_tsk_thread_flag(child, TIF_DEBUG);
-                         ret = 0;
-                       }
-                 break;
-               }
-               break;
-       }
-       case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-       case PTRACE_CONT:    /* restart after signal. */
-
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               if (request == PTRACE_SYSCALL)
-                       set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-               else
-                       clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-               child->exit_code = data;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-#ifdef CONFIG_IA32_EMULATION
-               /* This makes only sense with 32bit programs. Allow a
-                  64bit debugger to fully examine them too. Better
-                  don't use it against 64bit processes, use
-                  PTRACE_ARCH_PRCTL instead. */
-       case PTRACE_SET_THREAD_AREA: {
-               struct user_desc __user *p;
-               int old; 
-               p = (struct user_desc __user *)data;
-               get_user(old,  &p->entry_number); 
-               put_user(addr, &p->entry_number);
-               ret = do_set_thread_area(&child->thread, p);
-               put_user(old,  &p->entry_number); 
-               break;
-       case PTRACE_GET_THREAD_AREA:
-               p = (struct user_desc __user *)data;
-               get_user(old,  &p->entry_number); 
-               put_user(addr, &p->entry_number);
-               ret = do_get_thread_area(&child->thread, p);
-               put_user(old,  &p->entry_number); 
-               break;
-       } 
-#endif
-               /* normal 64bit interface to access TLS data. 
-                  Works just like arch_prctl, except that the arguments
-                  are reversed. */
-       case PTRACE_ARCH_PRCTL: 
-               ret = do_arch_prctl(child, data, addr);
-               break;
-
-/*
- * make the child exit.  Best I can do is send it a sigkill. 
- * perhaps it should be put in the status that it wants to 
- * exit.
- */
-       case PTRACE_KILL:
-               ret = 0;
-               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
-                       break;
-               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-               child->exit_code = SIGKILL;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               break;
-
-       case PTRACE_SINGLESTEP:    /* set the trap flag. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-               set_singlestep(child);
-               child->exit_code = data;
-               /* give it a chance to run. */
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-       case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-               if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
-                              sizeof(struct user_regs_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0;
-               for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
-                       ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
-                       data += sizeof(long);
-               }
-               break;
-       }
-
-       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-               unsigned long tmp;
-               if (!access_ok(VERIFY_READ, (unsigned __user *)data,
-                              sizeof(struct user_regs_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0;
-               for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
-                       ret = __get_user(tmp, (unsigned long __user *) data);
-                       if (ret)
-                               break;
-                       ret = putreg(child, ui, tmp);
-                       if (ret)
-                               break;
-                       data += sizeof(long);
-               }
-               break;
-       }
-
-       case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
-               if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = get_fpregs((struct user_i387_struct __user *)data, child);
-               break;
-       }
-
-       case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
-               if (!access_ok(VERIFY_READ, (unsigned __user *)data,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               set_stopped_child_used_math(child);
-               ret = set_fpregs(child, (struct user_i387_struct __user *)data);
-               break;
-       }
-
-       default:
-               ret = ptrace_request(child, request, addr, data);
-               break;
-       }
-       return ret;
-}
-
-static void syscall_trace(struct pt_regs *regs)
-{
-
-#if 0
-       printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
-              current->comm,
-              regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
-              current_thread_info()->flags, current->ptrace); 
-#endif
-
-       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-                               ? 0x80 : 0));
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
-}
-
-asmlinkage void syscall_trace_enter(struct pt_regs *regs)
-{
-       /* do the secure computing check first */
-       secure_computing(regs->orig_rax);
-
-       if (test_thread_flag(TIF_SYSCALL_TRACE)
-           && (current->ptrace & PT_PTRACED))
-               syscall_trace(regs);
-
-       if (unlikely(current->audit_context)) {
-               if (test_thread_flag(TIF_IA32)) {
-                       audit_syscall_entry(AUDIT_ARCH_I386,
-                                           regs->orig_rax,
-                                           regs->rbx, regs->rcx,
-                                           regs->rdx, regs->rsi);
-               } else {
-                       audit_syscall_entry(AUDIT_ARCH_X86_64,
-                                           regs->orig_rax,
-                                           regs->rdi, regs->rsi,
-                                           regs->rdx, regs->r10);
-               }
-       }
-}
-
-asmlinkage void syscall_trace_leave(struct pt_regs *regs)
-{
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
-
-       if ((test_thread_flag(TIF_SYSCALL_TRACE)
-            || test_thread_flag(TIF_SINGLESTEP))
-           && (current->ptrace & PT_PTRACED))
-               syscall_trace(regs);
-}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c

index fab30e1348361f94ca581aa20b4cbf645bdb8570..150ba29a0d331a486e2cea33c3ce3d4aec1464d7 100644 (file)
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -162,6 +162,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
                          ich_force_enable_hpet);
  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
                          ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
+                        ich_force_enable_hpet);
  
  
  static struct pci_dev *cached_dev;
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot.c

similarity index 72%

rename from arch/x86/kernel/reboot_32.c

rename to arch/x86/kernel/reboot.c

index bb1a0f889c5ebaed7a42864a9a98ea86e9848e52..5818dc28167d71bee82c305a85410192ef59e2b1 100644 (file)
--- a/arch/x86/kernel/reboot_32.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,64 +1,94 @@
-#include <linux/mm.h>
  #include <linux/module.h>
-#include <linux/delay.h>
  #include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/efi.h>
-#include <linux/dmi.h>
-#include <linux/ctype.h>
-#include <linux/pm.h>
  #include <linux/reboot.h>
-#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/efi.h>
+#include <acpi/reboot.h>
+#include <asm/io.h>
  #include <asm/apic.h>
-#include <asm/hpet.h>
  #include <asm/desc.h>
-#include "mach_reboot.h"
+#include <asm/hpet.h>
  #include <asm/reboot_fixups.h>
  #include <asm/reboot.h>
  
+#ifdef CONFIG_X86_32
+# include <linux/dmi.h>
+# include <linux/ctype.h>
+# include <linux/mc146818rtc.h>
+# include <asm/pgtable.h>
+#else
+# include <asm/iommu.h>
+#endif
+
  /*
   * Power off function, if any
   */
  void (*pm_power_off)(void);
  EXPORT_SYMBOL(pm_power_off);
  
+static long no_idt[3];
  static int reboot_mode;
-static int reboot_thru_bios;
+enum reboot_type reboot_type = BOOT_KBD;
+int reboot_force;
  
-#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
  static int reboot_cpu = -1;
  #endif
+
+/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
+   warm   Don't set the cold reboot flag
+   cold   Set the cold reboot flag
+   bios   Reboot by jumping through the BIOS (only for X86_32)
+   smp    Reboot by executing reset on BSP or other CPU (only for X86_32)
+   triple Force a triple fault (init)
+   kbd    Use the keyboard controller. cold reset (default)
+   acpi   Use the RESET_REG in the FADT
+   efi    Use efi reset_system runtime service
+   force  Avoid anything that could hang.
+ */
  static int __init reboot_setup(char *str)
  {
-       while(1) {
+       for (;;) {
                 switch (*str) {
-               case 'w': /* "warm" reboot (no memory testing etc) */
+               case 'w':
                         reboot_mode = 0x1234;
                         break;
-               case 'c': /* "cold" reboot (with memory testing etc) */
-                       reboot_mode = 0x0;
-                       break;
-               case 'b': /* "bios" reboot by jumping through the BIOS */
-                       reboot_thru_bios = 1;
-                       break;
-               case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
-                       reboot_thru_bios = 0;
+
+               case 'c':
+                       reboot_mode = 0;
                         break;
+
+#ifdef CONFIG_X86_32
  #ifdef CONFIG_SMP
-               case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
+               case 's':
                         if (isdigit(*(str+1))) {
                                 reboot_cpu = (int) (*(str+1) - '0');
                                 if (isdigit(*(str+2)))
                                         reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
                         }
-                               /* we will leave sorting out the final value 
-                               when we are ready to reboot, since we might not
-                               have set up boot_cpu_id or smp_num_cpu */
+                               /* we will leave sorting out the final value
+                                  when we are ready to reboot, since we might not
+                                  have set up boot_cpu_id or smp_num_cpu */
                         break;
+#endif /* CONFIG_SMP */
+
+               case 'b':
  #endif
+               case 'a':
+               case 'k':
+               case 't':
+               case 'e':
+                       reboot_type = *str;
+                       break;
+
+               case 'f':
+                       reboot_force = 1;
+                       break;
                 }
-               if((str = strchr(str,',')) != NULL)
+
+               str = strchr(str, ',');
+               if (str)
                         str++;
                 else
                         break;
@@ -68,18 +98,21 @@ static int __init reboot_setup(char *str)
  
  __setup("reboot=", reboot_setup);
  
+
+#ifdef CONFIG_X86_32
  /*
   * Reboot options and system auto-detection code provided by
   * Dell Inc. so their systems "just work". :-)
   */
  
  /*
- * Some machines require the "reboot=b"  commandline option, this quirk makes that automatic.
+ * Some machines require the "reboot=b"  commandline option,
+ * this quirk makes that automatic.
   */
  static int __init set_bios_reboot(const struct dmi_system_id *d)
  {
-       if (!reboot_thru_bios) {
-               reboot_thru_bios = 1;
+       if (reboot_type != BOOT_BIOS) {
+               reboot_type = BOOT_BIOS;
                 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
         }
         return 0;
@@ -143,7 +176,6 @@ static int __init reboot_init(void)
         dmi_check_system(reboot_dmi_table);
         return 0;
  }
-
  core_initcall(reboot_init);
  
  /* The following code and data reboots the machine by switching to real
@@ -152,7 +184,6 @@ core_initcall(reboot_init);
     controller to pulse the CPU reset line, which is more thorough, but
     doesn't work with at least one type of 486 motherboard.  It is easy
     to stop this code working; hence the copious comments. */
-
  static unsigned long long
  real_mode_gdt_entries [3] =
  {
@@ -161,11 +192,9 @@ real_mode_gdt_entries [3] =
         0x000092000100ffffULL   /* 16-bit real-mode 64k data at 0x00000100 */
  };
  
-static struct Xgt_desc_struct
+static struct desc_ptr
  real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, 0 },
-no_idt = { 0, 0 };
-
+real_mode_idt = { 0x3ff, 0 };
  
  /* This is 16-bit protected mode code to disable paging and the cache,
     switch to real mode and jump to the BIOS reset code.
@@ -185,7 +214,6 @@ no_idt = { 0, 0 };
  
     More could be done here to set up the registers as if a CPU reset had
     occurred; hopefully real BIOSs don't assume much. */
-
  static unsigned char real_mode_switch [] =
  {
         0x66, 0x0f, 0x20, 0xc0,                 /*    movl  %cr0,%eax        */
@@ -223,7 +251,6 @@ void machine_real_restart(unsigned char *code, int length)
            `outb_p' is needed instead of just `outb'.  Use it to be on the
            safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
          */
-
         spin_lock(&rtc_lock);
         CMOS_WRITE(0x00, 0x8f);
         spin_unlock(&rtc_lock);
@@ -231,9 +258,8 @@ void machine_real_restart(unsigned char *code, int length)
         /* Remap the kernel at virtual address zero, as well as offset zero
            from the kernel segment.  This assumes the kernel segment starts at
            virtual address PAGE_OFFSET. */
-
-       memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-               sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
+       memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+               sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
  
         /*
          * Use `swapper_pg_dir' as our page directory.
@@ -245,7 +271,6 @@ void machine_real_restart(unsigned char *code, int length)
            boot)".  This seems like a fairly standard thing that gets set by
            REBOOT.COM programs, and the previous reset routine did this
            too. */
-
         *((unsigned short *)0x472) = reboot_mode;
  
         /* For the switch to real mode, copy some code to low memory.  It has
@@ -253,19 +278,16 @@ void machine_real_restart(unsigned char *code, int length)
            has to have the same physical and virtual address, because it turns
            off paging.  Copy it near the end of the first page, out of the way
            of BIOS variables. */
-
-       memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
+       memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
                 real_mode_switch, sizeof (real_mode_switch));
-       memcpy ((void *) (0x1000 - 100), code, length);
+       memcpy((void *)(0x1000 - 100), code, length);
  
         /* Set up the IDT for real mode. */
-
         load_idt(&real_mode_idt);
  
         /* Set up a GDT from which we can load segment descriptors for real
            mode.  The GDT is not used in real mode; it is just needed here to
            prepare the descriptors. */
-
         load_gdt(&real_mode_gdt);
  
         /* Load the data segment registers, and thus the descriptors ready for
@@ -273,7 +295,6 @@ void machine_real_restart(unsigned char *code, int length)
            selector value being loaded here.  This is so that the segment
            registers don't have to be reloaded after switching to real mode:
            the values are consistent for real mode operation already. */
-
         __asm__ __volatile__ ("movl $0x0010,%%eax\n"
                                 "\tmovl %%eax,%%ds\n"
                                 "\tmovl %%eax,%%es\n"
@@ -284,130 +305,147 @@ void machine_real_restart(unsigned char *code, int length)
         /* Jump to the 16-bit code that we copied earlier.  It disables paging
            and the cache, switches to real mode, and jumps to the BIOS reset
            entry point. */
-
         __asm__ __volatile__ ("ljmp $0x0008,%0"
                                 :
-                               : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
+                               : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
  }
  #ifdef CONFIG_APM_MODULE
  EXPORT_SYMBOL(machine_real_restart);
  #endif
  
-static void native_machine_shutdown(void)
+#endif /* CONFIG_X86_32 */
+
+static inline void kb_wait(void)
+{
+       int i;
+
+       for (i = 0; i < 0x10000; i++) {
+               if ((inb(0x64) & 0x02) == 0)
+                       break;
+               udelay(2);
+       }
+}
+
+void machine_emergency_restart(void)
+{
+       int i;
+
+       /* Tell the BIOS if we want cold or warm reboot */
+       *((unsigned short *)__va(0x472)) = reboot_mode;
+
+       for (;;) {
+               /* Could also try the reset bit in the Hammer NB */
+               switch (reboot_type) {
+               case BOOT_KBD:
+                       for (i = 0; i < 10; i++) {
+                               kb_wait();
+                               udelay(50);
+                               outb(0xfe, 0x64); /* pulse reset low */
+                               udelay(50);
+                       }
+
+               case BOOT_TRIPLE:
+                       load_idt((const struct desc_ptr *)&no_idt);
+                       __asm__ __volatile__("int3");
+
+                       reboot_type = BOOT_KBD;
+                       break;
+
+#ifdef CONFIG_X86_32
+               case BOOT_BIOS:
+                       machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
+
+                       reboot_type = BOOT_KBD;
+                       break;
+#endif
+
+               case BOOT_ACPI:
+                       acpi_reboot();
+                       reboot_type = BOOT_KBD;
+                       break;
+
+
+               case BOOT_EFI:
+                       if (efi_enabled)
+                               efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+                                                EFI_SUCCESS, 0, NULL);
+
+                       reboot_type = BOOT_KBD;
+                       break;
+               }
+       }
+}
+
+void machine_shutdown(void)
  {
+       /* Stop the cpus and apics */
  #ifdef CONFIG_SMP
         int reboot_cpu_id;
  
         /* The boot cpu is always logical cpu 0 */
         reboot_cpu_id = 0;
  
+#ifdef CONFIG_X86_32
         /* See if there has been given a command line override */
         if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
-               cpu_isset(reboot_cpu, cpu_online_map)) {
+               cpu_isset(reboot_cpu, cpu_online_map))
                 reboot_cpu_id = reboot_cpu;
-       }
+#endif
  
-       /* Make certain the cpu I'm rebooting on is online */
-       if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+       /* Make certain the cpu I'm about to reboot on is online */
+       if (!cpu_isset(reboot_cpu_id, cpu_online_map))
                 reboot_cpu_id = smp_processor_id();
-       }
  
         /* Make certain I only run on the appropriate processor */
         set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
  
-       /* O.K. Now that I'm on the appropriate processor, stop
-        * all of the others, and disable their local APICs.
+       /* O.K Now that I'm on the appropriate processor,
+        * stop all of the others.
          */
-
         smp_send_stop();
-#endif /* CONFIG_SMP */
+#endif
  
         lapic_shutdown();
  
  #ifdef CONFIG_X86_IO_APIC
         disable_IO_APIC();
  #endif
+
  #ifdef CONFIG_HPET_TIMER
         hpet_disable();
  #endif
-}
  
-void __attribute__((weak)) mach_reboot_fixups(void)
-{
+#ifdef CONFIG_X86_64
+       pci_iommu_shutdown();
+#endif
  }
  
-static void native_machine_emergency_restart(void)
+void machine_restart(char *__unused)
  {
-       if (!reboot_thru_bios) {
-               if (efi_enabled) {
-                       efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
-                       load_idt(&no_idt);
-                       __asm__ __volatile__("int3");
-               }
-               /* rebooting needs to touch the page at absolute addr 0 */
-               *((unsigned short *)__va(0x472)) = reboot_mode;
-               for (;;) {
-                       mach_reboot_fixups(); /* for board specific fixups */
-                       mach_reboot();
-                       /* That didn't work - force a triple fault.. */
-                       load_idt(&no_idt);
-                       __asm__ __volatile__("int3");
-               }
-       }
-       if (efi_enabled)
-               efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
+       printk("machine restart\n");
  
-       machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
-}
-
-static void native_machine_restart(char * __unused)
-{
-       machine_shutdown();
+       if (!reboot_force)
+               machine_shutdown();
         machine_emergency_restart();
  }
  
-static void native_machine_halt(void)
+void machine_halt(void)
  {
  }
  
-static void native_machine_power_off(void)
+void machine_power_off(void)
  {
         if (pm_power_off) {
-               machine_shutdown();
+               if (!reboot_force)
+                       machine_shutdown();
                 pm_power_off();
         }
  }
  
-
  struct machine_ops machine_ops = {
-       .power_off = native_machine_power_off,
-       .shutdown = native_machine_shutdown,
-       .emergency_restart = native_machine_emergency_restart,
-       .restart = native_machine_restart,
-       .halt = native_machine_halt,
+       .power_off = machine_power_off,
+       .shutdown = machine_shutdown,
+       .emergency_restart = machine_emergency_restart,
+       .restart = machine_restart,
+       .halt = machine_halt
  };
-
-void machine_power_off(void)
-{
-       machine_ops.power_off();
-}
-
-void machine_shutdown(void)
-{
-       machine_ops.shutdown();
-}
-
-void machine_emergency_restart(void)
-{
-       machine_ops.emergency_restart();
-}
-
-void machine_restart(char *cmd)
-{
-       machine_ops.restart(cmd);
-}
-
-void machine_halt(void)
-{
-       machine_ops.halt();
-}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c

deleted file mode 100644 (file)

index 53620a9..0000000
--- a/arch/x86/kernel/reboot_64.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Various gunk just to reboot the machine. */ 
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/string.h>
-#include <linux/pm.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <asm/io.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/hw_irq.h>
-#include <asm/system.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/apic.h>
-#include <asm/hpet.h>
-#include <asm/gart.h>
-
-/*
- * Power off function, if any
- */
-void (*pm_power_off)(void);
-EXPORT_SYMBOL(pm_power_off);
-
-static long no_idt[3];
-static enum { 
-       BOOT_TRIPLE = 't',
-       BOOT_KBD = 'k'
-} reboot_type = BOOT_KBD;
-static int reboot_mode = 0;
-int reboot_force;
-
-/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
-   warm   Don't set the cold reboot flag
-   cold   Set the cold reboot flag
-   triple Force a triple fault (init)
-   kbd    Use the keyboard controller. cold reset (default)
-   force  Avoid anything that could hang.
- */ 
-static int __init reboot_setup(char *str)
-{
-       for (;;) {
-               switch (*str) {
-               case 'w': 
-                       reboot_mode = 0x1234;
-                       break;
-
-               case 'c':
-                       reboot_mode = 0;
-                       break;
-
-               case 't':
-               case 'b':
-               case 'k':
-                       reboot_type = *str;
-                       break;
-               case 'f':
-                       reboot_force = 1;
-                       break;
-               }
-               if((str = strchr(str,',')) != NULL)
-                       str++;
-               else
-                       break;
-       }
-       return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-static inline void kb_wait(void)
-{
-       int i;
-
-       for (i=0; i<0x10000; i++)
-               if ((inb_p(0x64) & 0x02) == 0)
-                       break;
-}
-
-void machine_shutdown(void)
-{
-       unsigned long flags;
-
-       /* Stop the cpus and apics */
-#ifdef CONFIG_SMP
-       int reboot_cpu_id;
-
-       /* The boot cpu is always logical cpu 0 */
-       reboot_cpu_id = 0;
-
-       /* Make certain the cpu I'm about to reboot on is online */
-       if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
-               reboot_cpu_id = smp_processor_id();
-       }
-
-       /* Make certain I only run on the appropriate processor */
-       set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
-
-       /* O.K Now that I'm on the appropriate processor,
-        * stop all of the others.
-        */
-       smp_send_stop();
-#endif
-
-       local_irq_save(flags);
-
-#ifndef CONFIG_SMP
-       disable_local_APIC();
-#endif
-
-       disable_IO_APIC();
-
-#ifdef CONFIG_HPET_TIMER
-       hpet_disable();
-#endif
-       local_irq_restore(flags);
-
-       pci_iommu_shutdown();
-}
-
-void machine_emergency_restart(void)
-{
-       int i;
-
-       /* Tell the BIOS if we want cold or warm reboot */
-       *((unsigned short *)__va(0x472)) = reboot_mode;
-       
-       for (;;) {
-               /* Could also try the reset bit in the Hammer NB */
-               switch (reboot_type) { 
-               case BOOT_KBD:
-               for (i=0; i<10; i++) {
-                       kb_wait();
-                       udelay(50);
-                       outb(0xfe,0x64);         /* pulse reset low */
-                       udelay(50);
-               }
-
-               case BOOT_TRIPLE: 
-                       load_idt((const struct desc_ptr *)&no_idt);
-                       __asm__ __volatile__("int3");
-
-                       reboot_type = BOOT_KBD;
-                       break;
-               }      
-       }      
-}
-
-void machine_restart(char * __unused)
-{
-       printk("machine restart\n");
-
-       if (!reboot_force) {
-               machine_shutdown();
-       }
-       machine_emergency_restart();
-}
-
-void machine_halt(void)
-{
-}
-
-void machine_power_off(void)
-{
-       if (pm_power_off) {
-               if (!reboot_force) {
-                       machine_shutdown();
-               }
-               pm_power_off();
-       }
-}
-
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c

index f452726c0fe276c2482e79b70ac1a1caedec7c5d..dec0b5ec25c2fb18000af506d5493a623c40ad8a 100644 (file)
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev)
         udelay(50); /* shouldn't get here but be safe and spin a while */
  }
  
+static void rdc321x_reset(struct pci_dev *dev)
+{
+       unsigned i;
+       /* Voluntary reset the watchdog timer */
+       outl(0x80003840, 0xCF8);
+       /* Generate a CPU reset on next tick */
+       i = inl(0xCFC);
+       /* Use the minimum timer resolution */
+       i |= 0x1600;
+       outl(i, 0xCFC);
+       outb(1, 0x92);
+}
+
  struct device_fixup {
         unsigned int vendor;
         unsigned int device;
@@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = {
  { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
  { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
  { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
  };
  
  /*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c

new file mode 100644 (file)

index 0000000..eb9b1a1
--- /dev/null
+++ b/arch/x86/kernel/rtc.c
@@ -0,0 +1,204 @@
+/*
+ * RTC related functions
+ */
+#include <linux/acpi.h>
+#include <linux/bcd.h>
+#include <linux/mc146818rtc.h>
+
+#include <asm/time.h>
+#include <asm/vsyscall.h>
+
+#ifdef CONFIG_X86_32
+# define CMOS_YEARS_OFFS 1900
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+#else
+/*
+ * x86-64 systems only exists since 2002.
+ * This will work up to Dec 31, 2100
+ */
+# define CMOS_YEARS_OFFS 2000
+#endif
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * called 500 ms after the second nowtime has started, because when
+ * nowtime is written into the registers of the CMOS clock, it will
+ * jump to the next second precisely 500 ms later. Check the Motorola
+ * MC146818A or Dallas DS12887 data sheet for details.
+ *
+ * BUG: This routine does not handle hour overflow properly; it just
+ *      sets the minutes. Usually you'll only notice that after reboot!
+ */
+int mach_set_rtc_mmss(unsigned long nowtime)
+{
+       int retval = 0;
+       int real_seconds, real_minutes, cmos_minutes;
+       unsigned char save_control, save_freq_select;
+
+        /* tell the clock it's being set */
+       save_control = CMOS_READ(RTC_CONTROL);
+       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
+
+       /* stop and reset prescaler */
+       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
+
+       cmos_minutes = CMOS_READ(RTC_MINUTES);
+       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+               BCD_TO_BIN(cmos_minutes);
+
+       /*
+        * since we're only adjusting minutes and seconds,
+        * don't interfere with hour overflow. This avoids
+        * messing with unknown time zones but requires your
+        * RTC not to be off by more than 15 minutes
+        */
+       real_seconds = nowtime % 60;
+       real_minutes = nowtime / 60;
+       /* correct for half hour time zone */
+       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
+               real_minutes += 30;
+       real_minutes %= 60;
+
+       if (abs(real_minutes - cmos_minutes) < 30) {
+               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+                       BIN_TO_BCD(real_seconds);
+                       BIN_TO_BCD(real_minutes);
+               }
+               CMOS_WRITE(real_seconds,RTC_SECONDS);
+               CMOS_WRITE(real_minutes,RTC_MINUTES);
+       } else {
+               printk(KERN_WARNING
+                      "set_rtc_mmss: can't update from %d to %d\n",
+                      cmos_minutes, real_minutes);
+               retval = -1;
+       }
+
+       /* The following flags have to be released exactly in this order,
+        * otherwise the DS12887 (popular MC146818A clone with integrated
+        * battery and quartz) will not reset the oscillator and will not
+        * update precisely 500 ms later. You won't find this mentioned in
+        * the Dallas Semiconductor data sheets, but who believes data
+        * sheets anyway ...                           -- Markus Kuhn
+        */
+       CMOS_WRITE(save_control, RTC_CONTROL);
+       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+
+       return retval;
+}
+
+unsigned long mach_get_cmos_time(void)
+{
+       unsigned int year, mon, day, hour, min, sec, century = 0;
+
+       /*
+        * If UIP is clear, then we have >= 244 microseconds before
+        * RTC registers will be updated.  Spec sheet says that this
+        * is the reliable way to read RTC - registers. If UIP is set
+        * then the register access might be invalid.
+        */
+       while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
+               cpu_relax();
+
+       sec = CMOS_READ(RTC_SECONDS);
+       min = CMOS_READ(RTC_MINUTES);
+       hour = CMOS_READ(RTC_HOURS);
+       day = CMOS_READ(RTC_DAY_OF_MONTH);
+       mon = CMOS_READ(RTC_MONTH);
+       year = CMOS_READ(RTC_YEAR);
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64)
+       /* CHECKME: Is this really 64bit only ??? */
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+           acpi_gbl_FADT.century)
+               century = CMOS_READ(acpi_gbl_FADT.century);
+#endif
+
+       if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) {
+               BCD_TO_BIN(sec);
+               BCD_TO_BIN(min);
+               BCD_TO_BIN(hour);
+               BCD_TO_BIN(day);
+               BCD_TO_BIN(mon);
+               BCD_TO_BIN(year);
+       }
+
+       if (century) {
+               BCD_TO_BIN(century);
+               year += century * 100;
+               printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
+       } else {
+               year += CMOS_YEARS_OFFS;
+               if (year < 1970)
+                       year += 100;
+       }
+
+       return mktime(year, mon, day, hour, min, sec);
+}
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+       unsigned char val;
+
+       lock_cmos_prefix(addr);
+       outb_p(addr, RTC_PORT(0));
+       val = inb_p(RTC_PORT(1));
+       lock_cmos_suffix(addr);
+       return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+       lock_cmos_prefix(addr);
+       outb_p(addr, RTC_PORT(0));
+       outb_p(val, RTC_PORT(1));
+       lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+       int retval;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+       retval = set_wallclock(nowtime);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       return retval;
+}
+
+/* not static: needed by APM */
+unsigned long read_persistent_clock(void)
+{
+       unsigned long retval, flags;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+       retval = get_wallclock();
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       return retval;
+}
+
+int update_persistent_clock(struct timespec now)
+{
+       return set_rtc_mmss(now.tv_sec);
+}
+
+unsigned long long native_read_tsc(void)
+{
+       return __native_read_tsc();
+}
+EXPORT_SYMBOL(native_read_tsc);
+
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c

index 3558ac78c9265517a4eb22628708f9774ded1f8b..309366f8f6030f100322cdc2ca63f001d6bc3ac5 100644 (file)
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -24,7 +24,11 @@
  #include <asm/sections.h>
  #include <asm/setup.h>
  
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
  struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
  
  cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
  
@@ -37,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
  char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
  
  unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
  static int do_not_nx __cpuinitdata = 0;
  
  /* noexec=on|off
@@ -79,6 +85,43 @@ static int __init nonx32_setup(char *str)
  }
  __setup("noexec32=", nonx32_setup);
  
+/*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas.  These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ */
+static void __init setup_per_cpu_maps(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+               if (per_cpu_offset(cpu)) {
+#endif
+                       per_cpu(x86_cpu_to_apicid, cpu) =
+                                               x86_cpu_to_apicid_init[cpu];
+                       per_cpu(x86_bios_cpu_apicid, cpu) =
+                                               x86_bios_cpu_apicid_init[cpu];
+#ifdef CONFIG_NUMA
+                       per_cpu(x86_cpu_to_node_map, cpu) =
+                                               x86_cpu_to_node_map_init[cpu];
+#endif
+#ifdef CONFIG_SMP
+               }
+               else
+                       printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
+                                                                       cpu);
+#endif
+       }
+
+       /* indicate the early static arrays will soon be gone */
+       x86_cpu_to_apicid_early_ptr = NULL;
+       x86_bios_cpu_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
+       x86_cpu_to_node_map_early_ptr = NULL;
+#endif
+}
+
  /*
   * Great future plan:
   * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
@@ -100,18 +143,21 @@ void __init setup_per_cpu_areas(void)
         for_each_cpu_mask (i, cpu_possible_map) {
                 char *ptr;
  
-               if (!NODE_DATA(cpu_to_node(i))) {
+               if (!NODE_DATA(early_cpu_to_node(i))) {
                         printk("cpu with no node %d, num_online_nodes %d\n",
                                i, num_online_nodes());
                         ptr = alloc_bootmem_pages(size);
                 } else { 
-                       ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
+                       ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size);
                 }
                 if (!ptr)
                         panic("Cannot allocate cpu data for CPU %d\n", i);
                 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
                 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
         }
+
+       /* setup percpu data maps early */
+       setup_per_cpu_maps();
  } 
  
  void pda_init(int cpu)
@@ -169,7 +215,8 @@ void syscall_init(void)
  #endif
  
         /* Flags to clear on syscall */
-       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
+       wrmsrl(MSR_SYSCALL_MASK,
+              X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
  }
  
  void __cpuinit check_efer(void)
@@ -227,7 +274,7 @@ void __cpuinit cpu_init (void)
          * and set up the GDT descriptor:
          */
         if (cpu)
-               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
+               memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
  
         cpu_gdt_descr[cpu].size = GDT_SIZE;
         load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
@@ -257,10 +304,10 @@ void __cpuinit cpu_init (void)
                                       v, cpu); 
                 }
                 estacks += PAGE_SIZE << order[v];
-               orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
+               orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
         }
  
-       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
         /*
          * <= is required because the CPU will access up to
          * 8 bits beyond the end of the IO permission bitmap.
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c

index 9c24b45b513c83e5e1354e2b92d340f86d476f07..62adc5f20be5bf87cce61b42c17f5aebf367bdd6 100644 (file)
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -44,9 +44,12 @@
  #include <linux/crash_dump.h>
  #include <linux/dmi.h>
  #include <linux/pfn.h>
+#include <linux/pci.h>
+#include <linux/init_ohci1394_dma.h>
  
  #include <video/edid.h>
  
+#include <asm/mtrr.h>
  #include <asm/apic.h>
  #include <asm/e820.h>
  #include <asm/mpspec.h>
@@ -67,14 +70,83 @@
     address, and must not be in the .bss segment! */
  unsigned long init_pg_tables_end __initdata = ~0UL;
  
-int disable_pse __cpuinitdata = 0;
-
  /*
   * Machine setup..
   */
-extern struct resource code_resource;
-extern struct resource data_resource;
-extern struct resource bss_resource;
+static struct resource data_resource = {
+       .name   = "Kernel data",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+       .name   = "Kernel code",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+       .name   = "Kernel bss",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+       .name   = "Video RAM area",
+       .start  = 0xa0000,
+       .end    = 0xbffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+       .name   = "dma1",
+       .start  = 0x0000,
+       .end    = 0x001f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "pic1",
+       .start  = 0x0020,
+       .end    = 0x0021,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "timer0",
+       .start  = 0x0040,
+       .end    = 0x0043,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "timer1",
+       .start  = 0x0050,
+       .end    = 0x0053,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "keyboard",
+       .start  = 0x0060,
+       .end    = 0x006f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "dma page reg",
+       .start  = 0x0080,
+       .end    = 0x008f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "pic2",
+       .start  = 0x00a0,
+       .end    = 0x00a1,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "dma2",
+       .start  = 0x00c0,
+       .end    = 0x00df,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "fpu",
+       .start  = 0x00f0,
+       .end    = 0x00ff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+} };
  
  /* cpu data as detected by the assembly code in head.S */
  struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -116,13 +188,17 @@ extern int root_mountflags;
  
  unsigned long saved_videomode;
  
-#define RAMDISK_IMAGE_START_MASK       0x07FF
+#define RAMDISK_IMAGE_START_MASK       0x07FF
  #define RAMDISK_PROMPT_FLAG            0x8000
-#define RAMDISK_LOAD_FLAG              0x4000  
+#define RAMDISK_LOAD_FLAG              0x4000
  
  static char __initdata command_line[COMMAND_LINE_SIZE];
  
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
  struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
  
  #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
  struct edd edd;
@@ -166,8 +242,7 @@ static int __init parse_mem(char *arg)
                 return -EINVAL;
  
         if (strcmp(arg, "nopentium") == 0) {
-               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-               disable_pse = 1;
+               setup_clear_cpu_cap(X86_FEATURE_PSE);
         } else {
                 /* If the user specifies memory size, we
                  * limit the BIOS-provided memory map to
@@ -176,7 +251,7 @@ static int __init parse_mem(char *arg)
                  * trim the existing memory map.
                  */
                 unsigned long long mem_size;
- 
+
                 mem_size = memparse(arg, &arg);
                 limit_regions(mem_size);
                 user_defined_memmap = 1;
@@ -315,7 +390,7 @@ static void __init reserve_ebda_region(void)
         unsigned int addr;
         addr = get_bios_ebda();
         if (addr)
-               reserve_bootmem(addr, PAGE_SIZE);       
+               reserve_bootmem(addr, PAGE_SIZE);
  }
  
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -420,6 +495,100 @@ static inline void __init reserve_crashkernel(void)
  {}
  #endif
  
+#ifdef CONFIG_BLK_DEV_INITRD
+
+static bool do_relocate_initrd = false;
+
+static void __init reserve_initrd(void)
+{
+       unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+       unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
+       unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
+       unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+       unsigned long ramdisk_here;
+
+       initrd_start = 0;
+
+       if (!boot_params.hdr.type_of_loader ||
+           !ramdisk_image || !ramdisk_size)
+               return;         /* No initrd provided by bootloader */
+
+       if (ramdisk_end < ramdisk_image) {
+               printk(KERN_ERR "initrd wraps around end of memory, "
+                      "disabling initrd\n");
+               return;
+       }
+       if (ramdisk_size >= end_of_lowmem/2) {
+               printk(KERN_ERR "initrd too large to handle, "
+                      "disabling initrd\n");
+               return;
+       }
+       if (ramdisk_end <= end_of_lowmem) {
+               /* All in lowmem, easy case */
+               reserve_bootmem(ramdisk_image, ramdisk_size);
+               initrd_start = ramdisk_image + PAGE_OFFSET;
+               initrd_end = initrd_start+ramdisk_size;
+               return;
+       }
+
+       /* We need to move the initrd down into lowmem */
+       ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+
+       /* Note: this includes all the lowmem currently occupied by
+          the initrd, we rely on that fact to keep the data intact. */
+       reserve_bootmem(ramdisk_here, ramdisk_size);
+       initrd_start = ramdisk_here + PAGE_OFFSET;
+       initrd_end   = initrd_start + ramdisk_size;
+
+       do_relocate_initrd = true;
+}
+
+#define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+static void __init relocate_initrd(void)
+{
+       unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+       unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
+       unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+       unsigned long ramdisk_here;
+       unsigned long slop, clen, mapaddr;
+       char *p, *q;
+
+       if (!do_relocate_initrd)
+               return;
+
+       ramdisk_here = initrd_start - PAGE_OFFSET;
+
+       q = (char *)initrd_start;
+
+       /* Copy any lowmem portion of the initrd */
+       if (ramdisk_image < end_of_lowmem) {
+               clen = end_of_lowmem - ramdisk_image;
+               p = (char *)__va(ramdisk_image);
+               memcpy(q, p, clen);
+               q += clen;
+               ramdisk_image += clen;
+               ramdisk_size  -= clen;
+       }
+
+       /* Copy the highmem portion of the initrd */
+       while (ramdisk_size) {
+               slop = ramdisk_image & ~PAGE_MASK;
+               clen = ramdisk_size;
+               if (clen > MAX_MAP_CHUNK-slop)
+                       clen = MAX_MAP_CHUNK-slop;
+               mapaddr = ramdisk_image & PAGE_MASK;
+               p = early_ioremap(mapaddr, clen+slop);
+               memcpy(q, p+slop, clen);
+               early_iounmap(p, clen+slop);
+               q += clen;
+               ramdisk_image += clen;
+               ramdisk_size  -= clen;
+       }
+}
+
+#endif /* CONFIG_BLK_DEV_INITRD */
+
  void __init setup_bootmem_allocator(void)
  {
         unsigned long bootmap_size;
@@ -475,26 +644,10 @@ void __init setup_bootmem_allocator(void)
          */
         find_smp_config();
  #endif
-       numa_kva_reserve();
  #ifdef CONFIG_BLK_DEV_INITRD
-       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-               unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-               unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-               unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-               unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-
-               if (ramdisk_end <= end_of_lowmem) {
-                       reserve_bootmem(ramdisk_image, ramdisk_size);
-                       initrd_start = ramdisk_image + PAGE_OFFSET;
-                       initrd_end = initrd_start+ramdisk_size;
-               } else {
-                       printk(KERN_ERR "initrd extends beyond end of memory "
-                              "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-                              ramdisk_end, end_of_lowmem);
-                       initrd_start = 0;
-               }
-       }
+       reserve_initrd();
  #endif
+       numa_kva_reserve();
         reserve_crashkernel();
  }
  
@@ -545,17 +698,11 @@ void __init setup_arch(char **cmdline_p)
         memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
         pre_setup_arch_hook();
         early_cpu_init();
+       early_ioremap_init();
  
-       /*
-        * FIXME: This isn't an official loader_type right
-        * now but does currently work with elilo.
-        * If we were configured as an EFI kernel, check to make
-        * sure that we were loaded correctly from elilo and that
-        * the system table is valid.  If not, then initialize normally.
-        */
  #ifdef CONFIG_EFI
-       if ((boot_params.hdr.type_of_loader == 0x50) &&
-           boot_params.efi_info.efi_systab)
+       if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+                    "EL32", 4))
                 efi_enabled = 1;
  #endif
  
@@ -579,12 +726,9 @@ void __init setup_arch(char **cmdline_p)
         rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
  #endif
         ARCH_SETUP
-       if (efi_enabled)
-               efi_init();
-       else {
-               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-               print_memory_map(memory_setup());
-       }
+
+       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+       print_memory_map(memory_setup());
  
         copy_edd();
  
@@ -612,8 +756,16 @@ void __init setup_arch(char **cmdline_p)
         strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
         *cmdline_p = command_line;
  
+       if (efi_enabled)
+               efi_init();
+
         max_low_pfn = setup_memory();
  
+       /* update e820 for memory not covered by WB MTRRs */
+       mtrr_bp_init();
+       if (mtrr_trim_uncached_memory(max_pfn))
+               max_low_pfn = setup_memory();
+
  #ifdef CONFIG_VMI
         /*
          * Must be after max_low_pfn is determined, and before kernel
@@ -636,6 +788,16 @@ void __init setup_arch(char **cmdline_p)
         smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
  #endif
         paging_init();
+
+       /*
+        * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+        */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+       if (init_ohci1394_dma_early)
+               init_ohci1394_dma_on_all_controllers();
+#endif
+
         remapped_pgdat_init();
         sparse_init();
         zone_sizes_init();
@@ -644,15 +806,19 @@ void __init setup_arch(char **cmdline_p)
          * NOTE: at this point the bootmem allocator is fully available.
          */
  
+#ifdef CONFIG_BLK_DEV_INITRD
+       relocate_initrd();
+#endif
+
         paravirt_post_allocator_init();
  
         dmi_scan_machine();
  
+       io_delay_init();
+
  #ifdef CONFIG_X86_GENERICARCH
         generic_apic_probe();
-#endif 
-       if (efi_enabled)
-               efi_map_memmap();
+#endif
  
  #ifdef CONFIG_ACPI
         /*
@@ -661,9 +827,7 @@ void __init setup_arch(char **cmdline_p)
         acpi_boot_table_init();
  #endif
  
-#ifdef CONFIG_PCI
         early_quirks();
-#endif
  
  #ifdef CONFIG_ACPI
         acpi_boot_init();
@@ -692,3 +856,26 @@ void __init setup_arch(char **cmdline_p)
  #endif
  #endif
  }
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+       int i;
+
+       printk(KERN_INFO "Setting up standard PCI resources\n");
+       init_iomem_resources(&code_resource, &data_resource, &bss_resource);
+
+       request_resource(&iomem_resource, &video_ram_resource);
+
+       /* request I/O space for devices used on all i[345]86 PCs */
+       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+               request_resource(&ioport_resource, &standard_io_resources[i]);
+       return 0;
+}
+
+subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c

index 30d94d1d5f5f38e222b8ac7e4a869a6e0ee91b01..77fb87bf6e5a6fc12971be48212fd545422a68cf 100644 (file)
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -30,6 +30,7 @@
  #include <linux/crash_dump.h>
  #include <linux/root_dev.h>
  #include <linux/pci.h>
+#include <linux/efi.h>
  #include <linux/acpi.h>
  #include <linux/kallsyms.h>
  #include <linux/edd.h>
@@ -39,10 +40,13 @@
  #include <linux/dmi.h>
  #include <linux/dma-mapping.h>
  #include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <linux/init_ohci1394_dma.h>
  
  #include <asm/mtrr.h>
  #include <asm/uaccess.h>
  #include <asm/system.h>
+#include <asm/vsyscall.h>
  #include <asm/io.h>
  #include <asm/smp.h>
  #include <asm/msr.h>
@@ -50,6 +54,7 @@
  #include <video/edid.h>
  #include <asm/e820.h>
  #include <asm/dma.h>
+#include <asm/gart.h>
  #include <asm/mpspec.h>
  #include <asm/mmu_context.h>
  #include <asm/proto.h>
@@ -59,6 +64,15 @@
  #include <asm/sections.h>
  #include <asm/dmi.h>
  #include <asm/cacheflush.h>
+#include <asm/mce.h>
+#include <asm/ds.h>
+#include <asm/topology.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define ARCH_SETUP
+#endif
  
  /*
   * Machine setup..
@@ -67,6 +81,8 @@
  struct cpuinfo_x86 boot_cpu_data __read_mostly;
  EXPORT_SYMBOL(boot_cpu_data);
  
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
  unsigned long mmu_cr4_features;
  
  /* Boot loader ID as an integer, for the benefit of proc_dointvec */
@@ -76,7 +92,7 @@ unsigned long saved_video_mode;
  
  int force_mwait __cpuinitdata;
  
-/* 
+/*
   * Early DMI memory
   */
  int dmi_alloc_index;
@@ -122,25 +138,27 @@ struct resource standard_io_resources[] = {
  
  #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
  
-struct resource data_resource = {
+static struct resource data_resource = {
         .name = "Kernel data",
         .start = 0,
         .end = 0,
         .flags = IORESOURCE_RAM,
  };
-struct resource code_resource = {
+static struct resource code_resource = {
         .name = "Kernel code",
         .start = 0,
         .end = 0,
         .flags = IORESOURCE_RAM,
  };
-struct resource bss_resource = {
+static struct resource bss_resource = {
         .name = "Kernel bss",
         .start = 0,
         .end = 0,
         .flags = IORESOURCE_RAM,
  };
  
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
  #ifdef CONFIG_PROC_VMCORE
  /* elfcorehdr= specifies the location of elf core header
   * stored by the crashed kernel. This option will be passed
@@ -166,12 +184,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
         bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
         bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
         if (bootmap == -1L)
-               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+               panic("Cannot find bootmem map of size %ld\n", bootmap_size);
         bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
         e820_register_active_regions(0, start_pfn, end_pfn);
         free_bootmem_with_active_regions(0, end_pfn);
         reserve_bootmem(bootmap, bootmap_size);
-} 
+}
  #endif
  
  #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -205,7 +223,8 @@ static void __init reserve_crashkernel(void)
         unsigned long long crash_size, crash_base;
         int ret;
  
-       free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
+       free_mem =
+               ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
  
         ret = parse_crashkernel(boot_command_line, free_mem,
                         &crash_size, &crash_base);
@@ -229,33 +248,21 @@ static inline void __init reserve_crashkernel(void)
  {}
  #endif
  
-#define EBDA_ADDR_POINTER 0x40E
-
-unsigned __initdata ebda_addr;
-unsigned __initdata ebda_size;
-
-static void discover_ebda(void)
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) __init memory_setup(void)
  {
-       /*
-        * there is a real-mode segmented pointer pointing to the 
-        * 4K EBDA area at 0x40E
-        */
-       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
-       ebda_addr <<= 4;
-
-       ebda_size = *(unsigned short *)__va(ebda_addr);
-
-       /* Round EBDA up to pages */
-       if (ebda_size == 0)
-               ebda_size = 1;
-       ebda_size <<= 10;
-       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
-       if (ebda_size > 64*1024)
-               ebda_size = 64*1024;
+       machine_specific_memory_setup();
  }
  
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
  void __init setup_arch(char **cmdline_p)
  {
+       unsigned i;
+
         printk(KERN_INFO "Command line: %s\n", boot_command_line);
  
         ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -269,7 +276,15 @@ void __init setup_arch(char **cmdline_p)
         rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
         rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
  #endif
-       setup_memory_region();
+#ifdef CONFIG_EFI
+       if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+                    "EL64", 4))
+               efi_enabled = 1;
+#endif
+
+       ARCH_SETUP
+
+       memory_setup();
         copy_edd();
  
         if (!boot_params.hdr.root_flags)
@@ -293,27 +308,47 @@ void __init setup_arch(char **cmdline_p)
  
         parse_early_param();
  
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+       if (init_ohci1394_dma_early)
+               init_ohci1394_dma_on_all_controllers();
+#endif
+
         finish_e820_parsing();
  
+       early_gart_iommu_check();
+
         e820_register_active_regions(0, 0, -1UL);
         /*
          * partially used pages are not usable - thus
          * we are rounding upwards:
          */
         end_pfn = e820_end_of_ram();
+       /* update e820 for memory not covered by WB MTRRs */
+       mtrr_bp_init();
+       if (mtrr_trim_uncached_memory(end_pfn)) {
+               e820_register_active_regions(0, 0, -1UL);
+               end_pfn = e820_end_of_ram();
+       }
+
         num_physpages = end_pfn;
  
         check_efer();
  
-       discover_ebda();
-
         init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+       if (efi_enabled)
+               efi_init();
  
         dmi_scan_machine();
  
+       io_delay_init();
+
  #ifdef CONFIG_SMP
-       /* setup to use the static apicid table during kernel startup */
-       x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
+       /* setup to use the early static init tables during kernel startup */
+       x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+       x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
+#ifdef CONFIG_NUMA
+       x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
+#endif
  #endif
  
  #ifdef CONFIG_ACPI
@@ -340,48 +375,26 @@ void __init setup_arch(char **cmdline_p)
  #endif
  
  #ifdef CONFIG_NUMA
-       numa_initmem_init(0, end_pfn); 
+       numa_initmem_init(0, end_pfn);
  #else
         contig_initmem_init(0, end_pfn);
  #endif
  
-       /* Reserve direct mapping */
-       reserve_bootmem_generic(table_start << PAGE_SHIFT, 
-                               (table_end - table_start) << PAGE_SHIFT);
-
-       /* reserve kernel */
-       reserve_bootmem_generic(__pa_symbol(&_text),
-                               __pa_symbol(&_end) - __pa_symbol(&_text));
+       early_res_to_bootmem();
  
+#ifdef CONFIG_ACPI_SLEEP
         /*
-        * reserve physical page 0 - it's a special BIOS page on many boxes,
-        * enabling clean reboots, SMP operation, laptop functions.
+        * Reserve low memory region for sleep support.
          */
-       reserve_bootmem_generic(0, PAGE_SIZE);
-
-       /* reserve ebda region */
-       if (ebda_addr)
-               reserve_bootmem_generic(ebda_addr, ebda_size);
-#ifdef CONFIG_NUMA
-       /* reserve nodemap region */
-       if (nodemap_addr)
-               reserve_bootmem_generic(nodemap_addr, nodemap_size);
+       acpi_reserve_bootmem();
  #endif
  
-#ifdef CONFIG_SMP
-       /* Reserve SMP trampoline */
-       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
-#endif
+       if (efi_enabled)
+               efi_reserve_bootmem();
  
-#ifdef CONFIG_ACPI_SLEEP
         /*
-        * Reserve low memory region for sleep support.
-        */
-       acpi_reserve_bootmem();
-#endif
-       /*
-        * Find and reserve possible boot-time SMP configuration:
-        */
+       * Find and reserve possible boot-time SMP configuration:
+       */
         find_smp_config();
  #ifdef CONFIG_BLK_DEV_INITRD
         if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
@@ -395,6 +408,8 @@ void __init setup_arch(char **cmdline_p)
                         initrd_start = ramdisk_image + PAGE_OFFSET;
                         initrd_end = initrd_start+ramdisk_size;
                 } else {
+                       /* Assumes everything on node 0 */
+                       free_bootmem(ramdisk_image, ramdisk_size);
                         printk(KERN_ERR "initrd extends beyond end of memory "
                                "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
                                ramdisk_end, end_of_mem);
@@ -404,17 +419,10 @@ void __init setup_arch(char **cmdline_p)
  #endif
         reserve_crashkernel();
         paging_init();
+       map_vsyscall();
  
-#ifdef CONFIG_PCI
         early_quirks();
-#endif
  
-       /*
-        * set this early, so we dont allocate cpu0
-        * if MADT list doesnt list BSP first
-        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
-        */
-       cpu_set(0, cpu_present_map);
  #ifdef CONFIG_ACPI
         /*
          * Read APIC and some other early information from ACPI tables.
@@ -430,25 +438,24 @@ void __init setup_arch(char **cmdline_p)
         if (smp_found_config)
                 get_smp_config();
         init_apic_mappings();
+       ioapic_init_mappings();
  
         /*
          * We trust e820 completely. No explicit ROM probing in memory.
-        */
-       e820_reserve_resources(); 
+        */
+       e820_reserve_resources(&code_resource, &data_resource, &bss_resource);
         e820_mark_nosave_regions();
  
-       {
-       unsigned i;
         /* request I/O space for devices used on all i[345]86 PCs */
         for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
                 request_resource(&ioport_resource, &standard_io_resources[i]);
-       }
  
         e820_setup_gap();
  
  #ifdef CONFIG_VT
  #if defined(CONFIG_VGA_CONSOLE)
-       conswitchp = &vga_con;
+       if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+               conswitchp = &vga_con;
  #elif defined(CONFIG_DUMMY_CONSOLE)
         conswitchp = &dummy_con;
  #endif
@@ -479,9 +486,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
  
         if (n >= 0x80000005) {
                 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
-                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-               c->x86_cache_size=(ecx>>24)+(edx>>24);
+               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+                      "D cache %dK (%d bytes/line)\n",
+                      edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+               c->x86_cache_size = (ecx>>24) + (edx>>24);
                 /* On K8 L1 TLB is inclusive, so don't count it */
                 c->x86_tlbsize = 0;
         }
@@ -495,11 +503,8 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
                 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
                 c->x86_cache_size, ecx & 0xFF);
         }
-
-       if (n >= 0x80000007)
-               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
         if (n >= 0x80000008) {
-               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
+               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
                 c->x86_virt_bits = (eax >> 8) & 0xff;
                 c->x86_phys_bits = eax & 0xff;
         }
@@ -508,14 +513,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
  #ifdef CONFIG_NUMA
  static int nearby_node(int apicid)
  {
-       int i;
+       int i, node;
+
         for (i = apicid - 1; i >= 0; i--) {
-               int node = apicid_to_node[i];
+               node = apicid_to_node[i];
                 if (node != NUMA_NO_NODE && node_online(node))
                         return node;
         }
         for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-               int node = apicid_to_node[i];
+               node = apicid_to_node[i];
                 if (node != NUMA_NO_NODE && node_online(node))
                         return node;
         }
@@ -527,7 +533,7 @@ static int nearby_node(int apicid)
   * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
   * Assumes number of cores is a power of two.
   */
-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
  {
  #ifdef CONFIG_SMP
         unsigned bits;
@@ -536,7 +542,54 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
         int node = 0;
         unsigned apicid = hard_smp_processor_id();
  #endif
-       unsigned ecx = cpuid_ecx(0x80000008);
+       bits = c->x86_coreid_bits;
+
+       /* Low order bits define the core id (index of core in socket) */
+       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
+       /* Convert the APIC ID into the socket ID */
+       c->phys_proc_id = phys_pkg_id(bits);
+
+#ifdef CONFIG_NUMA
+       node = c->phys_proc_id;
+       if (apicid_to_node[apicid] != NUMA_NO_NODE)
+               node = apicid_to_node[apicid];
+       if (!node_online(node)) {
+               /* Two possibilities here:
+                  - The CPU is missing memory and no node was created.
+                  In that case try picking one from a nearby CPU
+                  - The APIC IDs differ from the HyperTransport node IDs
+                  which the K8 northbridge parsing fills in.
+                  Assume they are all increased by a constant offset,
+                  but in the same order as the HT nodeids.
+                  If that doesn't result in a usable node fall back to the
+                  path for the previous case.  */
+
+               int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
+
+               if (ht_nodeid >= 0 &&
+                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+                       node = apicid_to_node[ht_nodeid];
+               /* Pick a nearby node */
+               if (!node_online(node))
+                       node = nearby_node(apicid);
+       }
+       numa_set_node(cpu, node);
+
+       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+       unsigned bits, ecx;
+
+       /* Multi core CPU? */
+       if (c->extended_cpuid_level < 0x80000008)
+               return;
+
+       ecx = cpuid_ecx(0x80000008);
  
         c->x86_max_cores = (ecx & 0xff) + 1;
  
@@ -549,37 +602,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
                         bits++;
         }
  
-       /* Low order bits define the core id (index of core in socket) */
-       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
-       /* Convert the APIC ID into the socket ID */
-       c->phys_proc_id = phys_pkg_id(bits);
-
-#ifdef CONFIG_NUMA
-       node = c->phys_proc_id;
-       if (apicid_to_node[apicid] != NUMA_NO_NODE)
-               node = apicid_to_node[apicid];
-       if (!node_online(node)) {
-               /* Two possibilities here:
-                  - The CPU is missing memory and no node was created.
-                  In that case try picking one from a nearby CPU
-                  - The APIC IDs differ from the HyperTransport node IDs
-                  which the K8 northbridge parsing fills in.
-                  Assume they are all increased by a constant offset,
-                  but in the same order as the HT nodeids.
-                  If that doesn't result in a usable node fall back to the
-                  path for the previous case.  */
-               int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
-               if (ht_nodeid >= 0 &&
-                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-                       node = apicid_to_node[ht_nodeid];
-               /* Pick a nearby node */
-               if (!node_online(node))
-                       node = nearby_node(apicid);
-       }
-       numa_set_node(cpu, node);
+       c->x86_coreid_bits = bits;
  
-       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
  #endif
  }
  
@@ -595,8 +619,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
  /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
  static __cpuinit int amd_apic_timer_broken(void)
  {
-       u32 lo, hi;
-       u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+       u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+
         switch (eax & CPUID_XFAM) {
         case CPUID_XFAM_K8:
                 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
@@ -614,6 +638,15 @@ static __cpuinit int amd_apic_timer_broken(void)
         return 0;
  }
  
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+       early_init_amd_mc(c);
+
+       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+       if (c->x86_power & (1<<8))
+               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
  {
         unsigned level;
@@ -624,7 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
         /*
          * Disable TLB flush filter by setting HWCR.FFDIS on K8
          * bit 6 of msr C001_0015
-        *
+        *
          * Errata 63 for SH-B3 steppings
          * Errata 122 for all steppings (F+ have it disabled by default)
          */
@@ -637,35 +670,32 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
  
         /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
            3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-       clear_bit(0*32+31, &c->x86_capability);
-       
+       clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
+
         /* On C+ stepping K8 rep microcode works well for copy/memset */
         level = cpuid_eax(1);
-       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
-               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
+                            level >= 0x0f58))
+               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
         if (c->x86 == 0x10 || c->x86 == 0x11)
-               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
  
         /* Enable workaround for FXSAVE leak */
         if (c->x86 >= 6)
-               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+               set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
  
         level = get_model_name(c);
         if (!level) {
-               switch (c->x86) { 
+               switch (c->x86) {
                 case 15:
                         /* Should distinguish Models here, but this is only
                            a fallback anyways. */
                         strcpy(c->x86_model_id, "Hammer");
-                       break; 
-               } 
-       } 
+                       break;
+               }
+       }
         display_cacheinfo(c);
  
-       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-       if (c->x86_power & (1<<8))
-               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
-
         /* Multi core CPU? */
         if (c->extended_cpuid_level >= 0x80000008)
                 amd_detect_cmp(c);
@@ -677,41 +707,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                 num_cache_leaves = 3;
  
         if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-               set_bit(X86_FEATURE_K8, &c->x86_capability);
-
-       /* RDTSC can be speculated around */
-       clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+               set_cpu_cap(c, X86_FEATURE_K8);
  
-       /* Family 10 doesn't support C states in MWAIT so don't use it */
-       if (c->x86 == 0x10 && !force_mwait)
-               clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
+       /* MFENCE stops RDTSC speculation */
+       set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
  
         if (amd_apic_timer_broken())
                 disable_apic_timer = 1;
  }
  
-static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
  {
  #ifdef CONFIG_SMP
-       u32     eax, ebx, ecx, edx;
-       int     index_msb, core_bits;
+       u32 eax, ebx, ecx, edx;
+       int index_msb, core_bits;
  
         cpuid(1, &eax, &ebx, &ecx, &edx);
  
  
         if (!cpu_has(c, X86_FEATURE_HT))
                 return;
-       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
                 goto out;
  
         smp_num_siblings = (ebx & 0xff0000) >> 16;
  
         if (smp_num_siblings == 1) {
                 printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-       } else if (smp_num_siblings > 1 ) {
+       } else if (smp_num_siblings > 1) {
  
                 if (smp_num_siblings > NR_CPUS) {
-                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+                       printk(KERN_WARNING "CPU: Unsupported number of "
+                              "siblings %d", smp_num_siblings);
                         smp_num_siblings = 1;
                         return;
                 }
@@ -721,7 +748,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
  
                 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
  
-               index_msb = get_count_order(smp_num_siblings) ;
+               index_msb = get_count_order(smp_num_siblings);
  
                 core_bits = get_count_order(c->x86_max_cores);
  
@@ -730,8 +757,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
         }
  out:
         if ((c->x86_max_cores * smp_num_siblings) > 1) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
-               printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
+               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+                      c->phys_proc_id);
+               printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+                      c->cpu_core_id);
         }
  
  #endif
@@ -773,28 +802,39 @@ static void srat_detect_node(void)
  #endif
  }
  
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+           (c->x86 == 0x6 && c->x86_model >= 0x0e))
+               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+}
+
  static void __cpuinit init_intel(struct cpuinfo_x86 *c)
  {
         /* Cache sizes */
         unsigned n;
  
         init_intel_cacheinfo(c);
-       if (c->cpuid_level > 9 ) {
+       if (c->cpuid_level > 9) {
                 unsigned eax = cpuid_eax(10);
                 /* Check for version and the number of counters */
                 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-                       set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+                       set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
         }
  
         if (cpu_has_ds) {
                 unsigned int l1, l2;
                 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
                 if (!(l1 & (1<<11)))
-                       set_bit(X86_FEATURE_BTS, c->x86_capability);
+                       set_cpu_cap(c, X86_FEATURE_BTS);
                 if (!(l1 & (1<<12)))
-                       set_bit(X86_FEATURE_PEBS, c->x86_capability);
+                       set_cpu_cap(c, X86_FEATURE_PEBS);
         }
  
+
+       if (cpu_has_bts)
+               ds_init_intel(c);
+
         n = c->extended_cpuid_level;
         if (n >= 0x80000008) {
                 unsigned eax = cpuid_eax(0x80000008);
@@ -811,14 +851,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                 c->x86_cache_alignment = c->x86_clflush_size * 2;
         if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
             (c->x86 == 0x6 && c->x86_model >= 0x0e))
-               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
         if (c->x86 == 6)
-               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
-       if (c->x86 == 15)
-               set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-       else
-               clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-       c->x86_max_cores = intel_num_cpu_cores(c);
+               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+       set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+       c->x86_max_cores = intel_num_cpu_cores(c);
  
         srat_detect_node();
  }
@@ -835,18 +872,12 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
                 c->x86_vendor = X86_VENDOR_UNKNOWN;
  }
  
-struct cpu_model_info {
-       int vendor;
-       int family;
-       char *model_names[16];
-};
-
  /* Do some early cpuid on the boot CPU to get some parameter that are
     needed before check_bugs. Everything advanced is in identify_cpu
     below. */
-void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
  {
-       u32 tfms;
+       u32 tfms, xlvl;
  
         c->loops_per_jiffy = loops_per_jiffy;
         c->x86_cache_size = -1;
@@ -857,6 +888,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
         c->x86_clflush_size = 64;
         c->x86_cache_alignment = c->x86_clflush_size;
         c->x86_max_cores = 1;
+       c->x86_coreid_bits = 0;
         c->extended_cpuid_level = 0;
         memset(&c->x86_capability, 0, sizeof c->x86_capability);
  
@@ -865,7 +897,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
               (unsigned int *)&c->x86_vendor_id[0],
               (unsigned int *)&c->x86_vendor_id[8],
               (unsigned int *)&c->x86_vendor_id[4]);
-               
+
         get_cpu_vendor(c);
  
         /* Initialize the standard set of capabilities */
@@ -883,7 +915,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
                         c->x86 += (tfms >> 20) & 0xff;
                 if (c->x86 >= 0x6)
                         c->x86_model += ((tfms >> 16) & 0xF) << 4;
-               if (c->x86_capability[0] & (1<<19)) 
+               if (c->x86_capability[0] & (1<<19))
                         c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
         } else {
                 /* Have CPUID level 0 only - unheard of */
@@ -893,18 +925,6 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
  #ifdef CONFIG_SMP
         c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
  #endif
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-       int i;
-       u32 xlvl;
-
-       early_identify_cpu(c);
-
         /* AMD-defined flags: level 0x80000001 */
         xlvl = cpuid_eax(0x80000000);
         c->extended_cpuid_level = xlvl;
@@ -925,6 +945,30 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                         c->x86_capability[2] = cpuid_edx(0x80860001);
         }
  
+       c->extended_cpuid_level = cpuid_eax(0x80000000);
+       if (c->extended_cpuid_level >= 0x80000007)
+               c->x86_power = cpuid_edx(0x80000007);
+
+       switch (c->x86_vendor) {
+       case X86_VENDOR_AMD:
+               early_init_amd(c);
+               break;
+       case X86_VENDOR_INTEL:
+               early_init_intel(c);
+               break;
+       }
+
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+       int i;
+
+       early_identify_cpu(c);
+
         init_scattered_cpuid_features(c);
  
         c->apicid = phys_pkg_id(0);
@@ -954,8 +998,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                 break;
         }
  
-       select_idle_routine(c);
-       detect_ht(c); 
+       detect_ht(c);
  
         /*
          * On SMP, boot_cpu_data holds the common feature set between
@@ -965,32 +1008,56 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
          */
         if (c != &boot_cpu_data) {
                 /* AND the already accumulated flags with these */
-               for (i = 0 ; i < NCAPINTS ; i++)
+               for (i = 0; i < NCAPINTS; i++)
                         boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
         }
  
+       /* Clear all flags overriden by options */
+       for (i = 0; i < NCAPINTS; i++)
+               c->x86_capability[i] ^= cleared_cpu_caps[i];
+
  #ifdef CONFIG_X86_MCE
         mcheck_init(c);
  #endif
+       select_idle_routine(c);
+
         if (c != &boot_cpu_data)
                 mtrr_ap_init();
  #ifdef CONFIG_NUMA
         numa_add_cpu(smp_processor_id());
  #endif
+
+}
+
+static __init int setup_noclflush(char *arg)
+{
+       setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+       return 1;
  }
- 
+__setup("noclflush", setup_noclflush);
  
  void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
  {
         if (c->x86_model_id[0])
-               printk("%s", c->x86_model_id);
+               printk(KERN_INFO "%s", c->x86_model_id);
  
-       if (c->x86_mask || c->cpuid_level >= 0) 
-               printk(" stepping %02x\n", c->x86_mask);
+       if (c->x86_mask || c->cpuid_level >= 0)
+               printk(KERN_CONT " stepping %02x\n", c->x86_mask);
         else
-               printk("\n");
+               printk(KERN_CONT "\n");
  }
  
+static __init int setup_disablecpuid(char *arg)
+{
+       int bit;
+       if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+               setup_clear_cpu_cap(bit);
+       else
+               return 0;
+       return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
  /*
   *     Get CPU information for use by the procfs.
   */
@@ -998,9 +1065,9 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
  static int show_cpuinfo(struct seq_file *m, void *v)
  {
         struct cpuinfo_x86 *c = v;
-       int cpu = 0;
+       int cpu = 0, i;
  
-       /* 
+       /*
          * These flag bits must match the definitions in <asm/cpufeature.h>.
          * NULL means this bit is undefined or reserved; either way it doesn't
          * have meaning as far as Linux is concerned.  Note that it's important
@@ -1010,10 +1077,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
          */
         static const char *const x86_cap_flags[] = {
                 /* Intel-defined */
-               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
+               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
  
                 /* AMD-defined */
                 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -1080,34 +1147,35 @@ static int show_cpuinfo(struct seq_file *m, void *v)
         cpu = c->cpu_index;
  #endif
  
-       seq_printf(m,"processor\t: %u\n"
-                    "vendor_id\t: %s\n"
-                    "cpu family\t: %d\n"
-                    "model\t\t: %d\n"
-                    "model name\t: %s\n",
-                    (unsigned)cpu,
-                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
-                    c->x86,
-                    (int)c->x86_model,
-                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
-       
+       seq_printf(m, "processor\t: %u\n"
+                  "vendor_id\t: %s\n"
+                  "cpu family\t: %d\n"
+                  "model\t\t: %d\n"
+                  "model name\t: %s\n",
+                  (unsigned)cpu,
+                  c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+                  c->x86,
+                  (int)c->x86_model,
+                  c->x86_model_id[0] ? c->x86_model_id : "unknown");
+
         if (c->x86_mask || c->cpuid_level >= 0)
                 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
         else
                 seq_printf(m, "stepping\t: unknown\n");
-       
-       if (cpu_has(c,X86_FEATURE_TSC)) {
+
+       if (cpu_has(c, X86_FEATURE_TSC)) {
                 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
+
                 if (!freq)
                         freq = cpu_khz;
                 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-                            freq / 1000, (freq % 1000));
+                          freq / 1000, (freq % 1000));
         }
  
         /* Cache size */
-       if (c->x86_cache_size >= 0) 
+       if (c->x86_cache_size >= 0)
                 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
-       
+
  #ifdef CONFIG_SMP
         if (smp_num_siblings * c->x86_max_cores > 1) {
                 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
@@ -1116,48 +1184,43 @@ static int show_cpuinfo(struct seq_file *m, void *v)
                 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
                 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
         }
-#endif 
+#endif
  
         seq_printf(m,
-               "fpu\t\t: yes\n"
-               "fpu_exception\t: yes\n"
-               "cpuid level\t: %d\n"
-               "wp\t\t: yes\n"
-               "flags\t\t:",
+                  "fpu\t\t: yes\n"
+                  "fpu_exception\t: yes\n"
+                  "cpuid level\t: %d\n"
+                  "wp\t\t: yes\n"
+                  "flags\t\t:",
                    c->cpuid_level);
  
-       { 
-               int i; 
-               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
-                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
-                               seq_printf(m, " %s", x86_cap_flags[i]);
-       }
-               
+       for (i = 0; i < 32*NCAPINTS; i++)
+               if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+                       seq_printf(m, " %s", x86_cap_flags[i]);
+
         seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
                    c->loops_per_jiffy/(500000/HZ),
                    (c->loops_per_jiffy/(5000/HZ)) % 100);
  
-       if (c->x86_tlbsize > 0) 
+       if (c->x86_tlbsize > 0)
                 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
         seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
         seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
  
-       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
+       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
                    c->x86_phys_bits, c->x86_virt_bits);
  
         seq_printf(m, "power management:");
-       {
-               unsigned i;
-               for (i = 0; i < 32; i++) 
-                       if (c->x86_power & (1 << i)) {
-                               if (i < ARRAY_SIZE(x86_power_flags) &&
-                                       x86_power_flags[i])
-                                       seq_printf(m, "%s%s",
-                                               x86_power_flags[i][0]?" ":"",
-                                               x86_power_flags[i]);
-                               else
-                                       seq_printf(m, " [%d]", i);
-                       }
+       for (i = 0; i < 32; i++) {
+               if (c->x86_power & (1 << i)) {
+                       if (i < ARRAY_SIZE(x86_power_flags) &&
+                           x86_power_flags[i])
+                               seq_printf(m, "%s%s",
+                                          x86_power_flags[i][0]?" ":"",
+                                          x86_power_flags[i]);
+                       else
+                               seq_printf(m, " [%d]", i);
+               }
         }
  
         seq_printf(m, "\n\n");
@@ -1184,8 +1247,8 @@ static void c_stop(struct seq_file *m, void *v)
  {
  }
  
-struct seq_operations cpuinfo_op = {
-       .start =c_start,
+const struct seq_operations cpuinfo_op = {
+       .start = c_start,
         .next = c_next,
         .stop = c_stop,
         .show = show_cpuinfo,
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c

index 20f29e4c1d332ea682e14acbc570ddfa19924ff7..caee1f002fed39476d54e61b3aa6762b28e92c31 100644 (file)
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -23,6 +23,7 @@
  #include <asm/ucontext.h>
  #include <asm/uaccess.h>
  #include <asm/i387.h>
+#include <asm/vdso.h>
  #include "sigframe_32.h"
  
  #define DEBUG_SIG 0
@@ -81,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
  }
  
  asmlinkage int
-sys_sigaltstack(unsigned long ebx)
+sys_sigaltstack(unsigned long bx)
  {
         /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
-       struct pt_regs *regs = (struct pt_regs *)&ebx;
-       const stack_t __user *uss = (const stack_t __user *)ebx;
-       stack_t __user *uoss = (stack_t __user *)regs->ecx;
+       struct pt_regs *regs = (struct pt_regs *)&bx;
+       const stack_t __user *uss = (const stack_t __user *)bx;
+       stack_t __user *uoss = (stack_t __user *)regs->cx;
  
-       return do_sigaltstack(uss, uoss, regs->esp);
+       return do_sigaltstack(uss, uoss, regs->sp);
  }
  
  
@@ -109,12 +110,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
  #define COPY_SEG(seg)                                                  \
         { unsigned short tmp;                                           \
           err |= __get_user(tmp, &sc->seg);                             \
-         regs->x##seg = tmp; }
+         regs->seg = tmp; }
  
  #define COPY_SEG_STRICT(seg)                                           \
         { unsigned short tmp;                                           \
           err |= __get_user(tmp, &sc->seg);                             \
-         regs->x##seg = tmp|3; }
+         regs->seg = tmp|3; }
  
  #define GET_SEG(seg)                                                   \
         { unsigned short tmp;                                           \
@@ -130,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
         COPY_SEG(fs);
         COPY_SEG(es);
         COPY_SEG(ds);
-       COPY(edi);
-       COPY(esi);
-       COPY(ebp);
-       COPY(esp);
-       COPY(ebx);
-       COPY(edx);
-       COPY(ecx);
-       COPY(eip);
+       COPY(di);
+       COPY(si);
+       COPY(bp);
+       COPY(sp);
+       COPY(bx);
+       COPY(dx);
+       COPY(cx);
+       COPY(ip);
         COPY_SEG_STRICT(cs);
         COPY_SEG_STRICT(ss);
         
         {
                 unsigned int tmpflags;
-               err |= __get_user(tmpflags, &sc->eflags);
-               regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-               regs->orig_eax = -1;            /* disable syscall checks */
+               err |= __get_user(tmpflags, &sc->flags);
+               regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+               regs->orig_ax = -1;             /* disable syscall checks */
         }
  
         {
@@ -164,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
                 }
         }
  
-       err |= __get_user(*peax, &sc->eax);
+       err |= __get_user(*peax, &sc->ax);
         return err;
  
  badframe:
@@ -174,9 +175,9 @@ badframe:
  asmlinkage int sys_sigreturn(unsigned long __unused)
  {
         struct pt_regs *regs = (struct pt_regs *) &__unused;
-       struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
+       struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8);
         sigset_t set;
-       int eax;
+       int ax;
  
         if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
                 goto badframe;
@@ -192,17 +193,20 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
         recalc_sigpending();
         spin_unlock_irq(&current->sighand->siglock);
         
-       if (restore_sigcontext(regs, &frame->sc, &eax))
+       if (restore_sigcontext(regs, &frame->sc, &ax))
                 goto badframe;
-       return eax;
+       return ax;
  
  badframe:
-       if (show_unhandled_signals && printk_ratelimit())
-               printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
-                      " esp:%lx oeax:%lx\n",
+       if (show_unhandled_signals && printk_ratelimit()) {
+               printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
+                      " sp:%lx oeax:%lx",
                     task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
-                   current->comm, task_pid_nr(current), frame, regs->eip,
-                   regs->esp, regs->orig_eax);
+                   current->comm, task_pid_nr(current), frame, regs->ip,
+                   regs->sp, regs->orig_ax);
+               print_vma_addr(" in ", regs->ip);
+               printk("\n");
+       }
  
         force_sig(SIGSEGV, current);
         return 0;
@@ -211,9 +215,9 @@ badframe:
  asmlinkage int sys_rt_sigreturn(unsigned long __unused)
  {
         struct pt_regs *regs = (struct pt_regs *) &__unused;
-       struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
+       struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4);
         sigset_t set;
-       int eax;
+       int ax;
  
         if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
                 goto badframe;
@@ -226,13 +230,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
         recalc_sigpending();
         spin_unlock_irq(&current->sighand->siglock);
         
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
                 goto badframe;
  
-       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
+       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
                 goto badframe;
  
-       return eax;
+       return ax;
  
  badframe:
         force_sig(SIGSEGV, current);
@@ -249,27 +253,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
  {
         int tmp, err = 0;
  
-       err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+       err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
         savesegment(gs, tmp);
         err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
  
-       err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
-       err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
-       err |= __put_user(regs->edi, &sc->edi);
-       err |= __put_user(regs->esi, &sc->esi);
-       err |= __put_user(regs->ebp, &sc->ebp);
-       err |= __put_user(regs->esp, &sc->esp);
-       err |= __put_user(regs->ebx, &sc->ebx);
-       err |= __put_user(regs->edx, &sc->edx);
-       err |= __put_user(regs->ecx, &sc->ecx);
-       err |= __put_user(regs->eax, &sc->eax);
+       err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
+       err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
+       err |= __put_user(regs->di, &sc->di);
+       err |= __put_user(regs->si, &sc->si);
+       err |= __put_user(regs->bp, &sc->bp);
+       err |= __put_user(regs->sp, &sc->sp);
+       err |= __put_user(regs->bx, &sc->bx);
+       err |= __put_user(regs->dx, &sc->dx);
+       err |= __put_user(regs->cx, &sc->cx);
+       err |= __put_user(regs->ax, &sc->ax);
         err |= __put_user(current->thread.trap_no, &sc->trapno);
         err |= __put_user(current->thread.error_code, &sc->err);
-       err |= __put_user(regs->eip, &sc->eip);
-       err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
-       err |= __put_user(regs->eflags, &sc->eflags);
-       err |= __put_user(regs->esp, &sc->esp_at_signal);
-       err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+       err |= __put_user(regs->ip, &sc->ip);
+       err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
+       err |= __put_user(regs->flags, &sc->flags);
+       err |= __put_user(regs->sp, &sc->sp_at_signal);
+       err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
  
         tmp = save_i387(fpstate);
         if (tmp < 0)
@@ -290,29 +294,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
  static inline void __user *
  get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
  {
-       unsigned long esp;
+       unsigned long sp;
  
         /* Default to using normal stack */
-       esp = regs->esp;
+       sp = regs->sp;
+
+       /*
+        * If we are on the alternate signal stack and would overflow it, don't.
+        * Return an always-bogus address instead so we will die with SIGSEGV.
+        */
+       if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
+               return (void __user *) -1L;
  
         /* This is the X/Open sanctioned signal stack switching.  */
         if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(esp) == 0)
-                       esp = current->sas_ss_sp + current->sas_ss_size;
+               if (sas_ss_flags(sp) == 0)
+                       sp = current->sas_ss_sp + current->sas_ss_size;
         }
  
         /* This is the legacy signal stack switching. */
-       else if ((regs->xss & 0xffff) != __USER_DS &&
+       else if ((regs->ss & 0xffff) != __USER_DS &&
                  !(ka->sa.sa_flags & SA_RESTORER) &&
                  ka->sa.sa_restorer) {
-               esp = (unsigned long) ka->sa.sa_restorer;
+               sp = (unsigned long) ka->sa.sa_restorer;
         }
  
-       esp -= frame_size;
+       sp -= frame_size;
         /* Align the stack pointer according to the i386 ABI,
          * i.e. so that on function entry ((sp + 4) & 15) == 0. */
-       esp = ((esp + 4) & -16ul) - 4;
-       return (void __user *) esp;
+       sp = ((sp + 4) & -16ul) - 4;
+       return (void __user *) sp;
  }
  
  /* These symbols are defined with the addresses in the vsyscall page.
@@ -355,9 +366,9 @@ static int setup_frame(int sig, struct k_sigaction *ka,
         }
  
         if (current->binfmt->hasvdso)
-               restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+               restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
         else
-               restorer = (void *)&frame->retcode;
+               restorer = &frame->retcode;
         if (ka->sa.sa_flags & SA_RESTORER)
                 restorer = ka->sa.sa_restorer;
  
@@ -379,16 +390,16 @@ static int setup_frame(int sig, struct k_sigaction *ka,
                 goto give_sigsegv;
  
         /* Set up registers for signal handler */
-       regs->esp = (unsigned long) frame;
-       regs->eip = (unsigned long) ka->sa.sa_handler;
-       regs->eax = (unsigned long) sig;
-       regs->edx = (unsigned long) 0;
-       regs->ecx = (unsigned long) 0;
+       regs->sp = (unsigned long) frame;
+       regs->ip = (unsigned long) ka->sa.sa_handler;
+       regs->ax = (unsigned long) sig;
+       regs->dx = (unsigned long) 0;
+       regs->cx = (unsigned long) 0;
  
-       regs->xds = __USER_DS;
-       regs->xes = __USER_DS;
-       regs->xss = __USER_DS;
-       regs->xcs = __USER_CS;
+       regs->ds = __USER_DS;
+       regs->es = __USER_DS;
+       regs->ss = __USER_DS;
+       regs->cs = __USER_CS;
  
         /*
          * Clear TF when entering the signal handler, but
@@ -396,13 +407,13 @@ static int setup_frame(int sig, struct k_sigaction *ka,
          * The tracer may want to single-step inside the
          * handler too.
          */
-       regs->eflags &= ~TF_MASK;
+       regs->flags &= ~TF_MASK;
         if (test_thread_flag(TIF_SINGLESTEP))
                 ptrace_notify(SIGTRAP);
  
  #if DEBUG_SIG
         printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-               current->comm, current->pid, frame, regs->eip, frame->pretcode);
+               current->comm, current->pid, frame, regs->ip, frame->pretcode);
  #endif
  
         return 0;
@@ -442,7 +453,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
         err |= __put_user(0, &frame->uc.uc_flags);
         err |= __put_user(0, &frame->uc.uc_link);
         err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-       err |= __put_user(sas_ss_flags(regs->esp),
+       err |= __put_user(sas_ss_flags(regs->sp),
                           &frame->uc.uc_stack.ss_flags);
         err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
         err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -452,13 +463,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                 goto give_sigsegv;
  
         /* Set up to return from userspace.  */
-       restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
+       restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
         if (ka->sa.sa_flags & SA_RESTORER)
                 restorer = ka->sa.sa_restorer;
         err |= __put_user(restorer, &frame->pretcode);
          
         /*
-        * This is movl $,%eax ; int $0x80
+        * This is movl $,%ax ; int $0x80
          *
          * WE DO NOT USE IT ANY MORE! It's only left here for historical
          * reasons and because gdb uses it as a signature to notice
@@ -472,16 +483,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                 goto give_sigsegv;
  
         /* Set up registers for signal handler */
-       regs->esp = (unsigned long) frame;
-       regs->eip = (unsigned long) ka->sa.sa_handler;
-       regs->eax = (unsigned long) usig;
-       regs->edx = (unsigned long) &frame->info;
-       regs->ecx = (unsigned long) &frame->uc;
+       regs->sp = (unsigned long) frame;
+       regs->ip = (unsigned long) ka->sa.sa_handler;
+       regs->ax = (unsigned long) usig;
+       regs->dx = (unsigned long) &frame->info;
+       regs->cx = (unsigned long) &frame->uc;
  
-       regs->xds = __USER_DS;
-       regs->xes = __USER_DS;
-       regs->xss = __USER_DS;
-       regs->xcs = __USER_CS;
+       regs->ds = __USER_DS;
+       regs->es = __USER_DS;
+       regs->ss = __USER_DS;
+       regs->cs = __USER_CS;
  
         /*
          * Clear TF when entering the signal handler, but
@@ -489,13 +500,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
          * The tracer may want to single-step inside the
          * handler too.
          */
-       regs->eflags &= ~TF_MASK;
+       regs->flags &= ~TF_MASK;
         if (test_thread_flag(TIF_SINGLESTEP))
                 ptrace_notify(SIGTRAP);
  
  #if DEBUG_SIG
         printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-               current->comm, current->pid, frame, regs->eip, frame->pretcode);
+               current->comm, current->pid, frame, regs->ip, frame->pretcode);
  #endif
  
         return 0;
@@ -516,35 +527,33 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
         int ret;
  
         /* Are we from a system call? */
-       if (regs->orig_eax >= 0) {
+       if (regs->orig_ax >= 0) {
                 /* If so, check system call restarting.. */
-               switch (regs->eax) {
+               switch (regs->ax) {
                         case -ERESTART_RESTARTBLOCK:
                         case -ERESTARTNOHAND:
-                               regs->eax = -EINTR;
+                               regs->ax = -EINTR;
                                 break;
  
                         case -ERESTARTSYS:
                                 if (!(ka->sa.sa_flags & SA_RESTART)) {
-                                       regs->eax = -EINTR;
+                                       regs->ax = -EINTR;
                                         break;
                                 }
                         /* fallthrough */
                         case -ERESTARTNOINTR:
-                               regs->eax = regs->orig_eax;
-                               regs->eip -= 2;
+                               regs->ax = regs->orig_ax;
+                               regs->ip -= 2;
                 }
         }
  
         /*
-        * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
-        * that register information in the sigcontext is correct.
+        * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
+        * flag so that register information in the sigcontext is correct.
          */
-       if (unlikely(regs->eflags & TF_MASK)
-           && likely(current->ptrace & PT_DTRACE)) {
-               current->ptrace &= ~PT_DTRACE;
-               regs->eflags &= ~TF_MASK;
-       }
+       if (unlikely(regs->flags & X86_EFLAGS_TF) &&
+           likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
+               regs->flags &= ~X86_EFLAGS_TF;
  
         /* Set up the stack frame */
         if (ka->sa.sa_flags & SA_SIGINFO)
@@ -569,7 +578,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
   * want to handle. Thus you cannot kill init even with a SIGKILL even by
   * mistake.
   */
-static void fastcall do_signal(struct pt_regs *regs)
+static void do_signal(struct pt_regs *regs)
  {
         siginfo_t info;
         int signr;
@@ -599,8 +608,8 @@ static void fastcall do_signal(struct pt_regs *regs)
                  * have been cleared if the watchpoint triggered
                  * inside the kernel.
                  */
-               if (unlikely(current->thread.debugreg[7]))
-                       set_debugreg(current->thread.debugreg[7], 7);
+               if (unlikely(current->thread.debugreg7))
+                       set_debugreg(current->thread.debugreg7, 7);
  
                 /* Whee!  Actually deliver the signal.  */
                 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
@@ -616,19 +625,19 @@ static void fastcall do_signal(struct pt_regs *regs)
         }
  
         /* Did we come from a system call? */
-       if (regs->orig_eax >= 0) {
+       if (regs->orig_ax >= 0) {
                 /* Restart the system call - no handlers present */
-               switch (regs->eax) {
+               switch (regs->ax) {
                 case -ERESTARTNOHAND:
                 case -ERESTARTSYS:
                 case -ERESTARTNOINTR:
-                       regs->eax = regs->orig_eax;
-                       regs->eip -= 2;
+                       regs->ax = regs->orig_ax;
+                       regs->ip -= 2;
                         break;
  
                 case -ERESTART_RESTARTBLOCK:
-                       regs->eax = __NR_restart_syscall;
-                       regs->eip -= 2;
+                       regs->ax = __NR_restart_syscall;
+                       regs->ip -= 2;
                         break;
                 }
         }
@@ -651,7 +660,7 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
  {
         /* Pending single-step? */
         if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->eflags |= TF_MASK;
+               regs->flags |= TF_MASK;
                 clear_thread_flag(TIF_SINGLESTEP);
         }
  
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c

index 38d806467c0f3ee8ad7e1935a7a5e0b18de9cd26..7347bb14e306bb3ff66385beee9cc1e5c6a15113 100644 (file)
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -39,7 +39,7 @@ asmlinkage long
  sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
                 struct pt_regs *regs)
  {
-       return do_sigaltstack(uss, uoss, regs->rsp);
+       return do_sigaltstack(uss, uoss, regs->sp);
  }
  
  
@@ -64,8 +64,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
  
  #define COPY(x)                err |= __get_user(regs->x, &sc->x)
  
-       COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
-       COPY(rdx); COPY(rcx); COPY(rip);
+       COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+       COPY(dx); COPY(cx); COPY(ip);
         COPY(r8);
         COPY(r9);
         COPY(r10);
@@ -86,9 +86,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
  
         {
                 unsigned int tmpflags;
-               err |= __get_user(tmpflags, &sc->eflags);
-               regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
-               regs->orig_rax = -1;            /* disable syscall checks */
+               err |= __get_user(tmpflags, &sc->flags);
+               regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+               regs->orig_ax = -1;             /* disable syscall checks */
         }
  
         {
@@ -108,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
                 }
         }
  
-       err |= __get_user(*prax, &sc->rax);
+       err |= __get_user(*prax, &sc->ax);
         return err;
  
  badframe:
@@ -119,9 +119,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
  {
         struct rt_sigframe __user *frame;
         sigset_t set;
-       unsigned long eax;
+       unsigned long ax;
  
-       frame = (struct rt_sigframe __user *)(regs->rsp - 8);
+       frame = (struct rt_sigframe __user *)(regs->sp - 8);
         if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
                 goto badframe;
         } 
@@ -135,17 +135,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
         recalc_sigpending();
         spin_unlock_irq(&current->sighand->siglock);
         
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
                 goto badframe;
  
  #ifdef DEBUG_SIG
-       printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
+       printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax);
  #endif
  
-       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
+       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
                 goto badframe;
  
-       return eax;
+       return ax;
  
  badframe:
         signal_fault(regs,frame,"sigreturn");
@@ -165,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
         err |= __put_user(0, &sc->gs);
         err |= __put_user(0, &sc->fs);
  
-       err |= __put_user(regs->rdi, &sc->rdi);
-       err |= __put_user(regs->rsi, &sc->rsi);
-       err |= __put_user(regs->rbp, &sc->rbp);
-       err |= __put_user(regs->rsp, &sc->rsp);
-       err |= __put_user(regs->rbx, &sc->rbx);
-       err |= __put_user(regs->rdx, &sc->rdx);
-       err |= __put_user(regs->rcx, &sc->rcx);
-       err |= __put_user(regs->rax, &sc->rax);
+       err |= __put_user(regs->di, &sc->di);
+       err |= __put_user(regs->si, &sc->si);
+       err |= __put_user(regs->bp, &sc->bp);
+       err |= __put_user(regs->sp, &sc->sp);
+       err |= __put_user(regs->bx, &sc->bx);
+       err |= __put_user(regs->dx, &sc->dx);
+       err |= __put_user(regs->cx, &sc->cx);
+       err |= __put_user(regs->ax, &sc->ax);
         err |= __put_user(regs->r8, &sc->r8);
         err |= __put_user(regs->r9, &sc->r9);
         err |= __put_user(regs->r10, &sc->r10);
@@ -183,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
         err |= __put_user(regs->r15, &sc->r15);
         err |= __put_user(me->thread.trap_no, &sc->trapno);
         err |= __put_user(me->thread.error_code, &sc->err);
-       err |= __put_user(regs->rip, &sc->rip);
-       err |= __put_user(regs->eflags, &sc->eflags);
+       err |= __put_user(regs->ip, &sc->ip);
+       err |= __put_user(regs->flags, &sc->flags);
         err |= __put_user(mask, &sc->oldmask);
         err |= __put_user(me->thread.cr2, &sc->cr2);
  
@@ -198,18 +198,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
  static void __user *
  get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
  {
-       unsigned long rsp;
+       unsigned long sp;
  
         /* Default to using normal stack - redzone*/
-       rsp = regs->rsp - 128;
+       sp = regs->sp - 128;
  
         /* This is the X/Open sanctioned signal stack switching.  */
         if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(rsp) == 0)
-                       rsp = current->sas_ss_sp + current->sas_ss_size;
+               if (sas_ss_flags(sp) == 0)
+                       sp = current->sas_ss_sp + current->sas_ss_size;
         }
  
-       return (void __user *)round_down(rsp - size, 16); 
+       return (void __user *)round_down(sp - size, 16);
  }
  
  static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -246,7 +246,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
         err |= __put_user(0, &frame->uc.uc_flags);
         err |= __put_user(0, &frame->uc.uc_link);
         err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-       err |= __put_user(sas_ss_flags(regs->rsp),
+       err |= __put_user(sas_ss_flags(regs->sp),
                           &frame->uc.uc_stack.ss_flags);
         err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
         err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
@@ -271,21 +271,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                 goto give_sigsegv;
  
  #ifdef DEBUG_SIG
-       printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
+       printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax);
  #endif
  
         /* Set up registers for signal handler */
-       regs->rdi = sig;
+       regs->di = sig;
         /* In case the signal handler was declared without prototypes */ 
-       regs->rax = 0;  
+       regs->ax = 0;
  
         /* This also works for non SA_SIGINFO handlers because they expect the
            next argument after the signal number on the stack. */
-       regs->rsi = (unsigned long)&frame->info; 
-       regs->rdx = (unsigned long)&frame->uc; 
-       regs->rip = (unsigned long) ka->sa.sa_handler;
+       regs->si = (unsigned long)&frame->info;
+       regs->dx = (unsigned long)&frame->uc;
+       regs->ip = (unsigned long) ka->sa.sa_handler;
  
-       regs->rsp = (unsigned long)frame;
+       regs->sp = (unsigned long)frame;
  
         /* Set up the CS register to run signal handlers in 64-bit mode,
            even if the handler happens to be interrupting 32-bit code. */
@@ -295,12 +295,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
            see include/asm-x86_64/uaccess.h for details. */
         set_fs(USER_DS);
  
-       regs->eflags &= ~TF_MASK;
+       regs->flags &= ~X86_EFLAGS_TF;
         if (test_thread_flag(TIF_SINGLESTEP))
                 ptrace_notify(SIGTRAP);
  #ifdef DEBUG_SIG
         printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
-               current->comm, current->pid, frame, regs->rip, frame->pretcode);
+               current->comm, current->pid, frame, regs->ip, frame->pretcode);
  #endif
  
         return 0;
@@ -321,44 +321,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
         int ret;
  
  #ifdef DEBUG_SIG
-       printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
+       printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n",
                 current->pid, sig,
-               regs->rip, regs->rsp, regs);
+               regs->ip, regs->sp, regs);
  #endif
  
         /* Are we from a system call? */
-       if ((long)regs->orig_rax >= 0) {
+       if ((long)regs->orig_ax >= 0) {
                 /* If so, check system call restarting.. */
-               switch (regs->rax) {
+               switch (regs->ax) {
                         case -ERESTART_RESTARTBLOCK:
                         case -ERESTARTNOHAND:
-                               regs->rax = -EINTR;
+                               regs->ax = -EINTR;
                                 break;
  
                         case -ERESTARTSYS:
                                 if (!(ka->sa.sa_flags & SA_RESTART)) {
-                                       regs->rax = -EINTR;
+                                       regs->ax = -EINTR;
                                         break;
                                 }
                                 /* fallthrough */
                         case -ERESTARTNOINTR:
-                               regs->rax = regs->orig_rax;
-                               regs->rip -= 2;
+                               regs->ax = regs->orig_ax;
+                               regs->ip -= 2;
                                 break;
                 }
         }
  
         /*
-        * If TF is set due to a debugger (PT_DTRACE), clear the TF
-        * flag so that register information in the sigcontext is
-        * correct.
+        * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
+        * flag so that register information in the sigcontext is correct.
          */
-       if (unlikely(regs->eflags & TF_MASK)) {
-               if (likely(current->ptrace & PT_DTRACE)) {
-                       current->ptrace &= ~PT_DTRACE;
-                       regs->eflags &= ~TF_MASK;
-               }
-       }
+       if (unlikely(regs->flags & X86_EFLAGS_TF) &&
+           likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
+               regs->flags &= ~X86_EFLAGS_TF;
  
  #ifdef CONFIG_IA32_EMULATION
         if (test_thread_flag(TIF_IA32)) {
@@ -430,21 +426,21 @@ static void do_signal(struct pt_regs *regs)
         }
  
         /* Did we come from a system call? */
-       if ((long)regs->orig_rax >= 0) {
+       if ((long)regs->orig_ax >= 0) {
                 /* Restart the system call - no handlers present */
-               long res = regs->rax;
+               long res = regs->ax;
                 switch (res) {
                 case -ERESTARTNOHAND:
                 case -ERESTARTSYS:
                 case -ERESTARTNOINTR:
-                       regs->rax = regs->orig_rax;
-                       regs->rip -= 2;
+                       regs->ax = regs->orig_ax;
+                       regs->ip -= 2;
                         break;
                 case -ERESTART_RESTARTBLOCK:
-                       regs->rax = test_thread_flag(TIF_IA32) ?
+                       regs->ax = test_thread_flag(TIF_IA32) ?
                                         __NR_ia32_restart_syscall :
                                         __NR_restart_syscall;
-                       regs->rip -= 2;
+                       regs->ip -= 2;
                         break;
                 }
         }
@@ -461,13 +457,13 @@ void
  do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
  {
  #ifdef DEBUG_SIG
-       printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
-              thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
+       printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n",
+              thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current));
  #endif
                
         /* Pending single-step? */
         if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->eflags |= TF_MASK;
+               regs->flags |= X86_EFLAGS_TF;
                 clear_thread_flag(TIF_SINGLESTEP);
         }
  
@@ -488,9 +484,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
  void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
  { 
         struct task_struct *me = current; 
-       if (show_unhandled_signals && printk_ratelimit())
-               printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
-              me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
+       if (show_unhandled_signals && printk_ratelimit()) {
+               printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+              me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
+               print_vma_addr(" in ", regs->ip);
+               printk("\n");
+       }
  
         force_sig(SIGSEGV, me); 
  } 
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c

index fcaa026eb80729f99ef68e8b7695ed59a8498842..dc0cde9d16fb38088587b664f02b91f878176628 100644 (file)
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
         apic_write_around(APIC_ICR, cfg);
  }
  
-void fastcall send_IPI_self(int vector)
+void send_IPI_self(int vector)
  {
         __send_IPI_shortcut(APIC_DEST_SELF, vector);
  }
@@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
          */ 
  
         local_irq_save(flags);
-       for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
+       for_each_possible_cpu(query_cpu) {
                 if (cpu_isset(query_cpu, mask)) {
                         __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
                                               vector);
@@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
   * We need to reload %cr3 since the page tables may be going
   * away from under us..
   */
-void leave_mm(unsigned long cpu)
+void leave_mm(int cpu)
  {
         if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
                 BUG();
         cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
         load_cr3(swapper_pg_dir);
  }
+EXPORT_SYMBOL_GPL(leave_mm);
  
  /*
   *
@@ -310,7 +311,7 @@ void leave_mm(unsigned long cpu)
   * 2) Leave the mm if we are in the lazy tlb mode.
   */
  
-fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+void smp_invalidate_interrupt(struct pt_regs *regs)
  {
         unsigned long cpu;
  
@@ -638,13 +639,13 @@ static void native_smp_send_stop(void)
   * all the work is done automatically when
   * we return from the interrupt.
   */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+void smp_reschedule_interrupt(struct pt_regs *regs)
  {
         ack_APIC_irq();
         __get_cpu_var(irq_stat).irq_resched_count++;
  }
  
-fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+void smp_call_function_interrupt(struct pt_regs *regs)
  {
         void (*func) (void *info) = call_data->func;
         void *info = call_data->info;
@@ -675,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id)
  {
         int i;
  
-       for (i = 0; i < NR_CPUS; i++) {
+       for_each_possible_cpu(i) {
                 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
                         return i;
         }
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c

index 03fa6ed559c637f5bf3b240184db97c16642aa65..2fd74b06db67093cc3b7789534f30d81b6912346 100644 (file)
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -29,7 +29,7 @@
  #include <asm/idle.h>
  
  /*
- *     Smarter SMP flushing macros. 
+ *     Smarter SMP flushing macros.
   *             c/o Linus Torvalds.
   *
   *     These mean you can really definitely utterly forget about
@@ -37,15 +37,15 @@
   *
   *     Optimizations Manfred Spraul <manfred@colorfullife.com>
   *
- *     More scalable flush, from Andi Kleen
+ *     More scalable flush, from Andi Kleen
   *
- *     To avoid global state use 8 different call vectors.
- *     Each CPU uses a specific vector to trigger flushes on other
- *     CPUs. Depending on the received vector the target CPUs look into
+ *     To avoid global state use 8 different call vectors.
+ *     Each CPU uses a specific vector to trigger flushes on other
+ *     CPUs. Depending on the received vector the target CPUs look into
   *     the right per cpu variable for the flush data.
   *
- *     With more than 8 CPUs they are hashed to the 8 available
- *     vectors. The limited global vector space forces us to this right now.
+ *     With more than 8 CPUs they are hashed to the 8 available
+ *     vectors. The limited global vector space forces us to this right now.
   *     In future when interrupts are split into per CPU domains this could be
   *     fixed, at the cost of triggering multiple IPIs in some cases.
   */
@@ -55,7 +55,6 @@ union smp_flush_state {
                 cpumask_t flush_cpumask;
                 struct mm_struct *flush_mm;
                 unsigned long flush_va;
-#define FLUSH_ALL      -1ULL
                 spinlock_t tlbstate_lock;
         };
         char pad[SMP_CACHE_BYTES];
@@ -67,16 +66,17 @@ union smp_flush_state {
  static DEFINE_PER_CPU(union smp_flush_state, flush_state);
  
  /*
- * We cannot call mmdrop() because we are in interrupt context, 
+ * We cannot call mmdrop() because we are in interrupt context,
   * instead update mm->cpu_vm_mask.
   */
-static inline void leave_mm(int cpu)
+void leave_mm(int cpu)
  {
         if (read_pda(mmu_state) == TLBSTATE_OK)
                 BUG();
         cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
         load_cr3(swapper_pg_dir);
  }
+EXPORT_SYMBOL_GPL(leave_mm);
  
  /*
   *
@@ -85,25 +85,25 @@ static inline void leave_mm(int cpu)
   * 1) switch_mm() either 1a) or 1b)
   * 1a) thread switch to a different mm
   * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *     Stop ipi delivery for the old mm. This is not synchronized with
- *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *     for the wrong mm, and in the worst case we perform a superfluous
- *     tlb flush.
+ *     Stop ipi delivery for the old mm. This is not synchronized with
+ *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *     for the wrong mm, and in the worst case we perform a superfluous
+ *     tlb flush.
   * 1a2) set cpu mmu_state to TLBSTATE_OK
- *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
   *     was in lazy tlb mode.
   * 1a3) update cpu active_mm
- *     Now cpu0 accepts tlb flushes for the new mm.
+ *     Now cpu0 accepts tlb flushes for the new mm.
   * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *     Now the other cpus will send tlb flush ipis.
+ *     Now the other cpus will send tlb flush ipis.
   * 1a4) change cr3.
   * 1b) thread switch without mm change
   *     cpu active_mm is correct, cpu0 already handles
   *     flush ipis.
   * 1b1) set cpu mmu_state to TLBSTATE_OK
   * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *     Atomically set the bit [other cpus will start sending flush ipis],
- *     and test the bit.
+ *     Atomically set the bit [other cpus will start sending flush ipis],
+ *     and test the bit.
   * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
   * 2) switch %%esp, ie current
   *
@@ -137,12 +137,12 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
          * orig_rax contains the negated interrupt vector.
          * Use that to determine where the sender put the data.
          */
-       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
+       sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
         f = &per_cpu(flush_state, sender);
  
         if (!cpu_isset(cpu, f->flush_cpumask))
                 goto out;
-               /* 
+               /*
                  * This was a BUG() but until someone can quote me the
                  * line from the intel manual that guarantees an IPI to
                  * multiple CPUs is retried _only_ on the erroring CPUs
@@ -150,10 +150,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
                  *
                  * BUG();
                  */
-                
+
         if (f->flush_mm == read_pda(active_mm)) {
                 if (read_pda(mmu_state) == TLBSTATE_OK) {
-                       if (f->flush_va == FLUSH_ALL)
+                       if (f->flush_va == TLB_FLUSH_ALL)
                                 local_flush_tlb();
                         else
                                 __flush_tlb_one(f->flush_va);
@@ -166,19 +166,22 @@ out:
         add_pda(irq_tlb_count, 1);
  }
  
-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
-                                               unsigned long va)
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+                            unsigned long va)
  {
         int sender;
         union smp_flush_state *f;
+       cpumask_t cpumask = *cpumaskp;
  
         /* Caller has disabled preemption */
         sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
         f = &per_cpu(flush_state, sender);
  
-       /* Could avoid this lock when
-          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-          probably not worth checking this for a cache-hot lock. */
+       /*
+        * Could avoid this lock when
+        * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+        * probably not worth checking this for a cache-hot lock.
+        */
         spin_lock(&f->tlbstate_lock);
  
         f->flush_mm = mm;
@@ -202,14 +205,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
  int __cpuinit init_smp_flush(void)
  {
         int i;
+
         for_each_cpu_mask(i, cpu_possible_map) {
                 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
         }
         return 0;
  }
-
  core_initcall(init_smp_flush);
-       
+
  void flush_tlb_current_task(void)
  {
         struct mm_struct *mm = current->mm;
@@ -221,10 +224,9 @@ void flush_tlb_current_task(void)
  
         local_flush_tlb();
         if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
         preempt_enable();
  }
-EXPORT_SYMBOL(flush_tlb_current_task);
  
  void flush_tlb_mm (struct mm_struct * mm)
  {
@@ -241,11 +243,10 @@ void flush_tlb_mm (struct mm_struct * mm)
                         leave_mm(smp_processor_id());
         }
         if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+               flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
  
         preempt_enable();
  }
-EXPORT_SYMBOL(flush_tlb_mm);
  
  void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
  {
@@ -259,8 +260,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
         if (current->active_mm == mm) {
                 if(current->mm)
                         __flush_tlb_one(va);
-                else
-                       leave_mm(smp_processor_id());
+               else
+                       leave_mm(smp_processor_id());
         }
  
         if (!cpus_empty(cpu_mask))
@@ -268,7 +269,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
  
         preempt_enable();
  }
-EXPORT_SYMBOL(flush_tlb_page);
  
  static void do_flush_tlb_all(void* info)
  {
@@ -325,11 +325,9 @@ void unlock_ipi_call_lock(void)
   * this function sends a 'generic call function' IPI to all other CPU
   * of the system defined in the mask.
   */
-
-static int
-__smp_call_function_mask(cpumask_t mask,
-                        void (*func)(void *), void *info,
-                        int wait)
+static int __smp_call_function_mask(cpumask_t mask,
+                                   void (*func)(void *), void *info,
+                                   int wait)
  {
         struct call_data_struct data;
         cpumask_t allbutself;
@@ -417,11 +415,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
   */
  
  int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-       int nonatomic, int wait)
+                             int nonatomic, int wait)
  {
         /* prevent preemption and reschedule on another processor */
-       int ret;
-       int me = get_cpu();
+       int ret, me = get_cpu();
  
         /* Can deadlock when called with interrupts disabled */
         WARN_ON(irqs_disabled());
@@ -471,9 +468,9 @@ static void stop_this_cpu(void *dummy)
          */
         cpu_clear(smp_processor_id(), cpu_online_map);
         disable_local_APIC();
-       for (;;) 
+       for (;;)
                 halt();
-} 
+}
  
  void smp_send_stop(void)
  {
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c

index 4ea80cbe52e58916798a9e7bd962c90ebd4e5a3f..5787a0c3e2960919f6dc82f0b034a5ba1d19735c 100644 (file)
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map);
  
  cpumask_t cpu_callin_map;
  cpumask_t cpu_callout_map;
-EXPORT_SYMBOL(cpu_callout_map);
  cpumask_t cpu_possible_map;
  EXPORT_SYMBOL(cpu_possible_map);
  static cpumask_t smp_commenced_mask;
@@ -92,15 +91,10 @@ static cpumask_t smp_commenced_mask;
  DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
  EXPORT_PER_CPU_SYMBOL(cpu_info);
  
-/*
- * The following static array is used during kernel startup
- * and the x86_cpu_to_apicid_ptr contains the address of the
- * array during this time.  Is it zeroed when the per_cpu
- * data area is removed.
- */
+/* which logical CPU number maps to which CPU (physical APIC ID) */
  u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
                         { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_ptr;
+void *x86_cpu_to_apicid_early_ptr;
  DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
  EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
  
@@ -113,7 +107,6 @@ u8 apicid_2_node[MAX_APICID];
  extern const unsigned char trampoline_data [];
  extern const unsigned char trampoline_end  [];
  static unsigned char *trampoline_base;
-static int trampoline_exec;
  
  static void map_cpu_to_logical_apicid(void);
  
@@ -138,17 +131,13 @@ static unsigned long __cpuinit setup_trampoline(void)
   */
  void __init smp_alloc_memory(void)
  {
-       trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
+       trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
         /*
          * Has to be in very low memory so we can execute
          * real-mode AP code.
          */
         if (__pa(trampoline_base) >= 0x9F000)
                 BUG();
-       /*
-        * Make the SMP trampoline executable:
-        */
-       trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
  }
  
  /*
@@ -405,7 +394,7 @@ static void __cpuinit start_secondary(void *unused)
         setup_secondary_clock();
         if (nmi_watchdog == NMI_IO_APIC) {
                 disable_8259A_irq(0);
-               enable_NMI_through_LVT0(NULL);
+               enable_NMI_through_LVT0();
                 enable_8259A_irq(0);
         }
         /*
@@ -448,38 +437,38 @@ void __devinit initialize_secondary(void)
  {
         /*
          * We don't actually need to load the full TSS,
-        * basically just the stack pointer and the eip.
+        * basically just the stack pointer and the ip.
          */
  
         asm volatile(
                 "movl %0,%%esp\n\t"
                 "jmp *%1"
                 :
-               :"m" (current->thread.esp),"m" (current->thread.eip));
+               :"m" (current->thread.sp),"m" (current->thread.ip));
  }
  
  /* Static state in head.S used to set up a CPU */
  extern struct {
-       void * esp;
+       void * sp;
         unsigned short ss;
  } stack_start;
  
  #ifdef CONFIG_NUMA
  
  /* which logical CPUs are on which nodes */
-cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
+cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
                                 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-EXPORT_SYMBOL(node_2_cpu_mask);
+EXPORT_SYMBOL(node_to_cpumask_map);
  /* which node each logical CPU is on */
-int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_2_node);
+int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_to_node_map);
  
  /* set up a mapping between cpu and node. */
  static inline void map_cpu_to_node(int cpu, int node)
  {
         printk("Mapping cpu %d to node %d\n", cpu, node);
-       cpu_set(cpu, node_2_cpu_mask[node]);
-       cpu_2_node[cpu] = node;
+       cpu_set(cpu, node_to_cpumask_map[node]);
+       cpu_to_node_map[cpu] = node;
  }
  
  /* undo a mapping between cpu and node. */
@@ -489,8 +478,8 @@ static inline void unmap_cpu_to_node(int cpu)
  
         printk("Unmapping cpu %d from all nodes\n", cpu);
         for (node = 0; node < MAX_NUMNODES; node ++)
-               cpu_clear(cpu, node_2_cpu_mask[node]);
-       cpu_2_node[cpu] = 0;
+               cpu_clear(cpu, node_to_cpumask_map[node]);
+       cpu_to_node_map[cpu] = 0;
  }
  #else /* !CONFIG_NUMA */
  
@@ -668,7 +657,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
          * target processor state.
          */
         startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-                        (unsigned long) stack_start.esp);
+                        (unsigned long) stack_start.sp);
  
         /*
          * Run STARTUP IPI loop.
@@ -754,7 +743,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu)
                 /* initialize thread_struct.  we really want to avoid destroy
                  * idle tread
                  */
-               idle->thread.esp = (unsigned long)task_pt_regs(idle);
+               idle->thread.sp = (unsigned long)task_pt_regs(idle);
                 init_idle(idle, cpu);
                 return idle;
         }
@@ -799,7 +788,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
         per_cpu(current_task, cpu) = idle;
         early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
  
-       idle->thread.eip = (unsigned long) start_secondary;
+       idle->thread.ip = (unsigned long) start_secondary;
         /* start_eip had better be page-aligned! */
         start_eip = setup_trampoline();
  
@@ -807,9 +796,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
         alternatives_smp_switch(1);
  
         /* So we see what's up   */
-       printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+       printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
         /* Stack for startup_32 can be just as for start_secondary onwards */
-       stack_start.esp = (void *) idle->thread.esp;
+       stack_start.sp = (void *) idle->thread.sp;
  
         irq_ctx_init(cpu);
  
@@ -1091,7 +1080,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
          * Allow the user to impress friends.
          */
         Dprintk("Before bogomips.\n");
-       for (cpu = 0; cpu < NR_CPUS; cpu++)
+       for_each_possible_cpu(cpu)
                 if (cpu_isset(cpu, cpu_callout_map))
                         bogosum += cpu_data(cpu).loops_per_jiffy;
         printk(KERN_INFO
@@ -1122,7 +1111,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
          * construct cpu_sibling_map, so that we can tell sibling CPUs
          * efficiently.
          */
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+       for_each_possible_cpu(cpu) {
                 cpus_clear(per_cpu(cpu_sibling_map, cpu));
                 cpus_clear(per_cpu(cpu_core_map, cpu));
         }
@@ -1296,12 +1285,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
         setup_ioapic_dest();
  #endif
         zap_low_mappings();
-#ifndef CONFIG_HOTPLUG_CPU
-       /*
-        * Disable executability of the SMP trampoline:
-        */
-       set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
-#endif
  }
  
  void __init smp_intr_init(void)
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c

index aaf4e1291217ef120d98cb3d7d98e55dd0debb79..cc64b8085c2aec9f0196dd83320b1f11db0d2afa 100644 (file)
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -65,7 +65,7 @@ int smp_num_siblings = 1;
  EXPORT_SYMBOL(smp_num_siblings);
  
  /* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID;
+DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
  
  /* Bitmask of currently online CPUs */
  cpumask_t cpu_online_map __read_mostly;
@@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map);
   */
  cpumask_t cpu_callin_map;
  cpumask_t cpu_callout_map;
-EXPORT_SYMBOL(cpu_callout_map);
-
  cpumask_t cpu_possible_map;
  EXPORT_SYMBOL(cpu_possible_map);
  
@@ -113,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
   * a new thread. Also avoids complicated thread destroy functionality
   * for idle threads.
   */
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
+ * removed after init for !CONFIG_HOTPLUG_CPU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
+#define get_idle_for_cpu(x)     (per_cpu(idle_thread_array, x))
+#define set_idle_for_cpu(x,p)   (per_cpu(idle_thread_array, x) = (p))
+#else
  struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-
  #define get_idle_for_cpu(x)     (idle_thread_array[(x)])
  #define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
+#endif
+
  
  /*
   * Currently trivial. Write the real->protected mode
@@ -212,6 +220,7 @@ void __cpuinit smp_callin(void)
  
         Dprintk("CALLIN, before setup_local_APIC().\n");
         setup_local_APIC();
+       end_local_APIC_setup();
  
         /*
          * Get our bogomips.
@@ -338,7 +347,7 @@ void __cpuinit start_secondary(void)
  
         if (nmi_watchdog == NMI_IO_APIC) {
                 disable_8259A_irq(0);
-               enable_NMI_through_LVT0(NULL);
+               enable_NMI_through_LVT0();
                 enable_8259A_irq(0);
         }
  
@@ -370,7 +379,7 @@ void __cpuinit start_secondary(void)
  
         unlock_ipi_call_lock();
  
-       setup_secondary_APIC_clock();
+       setup_secondary_clock();
  
         cpu_idle();
  }
@@ -384,19 +393,20 @@ static void inquire_remote_apic(int apicid)
         unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
         char *names[] = { "ID", "VERSION", "SPIV" };
         int timeout;
-       unsigned int status;
+       u32 status;
  
         printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
  
         for (i = 0; i < ARRAY_SIZE(regs); i++) {
-               printk("... APIC #%d %s: ", apicid, names[i]);
+               printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
  
                 /*
                  * Wait for idle.
                  */
                 status = safe_apic_wait_icr_idle();
                 if (status)
-                       printk("a previous APIC delivery may have failed\n");
+                       printk(KERN_CONT
+                              "a previous APIC delivery may have failed\n");
  
                 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
                 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -410,10 +420,10 @@ static void inquire_remote_apic(int apicid)
                 switch (status) {
                 case APIC_ICR_RR_VALID:
                         status = apic_read(APIC_RRR);
-                       printk("%08x\n", status);
+                       printk(KERN_CONT "%08x\n", status);
                         break;
                 default:
-                       printk("failed\n");
+                       printk(KERN_CONT "failed\n");
                 }
         }
  }
@@ -466,7 +476,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
          */
         Dprintk("#startup loops: %d.\n", num_starts);
  
-       maxlvt = get_maxlvt();
+       maxlvt = lapic_get_maxlvt();
  
         for (j = 1; j <= num_starts; j++) {
                 Dprintk("Sending STARTUP #%d.\n",j);
@@ -577,7 +587,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
         c_idle.idle = get_idle_for_cpu(cpu);
  
         if (c_idle.idle) {
-               c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+               c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
                         (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
                 init_idle(c_idle.idle, cpu);
                 goto do_rest;
@@ -613,8 +623,8 @@ do_rest:
  
         start_rip = setup_trampoline();
  
-       init_rsp = c_idle.idle->thread.rsp;
-       per_cpu(init_tss,cpu).rsp0 = init_rsp;
+       init_rsp = c_idle.idle->thread.sp;
+       load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
         initial_code = start_secondary;
         clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
  
@@ -691,7 +701,7 @@ do_rest:
         }
         if (boot_error) {
                 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
-               clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+               clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
                 clear_node_cpumask(cpu); /* was set by numa_add_cpu */
                 cpu_clear(cpu, cpu_present_map);
                 cpu_clear(cpu, cpu_possible_map);
@@ -841,24 +851,16 @@ static int __init smp_sanity_check(unsigned max_cpus)
         return 0;
  }
  
-/*
- * Copy apicid's found by MP_processor_info from initial array to the per cpu
- * data area.  The x86_cpu_to_apicid_init array is then expendable and the
- * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no
- * longer available.
- */
-void __init smp_set_apicids(void)
+static void __init smp_cpu_index_default(void)
  {
-       int cpu;
+       int i;
+       struct cpuinfo_x86 *c;
  
-       for_each_cpu_mask(cpu, cpu_possible_map) {
-               if (per_cpu_offset(cpu))
-                       per_cpu(x86_cpu_to_apicid, cpu) =
-                                               x86_cpu_to_apicid_init[cpu];
+       for_each_cpu_mask(i, cpu_possible_map) {
+               c = &cpu_data(i);
+               /* mark all to hotplug */
+               c->cpu_index = NR_CPUS;
         }
-
-       /* indicate the static array will be going away soon */
-       x86_cpu_to_apicid_ptr = NULL;
  }
  
  /*
@@ -868,9 +870,9 @@ void __init smp_set_apicids(void)
  void __init smp_prepare_cpus(unsigned int max_cpus)
  {
         nmi_watchdog_default();
+       smp_cpu_index_default();
         current_cpu_data = boot_cpu_data;
         current_thread_info()->cpu = 0;  /* needed? */
-       smp_set_apicids();
         set_cpu_sibling_map(0);
  
         if (smp_sanity_check(max_cpus) < 0) {
@@ -885,6 +887,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
          */
         setup_local_APIC();
  
+       /*
+        * Enable IO APIC before setting up error vector
+        */
+       if (!skip_ioapic_setup && nr_ioapics)
+               enable_IO_APIC();
+       end_local_APIC_setup();
+
         if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
                 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
                       GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
@@ -903,7 +912,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
          * Set up local APIC timer on boot CPU.
          */
  
-       setup_boot_APIC_clock();
+       setup_boot_clock();
  }
  
  /*
@@ -912,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
  void __init smp_prepare_boot_cpu(void)
  {
         int me = smp_processor_id();
-       cpu_set(me, cpu_online_map);
+       /* already set me in cpu_online_map in boot_cpu_init() */
         cpu_set(me, cpu_callout_map);
         per_cpu(cpu_state, me) = CPU_ONLINE;
  }
@@ -1016,7 +1025,7 @@ void remove_cpu_from_maps(void)
  
         cpu_clear(cpu, cpu_callout_map);
         cpu_clear(cpu, cpu_callin_map);
-       clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+       clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
         clear_node_cpumask(cpu);
  }
  
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c

index bbfe85a0f699b198008445182430f2848843699c..8bc38af29aef54aabc6f5b909273e0d13915dd9f 100644 (file)
--- a/arch/x86/kernel/smpcommon_32.c
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu)
  {
         struct desc_struct *gdt = get_cpu_gdt_table(cpu);
  
-       pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
-                       (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+       pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
                         __per_cpu_offset[cpu], 0xFFFFF,
-                       0x80 | DESCTYPE_S | 0x2, 0x8);
+                       0x2 | DESCTYPE_S, 0x8);
+
+       gdt[GDT_ENTRY_PERCPU].s = 1;
  
         per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
         per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c

index 2a8713ec0f9aa7823ad0ec9a5cc53eadea4e8ecb..2bf6903cb4448d8152f0c671f3c0d2bdae0afae7 100644 (file)
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
  static int num_memory_chunks;          /* total number of memory chunks */
  static u8 __initdata apicid_to_pxm[MAX_APICID];
  
-extern void * boot_ioremap(unsigned long, unsigned long);
-
  /* Identify CPU proximity domains */
  static void __init parse_cpu_affinity_structure(char *p)
  {
@@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void)
         }
  
         rsdt = (struct acpi_table_rsdt *)
-           boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
+           early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
  
         if (!rsdt) {
                 printk(KERN_WARNING
@@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void)
         for (i = 0; i < tables; i++) {
                 /* Map in header, then map in full table length. */
                 header = (struct acpi_table_header *)
-                       boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
+                       early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
                 if (!header)
                         break;
                 header = (struct acpi_table_header *)
-                       boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
+                       early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
                 if (!header)
                         break;
  
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c

index 55771fd7e545c2875c64ad274ed8609f1298931a..02f0f61f5b1131a511851546393ef06665787507 100644 (file)
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -22,9 +22,9 @@ static int save_stack_stack(void *data, char *name)
         return -1;
  }
  
-static void save_stack_address(void *data, unsigned long addr)
+static void save_stack_address(void *data, unsigned long addr, int reliable)
  {
-       struct stack_trace *trace = (struct stack_trace *)data;
+       struct stack_trace *trace = data;
         if (trace->skip > 0) {
                 trace->skip--;
                 return;
@@ -33,7 +33,8 @@ static void save_stack_address(void *data, unsigned long addr)
                 trace->entries[trace->nr_entries++] = addr;
  }
  
-static void save_stack_address_nosched(void *data, unsigned long addr)
+static void
+save_stack_address_nosched(void *data, unsigned long addr, int reliable)
  {
         struct stack_trace *trace = (struct stack_trace *)data;
         if (in_sched_functions(addr))
@@ -65,15 +66,14 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
   */
  void save_stack_trace(struct stack_trace *trace)
  {
-       dump_trace(current, NULL, NULL, &save_stack_ops, trace);
+       dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
         if (trace->nr_entries < trace->max_entries)
                 trace->entries[trace->nr_entries++] = ULONG_MAX;
  }
-EXPORT_SYMBOL(save_stack_trace);
  
  void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
  {
-       dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
+       dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
         if (trace->nr_entries < trace->max_entries)
                 trace->entries[trace->nr_entries++] = ULONG_MAX;
  }
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c

new file mode 100644 (file)

index 0000000..2ef1a5f
--- /dev/null
+++ b/arch/x86/kernel/step.c
@@ -0,0 +1,203 @@
+/*
+ * x86 single-step support code, common to 32-bit and 64-bit.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/ptrace.h>
+
+unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+       unsigned long addr, seg;
+
+       addr = regs->ip;
+       seg = regs->cs & 0xffff;
+       if (v8086_mode(regs)) {
+               addr = (addr & 0xffff) + (seg << 4);
+               return addr;
+       }
+
+       /*
+        * We'll assume that the code segments in the GDT
+        * are all zero-based. That is largely true: the
+        * TLS segments are used for data, and the PNPBIOS
+        * and APM bios ones we just ignore here.
+        */
+       if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+               u32 *desc;
+               unsigned long base;
+
+               seg &= ~7UL;
+
+               mutex_lock(&child->mm->context.lock);
+               if (unlikely((seg >> 3) >= child->mm->context.size))
+                       addr = -1L; /* bogus selector, access would fault */
+               else {
+                       desc = child->mm->context.ldt + seg;
+                       base = ((desc[0] >> 16) |
+                               ((desc[1] & 0xff) << 16) |
+                               (desc[1] & 0xff000000));
+
+                       /* 16-bit code segment? */
+                       if (!((desc[1] >> 22) & 1))
+                               addr &= 0xffff;
+                       addr += base;
+               }
+               mutex_unlock(&child->mm->context.lock);
+       }
+
+       return addr;
+}
+
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+       int i, copied;
+       unsigned char opcode[15];
+       unsigned long addr = convert_ip_to_linear(child, regs);
+
+       copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+       for (i = 0; i < copied; i++) {
+               switch (opcode[i]) {
+               /* popf and iret */
+               case 0x9d: case 0xcf:
+                       return 1;
+
+                       /* CHECKME: 64 65 */
+
+               /* opcode and address size prefixes */
+               case 0x66: case 0x67:
+                       continue;
+               /* irrelevant prefixes (segment overrides and repeats) */
+               case 0x26: case 0x2e:
+               case 0x36: case 0x3e:
+               case 0x64: case 0x65:
+               case 0xf0: case 0xf2: case 0xf3:
+                       continue;
+
+#ifdef CONFIG_X86_64
+               case 0x40 ... 0x4f:
+                       if (regs->cs != __USER_CS)
+                               /* 32-bit mode: register increment */
+                               return 0;
+                       /* 64-bit mode: REX prefix */
+                       continue;
+#endif
+
+                       /* CHECKME: f2, f3 */
+
+               /*
+                * pushf: NOTE! We should probably not let
+                * the user see the TF bit being set. But
+                * it's more pain than it's worth to avoid
+                * it, and a debugger could emulate this
+                * all in user space if it _really_ cares.
+                */
+               case 0x9c:
+               default:
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Enable single-stepping.  Return nonzero if user mode is not using TF itself.
+ */
+static int enable_single_step(struct task_struct *child)
+{
+       struct pt_regs *regs = task_pt_regs(child);
+
+       /*
+        * Always set TIF_SINGLESTEP - this guarantees that
+        * we single-step system calls etc..  This will also
+        * cause us to set TF when returning to user mode.
+        */
+       set_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+       /*
+        * If TF was already set, don't do anything else
+        */
+       if (regs->flags & X86_EFLAGS_TF)
+               return 0;
+
+       /* Set TF on the kernel stack.. */
+       regs->flags |= X86_EFLAGS_TF;
+
+       /*
+        * ..but if TF is changed by the instruction we will trace,
+        * don't mark it as being "us" that set it, so that we
+        * won't clear it by hand later.
+        */
+       if (is_setting_trap_flag(child, regs))
+               return 0;
+
+       set_tsk_thread_flag(child, TIF_FORCED_TF);
+
+       return 1;
+}
+
+/*
+ * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
+ */
+static void write_debugctlmsr(struct task_struct *child, unsigned long val)
+{
+       child->thread.debugctlmsr = val;
+
+       if (child != current)
+               return;
+
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+/*
+ * Enable single or block step.
+ */
+static void enable_step(struct task_struct *child, bool block)
+{
+       /*
+        * Make sure block stepping (BTF) is not enabled unless it should be.
+        * Note that we don't try to worry about any is_setting_trap_flag()
+        * instructions after the first when using block stepping.
+        * So noone should try to use debugger block stepping in a program
+        * that uses user-mode single stepping itself.
+        */
+       if (enable_single_step(child) && block) {
+               set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+               write_debugctlmsr(child,
+                                 child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
+       } else {
+           write_debugctlmsr(child,
+                             child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+
+           if (!child->thread.debugctlmsr)
+                   clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+       }
+}
+
+void user_enable_single_step(struct task_struct *child)
+{
+       enable_step(child, 0);
+}
+
+void user_enable_block_step(struct task_struct *child)
+{
+       enable_step(child, 1);
+}
+
+void user_disable_single_step(struct task_struct *child)
+{
+       /*
+        * Make sure block stepping (BTF) is disabled.
+        */
+       write_debugctlmsr(child,
+                         child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+
+       if (!child->thread.debugctlmsr)
+               clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+
+       /* Always clear TIF_SINGLESTEP... */
+       clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+       /* But touch TF only if it was set by us.. */
+       if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
+               task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
+}
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c

index 2e5efaaf8800ee2d402aca5b0b9bb1a00061458e..09199511c25623057b8f3c60c96069c646ee27b8 100644 (file)
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/kernel/suspend_64.c
@@ -17,9 +17,26 @@
  /* References to section boundaries */
  extern const void __nosave_begin, __nosave_end;
  
+static void fix_processor_context(void);
+
  struct saved_context saved_context;
  
-void __save_processor_state(struct saved_context *ctxt)
+/**
+ *     __save_processor_state - save CPU registers before creating a
+ *             hibernation image and before restoring the memory state from it
+ *     @ctxt - structure to store the registers contents in
+ *
+ *     NOTE: If there is a CPU register the modification of which by the
+ *     boot kernel (ie. the kernel used for loading the hibernation image)
+ *     might affect the operations of the restored target kernel (ie. the one
+ *     saved in the hibernation image), then its contents must be saved by this
+ *     function.  In other words, if kernel A is hibernated and different
+ *     kernel B is used for loading the hibernation image into memory, the
+ *     kernel A's __save_processor_state() function must save all registers
+ *     needed by kernel A, so that it can operate correctly after the resume
+ *     regardless of what kernel B does in the meantime.
+ */
+static void __save_processor_state(struct saved_context *ctxt)
  {
         kernel_fpu_begin();
  
@@ -69,7 +86,12 @@ static void do_fpu_end(void)
         kernel_fpu_end();
  }
  
-void __restore_processor_state(struct saved_context *ctxt)
+/**
+ *     __restore_processor_state - restore the contents of CPU registers saved
+ *             by __save_processor_state()
+ *     @ctxt - structure to load the registers contents from
+ */
+static void __restore_processor_state(struct saved_context *ctxt)
  {
         /*
          * control registers
@@ -113,14 +135,14 @@ void restore_processor_state(void)
         __restore_processor_state(&saved_context);
  }
  
-void fix_processor_context(void)
+static void fix_processor_context(void)
  {
         int cpu = smp_processor_id();
         struct tss_struct *t = &per_cpu(init_tss, cpu);
  
         set_tss_desc(cpu,t);    /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
  
-       cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
+       get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
  
         syscall_init();                         /* This sets MSR_*STAR and related */
         load_TR_desc();                         /* This does ltr */
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S

index 72f952103e50252f025c62232bf86b0b8b55d4a1..aeb9a4d7681e5b8c93157c9e146f3b7f950ed47e 100644 (file)
--- a/arch/x86/kernel/suspend_asm_64.S
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -18,13 +18,13 @@
  
  ENTRY(swsusp_arch_suspend)
         movq    $saved_context, %rax
-       movq    %rsp, pt_regs_rsp(%rax)
-       movq    %rbp, pt_regs_rbp(%rax)
-       movq    %rsi, pt_regs_rsi(%rax)
-       movq    %rdi, pt_regs_rdi(%rax)
-       movq    %rbx, pt_regs_rbx(%rax)
-       movq    %rcx, pt_regs_rcx(%rax)
-       movq    %rdx, pt_regs_rdx(%rax)
+       movq    %rsp, pt_regs_sp(%rax)
+       movq    %rbp, pt_regs_bp(%rax)
+       movq    %rsi, pt_regs_si(%rax)
+       movq    %rdi, pt_regs_di(%rax)
+       movq    %rbx, pt_regs_bx(%rax)
+       movq    %rcx, pt_regs_cx(%rax)
+       movq    %rdx, pt_regs_dx(%rax)
         movq    %r8, pt_regs_r8(%rax)
         movq    %r9, pt_regs_r9(%rax)
         movq    %r10, pt_regs_r10(%rax)
@@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend)
         movq    %r14, pt_regs_r14(%rax)
         movq    %r15, pt_regs_r15(%rax)
         pushfq
-       popq    pt_regs_eflags(%rax)
+       popq    pt_regs_flags(%rax)
  
         /* save the address of restore_registers */
         movq    $restore_registers, %rax
@@ -115,13 +115,13 @@ ENTRY(restore_registers)
  
         /* We don't restore %rax, it must be 0 anyway */
         movq    $saved_context, %rax
-       movq    pt_regs_rsp(%rax), %rsp
-       movq    pt_regs_rbp(%rax), %rbp
-       movq    pt_regs_rsi(%rax), %rsi
-       movq    pt_regs_rdi(%rax), %rdi
-       movq    pt_regs_rbx(%rax), %rbx
-       movq    pt_regs_rcx(%rax), %rcx
-       movq    pt_regs_rdx(%rax), %rdx
+       movq    pt_regs_sp(%rax), %rsp
+       movq    pt_regs_bp(%rax), %rbp
+       movq    pt_regs_si(%rax), %rsi
+       movq    pt_regs_di(%rax), %rdi
+       movq    pt_regs_bx(%rax), %rbx
+       movq    pt_regs_cx(%rax), %rcx
+       movq    pt_regs_dx(%rax), %rdx
         movq    pt_regs_r8(%rax), %r8
         movq    pt_regs_r9(%rax), %r9
         movq    pt_regs_r10(%rax), %r10
@@ -130,7 +130,7 @@ ENTRY(restore_registers)
         movq    pt_regs_r13(%rax), %r13
         movq    pt_regs_r14(%rax), %r14
         movq    pt_regs_r15(%rax), %r15
-       pushq   pt_regs_eflags(%rax)
+       pushq   pt_regs_flags(%rax)
         popfq
  
         xorq    %rax, %rax
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c

index 907942ee6e7660bcf3a404ebfeedd12a46c9c334..bd802a5e1aa344680971c51d65444c019ed9fc87 100644 (file)
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -12,6 +12,7 @@
  #include <linux/file.h>
  #include <linux/utsname.h>
  #include <linux/personality.h>
+#include <linux/random.h>
  
  #include <asm/uaccess.h>
  #include <asm/ia32.h>
@@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
                            unsigned long *end)
  {
         if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
+               unsigned long new_begin;
                 /* This is usually used needed to map code in small
                    model, so it needs to be in the first 31bit. Limit
                    it to that.  This means we need to move the
@@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
                    of playground for now. -AK */ 
                 *begin = 0x40000000; 
                 *end = 0x80000000;              
+               if (current->flags & PF_RANDOMIZE) {
+                       new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
+                       if (new_begin)
+                               *begin = new_begin;
+               }
         } else {
                 *begin = TASK_UNMAPPED_BASE;
                 *end = TASK_SIZE; 
@@ -143,6 +150,97 @@ full_search:
         }
  }
  
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+                         const unsigned long len, const unsigned long pgoff,
+                         const unsigned long flags)
+{
+       struct vm_area_struct *vma;
+       struct mm_struct *mm = current->mm;
+       unsigned long addr = addr0;
+
+       /* requested length too big for entire address space */
+       if (len > TASK_SIZE)
+               return -ENOMEM;
+
+       if (flags & MAP_FIXED)
+               return addr;
+
+       /* for MAP_32BIT mappings we force the legact mmap base */
+       if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
+               goto bottomup;
+
+       /* requesting a specific address */
+       if (addr) {
+               addr = PAGE_ALIGN(addr);
+               vma = find_vma(mm, addr);
+               if (TASK_SIZE - len >= addr &&
+                               (!vma || addr + len <= vma->vm_start))
+                       return addr;
+       }
+
+       /* check if free_area_cache is useful for us */
+       if (len <= mm->cached_hole_size) {
+               mm->cached_hole_size = 0;
+               mm->free_area_cache = mm->mmap_base;
+       }
+
+       /* either no address requested or can't fit in requested address hole */
+       addr = mm->free_area_cache;
+
+       /* make sure it can fit in the remaining address space */
+       if (addr > len) {
+               vma = find_vma(mm, addr-len);
+               if (!vma || addr <= vma->vm_start)
+                       /* remember the address as a hint for next time */
+                       return (mm->free_area_cache = addr-len);
+       }
+
+       if (mm->mmap_base < len)
+               goto bottomup;
+
+       addr = mm->mmap_base-len;
+
+       do {
+               /*
+                * Lookup failure means no vma is above this address,
+                * else if new region fits below vma->vm_start,
+                * return with success:
+                */
+               vma = find_vma(mm, addr);
+               if (!vma || addr+len <= vma->vm_start)
+                       /* remember the address as a hint for next time */
+                       return (mm->free_area_cache = addr);
+
+               /* remember the largest hole we saw so far */
+               if (addr + mm->cached_hole_size < vma->vm_start)
+                       mm->cached_hole_size = vma->vm_start - addr;
+
+               /* try just below the current vma->vm_start */
+               addr = vma->vm_start-len;
+       } while (len < vma->vm_start);
+
+bottomup:
+       /*
+        * A failed mmap() very likely causes application failure,
+        * so fall back to the bottom-up function here. This scenario
+        * can happen with large stack limits and large mmap()
+        * allocations.
+        */
+       mm->cached_hole_size = ~0UL;
+       mm->free_area_cache = TASK_UNMAPPED_BASE;
+       addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+       /*
+        * Restore the topdown base:
+        */
+       mm->free_area_cache = mm->mmap_base;
+       mm->cached_hole_size = ~0UL;
+
+       return addr;
+}
+
+
  asmlinkage long sys_uname(struct new_utsname __user * name)
  {
         int err;
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c

new file mode 100644 (file)

index 0000000..6d7ef11
--- /dev/null
+++ b/arch/x86/kernel/test_nx.c
@@ -0,0 +1,176 @@
+/*
+ * test_nx.c: functional test for NX functionality
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+
+extern int rodata_test_data;
+
+/*
+ * This file checks 4 things:
+ * 1) Check if the stack is not executable
+ * 2) Check if kmalloc memory is not executable
+ * 3) Check if the .rodata section is not executable
+ * 4) Check if the .data section of a module is not executable
+ *
+ * To do this, the test code tries to execute memory in stack/kmalloc/etc,
+ * and then checks if the expected trap happens.
+ *
+ * Sadly, this implies having a dynamic exception handling table entry.
+ * ... which can be done (and will make Rusty cry)... but it can only
+ * be done in a stand-alone module with only 1 entry total.
+ * (otherwise we'd have to sort and that's just too messy)
+ */
+
+
+
+/*
+ * We want to set up an exception handling point on our stack,
+ * which means a variable value. This function is rather dirty
+ * and walks the exception table of the module, looking for a magic
+ * marker and replaces it with a specific function.
+ */
+static void fudze_exception_table(void *marker, void *new)
+{
+       struct module *mod = THIS_MODULE;
+       struct exception_table_entry *extable;
+
+       /*
+        * Note: This module has only 1 exception table entry,
+        * so searching and sorting is not needed. If that changes,
+        * this would be the place to search and re-sort the exception
+        * table.
+        */
+       if (mod->num_exentries > 1) {
+               printk(KERN_ERR "test_nx: too many exception table entries!\n");
+               printk(KERN_ERR "test_nx: test results are not reliable.\n");
+               return;
+       }
+       extable = (struct exception_table_entry *)mod->extable;
+       extable[0].insn = (unsigned long)new;
+}
+
+
+/*
+ * exception tables get their symbols translated so we need
+ * to use a fake function to put in there, which we can then
+ * replace at runtime.
+ */
+void foo_label(void);
+
+/*
+ * returns 0 for not-executable, negative for executable
+ *
+ * Note: we cannot allow this function to be inlined, because
+ * that would give us more than 1 exception table entry.
+ * This in turn would break the assumptions above.
+ */
+static noinline int test_address(void *address)
+{
+       unsigned long result;
+
+       /* Set up an exception table entry for our address */
+       fudze_exception_table(&foo_label, address);
+       result = 1;
+       asm volatile(
+               "foo_label:\n"
+               "0:     call *%[fake_code]\n"
+               "1:\n"
+               ".section .fixup,\"ax\"\n"
+               "2:     mov %[zero], %[rslt]\n"
+               "       ret\n"
+               ".previous\n"
+               ".section __ex_table,\"a\"\n"
+               "       .align 8\n"
+               "       .quad 0b\n"
+               "       .quad 2b\n"
+               ".previous\n"
+               : [rslt] "=r" (result)
+               : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
+       );
+       /* change the exception table back for the next round */
+       fudze_exception_table(address, &foo_label);
+
+       if (result)
+               return -ENODEV;
+       return 0;
+}
+
+static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
+
+static int test_NX(void)
+{
+       int ret = 0;
+       /* 0xC3 is the opcode for "ret" */
+       char stackcode[] = {0xC3, 0x90, 0 };
+       char *heap;
+
+       test_data = 0xC3;
+
+       printk(KERN_INFO "Testing NX protection\n");
+
+       /* Test 1: check if the stack is not executable */
+       if (test_address(&stackcode)) {
+               printk(KERN_ERR "test_nx: stack was executable\n");
+               ret = -ENODEV;
+       }
+
+
+       /* Test 2: Check if the heap is executable */
+       heap = kmalloc(64, GFP_KERNEL);
+       if (!heap)
+               return -ENOMEM;
+       heap[0] = 0xC3; /* opcode for "ret" */
+
+       if (test_address(heap)) {
+               printk(KERN_ERR "test_nx: heap was executable\n");
+               ret = -ENODEV;
+       }
+       kfree(heap);
+
+       /*
+        * The following 2 tests currently fail, this needs to get fixed
+        * Until then, don't run them to avoid too many people getting scared
+        * by the error message
+        */
+#if 0
+
+#ifdef CONFIG_DEBUG_RODATA
+       /* Test 3: Check if the .rodata section is executable */
+       if (rodata_test_data != 0xC3) {
+               printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
+               ret = -ENODEV;
+       } else if (test_address(&rodata_test_data)) {
+               printk(KERN_ERR "test_nx: .rodata section is executable\n");
+               ret = -ENODEV;
+       }
+#endif
+
+       /* Test 4: Check if the .data section of a module is executable */
+       if (test_address(&test_data)) {
+               printk(KERN_ERR "test_nx: .data section is executable\n");
+               ret = -ENODEV;
+       }
+
+#endif
+       return 0;
+}
+
+static void test_exit(void)
+{
+}
+
+module_init(test_NX);
+module_exit(test_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Testcase for the NX infrastructure");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c

new file mode 100644 (file)

index 0000000..4c16377
--- /dev/null
+++ b/arch/x86/kernel/test_rodata.c
@@ -0,0 +1,86 @@
+/*
+ * test_rodata.c: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/module.h>
+#include <asm/sections.h>
+extern int rodata_test_data;
+
+int rodata_test(void)
+{
+       unsigned long result;
+       unsigned long start, end;
+
+       /* test 1: read the value */
+       /* If this test fails, some previous testrun has clobbered the state */
+       if (!rodata_test_data) {
+               printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
+               return -ENODEV;
+       }
+
+       /* test 2: write to the variable; this should fault */
+       /*
+        * If this test fails, we managed to overwrite the data
+        *
+        * This is written in assembly to be able to catch the
+        * exception that is supposed to happen in the correct
+        * case
+        */
+
+       result = 1;
+       asm volatile(
+               "0:     mov %[zero],(%[rodata_test])\n"
+               "       mov %[zero], %[rslt]\n"
+               "1:\n"
+               ".section .fixup,\"ax\"\n"
+               "2:     jmp 1b\n"
+               ".previous\n"
+               ".section __ex_table,\"a\"\n"
+               "       .align 16\n"
+#ifdef CONFIG_X86_32
+               "       .long 0b,2b\n"
+#else
+               "       .quad 0b,2b\n"
+#endif
+               ".previous"
+               : [rslt] "=r" (result)
+               : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
+       );
+
+
+       if (!result) {
+               printk(KERN_ERR "rodata_test: test data was not read only\n");
+               return -ENODEV;
+       }
+
+       /* test 3: check the value hasn't changed */
+       /* If this test fails, we managed to overwrite the data */
+       if (!rodata_test_data) {
+               printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
+               return -ENODEV;
+       }
+       /* test 4: check if the rodata section is 4Kb aligned */
+       start = (unsigned long)__start_rodata;
+       end = (unsigned long)__end_rodata;
+       if (start & (PAGE_SIZE - 1)) {
+               printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
+               return -ENODEV;
+       }
+       if (end & (PAGE_SIZE - 1)) {
+               printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c

index 8a322c96bc23fdb9465e3b59e7d035b0094e0132..1a89e93f3f1ccfad3696921f25492e60f2bfdfbc 100644 (file)
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -28,98 +28,20 @@
   *     serialize accesses to xtime/lost_ticks).
   */
  
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
+#include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/bcd.h>
-#include <linux/efi.h>
  #include <linux/mca.h>
  
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/irq.h>
-#include <asm/msr.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/time.h>
-
-#include "mach_time.h"
-
-#include <linux/timex.h>
-
-#include <asm/hpet.h>
-
  #include <asm/arch_hooks.h>
-
-#include "io_ports.h"
-
-#include <asm/i8259.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
  
  #include "do_timer.h"
  
  unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
  EXPORT_SYMBOL(cpu_khz);
  
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
-/*
- * This is a special lock that is owned by the CPU and holds the index
- * register we are working with.  It is required for NMI access to the
- * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
- */
-volatile unsigned long cmos_lock = 0;
-EXPORT_SYMBOL(cmos_lock);
-
-/* Routines for accessing the CMOS RAM/RTC. */
-unsigned char rtc_cmos_read(unsigned char addr)
-{
-       unsigned char val;
-       lock_cmos_prefix(addr);
-       outb_p(addr, RTC_PORT(0));
-       val = inb_p(RTC_PORT(1));
-       lock_cmos_suffix(addr);
-       return val;
-}
-EXPORT_SYMBOL(rtc_cmos_read);
-
-void rtc_cmos_write(unsigned char val, unsigned char addr)
-{
-       lock_cmos_prefix(addr);
-       outb_p(addr, RTC_PORT(0));
-       outb_p(val, RTC_PORT(1));
-       lock_cmos_suffix(addr);
-}
-EXPORT_SYMBOL(rtc_cmos_write);
-
-static int set_rtc_mmss(unsigned long nowtime)
-{
-       int retval;
-       unsigned long flags;
-
-       /* gets recalled with irq locally disabled */
-       /* XXX - does irqsave resolve this? -johnstul */
-       spin_lock_irqsave(&rtc_lock, flags);
-       retval = set_wallclock(nowtime);
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       return retval;
-}
-
-
  int timer_ack;
  
  unsigned long profile_pc(struct pt_regs *regs)
@@ -127,17 +49,17 @@ unsigned long profile_pc(struct pt_regs *regs)
         unsigned long pc = instruction_pointer(regs);
  
  #ifdef CONFIG_SMP
-       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+       if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
             in_lock_functions(pc)) {
  #ifdef CONFIG_FRAME_POINTER
-               return *(unsigned long *)(regs->ebp + 4);
+               return *(unsigned long *)(regs->bp + 4);
  #else
-               unsigned long *sp = (unsigned long *)&regs->esp;
+               unsigned long *sp = (unsigned long *)&regs->sp;
  
                 /* Return address is either directly at stack pointer
-                  or above a saved eflags. Eflags has bits 22-31 zero,
+                  or above a saved flags. Eflags has bits 22-31 zero,
                    kernel addresses don't. */
-               if (sp[0] >> 22)
+               if (sp[0] >> 22)
                         return sp[0];
                 if (sp[1] >> 22)
                         return sp[1];
@@ -193,26 +115,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
         return IRQ_HANDLED;
  }
  
-/* not static: needed by APM */
-unsigned long read_persistent_clock(void)
-{
-       unsigned long retval;
-       unsigned long flags;
-
-       spin_lock_irqsave(&rtc_lock, flags);
-
-       retval = get_wallclock();
-
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
-       return set_rtc_mmss(now.tv_sec);
-}
-
  extern void (*late_time_init)(void);
  /* Duplicate of time_init() below, with hpet_enable part added */
  void __init hpet_time_init(void)
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c

index 368b1942b39aba509cfafe7fae629ef400a7a243..0380795121a601e620d41e4b83eb5838c7d6ad8b 100644 (file)
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -11,43 +11,18 @@
   *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
   */
  
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
+#include <linux/clockchips.h>
  #include <linux/init.h>
-#include <linux/mc146818rtc.h>
-#include <linux/time.h>
-#include <linux/ioport.h>
+#include <linux/interrupt.h>
  #include <linux/module.h>
-#include <linux/device.h>
-#include <linux/sysdev.h>
-#include <linux/bcd.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/kallsyms.h>
-#include <linux/acpi.h>
-#include <linux/clockchips.h>
+#include <linux/time.h>
  
-#ifdef CONFIG_ACPI
-#include <acpi/achware.h>      /* for PM timer frequency */
-#include <acpi/acpi_bus.h>
-#endif
  #include <asm/i8253.h>
-#include <asm/pgtable.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/proto.h>
-#include <asm/hpet.h>
-#include <asm/sections.h>
-#include <linux/hpet.h>
-#include <asm/apic.h>
  #include <asm/hpet.h>
-#include <asm/mpspec.h>
  #include <asm/nmi.h>
  #include <asm/vgtod.h>
-
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
+#include <asm/time.h>
+#include <asm/timer.h>
  
  volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
  
@@ -56,10 +31,10 @@ unsigned long profile_pc(struct pt_regs *regs)
         unsigned long pc = instruction_pointer(regs);
  
         /* Assume the lock function has either no stack frame or a copy
-          of eflags from PUSHF
+          of flags from PUSHF
            Eflags always has bits 22 and up cleared unlike kernel addresses. */
         if (!user_mode(regs) && in_lock_functions(pc)) {
-               unsigned long *sp = (unsigned long *)regs->rsp;
+               unsigned long *sp = (unsigned long *)regs->sp;
                 if (sp[0] >> 22)
                         return sp[0];
                 if (sp[1] >> 22)
@@ -69,82 +44,6 @@ unsigned long profile_pc(struct pt_regs *regs)
  }
  EXPORT_SYMBOL(profile_pc);
  
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
- * ms after the second nowtime has started, because when nowtime is written
- * into the registers of the CMOS clock, it will jump to the next second
- * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
- * sheet for details.
- */
-
-static int set_rtc_mmss(unsigned long nowtime)
-{
-       int retval = 0;
-       int real_seconds, real_minutes, cmos_minutes;
-       unsigned char control, freq_select;
-       unsigned long flags;
-
-/*
- * set_rtc_mmss is called when irqs are enabled, so disable irqs here
- */
-       spin_lock_irqsave(&rtc_lock, flags);
-/*
- * Tell the clock it's being set and stop it.
- */
-       control = CMOS_READ(RTC_CONTROL);
-       CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
-
-       freq_select = CMOS_READ(RTC_FREQ_SELECT);
-       CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
-
-       cmos_minutes = CMOS_READ(RTC_MINUTES);
-               BCD_TO_BIN(cmos_minutes);
-
-/*
- * since we're only adjusting minutes and seconds, don't interfere with hour
- * overflow. This avoids messing with unknown time zones but requires your RTC
- * not to be off by more than 15 minutes. Since we're calling it only when
- * our clock is externally synchronized using NTP, this shouldn't be a problem.
- */
-
-       real_seconds = nowtime % 60;
-       real_minutes = nowtime / 60;
-       if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
-               real_minutes += 30;             /* correct for half hour time zone */
-       real_minutes %= 60;
-
-       if (abs(real_minutes - cmos_minutes) >= 30) {
-               printk(KERN_WARNING "time.c: can't update CMOS clock "
-                      "from %d to %d\n", cmos_minutes, real_minutes);
-               retval = -1;
-       } else {
-               BIN_TO_BCD(real_seconds);
-               BIN_TO_BCD(real_minutes);
-               CMOS_WRITE(real_seconds, RTC_SECONDS);
-               CMOS_WRITE(real_minutes, RTC_MINUTES);
-       }
-
-/*
- * The following flags have to be released exactly in this order, otherwise the
- * DS12887 (popular MC146818A clone with integrated battery and quartz) will
- * not reset the oscillator and will not update precisely 500 ms later. You
- * won't find this mentioned in the Dallas Semiconductor data sheets, but who
- * believes data sheets anyway ... -- Markus Kuhn
- */
-
-       CMOS_WRITE(control, RTC_CONTROL);
-       CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
-
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
-       return set_rtc_mmss(now.tv_sec);
-}
-
  static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
  {
         add_pda(irq0_irqs, 1);
@@ -154,67 +53,10 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
         return IRQ_HANDLED;
  }
  
-unsigned long read_persistent_clock(void)
-{
-       unsigned int year, mon, day, hour, min, sec;
-       unsigned long flags;
-       unsigned century = 0;
-
-       spin_lock_irqsave(&rtc_lock, flags);
-       /*
-        * if UIP is clear, then we have >= 244 microseconds before RTC
-        * registers will be updated.  Spec sheet says that this is the
-        * reliable way to read RTC - registers invalid (off bus) during update
-        */
-       while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
-               cpu_relax();
-
-
-       /* now read all RTC registers while stable with interrupts disabled */
-       sec = CMOS_READ(RTC_SECONDS);
-       min = CMOS_READ(RTC_MINUTES);
-       hour = CMOS_READ(RTC_HOURS);
-       day = CMOS_READ(RTC_DAY_OF_MONTH);
-       mon = CMOS_READ(RTC_MONTH);
-       year = CMOS_READ(RTC_YEAR);
-#ifdef CONFIG_ACPI
-       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
-                               acpi_gbl_FADT.century)
-               century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       /*
-        * We know that x86-64 always uses BCD format, no need to check the
-        * config register.
-        */
-
-       BCD_TO_BIN(sec);
-       BCD_TO_BIN(min);
-       BCD_TO_BIN(hour);
-       BCD_TO_BIN(day);
-       BCD_TO_BIN(mon);
-       BCD_TO_BIN(year);
-
-       if (century) {
-               BCD_TO_BIN(century);
-               year += century * 100;
-               printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
-       } else {
-               /*
-                * x86-64 systems only exists since 2002.
-                * This will work up to Dec 31, 2100
-                */
-               year += 2000;
-       }
-
-       return mktime(year, mon, day, hour, min, sec);
-}
-
  /* calibrate_cpu is used on systems with fixed rate TSCs to determine
   * processor frequency */
  #define TICK_COUNT 100000000
-static unsigned int __init tsc_calibrate_cpu_khz(void)
+unsigned long __init native_calculate_cpu_khz(void)
  {
         int tsc_start, tsc_now;
         int i, no_ctr_free;
@@ -241,7 +83,7 @@ static unsigned int __init tsc_calibrate_cpu_khz(void)
         rdtscl(tsc_start);
         do {
                 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-               tsc_now = get_cycles_sync();
+               tsc_now = get_cycles();
         } while ((tsc_now - tsc_start) < TICK_COUNT);
  
         local_irq_restore(flags);
@@ -264,20 +106,22 @@ static struct irqaction irq0 = {
         .name           = "timer"
  };
  
-void __init time_init(void)
+void __init hpet_time_init(void)
  {
         if (!hpet_enable())
                 setup_pit_timer();
  
         setup_irq(0, &irq0);
+}
  
+void __init time_init(void)
+{
         tsc_calibrate();
  
         cpu_khz = tsc_khz;
         if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-               boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-               boot_cpu_data.x86 == 16)
-               cpu_khz = tsc_calibrate_cpu_khz();
+               (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
+               cpu_khz = calculate_cpu_khz();
  
         if (unsynchronized_tsc())
                 mark_tsc_unstable("TSCs unsynchronized");
@@ -290,4 +134,5 @@ void __init time_init(void)
         printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
                 cpu_khz / 1000, cpu_khz % 1000);
         init_tsc_clocksource();
+       late_time_init = choose_time_init();
  }
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c

new file mode 100644 (file)

index 0000000..6dfd4e7
--- /dev/null
+++ b/arch/x86/kernel/tls.c
@@ -0,0 +1,213 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/user.h>
+#include <linux/regset.h>
+
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+
+#include "tls.h"
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+       struct thread_struct *t = &current->thread;
+       int idx;
+
+       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+               if (desc_empty(&t->tls_array[idx]))
+                       return idx + GDT_ENTRY_TLS_MIN;
+       return -ESRCH;
+}
+
+static void set_tls_desc(struct task_struct *p, int idx,
+                        const struct user_desc *info, int n)
+{
+       struct thread_struct *t = &p->thread;
+       struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
+       int cpu;
+
+       /*
+        * We must not get preempted while modifying the TLS.
+        */
+       cpu = get_cpu();
+
+       while (n-- > 0) {
+               if (LDT_empty(info))
+                       desc->a = desc->b = 0;
+               else
+                       fill_ldt(desc, info);
+               ++info;
+               ++desc;
+       }
+
+       if (t == &current->thread)
+               load_TLS(t, cpu);
+
+       put_cpu();
+}
+
+/*
+ * Set a given TLS descriptor:
+ */
+int do_set_thread_area(struct task_struct *p, int idx,
+                      struct user_desc __user *u_info,
+                      int can_allocate)
+{
+       struct user_desc info;
+
+       if (copy_from_user(&info, u_info, sizeof(info)))
+               return -EFAULT;
+
+       if (idx == -1)
+               idx = info.entry_number;
+
+       /*
+        * index -1 means the kernel should try to find and
+        * allocate an empty descriptor:
+        */
+       if (idx == -1 && can_allocate) {
+               idx = get_free_idx();
+               if (idx < 0)
+                       return idx;
+               if (put_user(idx, &u_info->entry_number))
+                       return -EFAULT;
+       }
+
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       set_tls_desc(p, idx, &info, 1);
+
+       return 0;
+}
+
+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+{
+       return do_set_thread_area(current, -1, u_info, 1);
+}
+
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+static void fill_user_desc(struct user_desc *info, int idx,
+                          const struct desc_struct *desc)
+
+{
+       memset(info, 0, sizeof(*info));
+       info->entry_number = idx;
+       info->base_addr = get_desc_base(desc);
+       info->limit = get_desc_limit(desc);
+       info->seg_32bit = desc->d;
+       info->contents = desc->type >> 2;
+       info->read_exec_only = !(desc->type & 2);
+       info->limit_in_pages = desc->g;
+       info->seg_not_present = !desc->p;
+       info->useable = desc->avl;
+#ifdef CONFIG_X86_64
+       info->lm = desc->l;
+#endif
+}
+
+int do_get_thread_area(struct task_struct *p, int idx,
+                      struct user_desc __user *u_info)
+{
+       struct user_desc info;
+
+       if (idx == -1 && get_user(idx, &u_info->entry_number))
+               return -EFAULT;
+
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       fill_user_desc(&info, idx,
+                      &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
+
+       if (copy_to_user(u_info, &info, sizeof(info)))
+               return -EFAULT;
+       return 0;
+}
+
+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+{
+       return do_get_thread_area(current, -1, u_info);
+}
+
+int regset_tls_active(struct task_struct *target,
+                     const struct user_regset *regset)
+{
+       struct thread_struct *t = &target->thread;
+       int n = GDT_ENTRY_TLS_ENTRIES;
+       while (n > 0 && desc_empty(&t->tls_array[n - 1]))
+               --n;
+       return n;
+}
+
+int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
+                  unsigned int pos, unsigned int count,
+                  void *kbuf, void __user *ubuf)
+{
+       const struct desc_struct *tls;
+
+       if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+           (pos % sizeof(struct user_desc)) != 0 ||
+           (count % sizeof(struct user_desc)) != 0)
+               return -EINVAL;
+
+       pos /= sizeof(struct user_desc);
+       count /= sizeof(struct user_desc);
+
+       tls = &target->thread.tls_array[pos];
+
+       if (kbuf) {
+               struct user_desc *info = kbuf;
+               while (count-- > 0)
+                       fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
+                                      tls++);
+       } else {
+               struct user_desc __user *u_info = ubuf;
+               while (count-- > 0) {
+                       struct user_desc info;
+                       fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
+                       if (__copy_to_user(u_info++, &info, sizeof(info)))
+                               return -EFAULT;
+               }
+       }
+
+       return 0;
+}
+
+int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
+                  unsigned int pos, unsigned int count,
+                  const void *kbuf, const void __user *ubuf)
+{
+       struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
+       const struct user_desc *info;
+
+       if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+           (pos % sizeof(struct user_desc)) != 0 ||
+           (count % sizeof(struct user_desc)) != 0)
+               return -EINVAL;
+
+       if (kbuf)
+               info = kbuf;
+       else if (__copy_from_user(infobuf, ubuf, count))
+               return -EFAULT;
+       else
+               info = infobuf;
+
+       set_tls_desc(target,
+                    GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
+                    info, count / sizeof(struct user_desc));
+
+       return 0;
+}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h

new file mode 100644 (file)

index 0000000..2f083a2
--- /dev/null
+++ b/arch/x86/kernel/tls.h
@@ -0,0 +1,21 @@
+/*
+ * Internal declarations for x86 TLS implementation functions.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ */
+
+#ifndef _ARCH_X86_KERNEL_TLS_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_tls_active;
+extern user_regset_get_fn regset_tls_get;
+extern user_regset_set_fn regset_tls_set;
+
+#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c

index 7e16d675eb850530a72796adba4aca48806d3001..78cbb655aa79d1201954a18c542ec9ce85596c5d 100644 (file)
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,9 +31,10 @@
  #include <linux/mmzone.h>
  #include <asm/cpu.h>
  
-static struct i386_cpu cpu_devices[NR_CPUS];
+static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
  
-int __cpuinit arch_register_cpu(int num)
+#ifdef CONFIG_HOTPLUG_CPU
+int arch_register_cpu(int num)
  {
         /*
          * CPU0 cannot be offlined due to several
@@ -44,21 +45,23 @@ int __cpuinit arch_register_cpu(int num)
          * Also certain PCI quirks require not to enable hotplug control
          * for all CPU's.
          */
-#ifdef CONFIG_HOTPLUG_CPU
         if (num)
-               cpu_devices[num].cpu.hotpluggable = 1;
-#endif
-
-       return register_cpu(&cpu_devices[num].cpu, num);
+               per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
+       return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
  }
+EXPORT_SYMBOL(arch_register_cpu);
  
-#ifdef CONFIG_HOTPLUG_CPU
  void arch_unregister_cpu(int num)
  {
-       return unregister_cpu(&cpu_devices[num].cpu);
+       return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
  }
-EXPORT_SYMBOL(arch_register_cpu);
  EXPORT_SYMBOL(arch_unregister_cpu);
+#else
+int arch_register_cpu(int num)
+{
+       return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+}
+EXPORT_SYMBOL(arch_register_cpu);
  #endif /*CONFIG_HOTPLUG_CPU*/
  
  static int __init topology_init(void)
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c

index 02d1e1e58e819211fed1813113d945b0baeb0e9d..3cf72977d01292bbea4492568d85277a8ba6680f 100644 (file)
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -76,7 +76,8 @@ char ignore_fpu_irq = 0;
   * F0 0F bug workaround.. We have a special link segment
   * for this.
   */
-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+gate_desc idt_table[256]
+       __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
  
  asmlinkage void divide_error(void);
  asmlinkage void debug(void);
@@ -101,6 +102,34 @@ asmlinkage void machine_check(void);
  int kstack_depth_to_print = 24;
  static unsigned int code_bytes = 64;
  
+void printk_address(unsigned long address, int reliable)
+{
+#ifdef CONFIG_KALLSYMS
+       unsigned long offset = 0, symsize;
+       const char *symname;
+       char *modname;
+       char *delim = ":";
+       char namebuf[128];
+       char reliab[4] = "";
+
+       symname = kallsyms_lookup(address, &symsize, &offset,
+                                       &modname, namebuf);
+       if (!symname) {
+               printk(" [<%08lx>]\n", address);
+               return;
+       }
+       if (!reliable)
+               strcpy(reliab, "? ");
+
+       if (!modname)
+               modname = delim = "";
+       printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+               address, reliab, delim, modname, delim, symname, offset, symsize);
+#else
+       printk(" [<%08lx>]\n", address);
+#endif
+}
+
  static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
  {
         return  p > (void *)tinfo &&
@@ -114,48 +143,35 @@ struct stack_frame {
  };
  
  static inline unsigned long print_context_stack(struct thread_info *tinfo,
-                               unsigned long *stack, unsigned long ebp,
+                               unsigned long *stack, unsigned long bp,
                                 const struct stacktrace_ops *ops, void *data)
  {
-#ifdef CONFIG_FRAME_POINTER
-       struct stack_frame *frame = (struct stack_frame *)ebp;
-       while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
-               struct stack_frame *next;
-               unsigned long addr;
+       struct stack_frame *frame = (struct stack_frame *)bp;
  
-               addr = frame->return_address;
-               ops->address(data, addr);
-               /*
-                * break out of recursive entries (such as
-                * end_of_stack_stop_unwind_function). Also,
-                * we can never allow a frame pointer to
-                * move downwards!
-                */
-               next = frame->next_frame;
-               if (next <= frame)
-                       break;
-               frame = next;
-       }
-#else
         while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
                 unsigned long addr;
  
-               addr = *stack++;
-               if (__kernel_text_address(addr))
-                       ops->address(data, addr);
+               addr = *stack;
+               if (__kernel_text_address(addr)) {
+                       if ((unsigned long) stack == bp + 4) {
+                               ops->address(data, addr, 1);
+                               frame = frame->next_frame;
+                               bp = (unsigned long) frame;
+                       } else {
+                               ops->address(data, addr, bp == 0);
+                       }
+               }
+               stack++;
         }
-#endif
-       return ebp;
+       return bp;
  }
  
  #define MSG(msg) ops->warning(data, msg)
  
  void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack,
+               unsigned long *stack, unsigned long bp,
                 const struct stacktrace_ops *ops, void *data)
  {
-       unsigned long ebp = 0;
-
         if (!task)
                 task = current;
  
@@ -163,17 +179,17 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                 unsigned long dummy;
                 stack = &dummy;
                 if (task != current)
-                       stack = (unsigned long *)task->thread.esp;
+                       stack = (unsigned long *)task->thread.sp;
         }
  
  #ifdef CONFIG_FRAME_POINTER
-       if (!ebp) {
+       if (!bp) {
                 if (task == current) {
-                       /* Grab ebp right from our regs */
-                       asm ("movl %%ebp, %0" : "=r" (ebp) : );
+                       /* Grab bp right from our regs */
+                       asm ("movl %%ebp, %0" : "=r" (bp) : );
                 } else {
-                       /* ebp is the last reg pushed by switch_to */
-                       ebp = *(unsigned long *) task->thread.esp;
+                       /* bp is the last reg pushed by switch_to */
+                       bp = *(unsigned long *) task->thread.sp;
                 }
         }
  #endif
@@ -182,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                 struct thread_info *context;
                 context = (struct thread_info *)
                         ((unsigned long)stack & (~(THREAD_SIZE - 1)));
-               ebp = print_context_stack(context, stack, ebp, ops, data);
+               bp = print_context_stack(context, stack, bp, ops, data);
                 /* Should be after the line below, but somewhere
                    in early boot context comes out corrupted and we
                    can't reference it -AK */
@@ -217,9 +233,11 @@ static int print_trace_stack(void *data, char *name)
  /*
   * Print one address/symbol entries per line.
   */
-static void print_trace_address(void *data, unsigned long addr)
+static void print_trace_address(void *data, unsigned long addr, int reliable)
  {
         printk("%s [<%08lx>] ", (char *)data, addr);
+       if (!reliable)
+               printk("? ");
         print_symbol("%s\n", addr);
         touch_nmi_watchdog();
  }
@@ -233,32 +251,32 @@ static const struct stacktrace_ops print_trace_ops = {
  
  static void
  show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long * stack, char *log_lvl)
+               unsigned long *stack, unsigned long bp, char *log_lvl)
  {
-       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
         printk("%s =======================\n", log_lvl);
  }
  
  void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long * stack)
+               unsigned long *stack, unsigned long bp)
  {
-       show_trace_log_lvl(task, regs, stack, "");
+       show_trace_log_lvl(task, regs, stack, bp, "");
  }
  
  static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                              unsigned long *esp, char *log_lvl)
+                      unsigned long *sp, unsigned long bp, char *log_lvl)
  {
         unsigned long *stack;
         int i;
  
-       if (esp == NULL) {
+       if (sp == NULL) {
                 if (task)
-                       esp = (unsigned long*)task->thread.esp;
+                       sp = (unsigned long*)task->thread.sp;
                 else
-                       esp = (unsigned long *)&esp;
+                       sp = (unsigned long *)&sp;
         }
  
-       stack = esp;
+       stack = sp;
         for(i = 0; i < kstack_depth_to_print; i++) {
                 if (kstack_end(stack))
                         break;
@@ -267,13 +285,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                 printk("%08lx ", *stack++);
         }
         printk("\n%sCall Trace:\n", log_lvl);
-       show_trace_log_lvl(task, regs, esp, log_lvl);
+       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
  }
  
-void show_stack(struct task_struct *task, unsigned long *esp)
+void show_stack(struct task_struct *task, unsigned long *sp)
  {
         printk("       ");
-       show_stack_log_lvl(task, NULL, esp, "");
+       show_stack_log_lvl(task, NULL, sp, 0, "");
  }
  
  /*
@@ -282,13 +300,19 @@ void show_stack(struct task_struct *task, unsigned long *esp)
  void dump_stack(void)
  {
         unsigned long stack;
+       unsigned long bp = 0;
+
+#ifdef CONFIG_FRAME_POINTER
+       if (!bp)
+               asm("movl %%ebp, %0" : "=r" (bp):);
+#endif
  
         printk("Pid: %d, comm: %.20s %s %s %.*s\n",
                 current->pid, current->comm, print_tainted(),
                 init_utsname()->release,
                 (int)strcspn(init_utsname()->version, " "),
                 init_utsname()->version);
-       show_trace(current, NULL, &stack);
+       show_trace(current, NULL, &stack, bp);
  }
  
  EXPORT_SYMBOL(dump_stack);
@@ -307,30 +331,30 @@ void show_registers(struct pt_regs *regs)
          * time of the fault..
          */
         if (!user_mode_vm(regs)) {
-               u8 *eip;
+               u8 *ip;
                 unsigned int code_prologue = code_bytes * 43 / 64;
                 unsigned int code_len = code_bytes;
                 unsigned char c;
  
                 printk("\n" KERN_EMERG "Stack: ");
-               show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
+               show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
  
                 printk(KERN_EMERG "Code: ");
  
-               eip = (u8 *)regs->eip - code_prologue;
-               if (eip < (u8 *)PAGE_OFFSET ||
-                       probe_kernel_address(eip, c)) {
+               ip = (u8 *)regs->ip - code_prologue;
+               if (ip < (u8 *)PAGE_OFFSET ||
+                       probe_kernel_address(ip, c)) {
                         /* try starting at EIP */
-                       eip = (u8 *)regs->eip;
+                       ip = (u8 *)regs->ip;
                         code_len = code_len - code_prologue + 1;
                 }
-               for (i = 0; i < code_len; i++, eip++) {
-                       if (eip < (u8 *)PAGE_OFFSET ||
-                               probe_kernel_address(eip, c)) {
+               for (i = 0; i < code_len; i++, ip++) {
+                       if (ip < (u8 *)PAGE_OFFSET ||
+                               probe_kernel_address(ip, c)) {
                                 printk(" Bad EIP value.");
                                 break;
                         }
-                       if (eip == (u8 *)regs->eip)
+                       if (ip == (u8 *)regs->ip)
                                 printk("<%02x> ", c);
                         else
                                 printk("%02x ", c);
@@ -339,18 +363,57 @@ void show_registers(struct pt_regs *regs)
         printk("\n");
  }      
  
-int is_valid_bugaddr(unsigned long eip)
+int is_valid_bugaddr(unsigned long ip)
  {
         unsigned short ud2;
  
-       if (eip < PAGE_OFFSET)
+       if (ip < PAGE_OFFSET)
                 return 0;
-       if (probe_kernel_address((unsigned short *)eip, ud2))
+       if (probe_kernel_address((unsigned short *)ip, ud2))
                 return 0;
  
         return ud2 == 0x0b0f;
  }
  
+static int die_counter;
+
+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+{
+       unsigned long sp;
+       unsigned short ss;
+
+       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+       printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+       printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       printk("DEBUG_PAGEALLOC");
+#endif
+       printk("\n");
+
+       if (notify_die(DIE_OOPS, str, regs, err,
+                               current->thread.trap_no, SIGSEGV) !=
+                       NOTIFY_STOP) {
+               show_registers(regs);
+               /* Executive summary in case the oops scrolled away */
+               sp = (unsigned long) (&regs->sp);
+               savesegment(ss, ss);
+               if (user_mode(regs)) {
+                       sp = regs->sp;
+                       ss = regs->ss & 0xffff;
+               }
+               printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+               print_symbol("%s", regs->ip);
+               printk(" SS:ESP %04x:%08lx\n", ss, sp);
+               return 0;
+       } else {
+               return 1;
+       }
+}
+
  /*
   * This is gone through when something in the kernel has done something bad and
   * is about to be terminated.
@@ -366,7 +429,6 @@ void die(const char * str, struct pt_regs * regs, long err)
                 .lock_owner =           -1,
                 .lock_owner_depth =     0
         };
-       static int die_counter;
         unsigned long flags;
  
         oops_enter();
@@ -382,43 +444,13 @@ void die(const char * str, struct pt_regs * regs, long err)
                 raw_local_irq_save(flags);
  
         if (++die.lock_owner_depth < 3) {
-               unsigned long esp;
-               unsigned short ss;
+               report_bug(regs->ip, regs);
  
-               report_bug(regs->eip, regs);
-
-               printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
-                      ++die_counter);
-#ifdef CONFIG_PREEMPT
-               printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-               printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-               printk("DEBUG_PAGEALLOC");
-#endif
-               printk("\n");
-
-               if (notify_die(DIE_OOPS, str, regs, err,
-                                       current->thread.trap_no, SIGSEGV) !=
-                               NOTIFY_STOP) {
-                       show_registers(regs);
-                       /* Executive summary in case the oops scrolled away */
-                       esp = (unsigned long) (&regs->esp);
-                       savesegment(ss, ss);
-                       if (user_mode(regs)) {
-                               esp = regs->esp;
-                               ss = regs->xss & 0xffff;
-                       }
-                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
-                       print_symbol("%s", regs->eip);
-                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
-               }
-               else
+               if (__die(str, regs, err))
                         regs = NULL;
-       } else
+       } else {
                 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+       }
  
         bust_spinlocks(0);
         die.lock_owner = -1;
@@ -454,7 +486,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
  {
         struct task_struct *tsk = current;
  
-       if (regs->eflags & VM_MASK) {
+       if (regs->flags & VM_MASK) {
                 if (vm86)
                         goto vm86_trap;
                 goto trap_signal;
@@ -500,7 +532,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
  }
  
  #define DO_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
  { \
         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
                                                 == NOTIFY_STOP) \
@@ -509,7 +541,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
  }
  
  #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
  { \
         siginfo_t info; \
         if (irq) \
@@ -525,7 +557,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
  }
  
  #define DO_VM86_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
  { \
         if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
                                                 == NOTIFY_STOP) \
@@ -534,7 +566,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
  }
  
  #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
  { \
         siginfo_t info; \
         info.si_signo = signr; \
@@ -548,13 +580,13 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
         do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
  }
  
-DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
  #ifndef CONFIG_KPROBES
  DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
  #endif
  DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
  DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
  DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
  DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
  DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
@@ -562,7 +594,7 @@ DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
  DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
  DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
  
-fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+void __kprobes do_general_protection(struct pt_regs * regs,
                                               long error_code)
  {
         int cpu = get_cpu();
@@ -596,7 +628,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
         }
         put_cpu();
  
-       if (regs->eflags & VM_MASK)
+       if (regs->flags & VM_MASK)
                 goto gp_in_vm86;
  
         if (!user_mode(regs))
@@ -605,11 +637,14 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
         current->thread.error_code = error_code;
         current->thread.trap_no = 13;
         if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
-           printk_ratelimit())
+           printk_ratelimit()) {
                 printk(KERN_INFO
-                   "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+                   "%s[%d] general protection ip:%lx sp:%lx error:%lx",
                     current->comm, task_pid_nr(current),
-                   regs->eip, regs->esp, error_code);
+                   regs->ip, regs->sp, error_code);
+               print_vma_addr(" in ", regs->ip);
+               printk("\n");
+       }
  
         force_sig(SIGSEGV, current);
         return;
@@ -705,8 +740,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
         */
         bust_spinlocks(1);
         printk(KERN_EMERG "%s", msg);
-       printk(" on CPU%d, eip %08lx, registers:\n",
-               smp_processor_id(), regs->eip);
+       printk(" on CPU%d, ip %08lx, registers:\n",
+               smp_processor_id(), regs->ip);
         show_registers(regs);
         console_silent();
         spin_unlock(&nmi_print_lock);
@@ -763,7 +798,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
  
  static int ignore_nmis;
  
-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+__kprobes void do_nmi(struct pt_regs * regs, long error_code)
  {
         int cpu;
  
@@ -792,7 +827,7 @@ void restart_nmi(void)
  }
  
  #ifdef CONFIG_KPROBES
-fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
+void __kprobes do_int3(struct pt_regs *regs, long error_code)
  {
         trace_hardirqs_fixup();
  
@@ -828,7 +863,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
   * find every occurrence of the TF bit that could be saved away even
   * by user code)
   */
-fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
+void __kprobes do_debug(struct pt_regs * regs, long error_code)
  {
         unsigned int condition;
         struct task_struct *tsk = current;
@@ -837,24 +872,30 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
  
         get_debugreg(condition, 6);
  
+       /*
+        * The processor cleared BTF, so don't mark that we need it set.
+        */
+       clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+       tsk->thread.debugctlmsr = 0;
+
         if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
                                         SIGTRAP) == NOTIFY_STOP)
                 return;
         /* It's safe to allow irq's after DR6 has been saved */
-       if (regs->eflags & X86_EFLAGS_IF)
+       if (regs->flags & X86_EFLAGS_IF)
                 local_irq_enable();
  
         /* Mask out spurious debug traps due to lazy DR7 setting */
         if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-               if (!tsk->thread.debugreg[7])
+               if (!tsk->thread.debugreg7)
                         goto clear_dr7;
         }
  
-       if (regs->eflags & VM_MASK)
+       if (regs->flags & VM_MASK)
                 goto debug_vm86;
  
         /* Save debug status register where ptrace can see it */
-       tsk->thread.debugreg[6] = condition;
+       tsk->thread.debugreg6 = condition;
  
         /*
          * Single-stepping through TF: make sure we ignore any events in
@@ -886,7 +927,7 @@ debug_vm86:
  
  clear_TF_reenable:
         set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-       regs->eflags &= ~TF_MASK;
+       regs->flags &= ~TF_MASK;
         return;
  }
  
@@ -895,7 +936,7 @@ clear_TF_reenable:
   * the correct behaviour even in the presence of the asynchronous
   * IRQ13 behaviour
   */
-void math_error(void __user *eip)
+void math_error(void __user *ip)
  {
         struct task_struct * task;
         siginfo_t info;
@@ -911,7 +952,7 @@ void math_error(void __user *eip)
         info.si_signo = SIGFPE;
         info.si_errno = 0;
         info.si_code = __SI_FAULT;
-       info.si_addr = eip;
+       info.si_addr = ip;
         /*
          * (~cwd & swd) will mask out exceptions that are not set to unmasked
          * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -954,13 +995,13 @@ void math_error(void __user *eip)
         force_sig_info(SIGFPE, &info, task);
  }
  
-fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
+void do_coprocessor_error(struct pt_regs * regs, long error_code)
  {
         ignore_fpu_irq = 1;
-       math_error((void __user *)regs->eip);
+       math_error((void __user *)regs->ip);
  }
  
-static void simd_math_error(void __user *eip)
+static void simd_math_error(void __user *ip)
  {
         struct task_struct * task;
         siginfo_t info;
@@ -976,7 +1017,7 @@ static void simd_math_error(void __user *eip)
         info.si_signo = SIGFPE;
         info.si_errno = 0;
         info.si_code = __SI_FAULT;
-       info.si_addr = eip;
+       info.si_addr = ip;
         /*
          * The SIMD FPU exceptions are handled a little differently, as there
          * is only a single status/control register.  Thus, to determine which
@@ -1008,19 +1049,19 @@ static void simd_math_error(void __user *eip)
         force_sig_info(SIGFPE, &info, task);
  }
  
-fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
+void do_simd_coprocessor_error(struct pt_regs * regs,
                                           long error_code)
  {
         if (cpu_has_xmm) {
                 /* Handle SIMD FPU exceptions on PIII+ processors. */
                 ignore_fpu_irq = 1;
-               simd_math_error((void __user *)regs->eip);
+               simd_math_error((void __user *)regs->ip);
         } else {
                 /*
                  * Handle strange cache flush from user space exception
                  * in all other cases.  This is undocumented behaviour.
                  */
-               if (regs->eflags & VM_MASK) {
+               if (regs->flags & VM_MASK) {
                         handle_vm86_fault((struct kernel_vm86_regs *)regs,
                                           error_code);
                         return;
@@ -1032,7 +1073,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
         }
  }
  
-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
+void do_spurious_interrupt_bug(struct pt_regs * regs,
                                           long error_code)
  {
  #if 0
@@ -1041,7 +1082,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
  #endif
  }
  
-fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+unsigned long patch_espfix_desc(unsigned long uesp,
                                           unsigned long kesp)
  {
         struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
@@ -1095,51 +1136,17 @@ asmlinkage void math_emulate(long arg)
  
  #endif /* CONFIG_MATH_EMULATION */
  
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-void set_intr_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
-}
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
-}
-
-static void __init set_trap_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
-}
-
-static void __init set_system_gate(unsigned int n, void *addr)
-{
-       _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
-}
-
-static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
-       _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
-}
-
  
  void __init trap_init(void)
  {
         int i;
  
  #ifdef CONFIG_EISA
-       void __iomem *p = ioremap(0x0FFFD9, 4);
+       void __iomem *p = early_ioremap(0x0FFFD9, 4);
         if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
                 EISA_bus = 1;
         }
-       iounmap(p);
+       early_iounmap(p, 4);
  #endif
  
  #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c

index cc68b92316cd3813fc049874562f28eed81d9c60..efc66df728b61cfd3a0641345836a0d2e3deef9b 100644 (file)
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -74,22 +74,24 @@ asmlinkage void alignment_check(void);
  asmlinkage void machine_check(void);
  asmlinkage void spurious_interrupt_bug(void);
  
+static unsigned int code_bytes = 64;
+
  static inline void conditional_sti(struct pt_regs *regs)
  {
-       if (regs->eflags & X86_EFLAGS_IF)
+       if (regs->flags & X86_EFLAGS_IF)
                 local_irq_enable();
  }
  
  static inline void preempt_conditional_sti(struct pt_regs *regs)
  {
         preempt_disable();
-       if (regs->eflags & X86_EFLAGS_IF)
+       if (regs->flags & X86_EFLAGS_IF)
                 local_irq_enable();
  }
  
  static inline void preempt_conditional_cli(struct pt_regs *regs)
  {
-       if (regs->eflags & X86_EFLAGS_IF)
+       if (regs->flags & X86_EFLAGS_IF)
                 local_irq_disable();
         /* Make sure to not schedule here because we could be running
            on an exception stack. */
@@ -98,14 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
  
  int kstack_depth_to_print = 12;
  
-#ifdef CONFIG_KALLSYMS
-void printk_address(unsigned long address)
+void printk_address(unsigned long address, int reliable)
  {
+#ifdef CONFIG_KALLSYMS
         unsigned long offset = 0, symsize;
         const char *symname;
         char *modname;
         char *delim = ":";
-       char namebuf[128];
+       char namebuf[KSYM_NAME_LEN];
+       char reliab[4] = "";
  
         symname = kallsyms_lookup(address, &symsize, &offset,
                                         &modname, namebuf);
@@ -113,17 +116,17 @@ void printk_address(unsigned long address)
                 printk(" [<%016lx>]\n", address);
                 return;
         }
+       if (!reliable)
+               strcpy(reliab, "? ");
+
         if (!modname)
-               modname = delim = "";           
-       printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
-               address, delim, modname, delim, symname, offset, symsize);
-}
+               modname = delim = "";
+       printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+               address, reliab, delim, modname, delim, symname, offset, symsize);
  #else
-void printk_address(unsigned long address)
-{
         printk(" [<%016lx>]\n", address);
-}
  #endif
+}
  
  static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
                                         unsigned *usedp, char **idp)
@@ -208,14 +211,53 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
   * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
   */
  
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+                       void *p, unsigned int size, void *end)
+{
+       void *t = tinfo;
+       if (end) {
+               if (p < end && p >= (end-THREAD_SIZE))
+                       return 1;
+               else
+                       return 0;
+       }
+       return p > t && p < t + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+       struct stack_frame *next_frame;
+       unsigned long return_address;
+};
+
+
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+                               unsigned long *stack, unsigned long bp,
+                               const struct stacktrace_ops *ops, void *data,
+                               unsigned long *end)
  {
-       void *t = (void *)tinfo;
-        return p > t && p < t + THREAD_SIZE - 3;
+       struct stack_frame *frame = (struct stack_frame *)bp;
+
+       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+               unsigned long addr;
+
+               addr = *stack;
+               if (__kernel_text_address(addr)) {
+                       if ((unsigned long) stack == bp + 8) {
+                               ops->address(data, addr, 1);
+                               frame = frame->next_frame;
+                               bp = (unsigned long) frame;
+                       } else {
+                               ops->address(data, addr, bp == 0);
+                       }
+               }
+               stack++;
+       }
+       return bp;
  }
  
  void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
-               unsigned long *stack,
+               unsigned long *stack, unsigned long bp,
                 const struct stacktrace_ops *ops, void *data)
  {
         const unsigned cpu = get_cpu();
@@ -225,36 +267,28 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
  
         if (!tsk)
                 tsk = current;
+       tinfo = task_thread_info(tsk);
  
         if (!stack) {
                 unsigned long dummy;
                 stack = &dummy;
                 if (tsk && tsk != current)
-                       stack = (unsigned long *)tsk->thread.rsp;
+                       stack = (unsigned long *)tsk->thread.sp;
         }
  
-       /*
-        * Print function call entries within a stack. 'cond' is the
-        * "end of stackframe" condition, that the 'stack++'
-        * iteration will eventually trigger.
-        */
-#define HANDLE_STACK(cond) \
-       do while (cond) { \
-               unsigned long addr = *stack++; \
-               /* Use unlocked access here because except for NMIs     \
-                  we should be already protected against module unloads */ \
-               if (__kernel_text_address(addr)) { \
-                       /* \
-                        * If the address is either in the text segment of the \
-                        * kernel, or in the region which contains vmalloc'ed \
-                        * memory, it *may* be the address of a calling \
-                        * routine; if so, print it so that someone tracing \
-                        * down the cause of the crash will be able to figure \
-                        * out the call path that was taken. \
-                        */ \
-                       ops->address(data, addr);   \
-               } \
-       } while (0)
+#ifdef CONFIG_FRAME_POINTER
+       if (!bp) {
+               if (tsk == current) {
+                       /* Grab bp right from our regs */
+                       asm("movq %%rbp, %0" : "=r" (bp):);
+               } else {
+                       /* bp is the last reg pushed by switch_to */
+                       bp = *(unsigned long *) tsk->thread.sp;
+               }
+       }
+#endif
+
+
  
         /*
          * Print function call entries in all stacks, starting at the
@@ -270,7 +304,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
                 if (estack_end) {
                         if (ops->stack(data, id) < 0)
                                 break;
-                       HANDLE_STACK (stack < estack_end);
+
+                       bp = print_context_stack(tinfo, stack, bp, ops,
+                                                       data, estack_end);
                         ops->stack(data, "<EOE>");
                         /*
                          * We link to the next stack via the
@@ -288,7 +324,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
                         if (stack >= irqstack && stack < irqstack_end) {
                                 if (ops->stack(data, "IRQ") < 0)
                                         break;
-                               HANDLE_STACK (stack < irqstack_end);
+                               bp = print_context_stack(tinfo, stack, bp,
+                                               ops, data, irqstack_end);
                                 /*
                                  * We link to the next stack (which would be
                                  * the process stack normally) the last
@@ -306,9 +343,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
         /*
          * This handles the process stack:
          */
-       tinfo = task_thread_info(tsk);
-       HANDLE_STACK (valid_stack_ptr(tinfo, stack));
-#undef HANDLE_STACK
+       bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
         put_cpu();
  }
  EXPORT_SYMBOL(dump_trace);
@@ -331,10 +366,10 @@ static int print_trace_stack(void *data, char *name)
         return 0;
  }
  
-static void print_trace_address(void *data, unsigned long addr)
+static void print_trace_address(void *data, unsigned long addr, int reliable)
  {
         touch_nmi_watchdog();
-       printk_address(addr);
+       printk_address(addr, reliable);
  }
  
  static const struct stacktrace_ops print_trace_ops = {
@@ -345,15 +380,17 @@ static const struct stacktrace_ops print_trace_ops = {
  };
  
  void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
+               unsigned long bp)
  {
         printk("\nCall Trace:\n");
-       dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
+       dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
         printk("\n");
  }
  
  static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
+                                                       unsigned long bp)
  {
         unsigned long *stack;
         int i;
@@ -364,14 +401,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
         // debugging aid: "show_stack(NULL, NULL);" prints the
         // back trace for this cpu.
  
-       if (rsp == NULL) {
+       if (sp == NULL) {
                 if (tsk)
-                       rsp = (unsigned long *)tsk->thread.rsp;
+                       sp = (unsigned long *)tsk->thread.sp;
                 else
-                       rsp = (unsigned long *)&rsp;
+                       sp = (unsigned long *)&sp;
         }
  
-       stack = rsp;
+       stack = sp;
         for(i=0; i < kstack_depth_to_print; i++) {
                 if (stack >= irqstack && stack <= irqstack_end) {
                         if (stack == irqstack_end) {
@@ -387,12 +424,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
                 printk(" %016lx", *stack++);
                 touch_nmi_watchdog();
         }
-       show_trace(tsk, regs, rsp);
+       show_trace(tsk, regs, sp, bp);
  }
  
-void show_stack(struct task_struct *tsk, unsigned long * rsp)
+void show_stack(struct task_struct *tsk, unsigned long * sp)
  {
-       _show_stack(tsk, NULL, rsp);
+       _show_stack(tsk, NULL, sp, 0);
  }
  
  /*
@@ -401,13 +438,19 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
  void dump_stack(void)
  {
         unsigned long dummy;
+       unsigned long bp = 0;
+
+#ifdef CONFIG_FRAME_POINTER
+       if (!bp)
+               asm("movq %%rbp, %0" : "=r" (bp):);
+#endif
  
         printk("Pid: %d, comm: %.20s %s %s %.*s\n",
                 current->pid, current->comm, print_tainted(),
                 init_utsname()->release,
                 (int)strcspn(init_utsname()->version, " "),
                 init_utsname()->version);
-       show_trace(NULL, NULL, &dummy);
+       show_trace(NULL, NULL, &dummy, bp);
  }
  
  EXPORT_SYMBOL(dump_stack);
@@ -415,12 +458,15 @@ EXPORT_SYMBOL(dump_stack);
  void show_registers(struct pt_regs *regs)
  {
         int i;
-       int in_kernel = !user_mode(regs);
-       unsigned long rsp;
+       unsigned long sp;
         const int cpu = smp_processor_id();
         struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+       u8 *ip;
+       unsigned int code_prologue = code_bytes * 43 / 64;
+       unsigned int code_len = code_bytes;
  
-       rsp = regs->rsp;
+       sp = regs->sp;
+       ip = (u8 *) regs->ip - code_prologue;
         printk("CPU %d ", cpu);
         __show_regs(regs);
         printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -430,45 +476,43 @@ void show_registers(struct pt_regs *regs)
          * When in-kernel, we also print out the stack and code at the
          * time of the fault..
          */
-       if (in_kernel) {
+       if (!user_mode(regs)) {
+               unsigned char c;
                 printk("Stack: ");
-               _show_stack(NULL, regs, (unsigned long*)rsp);
-
-               printk("\nCode: ");
-               if (regs->rip < PAGE_OFFSET)
-                       goto bad;
-
-               for (i=0; i<20; i++) {
-                       unsigned char c;
-                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
-bad:
+               _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+               printk("\n");
+
+               printk(KERN_EMERG "Code: ");
+               if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+                       /* try starting at RIP */
+                       ip = (u8 *) regs->ip;
+                       code_len = code_len - code_prologue + 1;
+               }
+               for (i = 0; i < code_len; i++, ip++) {
+                       if (ip < (u8 *)PAGE_OFFSET ||
+                                       probe_kernel_address(ip, c)) {
                                 printk(" Bad RIP value.");
                                 break;
                         }
-                       printk("%02x ", c);
+                       if (ip == (u8 *)regs->ip)
+                               printk("<%02x> ", c);
+                       else
+                               printk("%02x ", c);
                 }
         }
         printk("\n");
  }      
  
-int is_valid_bugaddr(unsigned long rip)
+int is_valid_bugaddr(unsigned long ip)
  {
         unsigned short ud2;
  
-       if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
+       if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
                 return 0;
  
         return ud2 == 0x0b0f;
  }
  
-#ifdef CONFIG_BUG
-void out_of_line_bug(void)
-{ 
-       BUG(); 
-} 
-EXPORT_SYMBOL(out_of_line_bug);
-#endif
-
  static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
  static int die_owner = -1;
  static unsigned int die_nest_count;
@@ -496,7 +540,7 @@ unsigned __kprobes long oops_begin(void)
         return flags;
  }
  
-void __kprobes oops_end(unsigned long flags)
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
  { 
         die_owner = -1;
         bust_spinlocks(0);
@@ -505,12 +549,17 @@ void __kprobes oops_end(unsigned long flags)
                 /* Nest count reaches zero, release the lock. */
                 __raw_spin_unlock(&die_lock);
         raw_local_irq_restore(flags);
+       if (!regs) {
+               oops_exit();
+               return;
+       }
         if (panic_on_oops)
                 panic("Fatal exception");
         oops_exit();
+       do_exit(signr);
  }
  
-void __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
  {
         static int die_counter;
         printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
@@ -524,15 +573,17 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
         printk("DEBUG_PAGEALLOC");
  #endif
         printk("\n");
-       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
+       if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+               return 1;
         show_registers(regs);
         add_taint(TAINT_DIE);
         /* Executive summary in case the oops scrolled away */
         printk(KERN_ALERT "RIP ");
-       printk_address(regs->rip); 
-       printk(" RSP <%016lx>\n", regs->rsp); 
+       printk_address(regs->ip, 1);
+       printk(" RSP <%016lx>\n", regs->sp);
         if (kexec_should_crash(current))
                 crash_kexec(regs);
+       return 0;
  }
  
  void die(const char * str, struct pt_regs * regs, long err)
@@ -540,11 +591,11 @@ void die(const char * str, struct pt_regs * regs, long err)
         unsigned long flags = oops_begin();
  
         if (!user_mode(regs))
-               report_bug(regs->rip, regs);
+               report_bug(regs->ip, regs);
  
-       __die(str, regs, err);
-       oops_end(flags);
-       do_exit(SIGSEGV); 
+       if (__die(str, regs, err))
+               regs = NULL;
+       oops_end(flags, regs, SIGSEGV);
  }
  
  void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
@@ -561,10 +612,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
                 crash_kexec(regs);
         if (do_panic || panic_on_oops)
                 panic("Non maskable interrupt");
-       oops_end(flags);
+       oops_end(flags, NULL, SIGBUS);
         nmi_exit();
         local_irq_enable();
-       do_exit(SIGSEGV);
+       do_exit(SIGBUS);
  }
  
  static void __kprobes do_trap(int trapnr, int signr, char *str,
@@ -588,11 +639,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
                 tsk->thread.trap_no = trapnr;
  
                 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-                   printk_ratelimit())
+                   printk_ratelimit()) {
                         printk(KERN_INFO
-                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+                              "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
                                tsk->comm, tsk->pid, str,
-                              regs->rip, regs->rsp, error_code); 
+                              regs->ip, regs->sp, error_code);
+                       print_vma_addr(" in ", regs->ip);
+                       printk("\n");
+               }
  
                 if (info)
                         force_sig_info(signr, info, tsk);
@@ -602,19 +656,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
         }
  
  
-       /* kernel trap */ 
-       {            
-               const struct exception_table_entry *fixup;
-               fixup = search_exception_tables(regs->rip);
-               if (fixup)
-                       regs->rip = fixup->fixup;
-               else {
-                       tsk->thread.error_code = error_code;
-                       tsk->thread.trap_no = trapnr;
-                       die(str, regs, error_code);
-               }
-               return;
+       if (!fixup_exception(regs)) {
+               tsk->thread.error_code = error_code;
+               tsk->thread.trap_no = trapnr;
+               die(str, regs, error_code);
         }
+       return;
  }
  
  #define DO_ERROR(trapnr, signr, str, name) \
@@ -643,10 +690,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
         do_trap(trapnr, signr, str, regs, error_code, &info); \
  }
  
-DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
  DO_ERROR( 4, SIGSEGV, "overflow", overflow)
  DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
  DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
  DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
  DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -694,32 +741,28 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
                 tsk->thread.trap_no = 13;
  
                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit())
+                   printk_ratelimit()) {
                         printk(KERN_INFO
-                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+                      "%s[%d] general protection ip:%lx sp:%lx error:%lx",
                                tsk->comm, tsk->pid,
-                              regs->rip, regs->rsp, error_code); 
+                              regs->ip, regs->sp, error_code);
+                       print_vma_addr(" in ", regs->ip);
+                       printk("\n");
+               }
  
                 force_sig(SIGSEGV, tsk);
                 return;
         } 
  
-       /* kernel gp */
-       {
-               const struct exception_table_entry *fixup;
-               fixup = search_exception_tables(regs->rip);
-               if (fixup) {
-                       regs->rip = fixup->fixup;
-                       return;
-               }
+       if (fixup_exception(regs))
+               return;
  
-               tsk->thread.error_code = error_code;
-               tsk->thread.trap_no = 13;
-               if (notify_die(DIE_GPF, "general protection fault", regs,
-                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
-                       return;
-               die("general protection fault", regs, error_code);
-       }
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 13;
+       if (notify_die(DIE_GPF, "general protection fault", regs,
+                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
+               return;
+       die("general protection fault", regs, error_code);
  }
  
  static __kprobes void
@@ -832,15 +875,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
  {
         struct pt_regs *regs = eregs;
         /* Did already sync */
-       if (eregs == (struct pt_regs *)eregs->rsp)
+       if (eregs == (struct pt_regs *)eregs->sp)
                 ;
         /* Exception from user space */
         else if (user_mode(eregs))
                 regs = task_pt_regs(current);
         /* Exception from kernel and interrupts are enabled. Move to
            kernel process stack. */
-       else if (eregs->eflags & X86_EFLAGS_IF)
-               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
+       else if (eregs->flags & X86_EFLAGS_IF)
+               regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
         if (eregs != regs)
                 *regs = *eregs;
         return regs;
@@ -858,6 +901,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
  
         get_debugreg(condition, 6);
  
+       /*
+        * The processor cleared BTF, so don't mark that we need it set.
+        */
+       clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+       tsk->thread.debugctlmsr = 0;
+
         if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
                                                 SIGTRAP) == NOTIFY_STOP)
                 return;
@@ -873,27 +922,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
  
         tsk->thread.debugreg6 = condition;
  
-       /* Mask out spurious TF errors due to lazy TF clearing */
+
+       /*
+        * Single-stepping through TF: make sure we ignore any events in
+        * kernel space (but re-enable TF when returning to user mode).
+        */
         if (condition & DR_STEP) {
-               /*
-                * The TF error should be masked out only if the current
-                * process is not traced and if the TRAP flag has been set
-                * previously by a tracing process (condition detected by
-                * the PT_DTRACE flag); remember that the i386 TRAP flag
-                * can be modified by the process itself in user mode,
-                * allowing programs to debug themselves without the ptrace()
-                * interface.
-                */
                  if (!user_mode(regs))
                         goto clear_TF_reenable;
-               /*
-                * Was the TF flag set by a debugger? If so, clear it now,
-                * so that register information is correct.
-                */
-               if (tsk->ptrace & PT_DTRACE) {
-                       regs->eflags &= ~TF_MASK;
-                       tsk->ptrace &= ~PT_DTRACE;
-               }
         }
  
         /* Ok, finally something we can handle */
@@ -902,7 +938,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
         info.si_signo = SIGTRAP;
         info.si_errno = 0;
         info.si_code = TRAP_BRKPT;
-       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
+       info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
         force_sig_info(SIGTRAP, &info, tsk);
  
  clear_dr7:
@@ -912,18 +948,15 @@ clear_dr7:
  
  clear_TF_reenable:
         set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-       regs->eflags &= ~TF_MASK;
+       regs->flags &= ~X86_EFLAGS_TF;
         preempt_conditional_cli(regs);
  }
  
  static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  {
-       const struct exception_table_entry *fixup;
-       fixup = search_exception_tables(regs->rip);
-       if (fixup) {
-               regs->rip = fixup->fixup;
+       if (fixup_exception(regs))
                 return 1;
-       }
+
         notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
         /* Illegal floating point operation in the kernel */
         current->thread.trap_no = trapnr;
@@ -938,7 +971,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
   */
  asmlinkage void do_coprocessor_error(struct pt_regs *regs)
  {
-       void __user *rip = (void __user *)(regs->rip);
+       void __user *ip = (void __user *)(regs->ip);
         struct task_struct * task;
         siginfo_t info;
         unsigned short cwd, swd;
@@ -958,7 +991,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
         info.si_signo = SIGFPE;
         info.si_errno = 0;
         info.si_code = __SI_FAULT;
-       info.si_addr = rip;
+       info.si_addr = ip;
         /*
          * (~cwd & swd) will mask out exceptions that are not set to unmasked
          * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -1007,7 +1040,7 @@ asmlinkage void bad_intr(void)
  
  asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
  {
-       void __user *rip = (void __user *)(regs->rip);
+       void __user *ip = (void __user *)(regs->ip);
         struct task_struct * task;
         siginfo_t info;
         unsigned short mxcsr;
@@ -1027,7 +1060,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
         info.si_signo = SIGFPE;
         info.si_errno = 0;
         info.si_code = __SI_FAULT;
-       info.si_addr = rip;
+       info.si_addr = ip;
         /*
          * The SIMD FPU exceptions are handled a little differently, as there
          * is only a single status/control register.  Thus, to determine which
@@ -1089,6 +1122,7 @@ asmlinkage void math_state_restore(void)
         task_thread_info(me)->status |= TS_USEDFPU;
         me->fpu_counter++;
  }
+EXPORT_SYMBOL_GPL(math_state_restore);
  
  void __init trap_init(void)
  {
@@ -1144,3 +1178,14 @@ static int __init kstack_setup(char *s)
         return 0;
  }
  early_param("kstack", kstack_setup);
+
+
+static int __init code_bytes_setup(char *s)
+{
+       code_bytes = simple_strtoul(s, NULL, 0);
+       if (code_bytes > 8192)
+               code_bytes = 8192;
+
+       return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c

index 9ebc0dab66b4ced5720818ce600b2dd14ccf0e18..43517e324be83e73096fe28a39ec5275c9a378e0 100644 (file)
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -5,6 +5,7 @@
  #include <linux/jiffies.h>
  #include <linux/init.h>
  #include <linux/dmi.h>
+#include <linux/percpu.h>
  
  #include <asm/delay.h>
  #include <asm/tsc.h>
@@ -23,8 +24,6 @@ static int tsc_enabled;
  unsigned int tsc_khz;
  EXPORT_SYMBOL_GPL(tsc_khz);
  
-int tsc_disable;
-
  #ifdef CONFIG_X86_TSC
  static int __init tsc_setup(char *str)
  {
@@ -39,8 +38,7 @@ static int __init tsc_setup(char *str)
   */
  static int __init tsc_setup(char *str)
  {
-       tsc_disable = 1;
-
+       setup_clear_cpu_cap(X86_FEATURE_TSC);
         return 1;
  }
  #endif
@@ -80,13 +78,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
   *
   *                     -johnstul@us.ibm.com "math is hard, lets go shopping!"
   */
-unsigned long cyc2ns_scale __read_mostly;
  
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+DEFINE_PER_CPU(unsigned long, cyc2ns);
  
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
  {
-       cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+       unsigned long flags, prev_scale, *scale;
+       unsigned long long tsc_now, ns_now;
+
+       local_irq_save(flags);
+       sched_clock_idle_sleep_event();
+
+       scale = &per_cpu(cyc2ns, cpu);
+
+       rdtscll(tsc_now);
+       ns_now = __cycles_2_ns(tsc_now);
+
+       prev_scale = *scale;
+       if (cpu_khz)
+               *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+
+       /*
+        * Start smoothly with the new frequency:
+        */
+       sched_clock_idle_wakeup_event(0);
+       local_irq_restore(flags);
  }
  
  /*
@@ -239,7 +255,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
                                                 ref_freq, freq->new);
                         if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
                                 tsc_khz = cpu_khz;
-                               set_cyc2ns_scale(cpu_khz);
+                               preempt_disable();
+                               set_cyc2ns_scale(cpu_khz, smp_processor_id());
+                               preempt_enable();
                                 /*
                                  * TSC based sched_clock turns
                                  * to junk w/ cpufreq
@@ -333,6 +351,11 @@ __cpuinit int unsynchronized_tsc(void)
  {
         if (!cpu_has_tsc || tsc_unstable)
                 return 1;
+
+       /* Anything with constant TSC should be synchronized */
+       if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+               return 0;
+
         /*
          * Intel systems are normally all synchronized.
          * Exceptions must mark TSC as unstable:
@@ -367,7 +390,9 @@ static inline void check_geode_tsc_reliable(void) { }
  
  void __init tsc_init(void)
  {
-       if (!cpu_has_tsc || tsc_disable)
+       int cpu;
+
+       if (!cpu_has_tsc)
                 goto out_no_tsc;
  
         cpu_khz = calculate_cpu_khz();
@@ -380,7 +405,15 @@ void __init tsc_init(void)
                                 (unsigned long)cpu_khz / 1000,
                                 (unsigned long)cpu_khz % 1000);
  
-       set_cyc2ns_scale(cpu_khz);
+       /*
+        * Secondary CPUs do not run through tsc_init(), so set up
+        * all the scale factors for all CPUs, assuming the same
+        * speed as the bootup CPU. (cpufreq notifiers will fix this
+        * up if their speed diverges)
+        */
+       for_each_possible_cpu(cpu)
+               set_cyc2ns_scale(cpu_khz, cpu);
+
         use_tsc_delay();
  
         /* Check and install the TSC clocksource */
@@ -403,10 +436,5 @@ void __init tsc_init(void)
         return;
  
  out_no_tsc:
-       /*
-        * Set the tsc_disable flag if there's no TSC support, this
-        * makes it a fast flag for the kernel to see whether it
-        * should be using the TSC.
-        */
-       tsc_disable = 1;
+       setup_clear_cpu_cap(X86_FEATURE_TSC);
  }
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c

index 9c70af45b42bcbe7683cf2c37b593f4da7b793ee..947554ddabb6c7779f9e9adedfc07d7891dba283 100644 (file)
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -10,6 +10,7 @@
  
  #include <asm/hpet.h>
  #include <asm/timex.h>
+#include <asm/timer.h>
  
  static int notsc __initdata = 0;
  
@@ -18,19 +19,51 @@ EXPORT_SYMBOL(cpu_khz);
  unsigned int tsc_khz;
  EXPORT_SYMBOL(tsc_khz);
  
-static unsigned int cyc2ns_scale __read_mostly;
+/* Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *             ns = cycles / (freq / ns_per_sec)
+ *             ns = cycles * (ns_per_sec / freq)
+ *             ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *             ns = cycles * (10^6 / cpu_khz)
+ *
+ *     Then we use scaling math (suggested by george@mvista.com) to get:
+ *             ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *             ns = cycles * cyc2ns_scale / SC
+ *
+ *     And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better precision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                     -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+DEFINE_PER_CPU(unsigned long, cyc2ns);
  
-static inline void set_cyc2ns_scale(unsigned long khz)
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
  {
-       cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
-}
+       unsigned long flags, prev_scale, *scale;
+       unsigned long long tsc_now, ns_now;
  
-static unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-       return (cyc * cyc2ns_scale) >> NS_SCALE;
+       local_irq_save(flags);
+       sched_clock_idle_sleep_event();
+
+       scale = &per_cpu(cyc2ns, cpu);
+
+       rdtscll(tsc_now);
+       ns_now = __cycles_2_ns(tsc_now);
+
+       prev_scale = *scale;
+       if (cpu_khz)
+               *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+
+       sched_clock_idle_wakeup_event(0);
+       local_irq_restore(flags);
  }
  
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
  {
         unsigned long a = 0;
  
@@ -44,12 +77,27 @@ unsigned long long sched_clock(void)
         return cycles_2_ns(a);
  }
  
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+       return paravirt_sched_clock();
+}
+#else
+unsigned long long
+sched_clock(void) __attribute__((alias("native_sched_clock")));
+#endif
+
+
  static int tsc_unstable;
  
-inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
  {
         return tsc_unstable;
  }
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
  #ifdef CONFIG_CPU_FREQ
  
  /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -100,7 +148,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                         mark_tsc_unstable("cpufreq changes");
         }
  
-       set_cyc2ns_scale(tsc_khz_ref);
+       preempt_disable();
+       set_cyc2ns_scale(tsc_khz_ref, smp_processor_id());
+       preempt_enable();
  
         return 0;
  }
@@ -133,12 +183,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
         int i;
  
         for (i = 0; i < MAX_RETRIES; i++) {
-               t1 = get_cycles_sync();
+               t1 = get_cycles();
                 if (hpet)
                         *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
                 else
                         *pm = acpi_pm_read_early();
-               t2 = get_cycles_sync();
+               t2 = get_cycles();
                 if ((t2 - t1) < SMI_TRESHOLD)
                         return t2;
         }
@@ -151,7 +201,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
  void __init tsc_calibrate(void)
  {
         unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
-       int hpet = is_hpet_enabled();
+       int hpet = is_hpet_enabled(), cpu;
  
         local_irq_save(flags);
  
@@ -162,9 +212,9 @@ void __init tsc_calibrate(void)
         outb(0xb0, 0x43);
         outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
         outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-       tr1 = get_cycles_sync();
+       tr1 = get_cycles();
         while ((inb(0x61) & 0x20) == 0);
-       tr2 = get_cycles_sync();
+       tr2 = get_cycles();
  
         tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
  
@@ -206,7 +256,9 @@ void __init tsc_calibrate(void)
         }
  
         tsc_khz = tsc2 / tsc1;
-       set_cyc2ns_scale(tsc_khz);
+
+       for_each_possible_cpu(cpu)
+               set_cyc2ns_scale(tsc_khz, cpu);
  }
  
  /*
@@ -222,17 +274,9 @@ __cpuinit int unsynchronized_tsc(void)
         if (apic_is_clustered_box())
                 return 1;
  #endif
-       /* Most intel systems have synchronized TSCs except for
-          multi node systems */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-#ifdef CONFIG_ACPI
-               /* But TSC doesn't tick in C3 so don't use it there */
-               if (acpi_gbl_FADT.header.length > 0 &&
-                   acpi_gbl_FADT.C3latency < 1000)
-                       return 1;
-#endif
+
+       if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                 return 0;
-       }
  
         /* Assume multi socket systems are not synchronized */
         return num_present_cpus() > 1;
@@ -250,13 +294,13 @@ __setup("notsc", notsc_setup);
  /* clock source code: */
  static cycle_t read_tsc(void)
  {
-       cycle_t ret = (cycle_t)get_cycles_sync();
+       cycle_t ret = (cycle_t)get_cycles();
         return ret;
  }
  
  static cycle_t __vsyscall_fn vread_tsc(void)
  {
-       cycle_t ret = (cycle_t)get_cycles_sync();
+       cycle_t ret = (cycle_t)vget_cycles();
         return ret;
  }
  
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c

index 9125efe66a06bf1b572dc3b136ef92452c03245e..0577825cf89bad5ecdac456fa13d12f4b8e2fa0e 100644 (file)
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void)
         cycles_t start, now, prev, end;
         int i;
  
-       start = get_cycles_sync();
+       start = get_cycles();
         /*
          * The measurement runs for 20 msecs:
          */
@@ -61,18 +61,18 @@ static __cpuinit void check_tsc_warp(void)
                  */
                 __raw_spin_lock(&sync_lock);
                 prev = last_tsc;
-               now = get_cycles_sync();
+               now = get_cycles();
                 last_tsc = now;
                 __raw_spin_unlock(&sync_lock);
  
                 /*
                  * Be nice every now and then (and also check whether
-                * measurement is done [we also insert a 100 million
+                * measurement is done [we also insert a 10 million
                  * loops safety exit, so we dont lock up in case the
                  * TSC readout is totally broken]):
                  */
                 if (unlikely(!(i & 7))) {
-                       if (now > end || i > 100000000)
+                       if (now > end || i > 10000000)
                                 break;
                         cpu_relax();
                         touch_nmi_watchdog();
@@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void)
                         nr_warps++;
                         __raw_spin_unlock(&sync_lock);
                 }
-
+       }
+       if (!(now-start)) {
+               printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
+                       now-start, end-start);
+               WARN_ON(1);
         }
  }
  
@@ -129,23 +133,23 @@ void __cpuinit check_tsc_sync_source(int cpu)
         while (atomic_read(&stop_count) != cpus-1)
                 cpu_relax();
  
-       /*
-        * Reset it - just in case we boot another CPU later:
-        */
-       atomic_set(&start_count, 0);
-
         if (nr_warps) {
                 printk("\n");
                 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
                                     " turning off TSC clock.\n", max_warp);
                 mark_tsc_unstable("check_tsc_sync_source failed");
-               nr_warps = 0;
-               max_warp = 0;
-               last_tsc = 0;
         } else {
                 printk(" passed.\n");
         }
  
+       /*
+        * Reset it - just in case we boot another CPU later:
+        */
+       atomic_set(&start_count, 0);
+       nr_warps = 0;
+       max_warp = 0;
+       last_tsc = 0;
+
         /*
          * Let the target continue with the bootup:
          */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c

index 157e4bedd3c5ae359bbc924f3d18c9ed6170bb23..738c2104df30753bfb04a36fd68269e6564587c5 100644 (file)
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -70,10 +70,10 @@
  /*
   * 8- and 16-bit register defines..
   */
-#define AL(regs)       (((unsigned char *)&((regs)->pt.eax))[0])
-#define AH(regs)       (((unsigned char *)&((regs)->pt.eax))[1])
-#define IP(regs)       (*(unsigned short *)&((regs)->pt.eip))
-#define SP(regs)       (*(unsigned short *)&((regs)->pt.esp))
+#define AL(regs)       (((unsigned char *)&((regs)->pt.ax))[0])
+#define AH(regs)       (((unsigned char *)&((regs)->pt.ax))[1])
+#define IP(regs)       (*(unsigned short *)&((regs)->pt.ip))
+#define SP(regs)       (*(unsigned short *)&((regs)->pt.sp))
  
  /*
   * virtual flags (16 and 32-bit versions)
@@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
  {
         int ret = 0;
  
-       /* kernel_vm86_regs is missing xgs, so copy everything up to
+       /* kernel_vm86_regs is missing gs, so copy everything up to
            (but not including) orig_eax, and then rest including orig_eax. */
-       ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-       ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
+       ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
+       ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
                             sizeof(struct kernel_vm86_regs) -
-                           offsetof(struct kernel_vm86_regs, pt.orig_eax));
+                           offsetof(struct kernel_vm86_regs, pt.orig_ax));
  
         return ret;
  }
@@ -110,18 +110,17 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
  {
         int ret = 0;
  
-       /* copy eax-xfs inclusive */
-       ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-       /* copy orig_eax-__gsh+extra */
-       ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
+       /* copy ax-fs inclusive */
+       ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
+       /* copy orig_ax-__gsh+extra */
+       ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
                               sizeof(struct kernel_vm86_regs) -
-                             offsetof(struct kernel_vm86_regs, pt.orig_eax) +
+                             offsetof(struct kernel_vm86_regs, pt.orig_ax) +
                               extra);
         return ret;
  }
  
-struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
-struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
+struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs)
  {
         struct tss_struct *tss;
         struct pt_regs *ret;
@@ -138,7 +137,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
                 printk("no vm86_info: BAD\n");
                 do_exit(SIGSEGV);
         }
-       set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+       set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask);
         tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
         tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
         if (tmp) {
@@ -147,15 +146,15 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
         }
  
         tss = &per_cpu(init_tss, get_cpu());
-       current->thread.esp0 = current->thread.saved_esp0;
+       current->thread.sp0 = current->thread.saved_sp0;
         current->thread.sysenter_cs = __KERNEL_CS;
-       load_esp0(tss, &current->thread);
-       current->thread.saved_esp0 = 0;
+       load_sp0(tss, &current->thread);
+       current->thread.saved_sp0 = 0;
         put_cpu();
  
         ret = KVM86->regs32;
  
-       ret->xfs = current->thread.saved_fs;
+       ret->fs = current->thread.saved_fs;
         loadsegment(gs, current->thread.saved_gs);
  
         return ret;
@@ -197,7 +196,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  
  asmlinkage int sys_vm86old(struct pt_regs regs)
  {
-       struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
+       struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
         struct kernel_vm86_struct info; /* declare this _on top_,
                                          * this avoids wasting of stack space.
                                          * This remains on the stack until we
@@ -207,7 +206,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
         int tmp, ret = -EPERM;
  
         tsk = current;
-       if (tsk->thread.saved_esp0)
+       if (tsk->thread.saved_sp0)
                 goto out;
         tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
                                        offsetof(struct kernel_vm86_struct, vm86plus) -
@@ -237,12 +236,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
         struct vm86plus_struct __user *v86;
  
         tsk = current;
-       switch (regs.ebx) {
+       switch (regs.bx) {
                 case VM86_REQUEST_IRQ:
                 case VM86_FREE_IRQ:
                 case VM86_GET_IRQ_BITS:
                 case VM86_GET_AND_RESET_IRQ:
-                       ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
+                       ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
                         goto out;
                 case VM86_PLUS_INSTALL_CHECK:
                         /* NOTE: on old vm86 stuff this will return the error
@@ -256,9 +255,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
  
         /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
         ret = -EPERM;
-       if (tsk->thread.saved_esp0)
+       if (tsk->thread.saved_sp0)
                 goto out;
-       v86 = (struct vm86plus_struct __user *)regs.ecx;
+       v86 = (struct vm86plus_struct __user *)regs.cx;
         tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
                                        offsetof(struct kernel_vm86_struct, regs32) -
                                        sizeof(info.regs));
@@ -281,23 +280,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  /*
   * make sure the vm86() system call doesn't try to do anything silly
   */
-       info->regs.pt.xds = 0;
-       info->regs.pt.xes = 0;
-       info->regs.pt.xfs = 0;
+       info->regs.pt.ds = 0;
+       info->regs.pt.es = 0;
+       info->regs.pt.fs = 0;
  
  /* we are clearing gs later just before "jmp resume_userspace",
   * because it is not saved/restored.
   */
  
  /*
- * The eflags register is also special: we cannot trust that the user
+ * The flags register is also special: we cannot trust that the user
   * has set it up safely, so this makes sure interrupt etc flags are
   * inherited from protected mode.
   */
-       VEFLAGS = info->regs.pt.eflags;
-       info->regs.pt.eflags &= SAFE_MASK;
-       info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
-       info->regs.pt.eflags |= VM_MASK;
+       VEFLAGS = info->regs.pt.flags;
+       info->regs.pt.flags &= SAFE_MASK;
+       info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
+       info->regs.pt.flags |= VM_MASK;
  
         switch (info->cpu_type) {
                 case CPU_286:
@@ -315,18 +314,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
         }
  
  /*
- * Save old state, set default return value (%eax) to 0
+ * Save old state, set default return value (%ax) to 0
   */
-       info->regs32->eax = 0;
-       tsk->thread.saved_esp0 = tsk->thread.esp0;
-       tsk->thread.saved_fs = info->regs32->xfs;
+       info->regs32->ax = 0;
+       tsk->thread.saved_sp0 = tsk->thread.sp0;
+       tsk->thread.saved_fs = info->regs32->fs;
         savesegment(gs, tsk->thread.saved_gs);
  
         tss = &per_cpu(init_tss, get_cpu());
-       tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
+       tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
         if (cpu_has_sep)
                 tsk->thread.sysenter_cs = 0;
-       load_esp0(tss, &tsk->thread);
+       load_sp0(tss, &tsk->thread);
         put_cpu();
  
         tsk->thread.screen_bitmap = info->screen_bitmap;
@@ -352,7 +351,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
         struct pt_regs * regs32;
  
         regs32 = save_v86_state(regs16);
-       regs32->eax = retval;
+       regs32->ax = retval;
         __asm__ __volatile__("movl %0,%%esp\n\t"
                 "movl %1,%%ebp\n\t"
                 "jmp resume_userspace"
@@ -373,30 +372,30 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
  
  static inline void clear_TF(struct kernel_vm86_regs * regs)
  {
-       regs->pt.eflags &= ~TF_MASK;
+       regs->pt.flags &= ~TF_MASK;
  }
  
  static inline void clear_AC(struct kernel_vm86_regs * regs)
  {
-       regs->pt.eflags &= ~AC_MASK;
+       regs->pt.flags &= ~AC_MASK;
  }
  
  /* It is correct to call set_IF(regs) from the set_vflags_*
   * functions. However someone forgot to call clear_IF(regs)
   * in the opposite case.
   * After the command sequence CLI PUSHF STI POPF you should
- * end up with interrups disabled, but you ended up with
+ * end up with interrupts disabled, but you ended up with
   * interrupts enabled.
   *  ( I was testing my own changes, but the only bug I
   *    could find was in a function I had not changed. )
   * [KD]
   */
  
-static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
+static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs)
  {
-       set_flags(VEFLAGS, eflags, current->thread.v86mask);
-       set_flags(regs->pt.eflags, eflags, SAFE_MASK);
-       if (eflags & IF_MASK)
+       set_flags(VEFLAGS, flags, current->thread.v86mask);
+       set_flags(regs->pt.flags, flags, SAFE_MASK);
+       if (flags & IF_MASK)
                 set_IF(regs);
         else
                 clear_IF(regs);
@@ -405,7 +404,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
  static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
  {
         set_flags(VFLAGS, flags, current->thread.v86mask);
-       set_flags(regs->pt.eflags, flags, SAFE_MASK);
+       set_flags(regs->pt.flags, flags, SAFE_MASK);
         if (flags & IF_MASK)
                 set_IF(regs);
         else
@@ -414,7 +413,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
  
  static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
  {
-       unsigned long flags = regs->pt.eflags & RETURN_MASK;
+       unsigned long flags = regs->pt.flags & RETURN_MASK;
  
         if (VEFLAGS & VIF_MASK)
                 flags |= IF_MASK;
@@ -518,7 +517,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
         unsigned long __user *intr_ptr;
         unsigned long segoffs;
  
-       if (regs->pt.xcs == BIOSSEG)
+       if (regs->pt.cs == BIOSSEG)
                 goto cannot_handle;
         if (is_revectored(i, &KVM86->int_revectored))
                 goto cannot_handle;
@@ -530,9 +529,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
         if ((segoffs >> 16) == BIOSSEG)
                 goto cannot_handle;
         pushw(ssp, sp, get_vflags(regs), cannot_handle);
-       pushw(ssp, sp, regs->pt.xcs, cannot_handle);
+       pushw(ssp, sp, regs->pt.cs, cannot_handle);
         pushw(ssp, sp, IP(regs), cannot_handle);
-       regs->pt.xcs = segoffs >> 16;
+       regs->pt.cs = segoffs >> 16;
         SP(regs) -= 6;
         IP(regs) = segoffs & 0xffff;
         clear_TF(regs);
@@ -549,7 +548,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
         if (VMPI.is_vm86pus) {
                 if ( (trapno==3) || (trapno==1) )
                         return_to_32bit(regs, VM86_TRAP + (trapno << 8));
-               do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
+               do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
                 return 0;
         }
         if (trapno !=1)
@@ -585,10 +584,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
                 handle_vm86_trap(regs, 0, 1); \
         return; } while (0)
  
-       orig_flags = *(unsigned short *)&regs->pt.eflags;
+       orig_flags = *(unsigned short *)&regs->pt.flags;
  
-       csp = (unsigned char __user *) (regs->pt.xcs << 4);
-       ssp = (unsigned char __user *) (regs->pt.xss << 4);
+       csp = (unsigned char __user *) (regs->pt.cs << 4);
+       ssp = (unsigned char __user *) (regs->pt.ss << 4);
         sp = SP(regs);
         ip = IP(regs);
  
@@ -675,7 +674,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
                         SP(regs) += 6;
                 }
                 IP(regs) = newip;
-               regs->pt.xcs = newcs;
+               regs->pt.cs = newcs;
                 CHECK_IF_IN_TRAP;
                 if (data32) {
                         set_vflags_long(newflags, regs);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c

index f02bad68abaaa01658b62d0cda3bd9c5be647c3f..4525bc2c2e19bd88fe00f1b399f2a05efa29454f 100644 (file)
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -62,7 +62,10 @@ static struct {
         void (*cpuid)(void /* non-c */);
         void (*_set_ldt)(u32 selector);
         void (*set_tr)(u32 selector);
-       void (*set_kernel_stack)(u32 selector, u32 esp0);
+       void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
+       void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
+       void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
+       void (*set_kernel_stack)(u32 selector, u32 sp0);
         void (*allocate_page)(u32, u32, u32, u32, u32);
         void (*release_page)(u32, u32);
         void (*set_pte)(pte_t, pte_t *, unsigned);
@@ -88,13 +91,13 @@ struct vmi_timer_ops vmi_timer_ops;
  #define IRQ_PATCH_DISABLE  5
  
  static inline void patch_offset(void *insnbuf,
-                               unsigned long eip, unsigned long dest)
+                               unsigned long ip, unsigned long dest)
  {
-        *(unsigned long *)(insnbuf+1) = dest-eip-5;
+        *(unsigned long *)(insnbuf+1) = dest-ip-5;
  }
  
  static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-                              unsigned long eip)
+                              unsigned long ip)
  {
         u64 reloc;
         struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
@@ -103,13 +106,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
                 case VMI_RELOCATION_CALL_REL:
                         BUG_ON(len < 5);
                         *(char *)insnbuf = MNEM_CALL;
-                       patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+                       patch_offset(insnbuf, ip, (unsigned long)rel->eip);
                         return 5;
  
                 case VMI_RELOCATION_JUMP_REL:
                         BUG_ON(len < 5);
                         *(char *)insnbuf = MNEM_JMP;
-                       patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+                       patch_offset(insnbuf, ip, (unsigned long)rel->eip);
                         return 5;
  
                 case VMI_RELOCATION_NOP:
@@ -131,25 +134,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
   * sequence.  The callee does nop padding for us.
   */
  static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-                         unsigned long eip, unsigned len)
+                         unsigned long ip, unsigned len)
  {
         switch (type) {
                 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
                         return patch_internal(VMI_CALL_DisableInterrupts, len,
-                                             insns, eip);
+                                             insns, ip);
                 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
                         return patch_internal(VMI_CALL_EnableInterrupts, len,
-                                             insns, eip);
+                                             insns, ip);
                 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
                         return patch_internal(VMI_CALL_SetInterruptMask, len,
-                                             insns, eip);
+                                             insns, ip);
                 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
                         return patch_internal(VMI_CALL_GetInterruptMask, len,
-                                             insns, eip);
+                                             insns, ip);
                 case PARAVIRT_PATCH(pv_cpu_ops.iret):
-                       return patch_internal(VMI_CALL_IRET, len, insns, eip);
-               case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
-                       return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
+                       return patch_internal(VMI_CALL_IRET, len, insns, ip);
+               case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
+                       return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
                 default:
                         break;
         }
@@ -157,36 +160,36 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
  }
  
  /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
-                               unsigned int *ecx, unsigned int *edx)
+static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
+                               unsigned int *cx, unsigned int *dx)
  {
         int override = 0;
-       if (*eax == 1)
+       if (*ax == 1)
                 override = 1;
          asm volatile ("call *%6"
-                      : "=a" (*eax),
-                        "=b" (*ebx),
-                        "=c" (*ecx),
-                        "=d" (*edx)
-                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+                      : "=a" (*ax),
+                        "=b" (*bx),
+                        "=c" (*cx),
+                        "=d" (*dx)
+                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
         if (override) {
                 if (disable_pse)
-                       *edx &= ~X86_FEATURE_PSE;
+                       *dx &= ~X86_FEATURE_PSE;
                 if (disable_pge)
-                       *edx &= ~X86_FEATURE_PGE;
+                       *dx &= ~X86_FEATURE_PGE;
                 if (disable_sep)
-                       *edx &= ~X86_FEATURE_SEP;
+                       *dx &= ~X86_FEATURE_SEP;
                 if (disable_tsc)
-                       *edx &= ~X86_FEATURE_TSC;
+                       *dx &= ~X86_FEATURE_TSC;
                 if (disable_mtrr)
-                       *edx &= ~X86_FEATURE_MTRR;
+                       *dx &= ~X86_FEATURE_MTRR;
         }
  }
  
  static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
  {
         if (gdt[nr].a != new->a || gdt[nr].b != new->b)
-               write_gdt_entry(gdt, nr, new->a, new->b);
+               write_gdt_entry(gdt, nr, new, 0);
  }
  
  static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
@@ -200,12 +203,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
  static void vmi_set_ldt(const void *addr, unsigned entries)
  {
         unsigned cpu = smp_processor_id();
-       u32 low, high;
+       struct desc_struct desc;
  
-       pack_descriptor(&low, &high, (unsigned long)addr,
+       pack_descriptor(&desc, (unsigned long)addr,
                         entries * sizeof(struct desc_struct) - 1,
-                       DESCTYPE_LDT, 0);
-       write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+                       DESC_LDT, 0);
+       write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
         vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
  }
  
@@ -214,17 +217,37 @@ static void vmi_set_tr(void)
         vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
  }
  
-static void vmi_load_esp0(struct tss_struct *tss,
+static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
+{
+       u32 *idt_entry = (u32 *)g;
+       vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]);
+}
+
+static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
+                               const void *desc, int type)
+{
+       u32 *gdt_entry = (u32 *)desc;
+       vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]);
+}
+
+static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
+                               const void *desc)
+{
+       u32 *ldt_entry = (u32 *)desc;
+       vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]);
+}
+
+static void vmi_load_sp0(struct tss_struct *tss,
                                    struct thread_struct *thread)
  {
-       tss->x86_tss.esp0 = thread->esp0;
+       tss->x86_tss.sp0 = thread->sp0;
  
         /* This can only happen when SEP is enabled, no need to test "SEP"arately */
         if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
                 tss->x86_tss.ss1 = thread->sysenter_cs;
                 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
         }
-       vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
+       vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
  }
  
  static void vmi_flush_tlb_user(void)
@@ -375,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
         vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
  }
  
-static void vmi_allocate_pd(u32 pfn)
+static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
  {
         /*
          * This call comes in very early, before mem_map is setup.
@@ -452,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep
  static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
  {
  #ifdef CONFIG_X86_PAE
-       const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+       const pte_t pte = { .pte = pmdval.pmd };
         vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
  #else
         const pte_t pte = { pmdval.pud.pgd.pgd };
@@ -485,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t
  static void vmi_set_pud(pud_t *pudp, pud_t pudval)
  {
         /* Um, eww */
-       const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+       const pte_t pte = { .pte = pudval.pgd.pgd };
         vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
         vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
  }
  
  static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
  {
-       const pte_t pte = { 0 };
+       const pte_t pte = { .pte = 0 };
         vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
         vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
  }
  
  static void vmi_pmd_clear(pmd_t *pmd)
  {
-       const pte_t pte = { 0 };
+       const pte_t pte = { .pte = 0 };
         vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
         vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
  }
@@ -790,10 +813,13 @@ static inline int __init activate_vmi(void)
         para_fill(pv_cpu_ops.store_idt, GetIDT);
         para_fill(pv_cpu_ops.store_tr, GetTR);
         pv_cpu_ops.load_tls = vmi_load_tls;
-       para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
-       para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
-       para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
-       para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+       para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
+                 write_ldt_entry, WriteLDTEntry);
+       para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
+                 write_gdt_entry, WriteGDTEntry);
+       para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
+                 write_idt_entry, WriteIDTEntry);
+       para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
         para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
         para_fill(pv_cpu_ops.io_delay, IODelay);
  
@@ -870,7 +896,7 @@ static inline int __init activate_vmi(void)
          * the backend.  They are performance critical anyway, so requiring
          * a patch is not a big problem.
          */
-       pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+       pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0;
         pv_cpu_ops.iret = (void *)0xbadbab0;
  
  #ifdef CONFIG_SMP
@@ -963,19 +989,19 @@ static int __init parse_vmi(char *arg)
                 return -EINVAL;
  
         if (!strcmp(arg, "disable_pge")) {
-               clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
                 disable_pge = 1;
         } else if (!strcmp(arg, "disable_pse")) {
-               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
                 disable_pse = 1;
         } else if (!strcmp(arg, "disable_sep")) {
-               clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
                 disable_sep = 1;
         } else if (!strcmp(arg, "disable_tsc")) {
-               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
                 disable_tsc = 1;
         } else if (!strcmp(arg, "disable_mtrr")) {
-               clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
                 disable_mtrr = 1;
         } else if (!strcmp(arg, "disable_timer")) {
                 disable_vmi_timer = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c

index b1b5ab08b26eaaa26a955eb93f9502f7ed8e89ce..a2b030780aa9c1ce461384da18b20c287ac85b7c 100644 (file)
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -35,7 +35,6 @@
  #include <asm/i8253.h>
  
  #include <irq_vectors.h>
-#include "io_ports.h"
  
  #define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
  #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
@@ -238,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void)
  void __init vmi_time_init(void)
  {
         /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
-       outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+       outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
  
         vmi_time_init_clockevent();
         setup_irq(0, &vmi_clock_action);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S

index 84c913f38f980b621e6549ccca86f3a01a8666af..f1148ac8abe307ae425127e5275f78dd31db5531 100644 (file)
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -8,12 +8,6 @@
   * put it inside the section definition.
   */
  
-/* Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
  #define LOAD_OFFSET __PAGE_OFFSET
  
  #include <asm-generic/vmlinux.lds.h>
@@ -44,6 +38,8 @@ SECTIONS
  
    /* read-only */
    .text : AT(ADDR(.text) - LOAD_OFFSET) {
+       . = ALIGN(4096); /* not really needed, already page aligned */
+       *(.text.page_aligned)
         TEXT_TEXT
         SCHED_TEXT
         LOCK_TEXT
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S

index ea5386944e67e75da4e88712272ed251cad93761..0992b9946c6fd1b931ce19bbbb511465844ea3d1 100644 (file)
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -37,16 +37,15 @@ SECTIONS
         KPROBES_TEXT
         *(.fixup)
         *(.gnu.warning)
-       } :text = 0x9090
-                               /* out-of-line lock text */
-  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
-
-  _etext = .;                  /* End of text section */
+       _etext = .;                     /* End of text section */
+  } :text = 0x9090
  
    . = ALIGN(16);               /* Exception table */
-  __start___ex_table = .;
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
-  __stop___ex_table = .;
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+       __start___ex_table = .;
+        *(__ex_table)
+       __stop___ex_table = .;
+  }
  
    NOTES :text :note
  
@@ -179,6 +178,14 @@ SECTIONS
    }
    __con_initcall_end = .;
    SECURITY_INIT
+
+  . = ALIGN(8);
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+  __parainstructions = .;
+       *(.parainstructions)
+  __parainstructions_end = .;
+  }
+
    . = ALIGN(8);
    __alt_instructions = .;
    .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c

index 414caf0c5f9ae96b67bafea7589b532fa821b361..d971210a6d367d4d171338279e44eb2ce44cef7d 100644 (file)
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -25,21 +25,24 @@ static int __init vsmp_init(void)
                 return 0;
  
         /* Check if we are running on a ScaleMP vSMP box */
-       if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
-           (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
+       if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) !=
+            PCI_VENDOR_ID_SCALEMP) ||
+           (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) !=
+            PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
                 return 0;
  
         /* set vSMP magic bits to indicate vSMP capable kernel */
         address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
         cap = readl(address);
         ctl = readl(address + 4);
-       printk("vSMP CTL: capabilities:0x%08x  control:0x%08x\n", cap, ctl);
+       printk(KERN_INFO "vSMP CTL: capabilities:0x%08x  control:0x%08x\n",
+              cap, ctl);
         if (cap & ctl & (1 << 4)) {
                 /* Turn on vSMP IRQ fastpath handling (see system.h) */
                 ctl &= ~(1 << 4);
                 writel(ctl, address + 4);
                 ctl = readl(address + 4);
-               printk("vSMP CTL: control set to:0x%08x\n", ctl);
+               printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
         }
  
         iounmap(address);
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S

deleted file mode 100644 (file)

index a5ab3dc..0000000
--- a/arch/x86/kernel/vsyscall_32.S
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <linux/init.h>
-
-__INITDATA
-
-       .globl vsyscall_int80_start, vsyscall_int80_end
-vsyscall_int80_start:
-       .incbin "arch/x86/kernel/vsyscall-int80_32.so"
-vsyscall_int80_end:
-
-       .globl vsyscall_sysenter_start, vsyscall_sysenter_end
-vsyscall_sysenter_start:
-       .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
-vsyscall_sysenter_end:
-
-__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S

deleted file mode 100644 (file)

index 4a8b0ed..0000000
--- a/arch/x86/kernel/vsyscall_32.lds.S
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
- * object prelinked to its virtual address, and with only one read-only
- * segment (that fits in one page).  This script controls its layout.
- */
-#include <asm/asm-offsets.h>
-
-SECTIONS
-{
-  . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
-
-  .hash           : { *(.hash) }               :text
-  .gnu.hash       : { *(.gnu.hash) }
-  .dynsym         : { *(.dynsym) }
-  .dynstr         : { *(.dynstr) }
-  .gnu.version    : { *(.gnu.version) }
-  .gnu.version_d  : { *(.gnu.version_d) }
-  .gnu.version_r  : { *(.gnu.version_r) }
-
-  /* This linker script is used both with -r and with -shared.
-     For the layouts to match, we need to skip more than enough
-     space for the dynamic symbol table et al.  If this amount
-     is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VDSO_PRELINK_asm + 0x400;
-
-  .text           : { *(.text) }               :text =0x90909090
-  .note                  : { *(.note.*) }              :text :note
-  .eh_frame_hdr   : { *(.eh_frame_hdr) }       :text :eh_frame_hdr
-  .eh_frame       : { KEEP (*(.eh_frame)) }    :text
-  .dynamic        : { *(.dynamic) }            :text :dynamic
-  .useless        : {
-       *(.got.plt) *(.got)
-       *(.data .data.* .gnu.linkonce.d.*)
-       *(.dynbss)
-       *(.bss .bss.* .gnu.linkonce.b.*)
-  }                                            :text
-}
-
-/*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
- */
-PHDRS
-{
-  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
-  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
-  note PT_NOTE FLAGS(4); /* PF_R */
-  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
-}
-
-/*
- * This controls what symbols we export from the DSO.
- */
-VERSION
-{
-  LINUX_2.5 {
-    global:
-       __kernel_vsyscall;
-       __kernel_sigreturn;
-       __kernel_rt_sigreturn;
-
-    local: *;
-  };
-}
-
-/* The ELF entry point can be used to set the AT_SYSINFO value.  */
-ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c

index ad4005c6d4a1113d47c289a06d46328743fd2b2b..3f82427745802f0e30ff19ff5ed5aee7e50d7fa2 100644 (file)
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -43,7 +43,7 @@
  #include <asm/vgtod.h>
  
  #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define __syscall_clobber "r11","rcx","memory"
+#define __syscall_clobber "r11","cx","memory"
  #define __pa_vsymbol(x)                        \
         ({unsigned long v;              \
         extern char __vsyscall_0;       \
@@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t)
  long __vsyscall(2)
  vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
  {
-       unsigned int dummy, p;
+       unsigned int p;
         unsigned long j = 0;
  
         /* Fast cache - only recompute value once per jiffies and avoid
@@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
                 p = tcache->blob[1];
         } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
                 /* Load per CPU data from RDTSCP */
-               rdtscp(dummy, dummy, p);
+               native_read_tscp(&p);
         } else {
                 /* Load per CPU data from GDT */
                 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
@@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
         /* Store cpu number in limit so that it can be loaded quickly
            in user space in vgetcpu.
            12 bits for the CPU and 8 bits for the node. */
-       d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+       d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
         *d = 0x0f40000000000ULL;
         *d |= cpu;
         *d |= (node & 0xf) << 12;
@@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
         return NOTIFY_DONE;
  }
  
-static void __init map_vsyscall(void)
+void __init map_vsyscall(void)
  {
         extern char __vsyscall_0;
         unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
@@ -335,7 +335,6 @@ static int __init vsyscall_init(void)
         BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
         BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
         BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-       map_vsyscall();
  #ifdef CONFIG_SYSCTL
         register_sysctl_table(kernel_root_table2);
  #endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c

index 77c25b307635f6c521506b04f51d728ff7ac1b13..a66e9c1a05373d3c53870abdd96c9b7cf03d7d0f 100644 (file)
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -8,6 +8,7 @@
  #include <asm/processor.h>
  #include <asm/uaccess.h>
  #include <asm/pgtable.h>
+#include <asm/desc.h>
  
  EXPORT_SYMBOL(kernel_thread);
  
@@ -34,13 +35,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic);
  EXPORT_SYMBOL(copy_page);
  EXPORT_SYMBOL(clear_page);
  
-#ifdef CONFIG_SMP
-extern void  __write_lock_failed(rwlock_t *rw);
-extern void  __read_lock_failed(rwlock_t *rw);
-EXPORT_SYMBOL(__write_lock_failed);
-EXPORT_SYMBOL(__read_lock_failed);
-#endif
-
  /* Export string functions. We normally rely on gcc builtin for most of these,
     but gcc sometimes decides not to inline them. */    
  #undef memcpy
@@ -60,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt);
  EXPORT_SYMBOL(load_gs_index);
  
  EXPORT_SYMBOL(_proxy_pda);
+
+#ifdef CONFIG_PARAVIRT
+/* Virtualized guests may want to use it */
+EXPORT_SYMBOL_GPL(cpu_gdt_descr);
+#endif
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig

similarity index 94%

rename from drivers/kvm/Kconfig

rename to arch/x86/kvm/Kconfig

index 656920636cb2f6dafba5ffbd0dc41c7c177e601e..c83e1c9b5129b3cc96ecfcb8992e378cd7d4119e 100644 (file)
--- a/drivers/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,9 +1,12 @@
  #
  # KVM configuration
  #
+config HAVE_KVM
+       bool
+
  menuconfig VIRTUALIZATION
         bool "Virtualization"
-       depends on X86
+       depends on HAVE_KVM || X86
         default y
         ---help---
           Say Y here to get to see options for using your Linux host to run other
@@ -16,7 +19,7 @@ if VIRTUALIZATION
  
  config KVM
         tristate "Kernel-based Virtual Machine (KVM) support"
-       depends on X86 && EXPERIMENTAL
+       depends on HAVE_KVM && EXPERIMENTAL
         select PREEMPT_NOTIFIERS
         select ANON_INODES
         ---help---
diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile

similarity index 51%

rename from drivers/kvm/Makefile

rename to arch/x86/kvm/Makefile

index e5a8f4d3e97386f0ba73e629b8d2a13341fc5dd0..ffdd0b310784059527a4837ca040400c170db0d2 100644 (file)
--- a/drivers/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,11 @@
  # Makefile for Kernel-based Virtual Machine module
  #
  
-kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+
+EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
  obj-$(CONFIG_KVM) += kvm.o
  kvm-intel-objs = vmx.o
  obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c

similarity index 98%

rename from drivers/kvm/i8259.c

rename to arch/x86/kvm/i8259.c

index a679157bc599ae0a54979ca9e75f9a4e6ff610dc..ab29cf2def47cc0180c903491ef71ea52684934f 100644 (file)
--- a/drivers/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -28,6 +28,8 @@
  #include <linux/mm.h>
  #include "irq.h"
  
+#include <linux/kvm_host.h>
+
  /*
   * set irq level. If an edge is detected, then the IRR is set to 1
   */
@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
         return intno;
  }
  
-static void pic_reset(void *opaque)
+void kvm_pic_reset(struct kvm_kpic_state *s)
  {
-       struct kvm_kpic_state *s = opaque;
-
         s->last_irr = 0;
         s->irr = 0;
         s->imr = 0;
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
         addr &= 1;
         if (addr == 0) {
                 if (val & 0x10) {
-                       pic_reset(s);   /* init */
+                       kvm_pic_reset(s);       /* init */
                         /*
                          * deassert a pending interrupt
                          */
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c

similarity index 81%

rename from drivers/kvm/irq.c

rename to arch/x86/kvm/irq.c

index 7628c7ff628ff1db90f3a9a4d20df792bb59f4ad..e5714759e97fcee29989eda9a262fef7cd5183fb 100644 (file)
--- a/drivers/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -20,8 +20,8 @@
   */
  
  #include <linux/module.h>
+#include <linux/kvm_host.h>
  
-#include "kvm.h"
  #include "irq.h"
  
  /*
@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
  }
  EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
  
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-       struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-       printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
-{
-       int ipi_pcpu = vcpu->cpu;
-
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
-               ++vcpu->stat.halt_wakeup;
-       }
-       if (vcpu->guest_mode)
-               smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
-}
-
  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
  {
         kvm_inject_apic_timer_irqs(vcpu);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h

new file mode 100644 (file)

index 0000000..fa5ed5d
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+#include "ioapic.h"
+#include "lapic.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+typedef void irq_request_func(void *opaque, int level);
+
+struct kvm_kpic_state {
+       u8 last_irr;    /* edge detection */
+       u8 irr;         /* interrupt request register */
+       u8 imr;         /* interrupt mask register */
+       u8 isr;         /* interrupt service register */
+       u8 priority_add;        /* highest irq priority */
+       u8 irq_base;
+       u8 read_reg_select;
+       u8 poll;
+       u8 special_mask;
+       u8 init_state;
+       u8 auto_eoi;
+       u8 rotate_on_auto_eoi;
+       u8 special_fully_nested_mode;
+       u8 init4;               /* true if 4 byte init */
+       u8 elcr;                /* PIIX edge/trigger selection */
+       u8 elcr_mask;
+       struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+       struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+       irq_request_func *irq_request;
+       void *irq_request_opaque;
+       int output;             /* intr from master PIC */
+       struct kvm_io_device dev;
+};
+
+struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_read_irq(struct kvm_pic *s);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+{
+       return kvm->arch.vpic;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       return pic_irqchip(kvm) != NULL;
+}
+
+void kvm_pic_reset(struct kvm_kpic_state *s);
+
+void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h

similarity index 96%

rename from drivers/kvm/kvm_svm.h

rename to arch/x86/kvm/kvm_svm.h

index a0e415daef5b0142ba00cf2b5cec9b9e12b53d5b..ecdfe97e4635393eb0df2ba755ffd0150443e7dd 100644 (file)
--- a/drivers/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -4,10 +4,10 @@
  #include <linux/kernel.h>
  #include <linux/types.h>
  #include <linux/list.h>
+#include <linux/kvm_host.h>
  #include <asm/msr.h>
  
  #include "svm.h"
-#include "kvm.h"
  
  static const u32 host_save_user_msrs[] = {
  #ifdef CONFIG_X86_64
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c

similarity index 83%

rename from drivers/kvm/lapic.c

rename to arch/x86/kvm/lapic.c

index 238fcad3ceceee4fc32f8ebc2396ed455b4107ae..2cbee9479ce423850a99df39290e3efbbe51ae48 100644 (file)
--- a/drivers/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -17,7 +17,7 @@
   * the COPYING file in the top-level directory.
   */
  
-#include "kvm.h"
+#include <linux/kvm_host.h>
  #include <linux/kvm.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
@@ -56,6 +56,7 @@
  
  #define VEC_POS(v) ((v) & (32 - 1))
  #define REG_POS(v) (((v) >> 5) << 4)
+
  static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
  {
         return *((u32 *) (apic->regs + reg_off));
@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
  
  static inline int apic_hw_enabled(struct kvm_lapic *apic)
  {
-       return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
+       return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
  }
  
  static inline int  apic_sw_enabled(struct kvm_lapic *apic)
@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
  
  int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
  {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
         int highest_irr;
  
         if (!apic)
@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
  }
  EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
  
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
  {
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
         if (!apic_test_and_set_irr(vec, apic)) {
                 /* a new pending irq is set in IRR */
                 if (trig)
@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                            int short_hand, int dest, int dest_mode)
  {
         int result = 0;
-       struct kvm_lapic *target = vcpu->apic;
+       struct kvm_lapic *target = vcpu->arch.apic;
  
         apic_debug("target %p, source %p, dest 0x%x, "
                    "dest_mode 0x%x, short_hand 0x%x",
@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                 } else
                         apic_clear_vector(vector, apic->regs + APIC_TMR);
  
-               if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+               if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
                         kvm_vcpu_kick(vcpu);
-               else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
-                       vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+               else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
+                       vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
                         if (waitqueue_active(&vcpu->wq))
                                 wake_up_interruptible(&vcpu->wq);
                 }
@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
  
         case APIC_DM_INIT:
                 if (level) {
-                       if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
+                       if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
                                 printk(KERN_DEBUG
                                        "INIT on a runnable vcpu %d\n",
                                        vcpu->vcpu_id);
-                       vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+                       vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
                         kvm_vcpu_kick(vcpu);
                 } else {
                         printk(KERN_DEBUG
@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
         case APIC_DM_STARTUP:
                 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
                        vcpu->vcpu_id, vector);
-               if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
-                       vcpu->sipi_vector = vector;
-                       vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+               if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+                       vcpu->arch.sipi_vector = vector;
+                       vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
                         if (waitqueue_active(&vcpu->wq))
                                 wake_up_interruptible(&vcpu->wq);
                 }
@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
         return result;
  }
  
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
+static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
                                        unsigned long bitmap)
  {
-       int vcpu_id;
         int last;
         int next;
-       struct kvm_lapic *apic;
+       struct kvm_lapic *apic = NULL;
  
-       last = kvm->round_robin_prev_vcpu;
+       last = kvm->arch.round_robin_prev_vcpu;
         next = last;
  
         do {
@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
                         next = 0;
                 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
                         continue;
-               apic = kvm->vcpus[next]->apic;
+               apic = kvm->vcpus[next]->arch.apic;
                 if (apic && apic_enabled(apic))
                         break;
                 apic = NULL;
         } while (next != last);
-       kvm->round_robin_prev_vcpu = next;
+       kvm->arch.round_robin_prev_vcpu = next;
  
-       if (!apic) {
-               vcpu_id = ffs(bitmap) - 1;
-               if (vcpu_id < 0) {
-                       vcpu_id = 0;
-                       printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-               }
-               apic = kvm->vcpus[vcpu_id]->apic;
-       }
+       if (!apic)
+               printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
  
         return apic;
  }
  
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+               unsigned long bitmap)
+{
+       struct kvm_lapic *apic;
+
+       apic = kvm_apic_round_robin(kvm, vector, bitmap);
+       if (apic)
+               return apic->vcpu;
+       return NULL;
+}
+
  static void apic_set_eoi(struct kvm_lapic *apic)
  {
         int vector = apic_find_highest_isr(apic);
@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
         unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
         unsigned int vector = icr_low & APIC_VECTOR_MASK;
  
-       struct kvm_lapic *target;
+       struct kvm_vcpu *target;
         struct kvm_vcpu *vcpu;
         unsigned long lpr_map = 0;
         int i;
@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
                 if (!vcpu)
                         continue;
  
-               if (vcpu->apic &&
+               if (vcpu->arch.apic &&
                     apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
                         if (delivery_mode == APIC_DM_LOWEST)
                                 set_bit(vcpu->vcpu_id, &lpr_map);
                         else
-                               __apic_accept_irq(vcpu->apic, delivery_mode,
+                               __apic_accept_irq(vcpu->arch.apic, delivery_mode,
                                                   vector, level, trig_mode);
                 }
         }
  
         if (delivery_mode == APIC_DM_LOWEST) {
-               target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
+               target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
                 if (target != NULL)
-                       __apic_accept_irq(target, delivery_mode,
+                       __apic_accept_irq(target->arch.apic, delivery_mode,
                                           vector, level, trig_mode);
         }
  }
@@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
         return tmcct;
  }
  
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       struct kvm_run *run = vcpu->run;
+
+       set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+       kvm_x86_ops->cache_regs(vcpu);
+       run->tpr_access.rip = vcpu->arch.rip;
+       run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+       if (apic->vcpu->arch.tpr_access_reporting)
+               __report_tpr_access(apic, write);
+}
+
  static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
  {
         u32 val = 0;
@@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
                 val = apic_get_tmcct(apic);
                 break;
  
+       case APIC_TASKPRI:
+               report_tpr_access(apic, false);
+               /* fall thru */
         default:
                 apic_update_ppr(apic);
                 val = apic_get_reg(apic, offset);
@@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
                 break;
  
         case APIC_TASKPRI:
+               report_tpr_access(apic, true);
                 apic_set_tpr(apic, val & 0xff);
                 break;
  
@@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
         return ret;
  }
  
-void kvm_free_apic(struct kvm_lapic *apic)
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
  {
-       if (!apic)
+       if (!vcpu->arch.apic)
                 return;
  
-       hrtimer_cancel(&apic->timer.dev);
+       hrtimer_cancel(&vcpu->arch.apic->timer.dev);
  
-       if (apic->regs_page) {
-               __free_page(apic->regs_page);
-               apic->regs_page = 0;
-       }
+       if (vcpu->arch.apic->regs_page)
+               __free_page(vcpu->arch.apic->regs_page);
  
-       kfree(apic);
+       kfree(vcpu->arch.apic);
  }
  
  /*
@@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic)
  
  void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
  {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
         if (!apic)
                 return;
-       apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
+       apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
+                    | (apic_get_reg(apic, APIC_TASKPRI) & 4));
  }
  
  u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
  {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
         u64 tpr;
  
         if (!apic)
@@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
  
  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
  {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
         if (!apic) {
                 value |= MSR_IA32_APICBASE_BSP;
-               vcpu->apic_base = value;
+               vcpu->arch.apic_base = value;
                 return;
         }
         if (apic->vcpu->vcpu_id)
                 value &= ~MSR_IA32_APICBASE_BSP;
  
-       vcpu->apic_base = value;
-       apic->base_address = apic->vcpu->apic_base &
+       vcpu->arch.apic_base = value;
+       apic->base_address = apic->vcpu->arch.apic_base &
                              MSR_IA32_APICBASE_BASE;
  
         /* with FSB delivery interrupt, we can restart APIC functionality */
         apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
-                  "0x%lx.\n", apic->apic_base, apic->base_address);
+                  "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
  
  }
  
  u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
  {
-       return vcpu->apic_base;
+       return vcpu->arch.apic_base;
  }
  EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
  
@@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
         apic_debug("%s\n", __FUNCTION__);
  
         ASSERT(vcpu);
-       apic = vcpu->apic;
+       apic = vcpu->arch.apic;
         ASSERT(apic != NULL);
  
         /* Stop the timer in case it's a reset to an active apic */
@@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
         update_divide_count(apic);
         atomic_set(&apic->timer.pending, 0);
         if (vcpu->vcpu_id == 0)
-               vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
+               vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
         apic_update_ppr(apic);
  
         apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
                    "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
                    vcpu, kvm_apic_id(apic),
-                  vcpu->apic_base, apic->base_address);
+                  vcpu->arch.apic_base, apic->base_address);
  }
  EXPORT_SYMBOL_GPL(kvm_lapic_reset);
  
  int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
  {
-       struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
         int ret = 0;
  
         if (!apic)
@@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
         wait_queue_head_t *q = &apic->vcpu->wq;
  
         atomic_inc(&apic->timer.pending);
-       if (waitqueue_active(q))
-       {
-               apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+       if (waitqueue_active(q)) {
+               apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
                 wake_up_interruptible(q);
         }
         if (apic_lvtt_period(apic)) {
@@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
         if (!apic)
                 goto nomem;
  
-       vcpu->apic = apic;
+       vcpu->arch.apic = apic;
  
         apic->regs_page = alloc_page(GFP_KERNEL);
         if (apic->regs_page == NULL) {
                 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
                        vcpu->vcpu_id);
-               goto nomem;
+               goto nomem_free_apic;
         }
         apic->regs = page_address(apic->regs_page);
         memset(apic->regs, 0, PAGE_SIZE);
@@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
         hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
         apic->timer.dev.function = apic_timer_fn;
         apic->base_address = APIC_DEFAULT_PHYS_BASE;
-       vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
+       vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
  
         kvm_lapic_reset(vcpu);
         apic->dev.read = apic_mmio_read;
@@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
         apic->dev.private = apic;
  
         return 0;
+nomem_free_apic:
+       kfree(apic);
  nomem:
-       kvm_free_apic(apic);
         return -ENOMEM;
  }
  EXPORT_SYMBOL_GPL(kvm_create_lapic);
  
  int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
  {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
         int highest_irr;
  
         if (!apic || !apic_enabled(apic))
@@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
  
  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
  {
-       u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
+       u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
         int r = 0;
  
         if (vcpu->vcpu_id == 0) {
-               if (!apic_hw_enabled(vcpu->apic))
+               if (!apic_hw_enabled(vcpu->arch.apic))
                         r = 1;
                 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
                     GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
  
  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
  {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
         if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
                 atomic_read(&apic->timer.pending) > 0) {
@@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
  
  void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
  {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
         if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
                 apic->timer.last_update = ktime_add_ns(
@@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
  int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
  {
         int vector = kvm_apic_has_interrupt(vcpu);
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
         if (vector == -1)
                 return -1;
@@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
  
  void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
  {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
  
-       apic->base_address = vcpu->apic_base &
+       apic->base_address = vcpu->arch.apic_base &
                              MSR_IA32_APICBASE_BASE;
         apic_set_reg(apic, APIC_LVR, APIC_VERSION);
         apic_update_ppr(apic);
@@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
         start_apic_timer(apic);
  }
  
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
  {
-       struct kvm_lapic *apic = vcpu->apic;
+       struct kvm_lapic *apic = vcpu->arch.apic;
         struct hrtimer *timer;
  
         if (!apic)
@@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
         if (hrtimer_cancel(timer))
                 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
  }
-EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+       u32 data;
+       void *vapic;
+
+       if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+               return;
+
+       vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+       data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
+       kunmap_atomic(vapic, KM_USER0);
+
+       apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+       u32 data, tpr;
+       int max_irr, max_isr;
+       struct kvm_lapic *apic;
+       void *vapic;
+
+       if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+               return;
+
+       apic = vcpu->arch.apic;
+       tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+       max_irr = apic_find_highest_irr(apic);
+       if (max_irr < 0)
+               max_irr = 0;
+       max_isr = apic_find_highest_isr(apic);
+       if (max_isr < 0)
+               max_isr = 0;
+       data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+       vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
+       *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
+       kunmap_atomic(vapic, KM_USER0);
+}
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+       if (!irqchip_in_kernel(vcpu->kvm))
+               return;
+
+       vcpu->arch.apic->vapic_addr = vapic_addr;
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h

new file mode 100644 (file)

index 0000000..676c396
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+
+struct kvm_lapic {
+       unsigned long base_address;
+       struct kvm_io_device dev;
+       struct {
+               atomic_t pending;
+               s64 period;     /* unit: ns */
+               u32 divide_count;
+               ktime_t last_update;
+               struct hrtimer dev;
+       } timer;
+       struct kvm_vcpu *vcpu;
+       struct page *regs_page;
+       void *regs;
+       gpa_t vapic_addr;
+       struct page *vapic_page;
+};
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
+void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+
+void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

new file mode 100644 (file)

index 0000000..8efdcdb
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1885 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vmx.h"
+#include "mmu.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+
+#include <asm/page.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
+
+#ifndef MMU_DEBUG
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x)                                                      \
+       if (!(x)) {                                                     \
+               printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
+                      __FILE__, __LINE__, #x);                         \
+       }
+#endif
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << 2)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_MASK (1ULL << 5)
+#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+       (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+               (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_LEVEL_MASK(level) \
+               (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
+
+#define PT64_INDEX(address, level)\
+       (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+               (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LEVEL_MASK(level) \
+               (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+
+#define PT32_INDEX(address, level)\
+       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_DIR_BASE_ADDR_MASK \
+       (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+                       | PT64_NX_MASK)
+
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
+
+#define PT64_ROOT_LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+
+#define RMAP_EXT 4
+
+#define ACC_EXEC_MASK    1
+#define ACC_WRITE_MASK   PT_WRITABLE_MASK
+#define ACC_USER_MASK    PT_USER_MASK
+#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+struct kvm_rmap_desc {
+       u64 *shadow_ptes[RMAP_EXT];
+       struct kvm_rmap_desc *more;
+};
+
+static struct kmem_cache *pte_chain_cache;
+static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+
+static u64 __read_mostly shadow_trap_nonpresent_pte;
+static u64 __read_mostly shadow_notrap_nonpresent_pte;
+
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+{
+       shadow_trap_nonpresent_pte = trap_pte;
+       shadow_notrap_nonpresent_pte = notrap_pte;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
+
+static int is_write_protection(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr0 & X86_CR0_WP;
+}
+
+static int is_cpuid_PSE36(void)
+{
+       return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.shadow_efer & EFER_NX;
+}
+
+static int is_present_pte(unsigned long pte)
+{
+       return pte & PT_PRESENT_MASK;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+       pte &= ~PT_SHADOW_IO_MARK;
+       return pte != shadow_trap_nonpresent_pte
+               && pte != shadow_notrap_nonpresent_pte;
+}
+
+static int is_writeble_pte(unsigned long pte)
+{
+       return pte & PT_WRITABLE_MASK;
+}
+
+static int is_dirty_pte(unsigned long pte)
+{
+       return pte & PT_DIRTY_MASK;
+}
+
+static int is_io_pte(unsigned long pte)
+{
+       return pte & PT_SHADOW_IO_MARK;
+}
+
+static int is_rmap_pte(u64 pte)
+{
+       return pte != shadow_trap_nonpresent_pte
+               && pte != shadow_notrap_nonpresent_pte;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+       return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+static void set_shadow_pte(u64 *sptep, u64 spte)
+{
+#ifdef CONFIG_X86_64
+       set_64bit((unsigned long *)sptep, spte);
+#else
+       set_64bit((unsigned long long *)sptep, spte);
+#endif
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+                                 struct kmem_cache *base_cache, int min)
+{
+       void *obj;
+
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+               if (!obj)
+                       return -ENOMEM;
+               cache->objects[cache->nobjs++] = obj;
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               kfree(mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+                                      int min)
+{
+       struct page *page;
+
+       if (cache->nobjs >= min)
+               return 0;
+       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+               page = alloc_page(GFP_KERNEL);
+               if (!page)
+                       return -ENOMEM;
+               set_page_private(page, 0);
+               cache->objects[cache->nobjs++] = page_address(page);
+       }
+       return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+       while (mc->nobjs)
+               free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
+                                  pte_chain_cache, 4);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+                                  rmap_desc_cache, 1);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+       if (r)
+               goto out;
+       r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+                                  mmu_page_header_cache, 4);
+out:
+       return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+       mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+       mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+       mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+                                   size_t size)
+{
+       void *p;
+
+       BUG_ON(!mc->nobjs);
+       p = mc->objects[--mc->nobjs];
+       memset(p, 0, size);
+       return p;
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
+                                     sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+{
+       kfree(pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+       return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
+                                     sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
+{
+       kfree(rd);
+}
+
+/*
+ * Take gfn and return the reverse mapping to it.
+ * Note: gfn must be unaliased before this function get called
+ */
+
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_memory_slot *slot;
+
+       slot = gfn_to_memslot(kvm, gfn);
+       return &slot->rmap[gfn - slot->base_gfn];
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
+ * that points to page_address(page).
+ *
+ * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
+ * containing more mappings.
+ */
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+       struct kvm_rmap_desc *desc;
+       unsigned long *rmapp;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       gfn = unalias_gfn(vcpu->kvm, gfn);
+       sp = page_header(__pa(spte));
+       sp->gfns[spte - sp->spt] = gfn;
+       rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+       if (!*rmapp) {
+               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+               *rmapp = (unsigned long)spte;
+       } else if (!(*rmapp & 1)) {
+               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+               desc = mmu_alloc_rmap_desc(vcpu);
+               desc->shadow_ptes[0] = (u64 *)*rmapp;
+               desc->shadow_ptes[1] = spte;
+               *rmapp = (unsigned long)desc | 1;
+       } else {
+               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+                       desc = desc->more;
+               if (desc->shadow_ptes[RMAP_EXT-1]) {
+                       desc->more = mmu_alloc_rmap_desc(vcpu);
+                       desc = desc->more;
+               }
+               for (i = 0; desc->shadow_ptes[i]; ++i)
+                       ;
+               desc->shadow_ptes[i] = spte;
+       }
+}
+
+static void rmap_desc_remove_entry(unsigned long *rmapp,
+                                  struct kvm_rmap_desc *desc,
+                                  int i,
+                                  struct kvm_rmap_desc *prev_desc)
+{
+       int j;
+
+       for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+               ;
+       desc->shadow_ptes[i] = desc->shadow_ptes[j];
+       desc->shadow_ptes[j] = NULL;
+       if (j != 0)
+               return;
+       if (!prev_desc && !desc->more)
+               *rmapp = (unsigned long)desc->shadow_ptes[0];
+       else
+               if (prev_desc)
+                       prev_desc->more = desc->more;
+               else
+                       *rmapp = (unsigned long)desc->more | 1;
+       mmu_free_rmap_desc(desc);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+       struct kvm_rmap_desc *desc;
+       struct kvm_rmap_desc *prev_desc;
+       struct kvm_mmu_page *sp;
+       struct page *page;
+       unsigned long *rmapp;
+       int i;
+
+       if (!is_rmap_pte(*spte))
+               return;
+       sp = page_header(__pa(spte));
+       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+       mark_page_accessed(page);
+       if (is_writeble_pte(*spte))
+               kvm_release_page_dirty(page);
+       else
+               kvm_release_page_clean(page);
+       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+       if (!*rmapp) {
+               printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+               BUG();
+       } else if (!(*rmapp & 1)) {
+               rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+               if ((u64 *)*rmapp != spte) {
+                       printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
+                              spte, *spte);
+                       BUG();
+               }
+               *rmapp = 0;
+       } else {
+               rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+               desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+               prev_desc = NULL;
+               while (desc) {
+                       for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+                               if (desc->shadow_ptes[i] == spte) {
+                                       rmap_desc_remove_entry(rmapp,
+                                                              desc, i,
+                                                              prev_desc);
+                                       return;
+                               }
+                       prev_desc = desc;
+                       desc = desc->more;
+               }
+               BUG();
+       }
+}
+
+static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
+{
+       struct kvm_rmap_desc *desc;
+       struct kvm_rmap_desc *prev_desc;
+       u64 *prev_spte;
+       int i;
+
+       if (!*rmapp)
+               return NULL;
+       else if (!(*rmapp & 1)) {
+               if (!spte)
+                       return (u64 *)*rmapp;
+               return NULL;
+       }
+       desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+       prev_desc = NULL;
+       prev_spte = NULL;
+       while (desc) {
+               for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+                       if (prev_spte == spte)
+                               return desc->shadow_ptes[i];
+                       prev_spte = desc->shadow_ptes[i];
+               }
+               desc = desc->more;
+       }
+       return NULL;
+}
+
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+       unsigned long *rmapp;
+       u64 *spte;
+       int write_protected = 0;
+
+       gfn = unalias_gfn(kvm, gfn);
+       rmapp = gfn_to_rmap(kvm, gfn);
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               BUG_ON(!spte);
+               BUG_ON(!(*spte & PT_PRESENT_MASK));
+               rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+               if (is_writeble_pte(*spte)) {
+                       set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+                       write_protected = 1;
+               }
+               spte = rmap_next(kvm, rmapp, spte);
+       }
+       if (write_protected)
+               kvm_flush_remote_tlbs(kvm);
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+       u64 *pos;
+       u64 *end;
+
+       for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+               if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
+                       printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+                              pos, *pos);
+                       return 0;
+               }
+       return 1;
+}
+#endif
+
+static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       ASSERT(is_empty_shadow_page(sp->spt));
+       list_del(&sp->link);
+       __free_page(virt_to_page(sp->spt));
+       __free_page(virt_to_page(sp->gfns));
+       kfree(sp);
+       ++kvm->arch.n_free_mmu_pages;
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+       return gfn;
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+                                              u64 *parent_pte)
+{
+       struct kvm_mmu_page *sp;
+
+       sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
+       sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+       set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+       list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+       ASSERT(is_empty_shadow_page(sp->spt));
+       sp->slot_bitmap = 0;
+       sp->multimapped = 0;
+       sp->parent_pte = parent_pte;
+       --vcpu->kvm->arch.n_free_mmu_pages;
+       return sp;
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!parent_pte)
+               return;
+       if (!sp->multimapped) {
+               u64 *old = sp->parent_pte;
+
+               if (!old) {
+                       sp->parent_pte = parent_pte;
+                       return;
+               }
+               sp->multimapped = 1;
+               pte_chain = mmu_alloc_pte_chain(vcpu);
+               INIT_HLIST_HEAD(&sp->parent_ptes);
+               hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+               pte_chain->parent_ptes[0] = old;
+       }
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
+               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+                       continue;
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+                       if (!pte_chain->parent_ptes[i]) {
+                               pte_chain->parent_ptes[i] = parent_pte;
+                               return;
+                       }
+       }
+       pte_chain = mmu_alloc_pte_chain(vcpu);
+       BUG_ON(!pte_chain);
+       hlist_add_head(&pte_chain->link, &sp->parent_ptes);
+       pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+                                      u64 *parent_pte)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!sp->multimapped) {
+               BUG_ON(sp->parent_pte != parent_pte);
+               sp->parent_pte = NULL;
+               return;
+       }
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+                       if (!pte_chain->parent_ptes[i])
+                               break;
+                       if (pte_chain->parent_ptes[i] != parent_pte)
+                               continue;
+                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
+                               && pte_chain->parent_ptes[i + 1]) {
+                               pte_chain->parent_ptes[i]
+                                       = pte_chain->parent_ptes[i + 1];
+                               ++i;
+                       }
+                       pte_chain->parent_ptes[i] = NULL;
+                       if (i == 0) {
+                               hlist_del(&pte_chain->link);
+                               mmu_free_pte_chain(pte_chain);
+                               if (hlist_empty(&sp->parent_ptes)) {
+                                       sp->multimapped = 0;
+                                       sp->parent_pte = NULL;
+                               }
+                       }
+                       return;
+               }
+       BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry(sp, node, bucket, hash_link)
+               if (sp->gfn == gfn && !sp->role.metaphysical) {
+                       pgprintk("%s: found role %x\n",
+                                __FUNCTION__, sp->role.word);
+                       return sp;
+               }
+       return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+                                            gfn_t gfn,
+                                            gva_t gaddr,
+                                            unsigned level,
+                                            int metaphysical,
+                                            unsigned access,
+                                            u64 *parent_pte,
+                                            bool *new_page)
+{
+       union kvm_mmu_page_role role;
+       unsigned index;
+       unsigned quadrant;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node;
+
+       role.word = 0;
+       role.glevels = vcpu->arch.mmu.root_level;
+       role.level = level;
+       role.metaphysical = metaphysical;
+       role.access = access;
+       if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+               quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+               quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+               role.quadrant = quadrant;
+       }
+       pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+                gfn, role.word);
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry(sp, node, bucket, hash_link)
+               if (sp->gfn == gfn && sp->role.word == role.word) {
+                       mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+                       pgprintk("%s: found\n", __FUNCTION__);
+                       return sp;
+               }
+       ++vcpu->kvm->stat.mmu_cache_miss;
+       sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+       if (!sp)
+               return sp;
+       pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+       sp->gfn = gfn;
+       sp->role = role;
+       hlist_add_head(&sp->hash_link, bucket);
+       vcpu->arch.mmu.prefetch_page(vcpu, sp);
+       if (!metaphysical)
+               rmap_write_protect(vcpu->kvm, gfn);
+       if (new_page)
+               *new_page = 1;
+       return sp;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+                                        struct kvm_mmu_page *sp)
+{
+       unsigned i;
+       u64 *pt;
+       u64 ent;
+
+       pt = sp->spt;
+
+       if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       if (is_shadow_present_pte(pt[i]))
+                               rmap_remove(kvm, &pt[i]);
+                       pt[i] = shadow_trap_nonpresent_pte;
+               }
+               kvm_flush_remote_tlbs(kvm);
+               return;
+       }
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               ent = pt[i];
+
+               pt[i] = shadow_trap_nonpresent_pte;
+               if (!is_shadow_present_pte(ent))
+                       continue;
+               ent &= PT64_BASE_ADDR_MASK;
+               mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+       }
+       kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+       mmu_page_remove_parent_pte(sp, parent_pte);
+}
+
+static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
+{
+       int i;
+
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               if (kvm->vcpus[i])
+                       kvm->vcpus[i]->arch.last_pte_updated = NULL;
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       u64 *parent_pte;
+
+       ++kvm->stat.mmu_shadow_zapped;
+       while (sp->multimapped || sp->parent_pte) {
+               if (!sp->multimapped)
+                       parent_pte = sp->parent_pte;
+               else {
+                       struct kvm_pte_chain *chain;
+
+                       chain = container_of(sp->parent_ptes.first,
+                                            struct kvm_pte_chain, link);
+                       parent_pte = chain->parent_ptes[0];
+               }
+               BUG_ON(!parent_pte);
+               kvm_mmu_put_page(sp, parent_pte);
+               set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+       }
+       kvm_mmu_page_unlink_children(kvm, sp);
+       if (!sp->root_count) {
+               hlist_del(&sp->hash_link);
+               kvm_mmu_free_page(kvm, sp);
+       } else
+               list_move(&sp->link, &kvm->arch.active_mmu_pages);
+       kvm_mmu_reset_last_pte_updated(kvm);
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+{
+       /*
+        * If we set the number of mmu pages to be smaller be than the
+        * number of actived pages , we must to free some mmu pages before we
+        * change the value
+        */
+
+       if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
+           kvm_nr_mmu_pages) {
+               int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
+                                      - kvm->arch.n_free_mmu_pages;
+
+               while (n_used_mmu_pages > kvm_nr_mmu_pages) {
+                       struct kvm_mmu_page *page;
+
+                       page = container_of(kvm->arch.active_mmu_pages.prev,
+                                           struct kvm_mmu_page, link);
+                       kvm_mmu_zap_page(kvm, page);
+                       n_used_mmu_pages--;
+               }
+               kvm->arch.n_free_mmu_pages = 0;
+       }
+       else
+               kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
+                                        - kvm->arch.n_alloc_mmu_pages;
+
+       kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+}
+
+static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node, *n;
+       int r;
+
+       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+       r = 0;
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
+               if (sp->gfn == gfn && !sp->role.metaphysical) {
+                       pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+                                sp->role.word);
+                       kvm_mmu_zap_page(kvm, sp);
+                       r = 1;
+               }
+       return r;
+}
+
+static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_mmu_page *sp;
+
+       while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
+               pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+               kvm_mmu_zap_page(kvm, sp);
+       }
+}
+
+static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
+{
+       int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
+       struct kvm_mmu_page *sp = page_header(__pa(pte));
+
+       __set_bit(slot, &sp->slot_bitmap);
+}
+
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+
+       if (gpa == UNMAPPED_GVA)
+               return NULL;
+       return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+                        unsigned pt_access, unsigned pte_access,
+                        int user_fault, int write_fault, int dirty,
+                        int *ptwrite, gfn_t gfn, struct page *page)
+{
+       u64 spte;
+       int was_rmapped = is_rmap_pte(*shadow_pte);
+       int was_writeble = is_writeble_pte(*shadow_pte);
+
+       pgprintk("%s: spte %llx access %x write_fault %d"
+                " user_fault %d gfn %lx\n",
+                __FUNCTION__, *shadow_pte, pt_access,
+                write_fault, user_fault, gfn);
+
+       /*
+        * We don't set the accessed bit, since we sometimes want to see
+        * whether the guest actually used the pte (in order to detect
+        * demand paging).
+        */
+       spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+       if (!dirty)
+               pte_access &= ~ACC_WRITE_MASK;
+       if (!(pte_access & ACC_EXEC_MASK))
+               spte |= PT64_NX_MASK;
+
+       spte |= PT_PRESENT_MASK;
+       if (pte_access & ACC_USER_MASK)
+               spte |= PT_USER_MASK;
+
+       if (is_error_page(page)) {
+               set_shadow_pte(shadow_pte,
+                              shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
+               kvm_release_page_clean(page);
+               return;
+       }
+
+       spte |= page_to_phys(page);
+
+       if ((pte_access & ACC_WRITE_MASK)
+           || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+               struct kvm_mmu_page *shadow;
+
+               spte |= PT_WRITABLE_MASK;
+               if (user_fault) {
+                       mmu_unshadow(vcpu->kvm, gfn);
+                       goto unshadowed;
+               }
+
+               shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+               if (shadow) {
+                       pgprintk("%s: found shadow page for %lx, marking ro\n",
+                                __FUNCTION__, gfn);
+                       pte_access &= ~ACC_WRITE_MASK;
+                       if (is_writeble_pte(spte)) {
+                               spte &= ~PT_WRITABLE_MASK;
+                               kvm_x86_ops->tlb_flush(vcpu);
+                       }
+                       if (write_fault)
+                               *ptwrite = 1;
+               }
+       }
+
+unshadowed:
+
+       if (pte_access & ACC_WRITE_MASK)
+               mark_page_dirty(vcpu->kvm, gfn);
+
+       pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+       set_shadow_pte(shadow_pte, spte);
+       page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+       if (!was_rmapped) {
+               rmap_add(vcpu, shadow_pte, gfn);
+               if (!is_rmap_pte(*shadow_pte))
+                       kvm_release_page_clean(page);
+       } else {
+               if (was_writeble)
+                       kvm_release_page_dirty(page);
+               else
+                       kvm_release_page_clean(page);
+       }
+       if (!ptwrite || !*ptwrite)
+               vcpu->arch.last_pte_updated = shadow_pte;
+}
+
+static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
+                          gfn_t gfn, struct page *page)
+{
+       int level = PT32E_ROOT_LEVEL;
+       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+       int pt_write = 0;
+
+       for (; ; level--) {
+               u32 index = PT64_INDEX(v, level);
+               u64 *table;
+
+               ASSERT(VALID_PAGE(table_addr));
+               table = __va(table_addr);
+
+               if (level == 1) {
+                       mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+                                    0, write, 1, &pt_write, gfn, page);
+                       return pt_write || is_io_pte(table[index]);
+               }
+
+               if (table[index] == shadow_trap_nonpresent_pte) {
+                       struct kvm_mmu_page *new_table;
+                       gfn_t pseudo_gfn;
+
+                       pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+                               >> PAGE_SHIFT;
+                       new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+                                                    v, level - 1,
+                                                    1, ACC_ALL, &table[index],
+                                                    NULL);
+                       if (!new_table) {
+                               pgprintk("nonpaging_map: ENOMEM\n");
+                               kvm_release_page_clean(page);
+                               return -ENOMEM;
+                       }
+
+                       table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
+                               | PT_WRITABLE_MASK | PT_USER_MASK;
+               }
+               table_addr = table[index] & PT64_BASE_ADDR_MASK;
+       }
+}
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+{
+       int r;
+
+       struct page *page;
+
+       down_read(&current->mm->mmap_sem);
+       page = gfn_to_page(vcpu->kvm, gfn);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       r = __nonpaging_map(vcpu, v, write, gfn, page);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+
+       up_read(&current->mm->mmap_sem);
+
+       return r;
+}
+
+
+static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
+                                   struct kvm_mmu_page *sp)
+{
+       int i;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+               sp->spt[i] = shadow_trap_nonpresent_pte;
+}
+
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_mmu_page *sp;
+
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+       spin_lock(&vcpu->kvm->mmu_lock);
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+
+               sp = page_header(root);
+               --sp->root_count;
+               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+               spin_unlock(&vcpu->kvm->mmu_lock);
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               if (root) {
+                       root &= PT64_BASE_ADDR_MASK;
+                       sp = page_header(root);
+                       --sp->root_count;
+               }
+               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+       }
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       gfn_t root_gfn;
+       struct kvm_mmu_page *sp;
+
+       root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+
+#ifdef CONFIG_X86_64
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+
+               ASSERT(!VALID_PAGE(root));
+               sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+                                     PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
+               root = __pa(sp->spt);
+               ++sp->root_count;
+               vcpu->arch.mmu.root_hpa = root;
+               return;
+       }
+#endif
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               ASSERT(!VALID_PAGE(root));
+               if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+                       if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+                               vcpu->arch.mmu.pae_root[i] = 0;
+                               continue;
+                       }
+                       root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+               } else if (vcpu->arch.mmu.root_level == 0)
+                       root_gfn = 0;
+               sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+                                     PT32_ROOT_LEVEL, !is_paging(vcpu),
+                                     ACC_ALL, NULL, NULL);
+               root = __pa(sp->spt);
+               ++sp->root_count;
+               vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+       }
+       vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+       return vaddr;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+                               u32 error_code)
+{
+       gfn_t gfn;
+       int r;
+
+       pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       ASSERT(vcpu);
+       ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       gfn = gva >> PAGE_SHIFT;
+
+       return nonpaging_map(vcpu, gva & PAGE_MASK,
+                            error_code & PFERR_WRITE_MASK, gfn);
+}
+
+static void nonpaging_free(struct kvm_vcpu *vcpu)
+{
+       mmu_free_roots(vcpu);
+}
+
+static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       context->new_cr3 = nonpaging_new_cr3;
+       context->page_fault = nonpaging_page_fault;
+       context->gva_to_gpa = nonpaging_gva_to_gpa;
+       context->free = nonpaging_free;
+       context->prefetch_page = nonpaging_prefetch_page;
+       context->root_level = 0;
+       context->shadow_root_level = PT32E_ROOT_LEVEL;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.tlb_flush;
+       kvm_x86_ops->tlb_flush(vcpu);
+}
+
+static void paging_new_cr3(struct kvm_vcpu *vcpu)
+{
+       pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+       mmu_free_roots(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+                             u64 addr,
+                             u32 err_code)
+{
+       kvm_inject_page_fault(vcpu, addr, err_code);
+}
+
+static void paging_free(struct kvm_vcpu *vcpu)
+{
+       nonpaging_free(vcpu);
+}
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       ASSERT(is_pae(vcpu));
+       context->new_cr3 = paging_new_cr3;
+       context->page_fault = paging64_page_fault;
+       context->gva_to_gpa = paging64_gva_to_gpa;
+       context->prefetch_page = paging64_prefetch_page;
+       context->free = paging_free;
+       context->root_level = level;
+       context->shadow_root_level = level;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+       return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
+static int paging32_init_context(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu *context = &vcpu->arch.mmu;
+
+       context->new_cr3 = paging_new_cr3;
+       context->page_fault = paging32_page_fault;
+       context->gva_to_gpa = paging32_gva_to_gpa;
+       context->free = paging_free;
+       context->prefetch_page = paging32_prefetch_page;
+       context->root_level = PT32_ROOT_LEVEL;
+       context->shadow_root_level = PT32E_ROOT_LEVEL;
+       context->root_hpa = INVALID_PAGE;
+       return 0;
+}
+
+static int paging32E_init_context(struct kvm_vcpu *vcpu)
+{
+       return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+}
+
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       if (!is_paging(vcpu))
+               return nonpaging_init_context(vcpu);
+       else if (is_long_mode(vcpu))
+               return paging64_init_context(vcpu);
+       else if (is_pae(vcpu))
+               return paging32E_init_context(vcpu);
+       else
+               return paging32_init_context(vcpu);
+}
+
+static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+               vcpu->arch.mmu.free(vcpu);
+               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       }
+}
+
+int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+       destroy_kvm_mmu(vcpu);
+       return init_kvm_mmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               goto out;
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       mmu_alloc_roots(vcpu);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+       kvm_mmu_flush_tlb(vcpu);
+out:
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+       mmu_free_roots(vcpu);
+}
+
+static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp,
+                                 u64 *spte)
+{
+       u64 pte;
+       struct kvm_mmu_page *child;
+
+       pte = *spte;
+       if (is_shadow_present_pte(pte)) {
+               if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+                       rmap_remove(vcpu->kvm, spte);
+               else {
+                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       mmu_page_remove_parent_pte(child, spte);
+               }
+       }
+       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+}
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+                                 struct kvm_mmu_page *sp,
+                                 u64 *spte,
+                                 const void *new, int bytes,
+                                 int offset_in_pte)
+{
+       if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+               ++vcpu->kvm->stat.mmu_pde_zapped;
+               return;
+       }
+
+       ++vcpu->kvm->stat.mmu_pte_updated;
+       if (sp->role.glevels == PT32_ROOT_LEVEL)
+               paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+       else
+               paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+       if (!is_shadow_present_pte(old))
+               return false;
+       if (!is_shadow_present_pte(new))
+               return true;
+       if ((old ^ new) & PT64_BASE_ADDR_MASK)
+               return true;
+       old ^= PT64_NX_MASK;
+       new ^= PT64_NX_MASK;
+       return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+{
+       if (need_remote_flush(old, new))
+               kvm_flush_remote_tlbs(vcpu->kvm);
+       else
+               kvm_mmu_flush_tlb(vcpu);
+}
+
+static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+{
+       u64 *spte = vcpu->arch.last_pte_updated;
+
+       return !!(spte && (*spte & PT_ACCESSED_MASK));
+}
+
+static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                         const u8 *new, int bytes)
+{
+       gfn_t gfn;
+       int r;
+       u64 gpte = 0;
+
+       if (bytes != 4 && bytes != 8)
+               return;
+
+       /*
+        * Assume that the pte write on a page table of the same type
+        * as the current vcpu paging mode.  This is nearly always true
+        * (might be false while changing modes).  Note it is verified later
+        * by update_pte().
+        */
+       if (is_pae(vcpu)) {
+               /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+               if ((bytes == 4) && (gpa % 4 == 0)) {
+                       r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
+                       if (r)
+                               return;
+                       memcpy((void *)&gpte + (gpa % 8), new, 4);
+               } else if ((bytes == 8) && (gpa % 8 == 0)) {
+                       memcpy((void *)&gpte, new, 8);
+               }
+       } else {
+               if ((bytes == 4) && (gpa % 4 == 0))
+                       memcpy((void *)&gpte, new, 4);
+       }
+       if (!is_present_pte(gpte))
+               return;
+       gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+       vcpu->arch.update_pte.gfn = gfn;
+       vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                      const u8 *new, int bytes)
+{
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       struct kvm_mmu_page *sp;
+       struct hlist_node *node, *n;
+       struct hlist_head *bucket;
+       unsigned index;
+       u64 entry;
+       u64 *spte;
+       unsigned offset = offset_in_page(gpa);
+       unsigned pte_size;
+       unsigned page_offset;
+       unsigned misaligned;
+       unsigned quadrant;
+       int level;
+       int flooded = 0;
+       int npte;
+
+       pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+       mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       ++vcpu->kvm->stat.mmu_pte_write;
+       kvm_mmu_audit(vcpu, "pre pte write");
+       if (gfn == vcpu->arch.last_pt_write_gfn
+           && !last_updated_pte_accessed(vcpu)) {
+               ++vcpu->arch.last_pt_write_count;
+               if (vcpu->arch.last_pt_write_count >= 3)
+                       flooded = 1;
+       } else {
+               vcpu->arch.last_pt_write_gfn = gfn;
+               vcpu->arch.last_pt_write_count = 1;
+               vcpu->arch.last_pte_updated = NULL;
+       }
+       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
+               if (sp->gfn != gfn || sp->role.metaphysical)
+                       continue;
+               pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+               misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+               misaligned |= bytes < 4;
+               if (misaligned || flooded) {
+                       /*
+                        * Misaligned accesses are too much trouble to fix
+                        * up; also, they usually indicate a page is not used
+                        * as a page table.
+                        *
+                        * If we're seeing too many writes to a page,
+                        * it may no longer be a page table, or we may be
+                        * forking, in which case it is better to unmap the
+                        * page.
+                        */
+                       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+                                gpa, bytes, sp->role.word);
+                       kvm_mmu_zap_page(vcpu->kvm, sp);
+                       ++vcpu->kvm->stat.mmu_flooded;
+                       continue;
+               }
+               page_offset = offset;
+               level = sp->role.level;
+               npte = 1;
+               if (sp->role.glevels == PT32_ROOT_LEVEL) {
+                       page_offset <<= 1;      /* 32->64 */
+                       /*
+                        * A 32-bit pde maps 4MB while the shadow pdes map
+                        * only 2MB.  So we need to double the offset again
+                        * and zap two pdes instead of one.
+                        */
+                       if (level == PT32_ROOT_LEVEL) {
+                               page_offset &= ~7; /* kill rounding error */
+                               page_offset <<= 1;
+                               npte = 2;
+                       }
+                       quadrant = page_offset >> PAGE_SHIFT;
+                       page_offset &= ~PAGE_MASK;
+                       if (quadrant != sp->role.quadrant)
+                               continue;
+               }
+               spte = &sp->spt[page_offset / sizeof(*spte)];
+               while (npte--) {
+                       entry = *spte;
+                       mmu_pte_write_zap_pte(vcpu, sp, spte);
+                       mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
+                                             page_offset & (pte_size - 1));
+                       mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+                       ++spte;
+               }
+       }
+       kvm_mmu_audit(vcpu, "post pte write");
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       if (vcpu->arch.update_pte.page) {
+               kvm_release_page_clean(vcpu->arch.update_pte.page);
+               vcpu->arch.update_pte.page = NULL;
+       }
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       gpa_t gpa;
+       int r;
+
+       down_read(&current->mm->mmap_sem);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
+       up_read(&current->mm->mmap_sem);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+       r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       return r;
+}
+
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+               struct kvm_mmu_page *sp;
+
+               sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
+                                 struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+               ++vcpu->kvm->stat.mmu_recycled;
+       }
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+{
+       int r;
+       enum emulation_result er;
+
+       r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+       if (r < 0)
+               goto out;
+
+       if (!r) {
+               r = 1;
+               goto out;
+       }
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               goto out;
+
+       er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
+
+       switch (er) {
+       case EMULATE_DONE:
+               return 1;
+       case EMULATE_DO_MMIO:
+               ++vcpu->stat.mmio_exits;
+               return 0;
+       case EMULATE_FAIL:
+               kvm_report_emulation_failure(vcpu, "pagetable");
+               return 1;
+       default:
+               BUG();
+       }
+out:
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *sp;
+
+       while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
+               sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
+                                 struct kvm_mmu_page, link);
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+       }
+       free_page((unsigned long)vcpu->arch.mmu.pae_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+       struct page *page;
+       int i;
+
+       ASSERT(vcpu);
+
+       if (vcpu->kvm->arch.n_requested_mmu_pages)
+               vcpu->kvm->arch.n_free_mmu_pages =
+                                       vcpu->kvm->arch.n_requested_mmu_pages;
+       else
+               vcpu->kvm->arch.n_free_mmu_pages =
+                                       vcpu->kvm->arch.n_alloc_mmu_pages;
+       /*
+        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+        * Therefore we need to allocate shadow page tables in the first
+        * 4GB of memory, which happens to fit the DMA32 zone.
+        */
+       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+       if (!page)
+               goto error_1;
+       vcpu->arch.mmu.pae_root = page_address(page);
+       for (i = 0; i < 4; ++i)
+               vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+       return 0;
+
+error_1:
+       free_mmu_pages(vcpu);
+       return -ENOMEM;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       return alloc_mmu_pages(vcpu);
+}
+
+int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+       ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+       return init_kvm_mmu(vcpu);
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+       ASSERT(vcpu);
+
+       destroy_kvm_mmu(vcpu);
+       free_mmu_pages(vcpu);
+       mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+{
+       struct kvm_mmu_page *sp;
+
+       list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
+               int i;
+               u64 *pt;
+
+               if (!test_bit(slot, &sp->slot_bitmap))
+                       continue;
+
+               pt = sp->spt;
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+                       /* avoid RMW */
+                       if (pt[i] & PT_WRITABLE_MASK)
+                               pt[i] &= ~PT_WRITABLE_MASK;
+       }
+}
+
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+       struct kvm_mmu_page *sp, *node;
+
+       spin_lock(&kvm->mmu_lock);
+       list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+               kvm_mmu_zap_page(kvm, sp);
+       spin_unlock(&kvm->mmu_lock);
+
+       kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_module_exit(void)
+{
+       if (pte_chain_cache)
+               kmem_cache_destroy(pte_chain_cache);
+       if (rmap_desc_cache)
+               kmem_cache_destroy(rmap_desc_cache);
+       if (mmu_page_header_cache)
+               kmem_cache_destroy(mmu_page_header_cache);
+}
+
+int kvm_mmu_module_init(void)
+{
+       pte_chain_cache = kmem_cache_create("kvm_pte_chain",
+                                           sizeof(struct kvm_pte_chain),
+                                           0, 0, NULL);
+       if (!pte_chain_cache)
+               goto nomem;
+       rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
+                                           sizeof(struct kvm_rmap_desc),
+                                           0, 0, NULL);
+       if (!rmap_desc_cache)
+               goto nomem;
+
+       mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+                                                 sizeof(struct kvm_mmu_page),
+                                                 0, 0, NULL);
+       if (!mmu_page_header_cache)
+               goto nomem;
+
+       return 0;
+
+nomem:
+       kvm_mmu_module_exit();
+       return -ENOMEM;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+       int i;
+       unsigned int nr_mmu_pages;
+       unsigned int  nr_pages = 0;
+
+       for (i = 0; i < kvm->nmemslots; i++)
+               nr_pages += kvm->memslots[i].npages;
+
+       nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+       nr_mmu_pages = max(nr_mmu_pages,
+                       (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+
+       return nr_mmu_pages;
+}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+       gva = (long long)(gva << 16) >> 16;
+#endif
+       return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+                               gva_t va, int level)
+{
+       u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+       int i;
+       gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+               u64 ent = pt[i];
+
+               if (ent == shadow_trap_nonpresent_pte)
+                       continue;
+
+               va = canonicalize(va);
+               if (level > 1) {
+                       if (ent == shadow_notrap_nonpresent_pte)
+                               printk(KERN_ERR "audit: (%s) nontrapping pte"
+                                      " in nonleaf level: levels %d gva %lx"
+                                      " level %d pte %llx\n", audit_msg,
+                                      vcpu->arch.mmu.root_level, va, level, ent);
+
+                       audit_mappings_page(vcpu, ent, va, level - 1);
+               } else {
+                       gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
+                       struct page *page = gpa_to_page(vcpu, gpa);
+                       hpa_t hpa = page_to_phys(page);
+
+                       if (is_shadow_present_pte(ent)
+                           && (ent & PT64_BASE_ADDR_MASK) != hpa)
+                               printk(KERN_ERR "xx audit error: (%s) levels %d"
+                                      " gva %lx gpa %llx hpa %llx ent %llx %d\n",
+                                      audit_msg, vcpu->arch.mmu.root_level,
+                                      va, gpa, hpa, ent,
+                                      is_shadow_present_pte(ent));
+                       else if (ent == shadow_notrap_nonpresent_pte
+                                && !is_error_hpa(hpa))
+                               printk(KERN_ERR "audit: (%s) notrap shadow,"
+                                      " valid guest gva %lx\n", audit_msg, va);
+                       kvm_release_page_clean(page);
+
+               }
+       }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+       unsigned i;
+
+       if (vcpu->arch.mmu.root_level == 4)
+               audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+       else
+               for (i = 0; i < 4; ++i)
+                       if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+                               audit_mappings_page(vcpu,
+                                                   vcpu->arch.mmu.pae_root[i],
+                                                   i << 30,
+                                                   2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       int i, j, k;
+
+       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+               struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+               struct kvm_rmap_desc *d;
+
+               for (j = 0; j < m->npages; ++j) {
+                       unsigned long *rmapp = &m->rmap[j];
+
+                       if (!*rmapp)
+                               continue;
+                       if (!(*rmapp & 1)) {
+                               ++nmaps;
+                               continue;
+                       }
+                       d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+                       while (d) {
+                               for (k = 0; k < RMAP_EXT; ++k)
+                                       if (d->shadow_ptes[k])
+                                               ++nmaps;
+                                       else
+                                               break;
+                               d = d->more;
+                       }
+               }
+       }
+       return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+       int nmaps = 0;
+       struct kvm_mmu_page *sp;
+       int i;
+
+       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+               u64 *pt = sp->spt;
+
+               if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+                       continue;
+
+               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+                       u64 ent = pt[i];
+
+                       if (!(ent & PT_PRESENT_MASK))
+                               continue;
+                       if (!(ent & PT_WRITABLE_MASK))
+                               continue;
+                       ++nmaps;
+               }
+       }
+       return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+       int n_rmap = count_rmaps(vcpu);
+       int n_actual = count_writable_mappings(vcpu);
+
+       if (n_rmap != n_actual)
+               printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+                      __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+       struct kvm_mmu_page *sp;
+       struct kvm_memory_slot *slot;
+       unsigned long *rmapp;
+       gfn_t gfn;
+
+       list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+               if (sp->role.metaphysical)
+                       continue;
+
+               slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+               gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+               rmapp = &slot->rmap[gfn - slot->base_gfn];
+               if (*rmapp)
+                       printk(KERN_ERR "%s: (%s) shadow page has writable"
+                              " mappings: gfn %lx role %x\n",
+                              __FUNCTION__, audit_msg, sp->gfn,
+                              sp->role.word);
+       }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+       int olddbg = dbg;
+
+       dbg = 0;
+       audit_msg = msg;
+       audit_rmap(vcpu);
+       audit_write_protection(vcpu);
+       audit_mappings(vcpu);
+       dbg = olddbg;
+}
+
+#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h

new file mode 100644 (file)

index 0000000..1fce19e
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+       if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+               __kvm_mmu_free_some_pages(vcpu);
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+       if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+               return 0;
+
+       return kvm_mmu_load(vcpu);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+       return vcpu->arch.shadow_efer & EFER_LME;
+#else
+       return 0;
+#endif
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr4 & X86_CR4_PAE;
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr4 & X86_CR4_PSE;
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr0 & X86_CR0_PG;
+}
+
+#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h

new file mode 100644 (file)

index 0000000..03ba860
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,484 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+       #define pt_element_t u64
+       #define guest_walker guest_walker64
+       #define FNAME(name) paging##64_##name
+       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+       #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
+       #define PT_LEVEL_BITS PT64_LEVEL_BITS
+       #ifdef CONFIG_X86_64
+       #define PT_MAX_FULL_LEVELS 4
+       #define CMPXCHG cmpxchg
+       #else
+       #define CMPXCHG cmpxchg64
+       #define PT_MAX_FULL_LEVELS 2
+       #endif
+#elif PTTYPE == 32
+       #define pt_element_t u32
+       #define guest_walker guest_walker32
+       #define FNAME(name) paging##32_##name
+       #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+       #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+       #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+       #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
+       #define PT_LEVEL_BITS PT32_LEVEL_BITS
+       #define PT_MAX_FULL_LEVELS 2
+       #define CMPXCHG cmpxchg
+#else
+       #error Invalid PTTYPE value
+#endif
+
+#define gpte_to_gfn FNAME(gpte_to_gfn)
+#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+       int level;
+       gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+       pt_element_t ptes[PT_MAX_FULL_LEVELS];
+       gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+       unsigned pt_access;
+       unsigned pte_access;
+       gfn_t gfn;
+       u32 error_code;
+};
+
+static gfn_t gpte_to_gfn(pt_element_t gpte)
+{
+       return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
+{
+       return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+                        gfn_t table_gfn, unsigned index,
+                        pt_element_t orig_pte, pt_element_t new_pte)
+{
+       pt_element_t ret;
+       pt_element_t *table;
+       struct page *page;
+
+       page = gfn_to_page(kvm, table_gfn);
+       table = kmap_atomic(page, KM_USER0);
+
+       ret = CMPXCHG(&table[index], orig_pte, new_pte);
+
+       kunmap_atomic(table, KM_USER0);
+
+       kvm_release_page_dirty(page);
+
+       return (ret != orig_pte);
+}
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+{
+       unsigned access;
+
+       access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+#if PTTYPE == 64
+       if (is_nx(vcpu))
+               access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+       return access;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static int FNAME(walk_addr)(struct guest_walker *walker,
+                           struct kvm_vcpu *vcpu, gva_t addr,
+                           int write_fault, int user_fault, int fetch_fault)
+{
+       pt_element_t pte;
+       gfn_t table_gfn;
+       unsigned index, pt_access, pte_access;
+       gpa_t pte_gpa;
+
+       pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+walk:
+       walker->level = vcpu->arch.mmu.root_level;
+       pte = vcpu->arch.cr3;
+#if PTTYPE == 64
+       if (!is_long_mode(vcpu)) {
+               pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
+               if (!is_present_pte(pte))
+                       goto not_present;
+               --walker->level;
+       }
+#endif
+       ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
+              (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+
+       pt_access = ACC_ALL;
+
+       for (;;) {
+               index = PT_INDEX(addr, walker->level);
+
+               table_gfn = gpte_to_gfn(pte);
+               pte_gpa = gfn_to_gpa(table_gfn);
+               pte_gpa += index * sizeof(pt_element_t);
+               walker->table_gfn[walker->level - 1] = table_gfn;
+               walker->pte_gpa[walker->level - 1] = pte_gpa;
+               pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+                        walker->level - 1, table_gfn);
+
+               kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+
+               if (!is_present_pte(pte))
+                       goto not_present;
+
+               if (write_fault && !is_writeble_pte(pte))
+                       if (user_fault || is_write_protection(vcpu))
+                               goto access_error;
+
+               if (user_fault && !(pte & PT_USER_MASK))
+                       goto access_error;
+
+#if PTTYPE == 64
+               if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+                       goto access_error;
+#endif
+
+               if (!(pte & PT_ACCESSED_MASK)) {
+                       mark_page_dirty(vcpu->kvm, table_gfn);
+                       if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
+                           index, pte, pte|PT_ACCESSED_MASK))
+                               goto walk;
+                       pte |= PT_ACCESSED_MASK;
+               }
+
+               pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
+
+               walker->ptes[walker->level - 1] = pte;
+
+               if (walker->level == PT_PAGE_TABLE_LEVEL) {
+                       walker->gfn = gpte_to_gfn(pte);
+                       break;
+               }
+
+               if (walker->level == PT_DIRECTORY_LEVEL
+                   && (pte & PT_PAGE_SIZE_MASK)
+                   && (PTTYPE == 64 || is_pse(vcpu))) {
+                       walker->gfn = gpte_to_gfn_pde(pte);
+                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+                       if (PTTYPE == 32 && is_cpuid_PSE36())
+                               walker->gfn += pse36_gfn_delta(pte);
+                       break;
+               }
+
+               pt_access = pte_access;
+               --walker->level;
+       }
+
+       if (write_fault && !is_dirty_pte(pte)) {
+               bool ret;
+
+               mark_page_dirty(vcpu->kvm, table_gfn);
+               ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+                           pte|PT_DIRTY_MASK);
+               if (ret)
+                       goto walk;
+               pte |= PT_DIRTY_MASK;
+               kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+               walker->ptes[walker->level - 1] = pte;
+       }
+
+       walker->pt_access = pt_access;
+       walker->pte_access = pte_access;
+       pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+                __FUNCTION__, (u64)pte, pt_access, pte_access);
+       return 1;
+
+not_present:
+       walker->error_code = 0;
+       goto err;
+
+access_error:
+       walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+       if (write_fault)
+               walker->error_code |= PFERR_WRITE_MASK;
+       if (user_fault)
+               walker->error_code |= PFERR_USER_MASK;
+       if (fetch_fault)
+               walker->error_code |= PFERR_FETCH_MASK;
+       return 0;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+                             u64 *spte, const void *pte, int bytes,
+                             int offset_in_pte)
+{
+       pt_element_t gpte;
+       unsigned pte_access;
+       struct page *npage;
+
+       gpte = *(const pt_element_t *)pte;
+       if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
+               if (!offset_in_pte && !is_present_pte(gpte))
+                       set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+               return;
+       }
+       if (bytes < sizeof(pt_element_t))
+               return;
+       pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+       pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+       if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
+               return;
+       npage = vcpu->arch.update_pte.page;
+       if (!npage)
+               return;
+       get_page(npage);
+       mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+                    gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ */
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+                        struct guest_walker *walker,
+                        int user_fault, int write_fault, int *ptwrite,
+                        struct page *page)
+{
+       hpa_t shadow_addr;
+       int level;
+       u64 *shadow_ent;
+       unsigned access = walker->pt_access;
+
+       if (!is_present_pte(walker->ptes[walker->level - 1]))
+               return NULL;
+
+       shadow_addr = vcpu->arch.mmu.root_hpa;
+       level = vcpu->arch.mmu.shadow_root_level;
+       if (level == PT32E_ROOT_LEVEL) {
+               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr &= PT64_BASE_ADDR_MASK;
+               --level;
+       }
+
+       for (; ; level--) {
+               u32 index = SHADOW_PT_INDEX(addr, level);
+               struct kvm_mmu_page *shadow_page;
+               u64 shadow_pte;
+               int metaphysical;
+               gfn_t table_gfn;
+               bool new_page = 0;
+
+               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+               if (level == PT_PAGE_TABLE_LEVEL)
+                       break;
+               if (is_shadow_present_pte(*shadow_ent)) {
+                       shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
+                       continue;
+               }
+
+               if (level - 1 == PT_PAGE_TABLE_LEVEL
+                   && walker->level == PT_DIRECTORY_LEVEL) {
+                       metaphysical = 1;
+                       if (!is_dirty_pte(walker->ptes[level - 1]))
+                               access &= ~ACC_WRITE_MASK;
+                       table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
+               } else {
+                       metaphysical = 0;
+                       table_gfn = walker->table_gfn[level - 2];
+               }
+               shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+                                              metaphysical, access,
+                                              shadow_ent, &new_page);
+               if (new_page && !metaphysical) {
+                       int r;
+                       pt_element_t curr_pte;
+                       r = kvm_read_guest_atomic(vcpu->kvm,
+                                                 walker->pte_gpa[level - 2],
+                                                 &curr_pte, sizeof(curr_pte));
+                       if (r || curr_pte != walker->ptes[level - 2]) {
+                               kvm_release_page_clean(page);
+                               return NULL;
+                       }
+               }
+               shadow_addr = __pa(shadow_page->spt);
+               shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+                       | PT_WRITABLE_MASK | PT_USER_MASK;
+               *shadow_ent = shadow_pte;
+       }
+
+       mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
+                    user_fault, write_fault,
+                    walker->ptes[walker->level-1] & PT_DIRTY_MASK,
+                    ptwrite, walker->gfn, page);
+
+       return shadow_ent;
+}
+
+/*
+ * Page fault handler.  There are several causes for a page fault:
+ *   - there is no shadow pte for the guest pte
+ *   - write access through a shadow pte marked read only so that we can set
+ *     the dirty bit
+ *   - write access to a shadow pte marked read only so we can update the page
+ *     dirty bitmap, when userspace requests it
+ *   - mmio access; in this case we will never install a present shadow pte
+ *   - normal guest page fault due to the guest pte marked not present, not
+ *     writable, or not executable
+ *
+ *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ *           a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+                              u32 error_code)
+{
+       int write_fault = error_code & PFERR_WRITE_MASK;
+       int user_fault = error_code & PFERR_USER_MASK;
+       int fetch_fault = error_code & PFERR_FETCH_MASK;
+       struct guest_walker walker;
+       u64 *shadow_pte;
+       int write_pt = 0;
+       int r;
+       struct page *page;
+
+       pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+       kvm_mmu_audit(vcpu, "pre page fault");
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       down_read(&current->mm->mmap_sem);
+       /*
+        * Look up the shadow pte for the faulting address.
+        */
+       r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+                            fetch_fault);
+
+       /*
+        * The page is not mapped by the guest.  Let the guest handle it.
+        */
+       if (!r) {
+               pgprintk("%s: guest page fault\n", __FUNCTION__);
+               inject_page_fault(vcpu, addr, walker.error_code);
+               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+               up_read(&current->mm->mmap_sem);
+               return 0;
+       }
+
+       page = gfn_to_page(vcpu->kvm, walker.gfn);
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_free_some_pages(vcpu);
+       shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+                                 &write_pt, page);
+       pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+                shadow_pte, *shadow_pte, write_pt);
+
+       if (!write_pt)
+               vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+
+       /*
+        * mmio: emulate if accessible, otherwise its a guest fault.
+        */
+       if (shadow_pte && is_io_pte(*shadow_pte)) {
+               spin_unlock(&vcpu->kvm->mmu_lock);
+               up_read(&current->mm->mmap_sem);
+               return 1;
+       }
+
+       ++vcpu->stat.pf_fixed;
+       kvm_mmu_audit(vcpu, "post page fault (fixed)");
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       up_read(&current->mm->mmap_sem);
+
+       return write_pt;
+}
+
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
+{
+       struct guest_walker walker;
+       gpa_t gpa = UNMAPPED_GVA;
+       int r;
+
+       r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
+
+       if (r) {
+               gpa = gfn_to_gpa(walker.gfn);
+               gpa |= vaddr & ~PAGE_MASK;
+       }
+
+       return gpa;
+}
+
+static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
+                                struct kvm_mmu_page *sp)
+{
+       int i, offset = 0, r = 0;
+       pt_element_t pt;
+
+       if (sp->role.metaphysical
+           || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
+               nonpaging_prefetch_page(vcpu, sp);
+               return;
+       }
+
+       if (PTTYPE == 32)
+               offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
+               pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+               r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
+                                         sizeof(pt_element_t));
+               if (r || is_present_pte(pt))
+                       sp->spt[i] = shadow_trap_nonpresent_pte;
+               else
+                       sp->spt[i] = shadow_notrap_nonpresent_pte;
+       }
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef SHADOW_PT_INDEX
+#undef PT_LEVEL_MASK
+#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_pde
+#undef CMPXCHG
diff --git a/drivers/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h

similarity index 53%

rename from drivers/kvm/segment_descriptor.h

rename to arch/x86/kvm/segment_descriptor.h

index 71fdf458619a001092c9bba6ddaf4338b482372a..56fc4c8733894db1554e6c81ac5ea321f1228e4b 100644 (file)
--- a/drivers/kvm/segment_descriptor.h
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -1,3 +1,6 @@
+#ifndef __SEGMENT_DESCRIPTOR_H
+#define __SEGMENT_DESCRIPTOR_H
+
  struct segment_descriptor {
         u16 limit_low;
         u16 base_low;
@@ -14,4 +17,13 @@ struct segment_descriptor {
         u8  base_high;
  } __attribute__((packed));
  
+#ifdef CONFIG_X86_64
+/* LDT or TSS descriptor in the GDT. 16 bytes. */
+struct segment_descriptor_64 {
+       struct segment_descriptor s;
+       u32 base_higher;
+       u32 pad_zero;
+};
  
+#endif
+#endif
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c

similarity index 84%

rename from drivers/kvm/svm.c

rename to arch/x86/kvm/svm.c

index 4e04e49a2f1c35f626fa24d3d5d3f1ea5d115869..de755cb1431dcef84617b04e29eacb5a06fc59d0 100644 (file)
--- a/drivers/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -13,10 +13,11 @@
   * the COPYING file in the top-level directory.
   *
   */
+#include <linux/kvm_host.h>
  
  #include "kvm_svm.h"
-#include "x86_emulate.h"
  #include "irq.h"
+#include "mmu.h"
  
  #include <linux/module.h>
  #include <linux/kernel.h>
@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL");
  #define SEG_TYPE_LDT 2
  #define SEG_TYPE_BUSY_TSS16 3
  
-#define KVM_EFER_LMA (1 << 10)
-#define KVM_EFER_LME (1 << 8)
-
  #define SVM_FEATURE_NPT  (1 << 0)
  #define SVM_FEATURE_LBRV (1 << 1)
  #define SVM_DEATURE_SVML (1 << 2)
@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat)
  
  static inline u8 pop_irq(struct kvm_vcpu *vcpu)
  {
-       int word_index = __ffs(vcpu->irq_summary);
-       int bit_index = __ffs(vcpu->irq_pending[word_index]);
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
         int irq = word_index * BITS_PER_LONG + bit_index;
  
-       clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-       if (!vcpu->irq_pending[word_index])
-               clear_bit(word_index, &vcpu->irq_summary);
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
         return irq;
  }
  
  static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
  {
-       set_bit(irq, vcpu->irq_pending);
-       set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+       set_bit(irq, vcpu->arch.irq_pending);
+       set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
  }
  
  static inline void clgi(void)
@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
  
  static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
  {
-       if (!(efer & KVM_EFER_LMA))
-               efer &= ~KVM_EFER_LME;
+       if (!(efer & EFER_LMA))
+               efer &= ~EFER_LME;
  
         to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
-       vcpu->shadow_efer = efer;
+       vcpu->arch.shadow_efer = efer;
  }
  
-static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       svm->vmcb->control.event_inj =          SVM_EVTINJ_VALID |
-                                               SVM_EVTINJ_VALID_ERR |
-                                               SVM_EVTINJ_TYPE_EXEPT |
-                                               GP_VECTOR;
+       svm->vmcb->control.event_inj = nr
+               | SVM_EVTINJ_VALID
+               | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+               | SVM_EVTINJ_TYPE_EXEPT;
         svm->vmcb->control.event_inj_err = error_code;
  }
  
-static void inject_ud(struct kvm_vcpu *vcpu)
+static bool svm_exception_injected(struct kvm_vcpu *vcpu)
  {
-       to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
-                                               SVM_EVTINJ_TYPE_EXEPT |
-                                               UD_VECTOR;
-}
+       struct vcpu_svm *svm = to_svm(vcpu);
  
-static int is_page_fault(uint32_t info)
-{
-       info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
-       return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
+       return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
  }
  
  static int is_external_interrupt(u32 info)
@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
                 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
                 return;
         }
-       if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
+       if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
                 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
                        __FUNCTION__,
                        svm->vmcb->save.rip,
                        svm->next_rip);
-       }
  
-       vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
+       vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
         svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
  
-       vcpu->interrupt_window_open = 1;
+       vcpu->arch.interrupt_window_open = 1;
  }
  
  static int has_svm(void)
@@ -290,7 +282,7 @@ static void svm_hardware_enable(void *garbage)
  #ifdef CONFIG_X86_64
         struct desc_ptr gdt_descr;
  #else
-       struct Xgt_desc_struct gdt_descr;
+       struct desc_ptr gdt_descr;
  #endif
         struct desc_struct *gdt;
         int me = raw_smp_processor_id();
@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage)
         svm_data->next_asid = svm_data->max_asid + 1;
         svm_features = cpuid_edx(SVM_CPUID_FUNC);
  
-       asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
+       asm volatile ("sgdt %0" : "=m"(gdt_descr));
         gdt = (struct desc_struct *)gdt_descr.address;
         svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
  
@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb)
  
         control->intercept_cr_read =    INTERCEPT_CR0_MASK |
                                         INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK;
+                                       INTERCEPT_CR4_MASK |
+                                       INTERCEPT_CR8_MASK;
  
         control->intercept_cr_write =   INTERCEPT_CR0_MASK |
                                         INTERCEPT_CR3_MASK |
-                                       INTERCEPT_CR4_MASK;
+                                       INTERCEPT_CR4_MASK |
+                                       INTERCEPT_CR8_MASK;
  
         control->intercept_dr_read =    INTERCEPT_DR0_MASK |
                                         INTERCEPT_DR1_MASK |
@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb)
                                         INTERCEPT_DR5_MASK |
                                         INTERCEPT_DR7_MASK;
  
-       control->intercept_exceptions = 1 << PF_VECTOR;
+       control->intercept_exceptions = (1 << PF_VECTOR) |
+                                       (1 << UD_VECTOR);
  
  
         control->intercept =    (1ULL << INTERCEPT_INTR) |
@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb)
         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
  
         save->efer = MSR_EFER_SVME_MASK;
-
-        save->dr6 = 0xffff0ff0;
+       save->dr6 = 0xffff0ff0;
         save->dr7 = 0x400;
         save->rflags = 2;
         save->rip = 0x0000fff0;
@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb)
         /* rdx = ?? */
  }
  
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
  
         if (vcpu->vcpu_id != 0) {
                 svm->vmcb->save.rip = 0;
-               svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
-               svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
+               svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
+               svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
         }
+
+       return 0;
  }
  
  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
         if (err)
                 goto free_svm;
  
-       if (irqchip_in_kernel(kvm)) {
-               err = kvm_create_lapic(&svm->vcpu);
-               if (err < 0)
-                       goto free_svm;
-       }
-
         page = alloc_page(GFP_KERNEL);
         if (!page) {
                 err = -ENOMEM;
@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
  
         fx_init(&svm->vcpu);
         svm->vcpu.fpu_active = 1;
-       svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
         if (svm->vcpu.vcpu_id == 0)
-               svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
+               svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
  
         return &svm->vcpu;
  
@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                  * increasing TSC.
                  */
                 rdtscll(tsc_this);
-               delta = vcpu->host_tsc - tsc_this;
+               delta = vcpu->arch.host_tsc - tsc_this;
                 svm->vmcb->control.tsc_offset += delta;
                 vcpu->cpu = cpu;
                 kvm_migrate_apic_timer(vcpu);
@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
         struct vcpu_svm *svm = to_svm(vcpu);
         int i;
  
+       ++vcpu->stat.host_state_reload;
         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
  
-       rdtscll(vcpu->host_tsc);
-       kvm_put_guest_fpu(vcpu);
+       rdtscll(vcpu->arch.host_tsc);
  }
  
  static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-       vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-       vcpu->rip = svm->vmcb->save.rip;
+       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+       vcpu->arch.rip = svm->vmcb->save.rip;
  }
  
  static void svm_decache_regs(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
-       svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
-       svm->vmcb->save.rip = vcpu->rip;
+       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+       svm->vmcb->save.rip = vcpu->arch.rip;
  }
  
  static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         struct vcpu_svm *svm = to_svm(vcpu);
  
  #ifdef CONFIG_X86_64
-       if (vcpu->shadow_efer & KVM_EFER_LME) {
+       if (vcpu->arch.shadow_efer & EFER_LME) {
                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-                       vcpu->shadow_efer |= KVM_EFER_LMA;
-                       svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
+                       vcpu->arch.shadow_efer |= EFER_LMA;
+                       svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
                 }
  
-               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
-                       vcpu->shadow_efer &= ~KVM_EFER_LMA;
-                       svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
+               if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+                       vcpu->arch.shadow_efer &= ~EFER_LMA;
+                       svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
                 }
         }
  #endif
-       if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
+       if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
                 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
                 vcpu->fpu_active = 1;
         }
  
-       vcpu->cr0 = cr0;
+       vcpu->arch.cr0 = cr0;
         cr0 |= X86_CR0_PG | X86_CR0_WP;
         cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
         svm->vmcb->save.cr0 = cr0;
@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  
  static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
-       vcpu->cr4 = cr4;
+       vcpu->arch.cr4 = cr4;
         to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
  }
  
@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
                 svm->db_regs[dr] = value;
                 return;
         case 4 ... 5:
-               if (vcpu->cr4 & X86_CR4_DE) {
+               if (vcpu->arch.cr4 & X86_CR4_DE) {
                         *exception = UD_VECTOR;
                         return;
                 }
@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
         struct kvm *kvm = svm->vcpu.kvm;
         u64 fault_address;
         u32 error_code;
-       enum emulation_result er;
-       int r;
  
         if (!irqchip_in_kernel(kvm) &&
                 is_external_interrupt(exit_int_info))
                 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
  
-       mutex_lock(&kvm->lock);
-
         fault_address  = svm->vmcb->control.exit_info_2;
         error_code = svm->vmcb->control.exit_info_1;
-       r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
-       if (r < 0) {
-               mutex_unlock(&kvm->lock);
-               return r;
-       }
-       if (!r) {
-               mutex_unlock(&kvm->lock);
-               return 1;
-       }
-       er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
-                                error_code);
-       mutex_unlock(&kvm->lock);
+       return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+}
  
-       switch (er) {
-       case EMULATE_DONE:
-               return 1;
-       case EMULATE_DO_MMIO:
-               ++svm->vcpu.stat.mmio_exits;
-               return 0;
-       case EMULATE_FAIL:
-               kvm_report_emulation_failure(&svm->vcpu, "pagetable");
-               break;
-       default:
-               BUG();
-       }
+static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       int er;
  
-       kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-       return 0;
+       er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+       if (er != EMULATE_DONE)
+               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
  }
  
  static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  {
         svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
-       if (!(svm->vcpu.cr0 & X86_CR0_TS))
+       if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
                 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
         svm->vcpu.fpu_active = 1;
  
@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  
  static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  {
-       u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
+       u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
         int size, down, in, string, rep;
         unsigned port;
  
@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
         string = (io_info & SVM_IOIO_STR_MASK) != 0;
  
         if (string) {
-               if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
+               if (emulate_instruction(&svm->vcpu,
+                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
                         return 0;
                 return 1;
         }
@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  {
         svm->next_rip = svm->vmcb->save.rip + 3;
         skip_emulated_instruction(&svm->vcpu);
-       return kvm_hypercall(&svm->vcpu, kvm_run);
+       kvm_emulate_hypercall(&svm->vcpu);
+       return 1;
  }
  
  static int invalid_op_interception(struct vcpu_svm *svm,
                                    struct kvm_run *kvm_run)
  {
-       inject_ud(&svm->vcpu);
+       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
         return 1;
  }
  
@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  static int emulate_on_interception(struct vcpu_svm *svm,
                                    struct kvm_run *kvm_run)
  {
-       if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
+       if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
                 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
         return 1;
  }
  
+static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+       if (irqchip_in_kernel(svm->vcpu.kvm))
+               return 1;
+       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       return 0;
+}
+
  static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
  
  static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  {
-       u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
+       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
         u64 data;
  
         if (svm_get_msr(&svm->vcpu, ecx, &data))
-               svm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(&svm->vcpu, 0);
         else {
                 svm->vmcb->save.rax = data & 0xffffffff;
-               svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
+               svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
                 svm->next_rip = svm->vmcb->save.rip + 2;
                 skip_emulated_instruction(&svm->vcpu);
         }
@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
         case MSR_IA32_SYSENTER_ESP:
                 svm->vmcb->save.sysenter_esp = data;
                 break;
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               /*
+                * only support writing 0 to the performance counters for now
+                * to make Windows happy. Should be replaced by a real
+                * performance counter emulation later.
+                */
+               if (data != 0)
+                       goto unhandled;
+               break;
         default:
+       unhandled:
                 return kvm_set_msr_common(vcpu, ecx, data);
         }
         return 0;
@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
  
  static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  {
-       u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
+       u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
         u64 data = (svm->vmcb->save.rax & -1u)
-               | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
+               | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
         svm->next_rip = svm->vmcb->save.rip + 2;
         if (svm_set_msr(&svm->vcpu, ecx, data))
-               svm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(&svm->vcpu, 0);
         else
                 skip_emulated_instruction(&svm->vcpu);
         return 1;
@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
          * possible
          */
         if (kvm_run->request_interrupt_window &&
-           !svm->vcpu.irq_summary) {
+           !svm->vcpu.arch.irq_summary) {
                 ++svm->vcpu.stat.irq_window_exits;
                 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
                 return 0;
@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
         [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
         [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
         [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
+       [SVM_EXIT_READ_CR8]                     = emulate_on_interception,
         /* for now: */
         [SVM_EXIT_WRITE_CR0]                    = emulate_on_interception,
         [SVM_EXIT_WRITE_CR3]                    = emulate_on_interception,
         [SVM_EXIT_WRITE_CR4]                    = emulate_on_interception,
+       [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
         [SVM_EXIT_READ_DR0]                     = emulate_on_interception,
         [SVM_EXIT_READ_DR1]                     = emulate_on_interception,
         [SVM_EXIT_READ_DR2]                     = emulate_on_interception,
@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
         [SVM_EXIT_WRITE_DR3]                    = emulate_on_interception,
         [SVM_EXIT_WRITE_DR5]                    = emulate_on_interception,
         [SVM_EXIT_WRITE_DR7]                    = emulate_on_interception,
+       [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
         [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
         [SVM_EXIT_INTR]                         = nop_on_interception,
@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        exit_code);
  
         if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
-           || svm_exit_handlers[exit_code] == 0) {
+           || !svm_exit_handlers[exit_code]) {
                 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
                 kvm_run->hw.hardware_exit_reason = exit_code;
                 return 0;
@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
         int cpu = raw_smp_processor_id();
  
         struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
-       svm_data->tss_desc->type = 9; //available 32/64-bit TSS
+       svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
         load_TR_desc();
  }
  
@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
         struct vmcb *vmcb = svm->vmcb;
         int intr_vector = -1;
  
-       kvm_inject_pending_timer_irqs(vcpu);
         if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
             ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
                 intr_vector = vmcb->control.exit_int_info &
@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
                 push_irq(&svm->vcpu, control->int_vector);
         }
  
-       svm->vcpu.interrupt_window_open =
+       svm->vcpu.arch.interrupt_window_open =
                 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
  }
  
  static void svm_do_inject_vector(struct vcpu_svm *svm)
  {
         struct kvm_vcpu *vcpu = &svm->vcpu;
-       int word_index = __ffs(vcpu->irq_summary);
-       int bit_index = __ffs(vcpu->irq_pending[word_index]);
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
         int irq = word_index * BITS_PER_LONG + bit_index;
  
-       clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-       if (!vcpu->irq_pending[word_index])
-               clear_bit(word_index, &vcpu->irq_summary);
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
         svm_inject_irq(svm, irq);
  }
  
@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
         struct vcpu_svm *svm = to_svm(vcpu);
         struct vmcb_control_area *control = &svm->vmcb->control;
  
-       svm->vcpu.interrupt_window_open =
+       svm->vcpu.arch.interrupt_window_open =
                 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
                  (svm->vmcb->save.rflags & X86_EFLAGS_IF));
  
-       if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
+       if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
                 /*
                  * If interrupts enabled, and not blocked by sti or mov ss. Good.
                  */
@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
         /*
          * Interrupts blocked.  Wait for unblock.
          */
-       if (!svm->vcpu.interrupt_window_open &&
-           (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
+       if (!svm->vcpu.arch.interrupt_window_open &&
+           (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
                 control->intercept |= 1ULL << INTERCEPT_VINTR;
-       } else
+        else
                 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
  }
  
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+       return 0;
+}
+
  static void save_db_regs(unsigned long *db_regs)
  {
         asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         svm->host_cr2 = kvm_read_cr2();
         svm->host_dr6 = read_dr6();
         svm->host_dr7 = read_dr7();
-       svm->vmcb->save.cr2 = vcpu->cr2;
+       svm->vmcb->save.cr2 = vcpu->arch.cr2;
  
         if (svm->vmcb->save.dr7 & 0xff) {
                 write_dr7(0);
@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  
         asm volatile (
  #ifdef CONFIG_X86_64
-               "push %%rbx; push %%rcx; push %%rdx;"
-               "push %%rsi; push %%rdi; push %%rbp;"
-               "push %%r8;  push %%r9;  push %%r10; push %%r11;"
-               "push %%r12; push %%r13; push %%r14; push %%r15;"
+               "push %%rbp; \n\t"
  #else
-               "push %%ebx; push %%ecx; push %%edx;"
-               "push %%esi; push %%edi; push %%ebp;"
+               "push %%ebp; \n\t"
  #endif
  
  #ifdef CONFIG_X86_64
@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 "mov %%r14, %c[r14](%[svm]) \n\t"
                 "mov %%r15, %c[r15](%[svm]) \n\t"
  
-               "pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
-               "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
-               "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
-               "pop  %%rdx; pop  %%rcx; pop  %%rbx; \n\t"
+               "pop  %%rbp; \n\t"
  #else
                 "mov %%ebx, %c[rbx](%[svm]) \n\t"
                 "mov %%ecx, %c[rcx](%[svm]) \n\t"
@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 "mov %%edi, %c[rdi](%[svm]) \n\t"
                 "mov %%ebp, %c[rbp](%[svm]) \n\t"
  
-               "pop  %%ebp; pop  %%edi; pop  %%esi;"
-               "pop  %%edx; pop  %%ecx; pop  %%ebx; \n\t"
+               "pop  %%ebp; \n\t"
  #endif
                 :
                 : [svm]"a"(svm),
                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
-                 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
-                 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
-                 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
-                 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
-                 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
-                 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
+                 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+                 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+                 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+                 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+                 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+                 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
  #ifdef CONFIG_X86_64
-                 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
-                 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
-                 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
-                 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
-                 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
-                 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
-                 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
-                 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
+                 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+                 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+                 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+                 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+                 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+                 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+                 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+                 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
  #endif
-               : "cc", "memory" );
+               : "cc", "memory"
+#ifdef CONFIG_X86_64
+               , "rbx", "rcx", "rdx", "rsi", "rdi"
+               , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+               , "ebx", "ecx", "edx" , "esi", "edi"
+#endif
+               );
  
         if ((svm->vmcb->save.dr7 & 0xff))
                 load_db_regs(svm->host_db_regs);
  
-       vcpu->cr2 = svm->vmcb->save.cr2;
+       vcpu->arch.cr2 = svm->vmcb->save.cr2;
  
         write_dr6(svm->host_dr6);
         write_dr7(svm->host_dr7);
@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
         }
  }
  
-static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
-                                 unsigned long  addr,
-                                 uint32_t err_code)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
-
-       ++vcpu->stat.pf_guest;
-
-       if (is_page_fault(exit_int_info)) {
-
-               svm->vmcb->control.event_inj_err = 0;
-               svm->vmcb->control.event_inj =  SVM_EVTINJ_VALID |
-                                               SVM_EVTINJ_VALID_ERR |
-                                               SVM_EVTINJ_TYPE_EXEPT |
-                                               DF_VECTOR;
-               return;
-       }
-       vcpu->cr2 = addr;
-       svm->vmcb->save.cr2 = addr;
-       svm->vmcb->control.event_inj =  SVM_EVTINJ_VALID |
-                                       SVM_EVTINJ_VALID_ERR |
-                                       SVM_EVTINJ_TYPE_EXEPT |
-                                       PF_VECTOR;
-       svm->vmcb->control.event_inj_err = err_code;
-}
-
-
  static int is_disabled(void)
  {
         u64 vm_cr;
@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
         hypercall[0] = 0x0f;
         hypercall[1] = 0x01;
         hypercall[2] = 0xd9;
-       hypercall[3] = 0xc3;
  }
  
  static void svm_check_processor_compat(void *rtn)
@@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn)
         *(int *)rtn = 0;
  }
  
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+       return false;
+}
+
  static struct kvm_x86_ops svm_x86_ops = {
         .cpu_has_kvm_support = has_svm,
         .disabled_by_bios = is_disabled,
@@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = {
         .check_processor_compatibility = svm_check_processor_compat,
         .hardware_enable = svm_hardware_enable,
         .hardware_disable = svm_hardware_disable,
+       .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
  
         .vcpu_create = svm_create_vcpu,
         .vcpu_free = svm_free_vcpu,
@@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = {
         .set_rflags = svm_set_rflags,
  
         .tlb_flush = svm_flush_tlb,
-       .inject_page_fault = svm_inject_page_fault,
-
-       .inject_gp = svm_inject_gp,
  
         .run = svm_vcpu_run,
         .handle_exit = handle_exit,
@@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = {
         .patch_hypercall = svm_patch_hypercall,
         .get_irq = svm_get_irq,
         .set_irq = svm_set_irq,
+       .queue_exception = svm_queue_exception,
+       .exception_injected = svm_exception_injected,
         .inject_pending_irq = svm_intr_assist,
         .inject_pending_vectors = do_interrupt_requests,
+
+       .set_tss_addr = svm_set_tss_addr,
  };
  
  static int __init svm_init(void)
  {
-       return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
+       return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
                               THIS_MODULE);
  }
  
  static void __exit svm_exit(void)
  {
-       kvm_exit_x86();
+       kvm_exit();
  }
  
  module_init(svm_init)
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h

similarity index 98%

rename from drivers/kvm/svm.h

rename to arch/x86/kvm/svm.h

index 3b1b0f35b6cba172ac5fbb58cdceec93cdc7221c..5fd50491b55505fd1f02a033c3f244e134efdf5b 100644 (file)
--- a/drivers/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb {
  #define INTERCEPT_CR0_MASK 1
  #define INTERCEPT_CR3_MASK (1 << 3)
  #define INTERCEPT_CR4_MASK (1 << 4)
+#define INTERCEPT_CR8_MASK (1 << 8)
  
  #define INTERCEPT_DR0_MASK 1
  #define INTERCEPT_DR1_MASK (1 << 1)
@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb {
  
  #define SVM_EXIT_ERR           -1
  
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
+#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
  
  #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
  #define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c

similarity index 75%

rename from drivers/kvm/vmx.c

rename to arch/x86/kvm/vmx.c

index bb56ae3f89b601f9c2ae428dd92498f35b8aa181..ad36447e696e6c80bbf70ce53e0b88a7e2c2bbe8 100644 (file)
--- a/drivers/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -15,17 +15,18 @@
   *
   */
  
-#include "kvm.h"
-#include "x86_emulate.h"
  #include "irq.h"
  #include "vmx.h"
  #include "segment_descriptor.h"
+#include "mmu.h"
  
+#include <linux/kvm_host.h>
  #include <linux/module.h>
  #include <linux/kernel.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
  #include <linux/sched.h>
+#include <linux/moduleparam.h>
  
  #include <asm/io.h>
  #include <asm/desc.h>
@@ -33,6 +34,9 @@
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
  
+static int bypass_guest_pf = 1;
+module_param(bypass_guest_pf, bool, 0);
+
  struct vmcs {
         u32 revision_id;
         u32 abort;
@@ -43,6 +47,7 @@ struct vcpu_vmx {
         struct kvm_vcpu       vcpu;
         int                   launched;
         u8                    fail;
+       u32                   idt_vectoring_info;
         struct kvm_msr_entry *guest_msrs;
         struct kvm_msr_entry *host_msrs;
         int                   nmsrs;
@@ -57,8 +62,15 @@ struct vcpu_vmx {
                 u16           fs_sel, gs_sel, ldt_sel;
                 int           gs_ldt_reload_needed;
                 int           fs_reload_needed;
-       }host_state;
-
+               int           guest_efer_loaded;
+       } host_state;
+       struct {
+               struct {
+                       bool pending;
+                       u8 vector;
+                       unsigned rip;
+               } irq;
+       } rmode;
  };
  
  static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
  static struct page *vmx_io_bitmap_a;
  static struct page *vmx_io_bitmap_b;
  
-#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
-
  static struct vmcs_config {
         int size;
         int order;
         u32 revision_id;
         u32 pin_based_exec_ctrl;
         u32 cpu_based_exec_ctrl;
+       u32 cpu_based_2nd_exec_ctrl;
         u32 vmexit_ctrl;
         u32 vmentry_ctrl;
  } vmcs_config;
@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
                 rdmsrl(e[i].index, e[i].data);
  }
  
-static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
-{
-       return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
-}
-
-static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
-{
-       int efer_offset = vmx->msr_offset_efer;
-       return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
-               msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-}
-
  static inline int is_page_fault(u32 intr_info)
  {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info)
                 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
  }
  
+static inline int is_invalid_opcode(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+                            INTR_INFO_VALID_MASK)) ==
+               (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+}
+
  static inline int is_external_interrupt(u32 intr_info)
  {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
         return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
  }
  
+static inline int cpu_has_secondary_exec_ctrls(void)
+{
+       return (vmcs_config.cpu_based_exec_ctrl &
+               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+       return (vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+}
+
+static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+{
+       return ((cpu_has_vmx_virtualize_apic_accesses()) &&
+               (irqchip_in_kernel(kvm)));
+}
+
  static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
  {
         int i;
@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg)
                 vmcs_clear(vmx->vmcs);
         if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
                 per_cpu(current_vmcs, cpu) = NULL;
-       rdtscll(vmx->vcpu.host_tsc);
+       rdtscll(vmx->vcpu.arch.host_tsc);
  }
  
  static void vcpu_clear(struct vcpu_vmx *vmx)
  {
-       if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
-               smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
-                                        vmx, 0, 1);
-       else
-               __vcpu_clear(vmx);
+       if (vmx->vcpu.cpu == -1)
+               return;
+       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
         vmx->launched = 0;
  }
  
@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
         u8 error;
  
         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
-                      : "=q"(error) : "a"(value), "d"(field) : "cc" );
+                      : "=q"(error) : "a"(value), "d"(field) : "cc");
         if (unlikely(error))
                 vmwrite_error(field, value);
  }
@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
  {
         u32 eb;
  
-       eb = 1u << PF_VECTOR;
+       eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
         if (!vcpu->fpu_active)
                 eb |= 1u << NM_VECTOR;
         if (vcpu->guest_debug.enabled)
                 eb |= 1u << 1;
-       if (vcpu->rmode.active)
+       if (vcpu->arch.rmode.active)
                 eb = ~0;
         vmcs_write32(EXCEPTION_BITMAP, eb);
  }
@@ -344,16 +366,42 @@ static void reload_tss(void)
  
  static void load_transition_efer(struct vcpu_vmx *vmx)
  {
-       u64 trans_efer;
         int efer_offset = vmx->msr_offset_efer;
+       u64 host_efer = vmx->host_msrs[efer_offset].data;
+       u64 guest_efer = vmx->guest_msrs[efer_offset].data;
+       u64 ignore_bits;
  
-       trans_efer = vmx->host_msrs[efer_offset].data;
-       trans_efer &= ~EFER_SAVE_RESTORE_BITS;
-       trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
-       wrmsrl(MSR_EFER, trans_efer);
+       if (efer_offset < 0)
+               return;
+       /*
+        * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+        * outside long mode
+        */
+       ignore_bits = EFER_NX | EFER_SCE;
+#ifdef CONFIG_X86_64
+       ignore_bits |= EFER_LMA | EFER_LME;
+       /* SCE is meaningful only in long mode on Intel */
+       if (guest_efer & EFER_LMA)
+               ignore_bits &= ~(u64)EFER_SCE;
+#endif
+       if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
+               return;
+
+       vmx->host_state.guest_efer_loaded = 1;
+       guest_efer &= ~ignore_bits;
+       guest_efer |= host_efer & ignore_bits;
+       wrmsrl(MSR_EFER, guest_efer);
         vmx->vcpu.stat.efer_reload++;
  }
  
+static void reload_host_efer(struct vcpu_vmx *vmx)
+{
+       if (vmx->host_state.guest_efer_loaded) {
+               vmx->host_state.guest_efer_loaded = 0;
+               load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+       }
+}
+
  static void vmx_save_host_state(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
  #endif
  
  #ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu)) {
+       if (is_long_mode(&vmx->vcpu))
                 save_msrs(vmx->host_msrs +
                           vmx->msr_offset_kernel_gs_base, 1);
-       }
+
  #endif
         load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-       if (msr_efer_need_save_restore(vmx))
-               load_transition_efer(vmx);
+       load_transition_efer(vmx);
  }
  
  static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
         if (!vmx->host_state.loaded)
                 return;
  
+       ++vmx->vcpu.stat.host_state_reload;
         vmx->host_state.loaded = 0;
         if (vmx->host_state.fs_reload_needed)
                 load_fs(vmx->host_state.fs_sel);
@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
         reload_tss();
         save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
         load_msrs(vmx->host_msrs, vmx->save_nmsrs);
-       if (msr_efer_need_save_restore(vmx))
-               load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
+       reload_host_efer(vmx);
  }
  
  /*
@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                  * Make sure the time stamp counter is monotonous.
                  */
                 rdtscll(tsc_this);
-               delta = vcpu->host_tsc - tsc_this;
+               delta = vcpu->arch.host_tsc - tsc_this;
                 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
         }
  }
@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
  {
         vmx_load_host_state(to_vmx(vcpu));
-       kvm_put_guest_fpu(vcpu);
  }
  
  static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
                 return;
         vcpu->fpu_active = 1;
         vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
-       if (vcpu->cr0 & X86_CR0_TS)
+       if (vcpu->arch.cr0 & X86_CR0_TS)
                 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
         update_exception_bitmap(vcpu);
  }
@@ -523,8 +569,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
  
  static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  {
-       if (vcpu->rmode.active)
-               rflags |= IOPL_MASK | X86_EFLAGS_VM;
+       if (vcpu->arch.rmode.active)
+               rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
         vmcs_writel(GUEST_RFLAGS, rflags);
  }
  
@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
         if (interruptibility & 3)
                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
                              interruptibility & ~3);
-       vcpu->interrupt_window_open = 1;
+       vcpu->arch.interrupt_window_open = 1;
  }
  
-static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
+static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code)
  {
-       printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
-              vmcs_readl(GUEST_RIP));
-       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                    GP_VECTOR |
-                    INTR_TYPE_EXCEPTION |
-                    INTR_INFO_DELIEVER_CODE_MASK |
-                    INTR_INFO_VALID_MASK);
+                    nr | INTR_TYPE_EXCEPTION
+                    | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+                    | INTR_INFO_VALID_MASK);
+       if (has_error_code)
+               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+}
+
+static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
  }
  
  /*
@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                  * if efer.sce is enabled.
                  */
                 index = __find_msr_index(vmx, MSR_K6_STAR);
-               if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
+               if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
                         move_msr_up(vmx, index, save_nmsrs++);
         }
  #endif
@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  #ifdef CONFIG_X86_64
         case MSR_EFER:
                 ret = kvm_set_msr_common(vcpu, msr_index, data);
-               if (vmx->host_state.loaded)
+               if (vmx->host_state.loaded) {
+                       reload_host_efer(vmx);
                         load_transition_efer(vmx);
+               }
                 break;
         case MSR_FS_BASE:
                 vmcs_writel(GUEST_FS_BASE, data);
@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  
  /*
   * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->regs.
+ * registers to be accessed by indexing vcpu->arch.regs.
   */
  static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
  {
-       vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-       vcpu->rip = vmcs_readl(GUEST_RIP);
+       vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+       vcpu->arch.rip = vmcs_readl(GUEST_RIP);
  }
  
  /*
@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
   */
  static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
  {
-       vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
-       vmcs_writel(GUEST_RIP, vcpu->rip);
+       vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+       vmcs_writel(GUEST_RIP, vcpu->arch.rip);
  }
  
  static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
  
  static int vmx_get_irq(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 idtv_info_field;
  
-       idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       idtv_info_field = vmx->idt_vectoring_info;
         if (idtv_info_field & INTR_INFO_VALID_MASK) {
                 if (is_external_interrupt(idtv_info_field))
                         return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
                 else
-                       printk("pending exception: not handled yet\n");
+                       printk(KERN_DEBUG "pending exception: not handled yet\n");
         }
         return -1;
  }
@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage)
  }
  
  static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-                                     u32 msr, u32* result)
+                                     u32 msr, u32 *result)
  {
         u32 vmx_msr_low, vmx_msr_high;
         u32 ctl = ctl_min | ctl_opt;
@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
         u32 min, opt;
         u32 _pin_based_exec_control = 0;
         u32 _cpu_based_exec_control = 0;
+       u32 _cpu_based_2nd_exec_control = 0;
         u32 _vmexit_control = 0;
         u32 _vmentry_control = 0;
  
@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
               CPU_BASED_USE_IO_BITMAPS |
               CPU_BASED_MOV_DR_EXITING |
               CPU_BASED_USE_TSC_OFFSETING;
-#ifdef CONFIG_X86_64
-       opt = CPU_BASED_TPR_SHADOW;
-#else
-       opt = 0;
-#endif
+       opt = CPU_BASED_TPR_SHADOW |
+             CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
                                 &_cpu_based_exec_control) < 0)
                 return -EIO;
@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
                                            ~CPU_BASED_CR8_STORE_EXITING;
  #endif
+       if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+               min = 0;
+               opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                       SECONDARY_EXEC_WBINVD_EXITING;
+               if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
+                                       &_cpu_based_2nd_exec_control) < 0)
+                       return -EIO;
+       }
+#ifndef CONFIG_X86_64
+       if (!(_cpu_based_2nd_exec_control &
+                               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+               _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
  
         min = 0;
  #ifdef CONFIG_X86_64
@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
  
         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+       vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
         vmcs_conf->vmexit_ctrl         = _vmexit_control;
         vmcs_conf->vmentry_ctrl        = _vmentry_control;
  
@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
  {
         unsigned long flags;
  
-       vcpu->rmode.active = 0;
+       vcpu->arch.rmode.active = 0;
  
-       vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
-       vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
-       vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
+       vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
+       vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
+       vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
  
         flags = vmcs_readl(GUEST_RFLAGS);
-       flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
-       flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
+       flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
+       flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
         vmcs_writel(GUEST_RFLAGS, flags);
  
         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
  
         update_exception_bitmap(vcpu);
  
-       fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
-       fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
-       fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
-       fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
+       fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+       fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+       fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+       fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
  
         vmcs_write16(GUEST_SS_SELECTOR, 0);
         vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
  }
  
-static gva_t rmode_tss_base(struct kvm* kvm)
+static gva_t rmode_tss_base(struct kvm *kvm)
  {
-       gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
-       return base_gfn << PAGE_SHIFT;
+       if (!kvm->arch.tss_addr) {
+               gfn_t base_gfn = kvm->memslots[0].base_gfn +
+                                kvm->memslots[0].npages - 3;
+               return base_gfn << PAGE_SHIFT;
+       }
+       return kvm->arch.tss_addr;
  }
  
  static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
         save->base = vmcs_readl(sf->base);
         save->limit = vmcs_read32(sf->limit);
         save->ar = vmcs_read32(sf->ar_bytes);
-       vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
+       vmcs_write16(sf->selector, save->base >> 4);
+       vmcs_write32(sf->base, save->base & 0xfffff);
         vmcs_write32(sf->limit, 0xffff);
         vmcs_write32(sf->ar_bytes, 0xf3);
  }
@@ -1095,21 +1167,22 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
  {
         unsigned long flags;
  
-       vcpu->rmode.active = 1;
+       vcpu->arch.rmode.active = 1;
  
-       vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
+       vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
  
-       vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
+       vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
  
-       vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
+       vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
  
         flags = vmcs_readl(GUEST_RFLAGS);
-       vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
+       vcpu->arch.rmode.save_iopl
+               = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
  
-       flags |= IOPL_MASK | X86_EFLAGS_VM;
+       flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
  
         vmcs_writel(GUEST_RFLAGS, flags);
         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
@@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
                 vmcs_writel(GUEST_CS_BASE, 0xf0000);
         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
  
-       fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
-       fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
-       fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
-       fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
+       fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
+       fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
+       fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
+       fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
  
         kvm_mmu_reset_context(vcpu);
         init_rmode_tss(vcpu->kvm);
@@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
                              | AR_TYPE_BUSY_64_TSS);
         }
  
-       vcpu->shadow_efer |= EFER_LMA;
+       vcpu->arch.shadow_efer |= EFER_LMA;
  
         find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
         vmcs_write32(VM_ENTRY_CONTROLS,
@@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
  
  static void exit_lmode(struct kvm_vcpu *vcpu)
  {
-       vcpu->shadow_efer &= ~EFER_LMA;
+       vcpu->arch.shadow_efer &= ~EFER_LMA;
  
         vmcs_write32(VM_ENTRY_CONTROLS,
                      vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
  
  static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
  {
-       vcpu->cr4 &= KVM_GUEST_CR4_MASK;
-       vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+       vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
+       vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
  }
  
  static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  {
         vmx_fpu_deactivate(vcpu);
  
-       if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
+       if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
                 enter_pmode(vcpu);
  
-       if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
+       if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
                 enter_rmode(vcpu);
  
  #ifdef CONFIG_X86_64
-       if (vcpu->shadow_efer & EFER_LME) {
+       if (vcpu->arch.shadow_efer & EFER_LME) {
                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
                         enter_lmode(vcpu);
                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         vmcs_writel(CR0_READ_SHADOW, cr0);
         vmcs_writel(GUEST_CR0,
                     (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
-       vcpu->cr0 = cr0;
+       vcpu->arch.cr0 = cr0;
  
         if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
                 vmx_fpu_activate(vcpu);
@@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
         vmcs_writel(GUEST_CR3, cr3);
-       if (vcpu->cr0 & X86_CR0_PE)
+       if (vcpu->arch.cr0 & X86_CR0_PE)
                 vmx_fpu_deactivate(vcpu);
  }
  
  static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
         vmcs_writel(CR4_READ_SHADOW, cr4);
-       vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
+       vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
                     KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
-       vcpu->cr4 = cr4;
+       vcpu->arch.cr4 = cr4;
  }
  
  #ifdef CONFIG_X86_64
@@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
  
-       vcpu->shadow_efer = efer;
+       vcpu->arch.shadow_efer = efer;
         if (efer & EFER_LMA) {
                 vmcs_write32(VM_ENTRY_CONTROLS,
                                      vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
         u32 ar;
  
-       if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
-               vcpu->rmode.tr.selector = var->selector;
-               vcpu->rmode.tr.base = var->base;
-               vcpu->rmode.tr.limit = var->limit;
-               vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
+       if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+               vcpu->arch.rmode.tr.selector = var->selector;
+               vcpu->arch.rmode.tr.base = var->base;
+               vcpu->arch.rmode.tr.limit = var->limit;
+               vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
                 return;
         }
         vmcs_writel(sf->base, var->base);
         vmcs_write32(sf->limit, var->limit);
         vmcs_write16(sf->selector, var->selector);
-       if (vcpu->rmode.active && var->s) {
+       if (vcpu->arch.rmode.active && var->s) {
                 /*
                  * Hack real-mode segments into vm86 compatibility.
                  */
@@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
         vmcs_writel(GUEST_GDTR_BASE, dt->base);
  }
  
-static int init_rmode_tss(struct kvm* kvm)
+static int init_rmode_tss(struct kvm *kvm)
  {
-       struct page *p1, *p2, *p3;
         gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
-       char *page;
-
-       p1 = gfn_to_page(kvm, fn++);
-       p2 = gfn_to_page(kvm, fn++);
-       p3 = gfn_to_page(kvm, fn);
-
-       if (!p1 || !p2 || !p3) {
-               kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
-               return 0;
-       }
-
-       page = kmap_atomic(p1, KM_USER0);
-       clear_page(page);
-       *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-       kunmap_atomic(page, KM_USER0);
-
-       page = kmap_atomic(p2, KM_USER0);
-       clear_page(page);
-       kunmap_atomic(page, KM_USER0);
+       u16 data = 0;
+       int ret = 0;
+       int r;
  
-       page = kmap_atomic(p3, KM_USER0);
-       clear_page(page);
-       *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
-       kunmap_atomic(page, KM_USER0);
+       down_read(&current->mm->mmap_sem);
+       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+       if (r < 0)
+               goto out;
+       data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+       r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+       if (r < 0)
+               goto out;
+       r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+       if (r < 0)
+               goto out;
+       r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+       if (r < 0)
+               goto out;
+       data = ~0;
+       r = kvm_write_guest_page(kvm, fn, &data,
+                                RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+                                sizeof(u8));
+       if (r < 0)
+               goto out;
  
-       return 1;
+       ret = 1;
+out:
+       up_read(&current->mm->mmap_sem);
+       return ret;
  }
  
  static void seg_setup(int seg)
@@ -1397,6 +1472,27 @@ static void seg_setup(int seg)
         vmcs_write32(sf->ar_bytes, 0x93);
  }
  
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+       struct kvm_userspace_memory_region kvm_userspace_mem;
+       int r = 0;
+
+       down_write(&current->mm->mmap_sem);
+       if (kvm->arch.apic_access_page)
+               goto out;
+       kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+       kvm_userspace_mem.flags = 0;
+       kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
+       kvm_userspace_mem.memory_size = PAGE_SIZE;
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
+       if (r)
+               goto out;
+       kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+out:
+       up_write(&current->mm->mmap_sem);
+       return r;
+}
+
  /*
   * Sets up the vmcs for emulated real mode.
   */
@@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
         unsigned long a;
         struct descriptor_table dt;
         int i;
-       int ret = 0;
         unsigned long kvm_vmx_return;
-       u64 msr;
         u32 exec_control;
  
-       if (!init_rmode_tss(vmx->vcpu.kvm)) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       vmx->vcpu.rmode.active = 0;
-
-       vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-       set_cr8(&vmx->vcpu, 0);
-       msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-       if (vmx->vcpu.vcpu_id == 0)
-               msr |= MSR_IA32_APICBASE_BSP;
-       kvm_set_apic_base(&vmx->vcpu, msr);
-
-       fx_init(&vmx->vcpu);
-
-       /*
-        * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
-        * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
-        */
-       if (vmx->vcpu.vcpu_id == 0) {
-               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-               vmcs_writel(GUEST_CS_BASE, 0x000f0000);
-       } else {
-               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
-               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
-       }
-       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
-
-       seg_setup(VCPU_SREG_DS);
-       seg_setup(VCPU_SREG_ES);
-       seg_setup(VCPU_SREG_FS);
-       seg_setup(VCPU_SREG_GS);
-       seg_setup(VCPU_SREG_SS);
-
-       vmcs_write16(GUEST_TR_SELECTOR, 0);
-       vmcs_writel(GUEST_TR_BASE, 0);
-       vmcs_write32(GUEST_TR_LIMIT, 0xffff);
-       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
-       vmcs_writel(GUEST_LDTR_BASE, 0);
-       vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
-       vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
-       vmcs_write32(GUEST_SYSENTER_CS, 0);
-       vmcs_writel(GUEST_SYSENTER_ESP, 0);
-       vmcs_writel(GUEST_SYSENTER_EIP, 0);
-
-       vmcs_writel(GUEST_RFLAGS, 0x02);
-       if (vmx->vcpu.vcpu_id == 0)
-               vmcs_writel(GUEST_RIP, 0xfff0);
-       else
-               vmcs_writel(GUEST_RIP, 0);
-       vmcs_writel(GUEST_RSP, 0);
-
-       //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
-       vmcs_writel(GUEST_DR7, 0x400);
-
-       vmcs_writel(GUEST_GDTR_BASE, 0);
-       vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
-       vmcs_writel(GUEST_IDTR_BASE, 0);
-       vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
-       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
-       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
-       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-
         /* I/O */
         vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
         vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
  
-       guest_write_tsc(0);
-
         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
  
-       /* Special registers */
-       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
         /* Control */
         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
                 vmcs_config.pin_based_exec_ctrl);
@@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
         }
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
  
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
-       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+       if (cpu_has_secondary_exec_ctrls()) {
+               exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+               if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+                       exec_control &=
+                               ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+       }
+
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
+       vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
  
         vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
@@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
         get_idt(&dt);
         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
  
-       asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
+       asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
         vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
@@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                 ++vmx->nmsrs;
         }
  
-       setup_msrs(vmx);
-
         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
  
         /* 22.2.1, 20.8.1 */
         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
  
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
-
-#ifdef CONFIG_X86_64
-       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-       if (vm_need_tpr_shadow(vmx->vcpu.kvm))
-               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-                            page_to_phys(vmx->vcpu.apic->regs_page));
-       vmcs_write32(TPR_THRESHOLD, 0);
-#endif
-
         vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
         vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
  
-       vmx->vcpu.cr0 = 0x60000010;
-       vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
-       vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
-       vmx_set_efer(&vmx->vcpu, 0);
-#endif
-       vmx_fpu_activate(&vmx->vcpu);
-       update_exception_bitmap(&vmx->vcpu);
+       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+               if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
+                       return -ENOMEM;
  
         return 0;
-
-out:
-       return ret;
  }
  
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 msr;
+       int ret;
  
-       vmx_vcpu_setup(vmx);
-}
-
-static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
-{
-       u16 ent[2];
-       u16 cs;
-       u16 ip;
-       unsigned long flags;
-       unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
-       u16 sp =  vmcs_readl(GUEST_RSP);
-       u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
-
-       if (sp > ss_limit || sp < 6 ) {
-               vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
-                           __FUNCTION__,
-                           vmcs_readl(GUEST_RSP),
-                           vmcs_readl(GUEST_SS_BASE),
-                           vmcs_read32(GUEST_SS_LIMIT));
-               return;
+       if (!init_rmode_tss(vmx->vcpu.kvm)) {
+               ret = -ENOMEM;
+               goto out;
         }
  
-       if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
-                                                       X86EMUL_CONTINUE) {
-               vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
-               return;
+       vmx->vcpu.arch.rmode.active = 0;
+
+       vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+       set_cr8(&vmx->vcpu, 0);
+       msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+       if (vmx->vcpu.vcpu_id == 0)
+               msr |= MSR_IA32_APICBASE_BSP;
+       kvm_set_apic_base(&vmx->vcpu, msr);
+
+       fx_init(&vmx->vcpu);
+
+       /*
+        * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
+        * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
+        */
+       if (vmx->vcpu.vcpu_id == 0) {
+               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+               vmcs_writel(GUEST_CS_BASE, 0x000f0000);
+       } else {
+               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
+               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
         }
+       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
+
+       seg_setup(VCPU_SREG_DS);
+       seg_setup(VCPU_SREG_ES);
+       seg_setup(VCPU_SREG_FS);
+       seg_setup(VCPU_SREG_GS);
+       seg_setup(VCPU_SREG_SS);
+
+       vmcs_write16(GUEST_TR_SELECTOR, 0);
+       vmcs_writel(GUEST_TR_BASE, 0);
+       vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
  
-       flags =  vmcs_readl(GUEST_RFLAGS);
-       cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
-       ip =  vmcs_readl(GUEST_RIP);
+       vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+       vmcs_writel(GUEST_LDTR_BASE, 0);
+       vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+       vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
  
+       vmcs_write32(GUEST_SYSENTER_CS, 0);
+       vmcs_writel(GUEST_SYSENTER_ESP, 0);
+       vmcs_writel(GUEST_SYSENTER_EIP, 0);
  
-       if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
-           emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
-           emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
-               vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
-               return;
+       vmcs_writel(GUEST_RFLAGS, 0x02);
+       if (vmx->vcpu.vcpu_id == 0)
+               vmcs_writel(GUEST_RIP, 0xfff0);
+       else
+               vmcs_writel(GUEST_RIP, 0);
+       vmcs_writel(GUEST_RSP, 0);
+
+       /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
+       vmcs_writel(GUEST_DR7, 0x400);
+
+       vmcs_writel(GUEST_GDTR_BASE, 0);
+       vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+       vmcs_writel(GUEST_IDTR_BASE, 0);
+       vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+       vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+       vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+
+       guest_write_tsc(0);
+
+       /* Special registers */
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+       setup_msrs(vmx);
+
+       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+
+       if (cpu_has_vmx_tpr_shadow()) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+               if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+                       vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+                               page_to_phys(vmx->vcpu.arch.apic->regs_page));
+               vmcs_write32(TPR_THRESHOLD, 0);
         }
  
-       vmcs_writel(GUEST_RFLAGS, flags &
-                   ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
-       vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
-       vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
-       vmcs_writel(GUEST_RIP, ent[0]);
-       vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
+       if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+               vmcs_write64(APIC_ACCESS_ADDR,
+                            page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+
+       vmx->vcpu.arch.cr0 = 0x60000010;
+       vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
+       vmx_set_cr4(&vmx->vcpu, 0);
+#ifdef CONFIG_X86_64
+       vmx_set_efer(&vmx->vcpu, 0);
+#endif
+       vmx_fpu_activate(&vmx->vcpu);
+       update_exception_bitmap(&vmx->vcpu);
+
+       return 0;
+
+out:
+       return ret;
  }
  
  static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
  {
-       if (vcpu->rmode.active) {
-               inject_rmode_irq(vcpu, irq);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (vcpu->arch.rmode.active) {
+               vmx->rmode.irq.pending = true;
+               vmx->rmode.irq.vector = irq;
+               vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+               vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
                 return;
         }
         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
  
  static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
  {
-       int word_index = __ffs(vcpu->irq_summary);
-       int bit_index = __ffs(vcpu->irq_pending[word_index]);
+       int word_index = __ffs(vcpu->arch.irq_summary);
+       int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
         int irq = word_index * BITS_PER_LONG + bit_index;
  
-       clear_bit(bit_index, &vcpu->irq_pending[word_index]);
-       if (!vcpu->irq_pending[word_index])
-               clear_bit(word_index, &vcpu->irq_summary);
+       clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+       if (!vcpu->arch.irq_pending[word_index])
+               clear_bit(word_index, &vcpu->arch.irq_summary);
         vmx_inject_irq(vcpu, irq);
  }
  
@@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
  {
         u32 cpu_based_vm_exec_control;
  
-       vcpu->interrupt_window_open =
+       vcpu->arch.interrupt_window_open =
                 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                  (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
  
-       if (vcpu->interrupt_window_open &&
-           vcpu->irq_summary &&
+       if (vcpu->arch.interrupt_window_open &&
+           vcpu->arch.irq_summary &&
             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
                 /*
                  * If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
                 kvm_do_inject_irq(vcpu);
  
         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       if (!vcpu->interrupt_window_open &&
-           (vcpu->irq_summary || kvm_run->request_interrupt_window))
+       if (!vcpu->arch.interrupt_window_open &&
+           (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
                 /*
                  * Interrupts blocked.  Wait for unblock.
                  */
@@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
  }
  
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+       int ret;
+       struct kvm_userspace_memory_region tss_mem = {
+               .slot = 8,
+               .guest_phys_addr = addr,
+               .memory_size = PAGE_SIZE * 3,
+               .flags = 0,
+       };
+
+       ret = kvm_set_memory_region(kvm, &tss_mem, 0);
+       if (ret)
+               return ret;
+       kvm->arch.tss_addr = addr;
+       return 0;
+}
+
  static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
  {
         struct kvm_guest_debug *dbg = &vcpu->guest_debug;
@@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
  static int handle_rmode_exception(struct kvm_vcpu *vcpu,
                                   int vec, u32 err_code)
  {
-       if (!vcpu->rmode.active)
+       if (!vcpu->arch.rmode.active)
                 return 0;
  
         /*
@@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
          * Cause the #SS fault with 0 error code in VM86 mode.
          */
         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
-               if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
+               if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
                         return 1;
         return 0;
  }
  
  static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 intr_info, error_code;
         unsigned long cr2, rip;
         u32 vect_info;
         enum emulation_result er;
-       int r;
  
-       vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       vect_info = vmx->idt_vectoring_info;
         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
  
         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-                                               !is_page_fault(intr_info)) {
+                                               !is_page_fault(intr_info))
                 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
-       }
  
         if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-               set_bit(irq, vcpu->irq_pending);
-               set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
+               set_bit(irq, vcpu->arch.irq_pending);
+               set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
         }
  
         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
@@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 return 1;
         }
  
+       if (is_invalid_opcode(intr_info)) {
+               er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+               if (er != EMULATE_DONE)
+                       kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
         error_code = 0;
         rip = vmcs_readl(GUEST_RIP);
         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
         if (is_page_fault(intr_info)) {
                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
-
-               mutex_lock(&vcpu->kvm->lock);
-               r = kvm_mmu_page_fault(vcpu, cr2, error_code);
-               if (r < 0) {
-                       mutex_unlock(&vcpu->kvm->lock);
-                       return r;
-               }
-               if (!r) {
-                       mutex_unlock(&vcpu->kvm->lock);
-                       return 1;
-               }
-
-               er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
-               mutex_unlock(&vcpu->kvm->lock);
-
-               switch (er) {
-               case EMULATE_DONE:
-                       return 1;
-               case EMULATE_DO_MMIO:
-                       ++vcpu->stat.mmio_exits;
-                       return 0;
-                case EMULATE_FAIL:
-                       kvm_report_emulation_failure(vcpu, "pagetable");
-                       break;
-               default:
-                       BUG();
-               }
+               return kvm_mmu_page_fault(vcpu, cr2, error_code);
         }
  
-       if (vcpu->rmode.active &&
+       if (vcpu->arch.rmode.active &&
             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
                                                                 error_code)) {
-               if (vcpu->halt_request) {
-                       vcpu->halt_request = 0;
+               if (vcpu->arch.halt_request) {
+                       vcpu->arch.halt_request = 0;
                         return kvm_emulate_halt(vcpu);
                 }
                 return 1;
         }
  
-       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
+       if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
+           (INTR_TYPE_EXCEPTION | 1)) {
                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
                 return 0;
         }
@@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         string = (exit_qualification & 16) != 0;
  
         if (string) {
-               if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
+               if (emulate_instruction(vcpu,
+                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
                         return 0;
                 return 1;
         }
@@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
         hypercall[0] = 0x0f;
         hypercall[1] = 0x01;
         hypercall[2] = 0xc1;
-       hypercall[3] = 0xc3;
  }
  
  static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 switch (cr) {
                 case 0:
                         vcpu_load_rsp_rip(vcpu);
-                       set_cr0(vcpu, vcpu->regs[reg]);
+                       set_cr0(vcpu, vcpu->arch.regs[reg]);
                         skip_emulated_instruction(vcpu);
                         return 1;
                 case 3:
                         vcpu_load_rsp_rip(vcpu);
-                       set_cr3(vcpu, vcpu->regs[reg]);
+                       set_cr3(vcpu, vcpu->arch.regs[reg]);
                         skip_emulated_instruction(vcpu);
                         return 1;
                 case 4:
                         vcpu_load_rsp_rip(vcpu);
-                       set_cr4(vcpu, vcpu->regs[reg]);
+                       set_cr4(vcpu, vcpu->arch.regs[reg]);
                         skip_emulated_instruction(vcpu);
                         return 1;
                 case 8:
                         vcpu_load_rsp_rip(vcpu);
-                       set_cr8(vcpu, vcpu->regs[reg]);
+                       set_cr8(vcpu, vcpu->arch.regs[reg]);
                         skip_emulated_instruction(vcpu);
+                       if (irqchip_in_kernel(vcpu->kvm))
+                               return 1;
                         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
                         return 0;
                 };
@@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         case 2: /* clts */
                 vcpu_load_rsp_rip(vcpu);
                 vmx_fpu_deactivate(vcpu);
-               vcpu->cr0 &= ~X86_CR0_TS;
-               vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
+               vcpu->arch.cr0 &= ~X86_CR0_TS;
+               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
                 vmx_fpu_activate(vcpu);
                 skip_emulated_instruction(vcpu);
                 return 1;
@@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 switch (cr) {
                 case 3:
                         vcpu_load_rsp_rip(vcpu);
-                       vcpu->regs[reg] = vcpu->cr3;
+                       vcpu->arch.regs[reg] = vcpu->arch.cr3;
                         vcpu_put_rsp_rip(vcpu);
                         skip_emulated_instruction(vcpu);
                         return 1;
                 case 8:
                         vcpu_load_rsp_rip(vcpu);
-                       vcpu->regs[reg] = get_cr8(vcpu);
+                       vcpu->arch.regs[reg] = get_cr8(vcpu);
                         vcpu_put_rsp_rip(vcpu);
                         skip_emulated_instruction(vcpu);
                         return 1;
@@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 default:
                         val = 0;
                 }
-               vcpu->regs[reg] = val;
+               vcpu->arch.regs[reg] = val;
         } else {
                 /* mov to dr */
         }
@@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  
  static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
-       u32 ecx = vcpu->regs[VCPU_REGS_RCX];
+       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
         u64 data;
  
         if (vmx_get_msr(vcpu, ecx, &data)) {
-               vmx_inject_gp(vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                 return 1;
         }
  
         /* FIXME: handling of bits 32:63 of rax, rdx */
-       vcpu->regs[VCPU_REGS_RAX] = data & -1u;
-       vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+       vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
+       vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
         skip_emulated_instruction(vcpu);
         return 1;
  }
  
  static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
-       u32 ecx = vcpu->regs[VCPU_REGS_RCX];
-       u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
-               | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
+       u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+       u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+               | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
  
         if (vmx_set_msr(vcpu, ecx, data) != 0) {
-               vmx_inject_gp(vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                 return 1;
         }
  
@@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
          * possible
          */
         if (kvm_run->request_interrupt_window &&
-           !vcpu->irq_summary) {
+           !vcpu->arch.irq_summary) {
                 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
                 ++vcpu->stat.irq_window_exits;
                 return 0;
@@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
         skip_emulated_instruction(vcpu);
-       return kvm_hypercall(vcpu, kvm_run);
+       kvm_emulate_hypercall(vcpu);
+       return 1;
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       skip_emulated_instruction(vcpu);
+       /* TODO: Add support for VT-d/pass-through device */
+       return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       u64 exit_qualification;
+       enum emulation_result er;
+       unsigned long offset;
+
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+       offset = exit_qualification & 0xffful;
+
+       er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+       if (er !=  EMULATE_DONE) {
+               printk(KERN_ERR
+                      "Fail to handle apic access vmexit! Offset is 0x%lx\n",
+                      offset);
+               return -ENOTSUPP;
+       }
+       return 1;
  }
  
  /*
@@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
         [EXIT_REASON_HLT]                     = handle_halt,
         [EXIT_REASON_VMCALL]                  = handle_vmcall,
-       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold
+       [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
+       [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
+       [EXIT_REASON_WBINVD]                  = handle_wbinvd,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers =
   */
  static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
  {
-       u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
         u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
         struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 vectoring_info = vmx->idt_vectoring_info;
  
         if (unlikely(vmx->fail)) {
                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                 return 0;
         }
  
-       if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                               exit_reason != EXIT_REASON_EXCEPTION_NMI )
+       if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                               exit_reason != EXIT_REASON_EXCEPTION_NMI)
                 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
         if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
  
  static void vmx_intr_assist(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 idtv_info_field, intr_info_field;
         int has_ext_irq, interrupt_window_open;
         int vector;
  
-       kvm_inject_pending_timer_irqs(vcpu);
         update_tpr_threshold(vcpu);
  
         has_ext_irq = kvm_cpu_has_interrupt(vcpu);
         intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-       idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       idtv_info_field = vmx->idt_vectoring_info;
         if (intr_info_field & INTR_INFO_VALID_MASK) {
                 if (idtv_info_field & INTR_INFO_VALID_MASK) {
                         /* TODO: fault when IDT_Vectoring */
-                       printk(KERN_ERR "Fault when IDT_Vectoring\n");
+                       if (printk_ratelimit())
+                               printk(KERN_ERR "Fault when IDT_Vectoring\n");
                 }
                 if (has_ext_irq)
                         enable_irq_window(vcpu);
                 return;
         }
         if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
+               if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
+                   == INTR_TYPE_EXT_INTR
+                   && vcpu->arch.rmode.active) {
+                       u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
+
+                       vmx_inject_irq(vcpu, vect);
+                       if (unlikely(has_ext_irq))
+                               enable_irq_window(vcpu);
+                       return;
+               }
+
                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                                 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
                 enable_irq_window(vcpu);
  }
  
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen.  So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+{
+       vmx->rmode.irq.pending = 0;
+       if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+               return;
+       vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+       if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+               vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+               vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+               return;
+       }
+       vmx->idt_vectoring_info =
+               VECTORING_INFO_VALID_MASK
+               | INTR_TYPE_EXT_INTR
+               | vmx->rmode.irq.vector;
+}
+
  static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
          */
         vmcs_writel(HOST_CR0, read_cr0());
  
-       asm (
+       asm(
                 /* Store host registers */
  #ifdef CONFIG_X86_64
-               "push %%rax; push %%rbx; push %%rdx;"
-               "push %%rsi; push %%rdi; push %%rbp;"
-               "push %%r8;  push %%r9;  push %%r10; push %%r11;"
-               "push %%r12; push %%r13; push %%r14; push %%r15;"
+               "push %%rdx; push %%rbp;"
                 "push %%rcx \n\t"
-               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
  #else
-               "pusha; push %%ecx \n\t"
-               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
+               "push %%edx; push %%ebp;"
+               "push %%ecx \n\t"
  #endif
+               ASM_VMX_VMWRITE_RSP_RDX "\n\t"
                 /* Check if vmlaunch of vmresume is needed */
-               "cmp $0, %1 \n\t"
+               "cmpl $0, %c[launched](%0) \n\t"
                 /* Load guest registers.  Don't clobber flags. */
  #ifdef CONFIG_X86_64
-               "mov %c[cr2](%3), %%rax \n\t"
+               "mov %c[cr2](%0), %%rax \n\t"
                 "mov %%rax, %%cr2 \n\t"
-               "mov %c[rax](%3), %%rax \n\t"
-               "mov %c[rbx](%3), %%rbx \n\t"
-               "mov %c[rdx](%3), %%rdx \n\t"
-               "mov %c[rsi](%3), %%rsi \n\t"
-               "mov %c[rdi](%3), %%rdi \n\t"
-               "mov %c[rbp](%3), %%rbp \n\t"
-               "mov %c[r8](%3),  %%r8  \n\t"
-               "mov %c[r9](%3),  %%r9  \n\t"
-               "mov %c[r10](%3), %%r10 \n\t"
-               "mov %c[r11](%3), %%r11 \n\t"
-               "mov %c[r12](%3), %%r12 \n\t"
-               "mov %c[r13](%3), %%r13 \n\t"
-               "mov %c[r14](%3), %%r14 \n\t"
-               "mov %c[r15](%3), %%r15 \n\t"
-               "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
+               "mov %c[rax](%0), %%rax \n\t"
+               "mov %c[rbx](%0), %%rbx \n\t"
+               "mov %c[rdx](%0), %%rdx \n\t"
+               "mov %c[rsi](%0), %%rsi \n\t"
+               "mov %c[rdi](%0), %%rdi \n\t"
+               "mov %c[rbp](%0), %%rbp \n\t"
+               "mov %c[r8](%0),  %%r8  \n\t"
+               "mov %c[r9](%0),  %%r9  \n\t"
+               "mov %c[r10](%0), %%r10 \n\t"
+               "mov %c[r11](%0), %%r11 \n\t"
+               "mov %c[r12](%0), %%r12 \n\t"
+               "mov %c[r13](%0), %%r13 \n\t"
+               "mov %c[r14](%0), %%r14 \n\t"
+               "mov %c[r15](%0), %%r15 \n\t"
+               "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
  #else
-               "mov %c[cr2](%3), %%eax \n\t"
+               "mov %c[cr2](%0), %%eax \n\t"
                 "mov %%eax,   %%cr2 \n\t"
-               "mov %c[rax](%3), %%eax \n\t"
-               "mov %c[rbx](%3), %%ebx \n\t"
-               "mov %c[rdx](%3), %%edx \n\t"
-               "mov %c[rsi](%3), %%esi \n\t"
-               "mov %c[rdi](%3), %%edi \n\t"
-               "mov %c[rbp](%3), %%ebp \n\t"
-               "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
+               "mov %c[rax](%0), %%eax \n\t"
+               "mov %c[rbx](%0), %%ebx \n\t"
+               "mov %c[rdx](%0), %%edx \n\t"
+               "mov %c[rsi](%0), %%esi \n\t"
+               "mov %c[rdi](%0), %%edi \n\t"
+               "mov %c[rbp](%0), %%ebp \n\t"
+               "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
  #endif
                 /* Enter guest mode */
                 "jne .Llaunched \n\t"
@@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 ".Lkvm_vmx_return: "
                 /* Save guest registers, load host registers, keep flags */
  #ifdef CONFIG_X86_64
-               "xchg %3,     (%%rsp) \n\t"
-               "mov %%rax, %c[rax](%3) \n\t"
-               "mov %%rbx, %c[rbx](%3) \n\t"
-               "pushq (%%rsp); popq %c[rcx](%3) \n\t"
-               "mov %%rdx, %c[rdx](%3) \n\t"
-               "mov %%rsi, %c[rsi](%3) \n\t"
-               "mov %%rdi, %c[rdi](%3) \n\t"
-               "mov %%rbp, %c[rbp](%3) \n\t"
-               "mov %%r8,  %c[r8](%3) \n\t"
-               "mov %%r9,  %c[r9](%3) \n\t"
-               "mov %%r10, %c[r10](%3) \n\t"
-               "mov %%r11, %c[r11](%3) \n\t"
-               "mov %%r12, %c[r12](%3) \n\t"
-               "mov %%r13, %c[r13](%3) \n\t"
-               "mov %%r14, %c[r14](%3) \n\t"
-               "mov %%r15, %c[r15](%3) \n\t"
+               "xchg %0,     (%%rsp) \n\t"
+               "mov %%rax, %c[rax](%0) \n\t"
+               "mov %%rbx, %c[rbx](%0) \n\t"
+               "pushq (%%rsp); popq %c[rcx](%0) \n\t"
+               "mov %%rdx, %c[rdx](%0) \n\t"
+               "mov %%rsi, %c[rsi](%0) \n\t"
+               "mov %%rdi, %c[rdi](%0) \n\t"
+               "mov %%rbp, %c[rbp](%0) \n\t"
+               "mov %%r8,  %c[r8](%0) \n\t"
+               "mov %%r9,  %c[r9](%0) \n\t"
+               "mov %%r10, %c[r10](%0) \n\t"
+               "mov %%r11, %c[r11](%0) \n\t"
+               "mov %%r12, %c[r12](%0) \n\t"
+               "mov %%r13, %c[r13](%0) \n\t"
+               "mov %%r14, %c[r14](%0) \n\t"
+               "mov %%r15, %c[r15](%0) \n\t"
                 "mov %%cr2, %%rax   \n\t"
-               "mov %%rax, %c[cr2](%3) \n\t"
-               "mov (%%rsp), %3 \n\t"
+               "mov %%rax, %c[cr2](%0) \n\t"
  
-               "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
-               "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
-               "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
-               "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
+               "pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
  #else
-               "xchg %3, (%%esp) \n\t"
-               "mov %%eax, %c[rax](%3) \n\t"
-               "mov %%ebx, %c[rbx](%3) \n\t"
-               "pushl (%%esp); popl %c[rcx](%3) \n\t"
-               "mov %%edx, %c[rdx](%3) \n\t"
-               "mov %%esi, %c[rsi](%3) \n\t"
-               "mov %%edi, %c[rdi](%3) \n\t"
-               "mov %%ebp, %c[rbp](%3) \n\t"
+               "xchg %0, (%%esp) \n\t"
+               "mov %%eax, %c[rax](%0) \n\t"
+               "mov %%ebx, %c[rbx](%0) \n\t"
+               "pushl (%%esp); popl %c[rcx](%0) \n\t"
+               "mov %%edx, %c[rdx](%0) \n\t"
+               "mov %%esi, %c[rsi](%0) \n\t"
+               "mov %%edi, %c[rdi](%0) \n\t"
+               "mov %%ebp, %c[rbp](%0) \n\t"
                 "mov %%cr2, %%eax  \n\t"
-               "mov %%eax, %c[cr2](%3) \n\t"
-               "mov (%%esp), %3 \n\t"
+               "mov %%eax, %c[cr2](%0) \n\t"
  
-               "pop %%ecx; popa \n\t"
+               "pop %%ebp; pop %%ebp; pop %%edx \n\t"
+#endif
+               "setbe %c[fail](%0) \n\t"
+             : : "c"(vmx), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+               [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+               [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+               [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+               [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+               [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+               [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+               [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+               [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+               [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+               [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+               [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+               [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+               [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+               [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
  #endif
-               "setbe %0 \n\t"
-             : "=q" (vmx->fail)
-             : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
-               "c"(vcpu),
-               [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
-               [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
-               [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
-               [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
-               [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
-               [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
-               [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
+               [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+             : "cc", "memory"
  #ifdef CONFIG_X86_64
-               [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
-               [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
-               [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
-               [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
-               [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
-               [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
-               [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
-               [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
+               , "rbx", "rdi", "rsi"
+               , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+               , "ebx", "edi", "rsi"
  #endif
-               [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
-             : "cc", "memory" );
+             );
+
+       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+       if (vmx->rmode.irq.pending)
+               fixup_rmode_irq(vmx);
  
-       vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
+       vcpu->arch.interrupt_window_open =
+               (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
  
-       asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
+       asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
         vmx->launched = 1;
  
         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                 asm("int $2");
  }
  
-static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
-                                 unsigned long addr,
-                                 u32 err_code)
-{
-       u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
-       ++vcpu->stat.pf_guest;
-
-       if (is_page_fault(vect_info)) {
-               printk(KERN_DEBUG "inject_page_fault: "
-                      "double fault 0x%lx @ 0x%lx\n",
-                      addr, vmcs_readl(GUEST_RIP));
-               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                            DF_VECTOR |
-                            INTR_TYPE_EXCEPTION |
-                            INTR_INFO_DELIEVER_CODE_MASK |
-                            INTR_INFO_VALID_MASK);
-               return;
-       }
-       vcpu->cr2 = addr;
-       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
-       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                    PF_VECTOR |
-                    INTR_TYPE_EXCEPTION |
-                    INTR_INFO_DELIEVER_CODE_MASK |
-                    INTR_INFO_VALID_MASK);
-
-}
-
  static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
         if (err)
                 goto free_vcpu;
  
-       if (irqchip_in_kernel(kvm)) {
-               err = kvm_create_lapic(&vmx->vcpu);
-               if (err < 0)
-                       goto free_vcpu;
-       }
-
         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
         if (!vmx->guest_msrs) {
                 err = -ENOMEM;
@@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
         .check_processor_compatibility = vmx_check_processor_compat,
         .hardware_enable = hardware_enable,
         .hardware_disable = hardware_disable,
+       .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
  
         .vcpu_create = vmx_create_vcpu,
         .vcpu_free = vmx_free_vcpu,
@@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
         .set_rflags = vmx_set_rflags,
  
         .tlb_flush = vmx_flush_tlb,
-       .inject_page_fault = vmx_inject_page_fault,
-
-       .inject_gp = vmx_inject_gp,
  
         .run = vmx_vcpu_run,
         .handle_exit = kvm_handle_exit,
@@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
         .patch_hypercall = vmx_patch_hypercall,
         .get_irq = vmx_get_irq,
         .set_irq = vmx_inject_irq,
+       .queue_exception = vmx_queue_exception,
+       .exception_injected = vmx_exception_injected,
         .inject_pending_irq = vmx_intr_assist,
         .inject_pending_vectors = do_interrupt_requests,
+
+       .set_tss_addr = vmx_set_tss_addr,
  };
  
  static int __init vmx_init(void)
@@ -2541,10 +2651,13 @@ static int __init vmx_init(void)
         memset(iova, 0xff, PAGE_SIZE);
         kunmap(vmx_io_bitmap_b);
  
-       r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
         if (r)
                 goto out1;
  
+       if (bypass_guest_pf)
+               kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+
         return 0;
  
  out1:
@@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void)
         __free_page(vmx_io_bitmap_b);
         __free_page(vmx_io_bitmap_a);
  
-       kvm_exit_x86();
+       kvm_exit();
  }
  
  module_init(vmx_init)
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h

similarity index 96%

rename from drivers/kvm/vmx.h

rename to arch/x86/kvm/vmx.h

index fd4e14666088098f7af7d337bc5bc66bbd5e177c..d52ae8d7303de2586351d80662fd385366331ec5 100644 (file)
--- a/drivers/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -25,6 +25,9 @@
   *
   */
  
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
  #define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
  #define CPU_BASED_USE_TSC_OFFSETING             0x00000008
  #define CPU_BASED_HLT_EXITING                   0x00000080
@@ -42,6 +45,12 @@
  #define CPU_BASED_MONITOR_EXITING               0x20000000
  #define CPU_BASED_PAUSE_EXITING                 0x40000000
  #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
+
  
  #define PIN_BASED_EXT_INTR_MASK                 0x00000001
  #define PIN_BASED_NMI_EXITING                   0x00000008
@@ -54,8 +63,6 @@
  #define VM_ENTRY_SMM                            0x00000400
  #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
  
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-
  /* VMCS Encodings */
  enum vmcs_field {
         GUEST_ES_SELECTOR               = 0x00000800,
@@ -89,6 +96,8 @@ enum vmcs_field {
         TSC_OFFSET_HIGH                 = 0x00002011,
         VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
         VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
+       APIC_ACCESS_ADDR                = 0x00002014,
+       APIC_ACCESS_ADDR_HIGH           = 0x00002015,
         VMCS_LINK_POINTER               = 0x00002800,
         VMCS_LINK_POINTER_HIGH          = 0x00002801,
         GUEST_IA32_DEBUGCTL             = 0x00002802,
@@ -214,6 +223,8 @@ enum vmcs_field {
  #define EXIT_REASON_MSR_WRITE           32
  #define EXIT_REASON_MWAIT_INSTRUCTION   36
  #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_WBINVD             54
  
  /*
   * Interruption-information format
@@ -230,13 +241,14 @@ enum vmcs_field {
  
  #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
  #define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
  
  /*
   * Exit Qualifications for MOV for Control Register Access
   */
-#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control register */
+#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control reg.*/
  #define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose register */
+#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose reg. */
  #define LMSW_SOURCE_DATA_SHIFT 16
  #define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
  #define REG_EAX                         (0 << 8)
@@ -259,11 +271,11 @@ enum vmcs_field {
  /*
   * Exit Qualifications for MOV for Debug Register Access
   */
-#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug register */
+#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug reg. */
  #define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
  #define TYPE_MOV_TO_DR                  (0 << 4)
  #define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose register */
+#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
  
  
  /* segment AR */
@@ -307,4 +319,6 @@ enum vmcs_field {
  #define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
  #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
  
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       9
+
  #endif
diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c

similarity index 52%

rename from drivers/kvm/kvm_main.c

rename to arch/x86/kvm/x86.c

index c0f372f1d761312bc648d9deccd8e74609fbc513..8f94a0b89dffd51e5c5387be1b0bb6b1653762bc 100644 (file)
--- a/drivers/kvm/kvm_main.c
+++ b/arch/x86/kvm/x86.c
@@ -1,8 +1,7 @@
  /*
   * Kernel-based Virtual Machine driver for Linux
   *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
+ * derived from drivers/kvm/kvm_main.c
   *
   * Copyright (C) 2006 Qumranet, Inc.
   *
@@ -15,80 +14,22 @@
   *
   */
  
-#include "kvm.h"
-#include "x86_emulate.h"
+#include <linux/kvm_host.h>
  #include "segment_descriptor.h"
  #include "irq.h"
+#include "mmu.h"
  
  #include <linux/kvm.h>
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/percpu.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
+#include <linux/fs.h>
  #include <linux/vmalloc.h>
-#include <linux/reboot.h>
-#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/mman.h>
  #include <linux/highmem.h>
-#include <linux/file.h>
-#include <linux/sysdev.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/smp.h>
-#include <linux/anon_inodes.h>
-#include <linux/profile.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static DEFINE_SPINLOCK(kvm_lock);
-static LIST_HEAD(vm_list);
-
-static cpumask_t cpus_hardware_enabled;
-
-struct kvm_x86_ops *kvm_x86_ops;
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
-
-static __read_mostly struct preempt_ops kvm_preempt_ops;
-
-#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
-
-static struct kvm_stats_debugfs_item {
-       const char *name;
-       int offset;
-       struct dentry *dentry;
-} debugfs_entries[] = {
-       { "pf_fixed", STAT_OFFSET(pf_fixed) },
-       { "pf_guest", STAT_OFFSET(pf_guest) },
-       { "tlb_flush", STAT_OFFSET(tlb_flush) },
-       { "invlpg", STAT_OFFSET(invlpg) },
-       { "exits", STAT_OFFSET(exits) },
-       { "io_exits", STAT_OFFSET(io_exits) },
-       { "mmio_exits", STAT_OFFSET(mmio_exits) },
-       { "signal_exits", STAT_OFFSET(signal_exits) },
-       { "irq_window", STAT_OFFSET(irq_window_exits) },
-       { "halt_exits", STAT_OFFSET(halt_exits) },
-       { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
-       { "request_irq", STAT_OFFSET(request_irq_exits) },
-       { "irq_exits", STAT_OFFSET(irq_exits) },
-       { "light_exits", STAT_OFFSET(light_exits) },
-       { "efer_reload", STAT_OFFSET(efer_reload) },
-       { NULL }
-};
  
-static struct dentry *debugfs_dir;
+#include <asm/uaccess.h>
+#include <asm/msr.h>
  
  #define MAX_IO_MSRS 256
-
  #define CR0_RESERVED_BITS                                              \
         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -102,317 +43,151 @@ static struct dentry *debugfs_dir;
  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  #define EFER_RESERVED_BITS 0xfffffffffffff2fe
  
-#ifdef CONFIG_X86_64
-// LDT or TSS descriptor in the GDT. 16 bytes.
-struct segment_descriptor_64 {
-       struct segment_descriptor s;
-       u32 base_higher;
-       u32 pad_zero;
-};
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  
-#endif
+struct kvm_x86_ops *kvm_x86_ops;
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+       { "pf_fixed", VCPU_STAT(pf_fixed) },
+       { "pf_guest", VCPU_STAT(pf_guest) },
+       { "tlb_flush", VCPU_STAT(tlb_flush) },
+       { "invlpg", VCPU_STAT(invlpg) },
+       { "exits", VCPU_STAT(exits) },
+       { "io_exits", VCPU_STAT(io_exits) },
+       { "mmio_exits", VCPU_STAT(mmio_exits) },
+       { "signal_exits", VCPU_STAT(signal_exits) },
+       { "irq_window", VCPU_STAT(irq_window_exits) },
+       { "halt_exits", VCPU_STAT(halt_exits) },
+       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+       { "request_irq", VCPU_STAT(request_irq_exits) },
+       { "irq_exits", VCPU_STAT(irq_exits) },
+       { "host_state_reload", VCPU_STAT(host_state_reload) },
+       { "efer_reload", VCPU_STAT(efer_reload) },
+       { "fpu_reload", VCPU_STAT(fpu_reload) },
+       { "insn_emulation", VCPU_STAT(insn_emulation) },
+       { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+       { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+       { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+       { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+       { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+       { "mmu_flooded", VM_STAT(mmu_flooded) },
+       { "mmu_recycled", VM_STAT(mmu_recycled) },
+       { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+       { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+       { NULL }
+};
  
-static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
-                          unsigned long arg);
  
  unsigned long segment_base(u16 selector)
  {
         struct descriptor_table gdt;
         struct segment_descriptor *d;
         unsigned long table_base;
-       typedef unsigned long ul;
         unsigned long v;
  
         if (selector == 0)
                 return 0;
  
-       asm ("sgdt %0" : "=m"(gdt));
+       asm("sgdt %0" : "=m"(gdt));
         table_base = gdt.base;
  
         if (selector & 4) {           /* from ldt */
                 u16 ldt_selector;
  
-               asm ("sldt %0" : "=g"(ldt_selector));
+               asm("sldt %0" : "=g"(ldt_selector));
                 table_base = segment_base(ldt_selector);
         }
         d = (struct segment_descriptor *)(table_base + (selector & ~7));
-       v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
+       v = d->base_low | ((unsigned long)d->base_mid << 16) |
+               ((unsigned long)d->base_high << 24);
  #ifdef CONFIG_X86_64
-       if (d->system == 0
-           && (d->type == 2 || d->type == 9 || d->type == 11))
-               v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
+       if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((unsigned long) \
+                     ((struct segment_descriptor_64 *)d)->base_higher) << 32;
  #endif
         return v;
  }
  EXPORT_SYMBOL_GPL(segment_base);
  
-static inline int valid_vcpu(int n)
-{
-       return likely(n >= 0 && n < KVM_MAX_VCPUS);
-}
-
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
-               return;
-
-       vcpu->guest_fpu_loaded = 1;
-       fx_save(&vcpu->host_fx_image);
-       fx_restore(&vcpu->guest_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
-
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (!vcpu->guest_fpu_loaded)
-               return;
-
-       vcpu->guest_fpu_loaded = 0;
-       fx_save(&vcpu->guest_fx_image);
-       fx_restore(&vcpu->host_fx_image);
-}
-EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put()
- */
-static void vcpu_load(struct kvm_vcpu *vcpu)
-{
-       int cpu;
-
-       mutex_lock(&vcpu->mutex);
-       cpu = get_cpu();
-       preempt_notifier_register(&vcpu->preempt_notifier);
-       kvm_x86_ops->vcpu_load(vcpu, cpu);
-       put_cpu();
-}
-
-static void vcpu_put(struct kvm_vcpu *vcpu)
-{
-       preempt_disable();
-       kvm_x86_ops->vcpu_put(vcpu);
-       preempt_notifier_unregister(&vcpu->preempt_notifier);
-       preempt_enable();
-       mutex_unlock(&vcpu->mutex);
-}
-
-static void ack_flush(void *_completed)
-{
-}
-
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-       int i, cpu;
-       cpumask_t cpus;
-       struct kvm_vcpu *vcpu;
-
-       cpus_clear(cpus);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               vcpu = kvm->vcpus[i];
-               if (!vcpu)
-                       continue;
-               if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
-                       continue;
-               cpu = vcpu->cpu;
-               if (cpu != -1 && cpu != raw_smp_processor_id())
-                       cpu_set(cpu, cpus);
-       }
-       smp_call_function_mask(cpus, ack_flush, NULL, 1);
-}
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
  {
-       struct page *page;
-       int r;
-
-       mutex_init(&vcpu->mutex);
-       vcpu->cpu = -1;
-       vcpu->mmu.root_hpa = INVALID_PAGE;
-       vcpu->kvm = kvm;
-       vcpu->vcpu_id = id;
-       if (!irqchip_in_kernel(kvm) || id == 0)
-               vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+       if (irqchip_in_kernel(vcpu->kvm))
+               return vcpu->arch.apic_base;
         else
-               vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
-       init_waitqueue_head(&vcpu->wq);
-
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-       if (!page) {
-               r = -ENOMEM;
-               goto fail;
-       }
-       vcpu->run = page_address(page);
-
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-       if (!page) {
-               r = -ENOMEM;
-               goto fail_free_run;
-       }
-       vcpu->pio_data = page_address(page);
-
-       r = kvm_mmu_create(vcpu);
-       if (r < 0)
-               goto fail_free_pio_data;
-
-       return 0;
-
-fail_free_pio_data:
-       free_page((unsigned long)vcpu->pio_data);
-fail_free_run:
-       free_page((unsigned long)vcpu->run);
-fail:
-       return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
-
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
-       kvm_mmu_destroy(vcpu);
-       if (vcpu->apic)
-               hrtimer_cancel(&vcpu->apic->timer.dev);
-       kvm_free_apic(vcpu->apic);
-       free_page((unsigned long)vcpu->pio_data);
-       free_page((unsigned long)vcpu->run);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
-
-static struct kvm *kvm_create_vm(void)
-{
-       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-       if (!kvm)
-               return ERR_PTR(-ENOMEM);
-
-       kvm_io_bus_init(&kvm->pio_bus);
-       mutex_init(&kvm->lock);
-       INIT_LIST_HEAD(&kvm->active_mmu_pages);
-       kvm_io_bus_init(&kvm->mmio_bus);
-       spin_lock(&kvm_lock);
-       list_add(&kvm->vm_list, &vm_list);
-       spin_unlock(&kvm_lock);
-       return kvm;
-}
-
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
-                                 struct kvm_memory_slot *dont)
-{
-       int i;
-
-       if (!dont || free->phys_mem != dont->phys_mem)
-               if (free->phys_mem) {
-                       for (i = 0; i < free->npages; ++i)
-                               if (free->phys_mem[i])
-                                       __free_page(free->phys_mem[i]);
-                       vfree(free->phys_mem);
-               }
-
-       if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-               vfree(free->dirty_bitmap);
-
-       free->phys_mem = NULL;
-       free->npages = 0;
-       free->dirty_bitmap = NULL;
-}
-
-static void kvm_free_physmem(struct kvm *kvm)
-{
-       int i;
-
-       for (i = 0; i < kvm->nmemslots; ++i)
-               kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+               return vcpu->arch.apic_base;
  }
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
  
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
  {
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
-               if (vcpu->pio.guest_pages[i]) {
-                       __free_page(vcpu->pio.guest_pages[i]);
-                       vcpu->pio.guest_pages[i] = NULL;
-               }
+       /* TODO: reserve bits check */
+       if (irqchip_in_kernel(vcpu->kvm))
+               kvm_lapic_set_base(vcpu, data);
+       else
+               vcpu->arch.apic_base = data;
  }
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
  
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
  {
-       vcpu_load(vcpu);
-       kvm_mmu_unload(vcpu);
-       vcpu_put(vcpu);
+       WARN_ON(vcpu->arch.exception.pending);
+       vcpu->arch.exception.pending = true;
+       vcpu->arch.exception.has_error_code = false;
+       vcpu->arch.exception.nr = nr;
  }
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
  
-static void kvm_free_vcpus(struct kvm *kvm)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+                          u32 error_code)
  {
-       unsigned int i;
-
-       /*
-        * Unpin any mmu pages first.
-        */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
+       ++vcpu->stat.pf_guest;
+       if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
+               printk(KERN_DEBUG "kvm: inject_page_fault:"
+                      " double fault 0x%lx\n", addr);
+               vcpu->arch.exception.nr = DF_VECTOR;
+               vcpu->arch.exception.error_code = 0;
+               return;
         }
-
-}
-
-static void kvm_destroy_vm(struct kvm *kvm)
-{
-       spin_lock(&kvm_lock);
-       list_del(&kvm->vm_list);
-       spin_unlock(&kvm_lock);
-       kvm_io_bus_destroy(&kvm->pio_bus);
-       kvm_io_bus_destroy(&kvm->mmio_bus);
-       kfree(kvm->vpic);
-       kfree(kvm->vioapic);
-       kvm_free_vcpus(kvm);
-       kvm_free_physmem(kvm);
-       kfree(kvm);
+       vcpu->arch.cr2 = addr;
+       kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
  }
  
-static int kvm_vm_release(struct inode *inode, struct file *filp)
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
  {
-       struct kvm *kvm = filp->private_data;
-
-       kvm_destroy_vm(kvm);
-       return 0;
+       WARN_ON(vcpu->arch.exception.pending);
+       vcpu->arch.exception.pending = true;
+       vcpu->arch.exception.has_error_code = true;
+       vcpu->arch.exception.nr = nr;
+       vcpu->arch.exception.error_code = error_code;
  }
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
  
-static void inject_gp(struct kvm_vcpu *vcpu)
+static void __queue_exception(struct kvm_vcpu *vcpu)
  {
-       kvm_x86_ops->inject_gp(vcpu, 0);
+       kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+                                    vcpu->arch.exception.has_error_code,
+                                    vcpu->arch.exception.error_code);
  }
  
  /*
   * Load the pae pdptrs.  Return true is they are all valid.
   */
-static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
         int i;
-       u64 *pdpt;
         int ret;
-       struct page *page;
-       u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
+       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
  
-       mutex_lock(&vcpu->kvm->lock);
-       page = gfn_to_page(vcpu->kvm, pdpt_gfn);
-       if (!page) {
+       down_read(&current->mm->mmap_sem);
+       ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
+                                 offset * sizeof(u64), sizeof(pdpte));
+       if (ret < 0) {
                 ret = 0;
                 goto out;
         }
-
-       pdpt = kmap_atomic(page, KM_USER0);
-       memcpy(pdpte, pdpt+offset, sizeof(pdpte));
-       kunmap_atomic(pdpt, KM_USER0);
-
         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
                         ret = 0;
@@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
         }
         ret = 1;
  
-       memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
+       memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
  out:
-       mutex_unlock(&vcpu->kvm->lock);
+       up_read(&current->mm->mmap_sem);
  
         return ret;
  }
  
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+       u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+       bool changed = true;
+       int r;
+
+       if (is_long_mode(vcpu) || !is_pae(vcpu))
+               return false;
+
+       down_read(&current->mm->mmap_sem);
+       r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+       if (r < 0)
+               goto out;
+       changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+out:
+       up_read(&current->mm->mmap_sem);
+
+       return changed;
+}
+
  void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  {
         if (cr0 & CR0_RESERVED_BITS) {
                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
-                      cr0, vcpu->cr0);
-               inject_gp(vcpu);
+                      cr0, vcpu->arch.cr0);
+               kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
                        "and a clear PE flag\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
  #ifdef CONFIG_X86_64
-               if ((vcpu->shadow_efer & EFER_LME)) {
+               if ((vcpu->arch.shadow_efer & EFER_LME)) {
                         int cs_db, cs_l;
  
                         if (!is_pae(vcpu)) {
                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
                                        "in long mode while PAE is disabled\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                 return;
                         }
                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
                         if (cs_l) {
                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
                                        "in long mode while CS.L == 1\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                 return;
  
                         }
                 } else
  #endif
-               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
+               if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
                                "reserved bits\n");
-                       inject_gp(vcpu);
+                       kvm_inject_gp(vcpu, 0);
                         return;
                 }
  
         }
  
         kvm_x86_ops->set_cr0(vcpu, cr0);
-       vcpu->cr0 = cr0;
+       vcpu->arch.cr0 = cr0;
  
-       mutex_lock(&vcpu->kvm->lock);
         kvm_mmu_reset_context(vcpu);
-       mutex_unlock(&vcpu->kvm->lock);
         return;
  }
  EXPORT_SYMBOL_GPL(set_cr0);
  
  void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
  {
-       set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
+       set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
  }
  EXPORT_SYMBOL_GPL(lmsw);
  
@@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
         if (cr4 & CR4_RESERVED_BITS) {
                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                 return;
         }
  
@@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                 if (!(cr4 & X86_CR4_PAE)) {
                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
                                "in long mode\n");
-                       inject_gp(vcpu);
+                       kvm_inject_gp(vcpu, 0);
                         return;
                 }
         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
-                  && !load_pdptrs(vcpu, vcpu->cr3)) {
+                  && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                 return;
         }
  
         if (cr4 & X86_CR4_VMXE) {
                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                 return;
         }
         kvm_x86_ops->set_cr4(vcpu, cr4);
-       vcpu->cr4 = cr4;
-       mutex_lock(&vcpu->kvm->lock);
+       vcpu->arch.cr4 = cr4;
         kvm_mmu_reset_context(vcpu);
-       mutex_unlock(&vcpu->kvm->lock);
  }
  EXPORT_SYMBOL_GPL(set_cr4);
  
  void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
+       if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+               kvm_mmu_flush_tlb(vcpu);
+               return;
+       }
+
         if (is_long_mode(vcpu)) {
                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
-                       inject_gp(vcpu);
+                       kvm_inject_gp(vcpu, 0);
                         return;
                 }
         } else {
@@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                         if (cr3 & CR3_PAE_RESERVED_BITS) {
                                 printk(KERN_DEBUG
                                        "set_cr3: #GP, reserved bits\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                 return;
                         }
                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
                                        "reserved bits\n");
-                               inject_gp(vcpu);
-                               return;
-                       }
-               } else {
-                       if (cr3 & CR3_NONPAE_RESERVED_BITS) {
-                               printk(KERN_DEBUG
-                                      "set_cr3: #GP, reserved bits\n");
-                               inject_gp(vcpu);
+                               kvm_inject_gp(vcpu, 0);
                                 return;
                         }
                 }
+               /*
+                * We don't check reserved bits in nonpae mode, because
+                * this isn't enforced, and VMware depends on this.
+                */
         }
  
-       mutex_lock(&vcpu->kvm->lock);
+       down_read(&current->mm->mmap_sem);
         /*
          * Does the new cr3 value map to physical memory? (Note, we
          * catch an invalid cr3 even in real-mode, because it would
@@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
          * to debug) behavior on the guest side.
          */
         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
         else {
-               vcpu->cr3 = cr3;
-               vcpu->mmu.new_cr3(vcpu);
+               vcpu->arch.cr3 = cr3;
+               vcpu->arch.mmu.new_cr3(vcpu);
         }
-       mutex_unlock(&vcpu->kvm->lock);
+       up_read(&current->mm->mmap_sem);
  }
  EXPORT_SYMBOL_GPL(set_cr3);
  
@@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
  {
         if (cr8 & CR8_RESERVED_BITS) {
                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                 return;
         }
         if (irqchip_in_kernel(vcpu->kvm))
                 kvm_lapic_set_tpr(vcpu, cr8);
         else
-               vcpu->cr8 = cr8;
+               vcpu->arch.cr8 = cr8;
  }
  EXPORT_SYMBOL_GPL(set_cr8);
  
@@ -602,1157 +395,1589 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu)
         if (irqchip_in_kernel(vcpu->kvm))
                 return kvm_lapic_get_cr8(vcpu);
         else
-               return vcpu->cr8;
+               return vcpu->arch.cr8;
  }
  EXPORT_SYMBOL_GPL(get_cr8);
  
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
-{
-       if (irqchip_in_kernel(vcpu->kvm))
-               return vcpu->apic_base;
-       else
-               return vcpu->apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
-
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
-{
-       /* TODO: reserve bits check */
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_lapic_set_base(vcpu, data);
-       else
-               vcpu->apic_base = data;
-}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-
-void fx_init(struct kvm_vcpu *vcpu)
-{
-       unsigned after_mxcsr_mask;
-
-       /* Initialize guest FPU by resetting ours and saving into guest's */
-       preempt_disable();
-       fx_save(&vcpu->host_fx_image);
-       fpu_init();
-       fx_save(&vcpu->guest_fx_image);
-       fx_restore(&vcpu->host_fx_image);
-       preempt_enable();
-
-       vcpu->cr0 |= X86_CR0_ET;
-       after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-       vcpu->guest_fx_image.mxcsr = 0x1f80;
-       memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
-              0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
  /*
- * Allocate some memory and give it an address in the guest physical address
- * space.
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
   *
- * Discontiguous memory is allowed, mostly for framebuffers.
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu.
   */
-static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-                                         struct kvm_memory_region *mem)
-{
-       int r;
-       gfn_t base_gfn;
-       unsigned long npages;
-       unsigned long i;
-       struct kvm_memory_slot *memslot;
-       struct kvm_memory_slot old, new;
-
-       r = -EINVAL;
-       /* General sanity checks */
-       if (mem->memory_size & (PAGE_SIZE - 1))
-               goto out;
-       if (mem->guest_phys_addr & (PAGE_SIZE - 1))
-               goto out;
-       if (mem->slot >= KVM_MEMORY_SLOTS)
-               goto out;
-       if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
-               goto out;
-
-       memslot = &kvm->memslots[mem->slot];
-       base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
-       npages = mem->memory_size >> PAGE_SHIFT;
-
-       if (!npages)
-               mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+static u32 msrs_to_save[] = {
+       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+       MSR_K6_STAR,
+#ifdef CONFIG_X86_64
+       MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+       MSR_IA32_TIME_STAMP_COUNTER,
+};
  
-       mutex_lock(&kvm->lock);
+static unsigned num_msrs_to_save;
  
-       new = old = *memslot;
+static u32 emulated_msrs[] = {
+       MSR_IA32_MISC_ENABLE,
+};
  
-       new.base_gfn = base_gfn;
-       new.npages = npages;
-       new.flags = mem->flags;
+#ifdef CONFIG_X86_64
  
-       /* Disallow changing a memory slot's size. */
-       r = -EINVAL;
-       if (npages && old.npages && npages != old.npages)
-               goto out_unlock;
-
-       /* Check for overlaps */
-       r = -EEXIST;
-       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-               struct kvm_memory_slot *s = &kvm->memslots[i];
+static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       if (efer & EFER_RESERVED_BITS) {
+               printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
+                      efer);
+               kvm_inject_gp(vcpu, 0);
+               return;
+       }
  
-               if (s == memslot)
-                       continue;
-               if (!((base_gfn + npages <= s->base_gfn) ||
-                     (base_gfn >= s->base_gfn + s->npages)))
-                       goto out_unlock;
+       if (is_paging(vcpu)
+           && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
+               printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
+               kvm_inject_gp(vcpu, 0);
+               return;
         }
  
-       /* Deallocate if slot is being removed */
-       if (!npages)
-               new.phys_mem = NULL;
+       kvm_x86_ops->set_efer(vcpu, efer);
  
-       /* Free page dirty bitmap if unneeded */
-       if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
-               new.dirty_bitmap = NULL;
+       efer &= ~EFER_LMA;
+       efer |= vcpu->arch.shadow_efer & EFER_LMA;
  
-       r = -ENOMEM;
+       vcpu->arch.shadow_efer = efer;
+}
  
-       /* Allocate if a slot is being created */
-       if (npages && !new.phys_mem) {
-               new.phys_mem = vmalloc(npages * sizeof(struct page *));
+#endif
  
-               if (!new.phys_mem)
-                       goto out_unlock;
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+       return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+}
  
-               memset(new.phys_mem, 0, npages * sizeof(struct page *));
-               for (i = 0; i < npages; ++i) {
-                       new.phys_mem[i] = alloc_page(GFP_HIGHUSER
-                                                    | __GFP_ZERO);
-                       if (!new.phys_mem[i])
-                               goto out_unlock;
-                       set_page_private(new.phys_mem[i],0);
-               }
-       }
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+       return kvm_set_msr(vcpu, index, *data);
+}
  
-       /* Allocate page dirty bitmap if needed */
-       if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-               unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
  
-               new.dirty_bitmap = vmalloc(dirty_bytes);
-               if (!new.dirty_bitmap)
-                       goto out_unlock;
-               memset(new.dirty_bitmap, 0, dirty_bytes);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       switch (msr) {
+#ifdef CONFIG_X86_64
+       case MSR_EFER:
+               set_efer(vcpu, data);
+               break;
+#endif
+       case MSR_IA32_MC0_STATUS:
+               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
+                      __FUNCTION__, data);
+               break;
+       case MSR_IA32_MCG_STATUS:
+               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
+                       __FUNCTION__, data);
+               break;
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_UCODE_WRITE:
+       case 0x200 ... 0x2ff: /* MTRRs */
+               break;
+       case MSR_IA32_APICBASE:
+               kvm_set_apic_base(vcpu, data);
+               break;
+       case MSR_IA32_MISC_ENABLE:
+               vcpu->arch.ia32_misc_enable_msr = data;
+               break;
+       default:
+               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
+               return 1;
         }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
  
-       if (mem->slot >= kvm->nmemslots)
-               kvm->nmemslots = mem->slot + 1;
-
-       *memslot = new;
  
-       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-       kvm_flush_remote_tlbs(kvm);
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+       return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+}
  
-       mutex_unlock(&kvm->lock);
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data;
  
-       kvm_free_physmem_slot(&old, &new);
+       switch (msr) {
+       case 0xc0010010: /* SYSCFG */
+       case 0xc0010015: /* HWCR */
+       case MSR_IA32_PLATFORM_ID:
+       case MSR_IA32_P5_MC_ADDR:
+       case MSR_IA32_P5_MC_TYPE:
+       case MSR_IA32_MC0_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MC0_MISC:
+       case MSR_IA32_MC0_MISC+4:
+       case MSR_IA32_MC0_MISC+8:
+       case MSR_IA32_MC0_MISC+12:
+       case MSR_IA32_MC0_MISC+16:
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_PERF_STATUS:
+       case MSR_IA32_EBL_CR_POWERON:
+               /* MTRR registers */
+       case 0xfe:
+       case 0x200 ... 0x2ff:
+               data = 0;
+               break;
+       case 0xcd: /* fsb frequency */
+               data = 3;
+               break;
+       case MSR_IA32_APICBASE:
+               data = kvm_get_apic_base(vcpu);
+               break;
+       case MSR_IA32_MISC_ENABLE:
+               data = vcpu->arch.ia32_misc_enable_msr;
+               break;
+#ifdef CONFIG_X86_64
+       case MSR_EFER:
+               data = vcpu->arch.shadow_efer;
+               break;
+#endif
+       default:
+               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+               return 1;
+       }
+       *pdata = data;
         return 0;
-
-out_unlock:
-       mutex_unlock(&kvm->lock);
-       kvm_free_physmem_slot(&new, &old);
-out:
-       return r;
  }
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
  
  /*
- * Get (and clear) the dirty memory log for a memory slot.
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
   */
-static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-                                     struct kvm_dirty_log *log)
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+                   struct kvm_msr_entry *entries,
+                   int (*do_msr)(struct kvm_vcpu *vcpu,
+                                 unsigned index, u64 *data))
  {
-       struct kvm_memory_slot *memslot;
-       int r, i;
-       int n;
-       unsigned long any = 0;
-
-       mutex_lock(&kvm->lock);
-
-       r = -EINVAL;
-       if (log->slot >= KVM_MEMORY_SLOTS)
-               goto out;
-
-       memslot = &kvm->memslots[log->slot];
-       r = -ENOENT;
-       if (!memslot->dirty_bitmap)
-               goto out;
-
-       n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
-
-       for (i = 0; !any && i < n/sizeof(long); ++i)
-               any = memslot->dirty_bitmap[i];
+       int i;
  
-       r = -EFAULT;
-       if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
-               goto out;
+       vcpu_load(vcpu);
  
-       /* If nothing is dirty, don't bother messing with page tables. */
-       if (any) {
-               kvm_mmu_slot_remove_write_access(kvm, log->slot);
-               kvm_flush_remote_tlbs(kvm);
-               memset(memslot->dirty_bitmap, 0, n);
-       }
+       for (i = 0; i < msrs->nmsrs; ++i)
+               if (do_msr(vcpu, entries[i].index, &entries[i].data))
+                       break;
  
-       r = 0;
+       vcpu_put(vcpu);
  
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
+       return i;
  }
  
  /*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
   */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-                                        struct kvm_memory_alias *alias)
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+                 int (*do_msr)(struct kvm_vcpu *vcpu,
+                               unsigned index, u64 *data),
+                 int writeback)
  {
+       struct kvm_msrs msrs;
+       struct kvm_msr_entry *entries;
         int r, n;
-       struct kvm_mem_alias *p;
+       unsigned size;
  
-       r = -EINVAL;
-       /* General sanity checks */
-       if (alias->memory_size & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-               goto out;
-       if (alias->slot >= KVM_ALIAS_SLOTS)
-               goto out;
-       if (alias->guest_phys_addr + alias->memory_size
-           < alias->guest_phys_addr)
-               goto out;
-       if (alias->target_phys_addr + alias->memory_size
-           < alias->target_phys_addr)
+       r = -EFAULT;
+       if (copy_from_user(&msrs, user_msrs, sizeof msrs))
                 goto out;
  
-       mutex_lock(&kvm->lock);
+       r = -E2BIG;
+       if (msrs.nmsrs >= MAX_IO_MSRS)
+               goto out;
  
-       p = &kvm->aliases[alias->slot];
-       p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-       p->npages = alias->memory_size >> PAGE_SHIFT;
-       p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
+       r = -ENOMEM;
+       size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+       entries = vmalloc(size);
+       if (!entries)
+               goto out;
  
-       for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-               if (kvm->aliases[n - 1].npages)
-                       break;
-       kvm->naliases = n;
+       r = -EFAULT;
+       if (copy_from_user(entries, user_msrs->entries, size))
+               goto out_free;
  
-       kvm_mmu_zap_all(kvm);
+       r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+       if (r < 0)
+               goto out_free;
  
-       mutex_unlock(&kvm->lock);
+       r = -EFAULT;
+       if (writeback && copy_to_user(user_msrs->entries, entries, size))
+               goto out_free;
  
-       return 0;
+       r = n;
  
+out_free:
+       vfree(entries);
  out:
         return r;
  }
  
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+void decache_vcpus_on_cpu(int cpu)
+{
+       struct kvm *vm;
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       spin_lock(&kvm_lock);
+       list_for_each_entry(vm, &vm_list, vm_list)
+               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       vcpu = vm->vcpus[i];
+                       if (!vcpu)
+                               continue;
+                       /*
+                        * If the vcpu is locked, then it is running on some
+                        * other cpu and therefore it is not cached on the
+                        * cpu in question.
+                        *
+                        * If it's not locked, check the last cpu it executed
+                        * on.
+                        */
+                       if (mutex_trylock(&vcpu->mutex)) {
+                               if (vcpu->cpu == cpu) {
+                                       kvm_x86_ops->vcpu_decache(vcpu);
+                                       vcpu->cpu = -1;
+                               }
+                               mutex_unlock(&vcpu->mutex);
+                       }
+               }
+       spin_unlock(&kvm_lock);
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
  {
         int r;
  
-       r = 0;
-       switch (chip->chip_id) {
-       case KVM_IRQCHIP_PIC_MASTER:
-               memcpy (&chip->chip.pic,
-                       &pic_irqchip(kvm)->pics[0],
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_PIC_SLAVE:
-               memcpy (&chip->chip.pic,
-                       &pic_irqchip(kvm)->pics[1],
-                       sizeof(struct kvm_pic_state));
+       switch (ext) {
+       case KVM_CAP_IRQCHIP:
+       case KVM_CAP_HLT:
+       case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+       case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_SET_TSS_ADDR:
+       case KVM_CAP_EXT_CPUID:
+               r = 1;
                 break;
-       case KVM_IRQCHIP_IOAPIC:
-               memcpy (&chip->chip.ioapic,
-                       ioapic_irqchip(kvm),
-                       sizeof(struct kvm_ioapic_state));
+       case KVM_CAP_VAPIC:
+               r = !kvm_x86_ops->cpu_has_accelerated_tpr();
                 break;
         default:
-               r = -EINVAL;
+               r = 0;
                 break;
         }
         return r;
+
  }
  
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+long kvm_arch_dev_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg)
  {
-       int r;
+       void __user *argp = (void __user *)arg;
+       long r;
  
-       r = 0;
-       switch (chip->chip_id) {
-       case KVM_IRQCHIP_PIC_MASTER:
-               memcpy (&pic_irqchip(kvm)->pics[0],
-                       &chip->chip.pic,
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_PIC_SLAVE:
-               memcpy (&pic_irqchip(kvm)->pics[1],
-                       &chip->chip.pic,
-                       sizeof(struct kvm_pic_state));
-               break;
-       case KVM_IRQCHIP_IOAPIC:
-               memcpy (ioapic_irqchip(kvm),
-                       &chip->chip.ioapic,
-                       sizeof(struct kvm_ioapic_state));
+       switch (ioctl) {
+       case KVM_GET_MSR_INDEX_LIST: {
+               struct kvm_msr_list __user *user_msr_list = argp;
+               struct kvm_msr_list msr_list;
+               unsigned n;
+
+               r = -EFAULT;
+               if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+                       goto out;
+               n = msr_list.nmsrs;
+               msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+               if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+                       goto out;
+               r = -E2BIG;
+               if (n < num_msrs_to_save)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+                                num_msrs_to_save * sizeof(u32)))
+                       goto out;
+               if (copy_to_user(user_msr_list->indices
+                                + num_msrs_to_save * sizeof(u32),
+                                &emulated_msrs,
+                                ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+                       goto out;
+               r = 0;
                 break;
+       }
         default:
                 r = -EINVAL;
-               break;
         }
-       kvm_pic_update_irq(pic_irqchip(kvm));
+out:
         return r;
  }
  
-static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  {
-       int i;
-       struct kvm_mem_alias *alias;
+       kvm_x86_ops->vcpu_load(vcpu, cpu);
+}
  
-       for (i = 0; i < kvm->naliases; ++i) {
-               alias = &kvm->aliases[i];
-               if (gfn >= alias->base_gfn
-                   && gfn < alias->base_gfn + alias->npages)
-                       return alias->target_gfn + gfn - alias->base_gfn;
-       }
-       return gfn;
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->vcpu_put(vcpu);
+       kvm_put_guest_fpu(vcpu);
  }
  
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+static int is_efer_nx(void)
  {
-       int i;
+       u64 efer;
+
+       rdmsrl(MSR_EFER, efer);
+       return efer & EFER_NX;
+}
  
-       for (i = 0; i < kvm->nmemslots; ++i) {
-               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_cpuid_entry2 *e, *entry;
  
-               if (gfn >= memslot->base_gfn
-                   && gfn < memslot->base_gfn + memslot->npages)
-                       return memslot;
+       entry = NULL;
+       for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+               e = &vcpu->arch.cpuid_entries[i];
+               if (e->function == 0x80000001) {
+                       entry = e;
+                       break;
+               }
+       }
+       if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+               entry->edx &= ~(1 << 20);
+               printk(KERN_INFO "kvm: guest NX capability removed\n");
         }
-       return NULL;
  }
  
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+/* when an old userspace process fills a new kernel module */
+static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid *cpuid,
+                                   struct kvm_cpuid_entry __user *entries)
  {
-       gfn = unalias_gfn(kvm, gfn);
-       return __gfn_to_memslot(kvm, gfn);
+       int r, i;
+       struct kvm_cpuid_entry *cpuid_entries;
+
+       r = -E2BIG;
+       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+               goto out;
+       r = -ENOMEM;
+       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+       if (!cpuid_entries)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(cpuid_entries, entries,
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+               goto out_free;
+       for (i = 0; i < cpuid->nent; i++) {
+               vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+               vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+               vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+               vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+               vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+               vcpu->arch.cpuid_entries[i].index = 0;
+               vcpu->arch.cpuid_entries[i].flags = 0;
+               vcpu->arch.cpuid_entries[i].padding[0] = 0;
+               vcpu->arch.cpuid_entries[i].padding[1] = 0;
+               vcpu->arch.cpuid_entries[i].padding[2] = 0;
+       }
+       vcpu->arch.cpuid_nent = cpuid->nent;
+       cpuid_fix_nx_cap(vcpu);
+       r = 0;
+
+out_free:
+       vfree(cpuid_entries);
+out:
+       return r;
  }
  
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
  {
-       struct kvm_memory_slot *slot;
+       int r;
+
+       r = -E2BIG;
+       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out;
+       vcpu->arch.cpuid_nent = cpuid->nent;
+       return 0;
  
-       gfn = unalias_gfn(kvm, gfn);
-       slot = __gfn_to_memslot(kvm, gfn);
-       if (!slot)
-               return NULL;
-       return slot->phys_mem[gfn - slot->base_gfn];
+out:
+       return r;
  }
-EXPORT_SYMBOL_GPL(gfn_to_page);
  
-/* WARNING: Does not work on aliased pages. */
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
  {
-       struct kvm_memory_slot *memslot;
+       int r;
+
+       r = -E2BIG;
+       if (cpuid->nent < vcpu->arch.cpuid_nent)
+               goto out;
+       r = -EFAULT;
+       if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out;
+       return 0;
  
-       memslot = __gfn_to_memslot(kvm, gfn);
-       if (memslot && memslot->dirty_bitmap) {
-               unsigned long rel_gfn = gfn - memslot->base_gfn;
+out:
+       cpuid->nent = vcpu->arch.cpuid_nent;
+       return r;
+}
  
-               /* avoid RMW */
-               if (!test_bit(rel_gfn, memslot->dirty_bitmap))
-                       set_bit(rel_gfn, memslot->dirty_bitmap);
+static inline u32 bit(int bitno)
+{
+       return 1 << (bitno & 31);
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                         u32 index)
+{
+       entry->function = function;
+       entry->index = index;
+       cpuid_count(entry->function, entry->index,
+               &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+       entry->flags = 0;
+}
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                        u32 index, int *nent, int maxnent)
+{
+       const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
+               bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
+               bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
+       const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
+               bit(X86_FEATURE_SYSCALL) |
+               (bit(X86_FEATURE_NX) && is_efer_nx()) |
+#ifdef CONFIG_X86_64
+               bit(X86_FEATURE_LM) |
+#endif
+               bit(X86_FEATURE_MMXEXT) |
+               bit(X86_FEATURE_3DNOWEXT) |
+               bit(X86_FEATURE_3DNOW);
+       const u32 kvm_supported_word3_x86_features =
+               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+       const u32 kvm_supported_word6_x86_features =
+               bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+
+       /* all func 2 cpuid_count() should be called on the same cpu */
+       get_cpu();
+       do_cpuid_1_ent(entry, function, index);
+       ++*nent;
+
+       switch (function) {
+       case 0:
+               entry->eax = min(entry->eax, (u32)0xb);
+               break;
+       case 1:
+               entry->edx &= kvm_supported_word0_x86_features;
+               entry->ecx &= kvm_supported_word3_x86_features;
+               break;
+       /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+        * may return different values. This forces us to get_cpu() before
+        * issuing the first command, and also to emulate this annoying behavior
+        * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+       case 2: {
+               int t, times = entry->eax & 0xff;
+
+               entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+               for (t = 1; t < times && *nent < maxnent; ++t) {
+                       do_cpuid_1_ent(&entry[t], function, 0);
+                       entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+                       ++*nent;
+               }
+               break;
+       }
+       /* function 4 and 0xb have additional index. */
+       case 4: {
+               int index, cache_type;
+
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               /* read more entries until cache_type is zero */
+               for (index = 1; *nent < maxnent; ++index) {
+                       cache_type = entry[index - 1].eax & 0x1f;
+                       if (!cache_type)
+                               break;
+                       do_cpuid_1_ent(&entry[index], function, index);
+                       entry[index].flags |=
+                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       ++*nent;
+               }
+               break;
         }
+       case 0xb: {
+               int index, level_type;
+
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               /* read more entries until level_type is zero */
+               for (index = 1; *nent < maxnent; ++index) {
+                       level_type = entry[index - 1].ecx & 0xff;
+                       if (!level_type)
+                               break;
+                       do_cpuid_1_ent(&entry[index], function, index);
+                       entry[index].flags |=
+                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       ++*nent;
+               }
+               break;
+       }
+       case 0x80000000:
+               entry->eax = min(entry->eax, 0x8000001a);
+               break;
+       case 0x80000001:
+               entry->edx &= kvm_supported_word1_x86_features;
+               entry->ecx &= kvm_supported_word6_x86_features;
+               break;
+       }
+       put_cpu();
  }
  
-int emulator_read_std(unsigned long addr,
-                            void *val,
-                            unsigned int bytes,
-                            struct kvm_vcpu *vcpu)
+static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
  {
-       void *data = val;
-
-       while (bytes) {
-               gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
-               unsigned offset = addr & (PAGE_SIZE-1);
-               unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
-               unsigned long pfn;
-               struct page *page;
-               void *page_virt;
-
-               if (gpa == UNMAPPED_GVA)
-                       return X86EMUL_PROPAGATE_FAULT;
-               pfn = gpa >> PAGE_SHIFT;
-               page = gfn_to_page(vcpu->kvm, pfn);
-               if (!page)
-                       return X86EMUL_UNHANDLEABLE;
-               page_virt = kmap_atomic(page, KM_USER0);
+       struct kvm_cpuid_entry2 *cpuid_entries;
+       int limit, nent = 0, r = -E2BIG;
+       u32 func;
  
-               memcpy(data, page_virt + offset, tocopy);
+       if (cpuid->nent < 1)
+               goto out;
+       r = -ENOMEM;
+       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+       if (!cpuid_entries)
+               goto out;
  
-               kunmap_atomic(page_virt, KM_USER0);
+       do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+       limit = cpuid_entries[0].eax;
+       for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+               do_cpuid_ent(&cpuid_entries[nent], func, 0,
+                               &nent, cpuid->nent);
+       r = -E2BIG;
+       if (nent >= cpuid->nent)
+               goto out_free;
  
-               bytes -= tocopy;
-               data += tocopy;
-               addr += tocopy;
-       }
+       do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+       limit = cpuid_entries[nent - 1].eax;
+       for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+               do_cpuid_ent(&cpuid_entries[nent], func, 0,
+                              &nent, cpuid->nent);
+       r = -EFAULT;
+       if (copy_to_user(entries, cpuid_entries,
+                       nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out_free;
+       cpuid->nent = nent;
+       r = 0;
  
-       return X86EMUL_CONTINUE;
+out_free:
+       vfree(cpuid_entries);
+out:
+       return r;
  }
-EXPORT_SYMBOL_GPL(emulator_read_std);
  
-static int emulator_write_std(unsigned long addr,
-                             const void *val,
-                             unsigned int bytes,
-                             struct kvm_vcpu *vcpu)
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+                                   struct kvm_lapic_state *s)
  {
-       pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
-       return X86EMUL_UNHANDLEABLE;
+       vcpu_load(vcpu);
+       memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
+       vcpu_put(vcpu);
+
+       return 0;
  }
  
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr)
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+                                   struct kvm_lapic_state *s)
  {
-       struct kvm_io_device *dev;
+       vcpu_load(vcpu);
+       memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+       kvm_apic_post_state_restore(vcpu);
+       vcpu_put(vcpu);
  
-       if (vcpu->apic) {
-               dev = &vcpu->apic->dev;
-               if (dev->in_range(dev, addr))
-                       return dev;
-       }
-       return NULL;
+       return 0;
  }
  
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr)
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+                                   struct kvm_interrupt *irq)
  {
-       struct kvm_io_device *dev;
+       if (irq->irq < 0 || irq->irq >= 256)
+               return -EINVAL;
+       if (irqchip_in_kernel(vcpu->kvm))
+               return -ENXIO;
+       vcpu_load(vcpu);
  
-       dev = vcpu_find_pervcpu_dev(vcpu, addr);
-       if (dev == NULL)
-               dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
-       return dev;
+       set_bit(irq->irq, vcpu->arch.irq_pending);
+       set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+
+       vcpu_put(vcpu);
+
+       return 0;
  }
  
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-                                              gpa_t addr)
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+                                          struct kvm_tpr_access_ctl *tac)
  {
-       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
+       if (tac->flags)
+               return -EINVAL;
+       vcpu->arch.tpr_access_reporting = !!tac->enabled;
+       return 0;
  }
  
-static int emulator_read_emulated(unsigned long addr,
-                                 void *val,
-                                 unsigned int bytes,
-                                 struct kvm_vcpu *vcpu)
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
  {
-       struct kvm_io_device *mmio_dev;
-       gpa_t                 gpa;
-
-       if (vcpu->mmio_read_completed) {
-               memcpy(val, vcpu->mmio_data, bytes);
-               vcpu->mmio_read_completed = 0;
-               return X86EMUL_CONTINUE;
-       } else if (emulator_read_std(addr, val, bytes, vcpu)
-                  == X86EMUL_CONTINUE)
-               return X86EMUL_CONTINUE;
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       int r;
  
-       gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
-       if (gpa == UNMAPPED_GVA)
-               return X86EMUL_PROPAGATE_FAULT;
+       switch (ioctl) {
+       case KVM_GET_LAPIC: {
+               struct kvm_lapic_state lapic;
  
-       /*
-        * Is this MMIO handled locally?
-        */
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-       if (mmio_dev) {
-               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-               return X86EMUL_CONTINUE;
+               memset(&lapic, 0, sizeof lapic);
+               r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &lapic, sizeof lapic))
+                       goto out;
+               r = 0;
+               break;
         }
+       case KVM_SET_LAPIC: {
+               struct kvm_lapic_state lapic;
  
-       vcpu->mmio_needed = 1;
-       vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->mmio_is_write = 0;
-
-       return X86EMUL_UNHANDLEABLE;
-}
-
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
-                              const void *val, int bytes)
-{
-       struct page *page;
-       void *virt;
-
-       if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
-               return 0;
-       page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-       if (!page)
-               return 0;
-       mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
-       virt = kmap_atomic(page, KM_USER0);
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
-       memcpy(virt + offset_in_page(gpa), val, bytes);
-       kunmap_atomic(virt, KM_USER0);
-       return 1;
-}
+               r = -EFAULT;
+               if (copy_from_user(&lapic, argp, sizeof lapic))
+                       goto out;
+               r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_INTERRUPT: {
+               struct kvm_interrupt irq;
  
-static int emulator_write_emulated_onepage(unsigned long addr,
-                                          const void *val,
-                                          unsigned int bytes,
-                                          struct kvm_vcpu *vcpu)
-{
-       struct kvm_io_device *mmio_dev;
-       gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
+               r = -EFAULT;
+               if (copy_from_user(&irq, argp, sizeof irq))
+                       goto out;
+               r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_CPUID: {
+               struct kvm_cpuid __user *cpuid_arg = argp;
+               struct kvm_cpuid cpuid;
  
-       if (gpa == UNMAPPED_GVA) {
-               kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
-               return X86EMUL_PROPAGATE_FAULT;
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+               if (r)
+                       goto out;
+               break;
         }
+       case KVM_SET_CPUID2: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
  
-       if (emulator_write_phys(vcpu, gpa, val, bytes))
-               return X86EMUL_CONTINUE;
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+                               cpuid_arg->entries);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_GET_CPUID2: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
  
-       /*
-        * Is this MMIO handled locally?
-        */
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
-       if (mmio_dev) {
-               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-               return X86EMUL_CONTINUE;
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+                               cpuid_arg->entries);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+                       goto out;
+               r = 0;
+               break;
         }
+       case KVM_GET_MSRS:
+               r = msr_io(vcpu, argp, kvm_get_msr, 1);
+               break;
+       case KVM_SET_MSRS:
+               r = msr_io(vcpu, argp, do_set_msr, 0);
+               break;
+       case KVM_TPR_ACCESS_REPORTING: {
+               struct kvm_tpr_access_ctl tac;
  
-       vcpu->mmio_needed = 1;
-       vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->mmio_is_write = 1;
-       memcpy(vcpu->mmio_data, val, bytes);
+               r = -EFAULT;
+               if (copy_from_user(&tac, argp, sizeof tac))
+                       goto out;
+               r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &tac, sizeof tac))
+                       goto out;
+               r = 0;
+               break;
+       };
+       case KVM_SET_VAPIC_ADDR: {
+               struct kvm_vapic_addr va;
  
-       return X86EMUL_CONTINUE;
+               r = -EINVAL;
+               if (!irqchip_in_kernel(vcpu->kvm))
+                       goto out;
+               r = -EFAULT;
+               if (copy_from_user(&va, argp, sizeof va))
+                       goto out;
+               r = 0;
+               kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+               break;
+       }
+       default:
+               r = -EINVAL;
+       }
+out:
+       return r;
  }
  
-int emulator_write_emulated(unsigned long addr,
-                                  const void *val,
-                                  unsigned int bytes,
-                                  struct kvm_vcpu *vcpu)
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
  {
-       /* Crossing a page boundary? */
-       if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
-               int rc, now;
+       int ret;
  
-               now = -addr & ~PAGE_MASK;
-               rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
-               if (rc != X86EMUL_CONTINUE)
-                       return rc;
-               addr += now;
-               val += now;
-               bytes -= now;
-       }
-       return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+       if (addr > (unsigned int)(-3 * PAGE_SIZE))
+               return -1;
+       ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+       return ret;
  }
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
  
-static int emulator_cmpxchg_emulated(unsigned long addr,
-                                    const void *old,
-                                    const void *new,
-                                    unsigned int bytes,
-                                    struct kvm_vcpu *vcpu)
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+                                         u32 kvm_nr_mmu_pages)
  {
-       static int reported;
+       if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+               return -EINVAL;
  
-       if (!reported) {
-               reported = 1;
-               printk(KERN_WARNING "kvm: emulating exchange as write\n");
-       }
-       return emulator_write_emulated(addr, new, bytes, vcpu);
-}
+       down_write(&current->mm->mmap_sem);
  
-static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-       return kvm_x86_ops->get_segment_base(vcpu, seg);
-}
+       kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+       kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
  
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
-{
-       return X86EMUL_CONTINUE;
+       up_write(&current->mm->mmap_sem);
+       return 0;
  }
  
-int emulate_clts(struct kvm_vcpu *vcpu)
+static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
  {
-       kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
-       return X86EMUL_CONTINUE;
+       return kvm->arch.n_alloc_mmu_pages;
  }
  
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
  {
-       struct kvm_vcpu *vcpu = ctxt->vcpu;
+       int i;
+       struct kvm_mem_alias *alias;
  
-       switch (dr) {
-       case 0 ... 3:
-               *dest = kvm_x86_ops->get_dr(vcpu, dr);
-               return X86EMUL_CONTINUE;
-       default:
-               pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
-               return X86EMUL_UNHANDLEABLE;
+       for (i = 0; i < kvm->arch.naliases; ++i) {
+               alias = &kvm->arch.aliases[i];
+               if (gfn >= alias->base_gfn
+                   && gfn < alias->base_gfn + alias->npages)
+                       return alias->target_gfn + gfn - alias->base_gfn;
         }
+       return gfn;
  }
  
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+/*
+ * Set a new alias region.  Aliases map a portion of physical memory into
+ * another portion.  This is useful for memory windows, for example the PC
+ * VGA region.
+ */
+static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
+                                        struct kvm_memory_alias *alias)
  {
-       unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-       int exception;
+       int r, n;
+       struct kvm_mem_alias *p;
  
-       kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
-       if (exception) {
-               /* FIXME: better handling */
-               return X86EMUL_UNHANDLEABLE;
-       }
-       return X86EMUL_CONTINUE;
-}
+       r = -EINVAL;
+       /* General sanity checks */
+       if (alias->memory_size & (PAGE_SIZE - 1))
+               goto out;
+       if (alias->guest_phys_addr & (PAGE_SIZE - 1))
+               goto out;
+       if (alias->slot >= KVM_ALIAS_SLOTS)
+               goto out;
+       if (alias->guest_phys_addr + alias->memory_size
+           < alias->guest_phys_addr)
+               goto out;
+       if (alias->target_phys_addr + alias->memory_size
+           < alias->target_phys_addr)
+               goto out;
  
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
-       static int reported;
-       u8 opcodes[4];
-       unsigned long rip = vcpu->rip;
-       unsigned long rip_linear;
+       down_write(&current->mm->mmap_sem);
  
-       rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
+       p = &kvm->arch.aliases[alias->slot];
+       p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
+       p->npages = alias->memory_size >> PAGE_SHIFT;
+       p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
  
-       if (reported)
-               return;
+       for (n = KVM_ALIAS_SLOTS; n > 0; --n)
+               if (kvm->arch.aliases[n - 1].npages)
+                       break;
+       kvm->arch.naliases = n;
  
-       emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+       kvm_mmu_zap_all(kvm);
  
-       printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-              context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
-       reported = 1;
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
+       up_write(&current->mm->mmap_sem);
  
-struct x86_emulate_ops emulate_ops = {
-       .read_std            = emulator_read_std,
-       .write_std           = emulator_write_std,
-       .read_emulated       = emulator_read_emulated,
-       .write_emulated      = emulator_write_emulated,
-       .cmpxchg_emulated    = emulator_cmpxchg_emulated,
-};
+       return 0;
  
-int emulate_instruction(struct kvm_vcpu *vcpu,
-                       struct kvm_run *run,
-                       unsigned long cr2,
-                       u16 error_code)
+out:
+       return r;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
  {
-       struct x86_emulate_ctxt emulate_ctxt;
         int r;
-       int cs_db, cs_l;
  
-       vcpu->mmio_fault_cr2 = cr2;
-       kvm_x86_ops->cache_regs(vcpu);
-
-       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-       emulate_ctxt.vcpu = vcpu;
-       emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-       emulate_ctxt.cr2 = cr2;
-       emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
-               ? X86EMUL_MODE_REAL : cs_l
-               ? X86EMUL_MODE_PROT64 : cs_db
-               ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-
-       if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
-               emulate_ctxt.cs_base = 0;
-               emulate_ctxt.ds_base = 0;
-               emulate_ctxt.es_base = 0;
-               emulate_ctxt.ss_base = 0;
-       } else {
-               emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
-               emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
-               emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
-               emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
+       r = 0;
+       switch (chip->chip_id) {
+       case KVM_IRQCHIP_PIC_MASTER:
+               memcpy(&chip->chip.pic,
+                       &pic_irqchip(kvm)->pics[0],
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_PIC_SLAVE:
+               memcpy(&chip->chip.pic,
+                       &pic_irqchip(kvm)->pics[1],
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_IOAPIC:
+               memcpy(&chip->chip.ioapic,
+                       ioapic_irqchip(kvm),
+                       sizeof(struct kvm_ioapic_state));
+               break;
+       default:
+               r = -EINVAL;
+               break;
         }
+       return r;
+}
  
-       emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
-       emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
-
-       vcpu->mmio_is_write = 0;
-       vcpu->pio.string = 0;
-       r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
-       if (vcpu->pio.string)
-               return EMULATE_DO_MMIO;
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+       int r;
  
-       if ((r || vcpu->mmio_is_write) && run) {
-               run->exit_reason = KVM_EXIT_MMIO;
-               run->mmio.phys_addr = vcpu->mmio_phys_addr;
-               memcpy(run->mmio.data, vcpu->mmio_data, 8);
-               run->mmio.len = vcpu->mmio_size;
-               run->mmio.is_write = vcpu->mmio_is_write;
+       r = 0;
+       switch (chip->chip_id) {
+       case KVM_IRQCHIP_PIC_MASTER:
+               memcpy(&pic_irqchip(kvm)->pics[0],
+                       &chip->chip.pic,
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_PIC_SLAVE:
+               memcpy(&pic_irqchip(kvm)->pics[1],
+                       &chip->chip.pic,
+                       sizeof(struct kvm_pic_state));
+               break;
+       case KVM_IRQCHIP_IOAPIC:
+               memcpy(ioapic_irqchip(kvm),
+                       &chip->chip.ioapic,
+                       sizeof(struct kvm_ioapic_state));
+               break;
+       default:
+               r = -EINVAL;
+               break;
         }
+       kvm_pic_update_irq(pic_irqchip(kvm));
+       return r;
+}
  
-       if (r) {
-               if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-                       return EMULATE_DONE;
-               if (!vcpu->mmio_needed) {
-                       kvm_report_emulation_failure(vcpu, "mmio");
-                       return EMULATE_FAIL;
-               }
-               return EMULATE_DO_MMIO;
-       }
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                                     struct kvm_dirty_log *log)
+{
+       int r;
+       int n;
+       struct kvm_memory_slot *memslot;
+       int is_dirty = 0;
  
-       kvm_x86_ops->decache_regs(vcpu);
-       kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
+       down_write(&current->mm->mmap_sem);
  
-       if (vcpu->mmio_is_write) {
-               vcpu->mmio_needed = 0;
-               return EMULATE_DO_MMIO;
-       }
+       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       if (r)
+               goto out;
  
-       return EMULATE_DONE;
+       /* If nothing is dirty, don't bother messing with page tables. */
+       if (is_dirty) {
+               kvm_mmu_slot_remove_write_access(kvm, log->slot);
+               kvm_flush_remote_tlbs(kvm);
+               memslot = &kvm->memslots[log->slot];
+               n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+               memset(memslot->dirty_bitmap, 0, n);
+       }
+       r = 0;
+out:
+       up_write(&current->mm->mmap_sem);
+       return r;
  }
-EXPORT_SYMBOL_GPL(emulate_instruction);
  
-/*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
- */
-static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+long kvm_arch_vm_ioctl(struct file *filp,
+                      unsigned int ioctl, unsigned long arg)
  {
-       DECLARE_WAITQUEUE(wait, current);
+       struct kvm *kvm = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       int r = -EINVAL;
  
-       add_wait_queue(&vcpu->wq, &wait);
+       switch (ioctl) {
+       case KVM_SET_TSS_ADDR:
+               r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+               if (r < 0)
+                       goto out;
+               break;
+       case KVM_SET_MEMORY_REGION: {
+               struct kvm_memory_region kvm_mem;
+               struct kvm_userspace_memory_region kvm_userspace_mem;
  
-       /*
-        * We will block until either an interrupt or a signal wakes us up
-        */
-       while (!kvm_cpu_has_interrupt(vcpu)
-              && !signal_pending(current)
-              && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
-              && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               vcpu_put(vcpu);
-               schedule();
-               vcpu_load(vcpu);
+               r = -EFAULT;
+               if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
+                       goto out;
+               kvm_userspace_mem.slot = kvm_mem.slot;
+               kvm_userspace_mem.flags = kvm_mem.flags;
+               kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
+               kvm_userspace_mem.memory_size = kvm_mem.memory_size;
+               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
+               if (r)
+                       goto out;
+               break;
         }
+       case KVM_SET_NR_MMU_PAGES:
+               r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+               if (r)
+                       goto out;
+               break;
+       case KVM_GET_NR_MMU_PAGES:
+               r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
+               break;
+       case KVM_SET_MEMORY_ALIAS: {
+               struct kvm_memory_alias alias;
  
-       __set_current_state(TASK_RUNNING);
-       remove_wait_queue(&vcpu->wq, &wait);
-}
+               r = -EFAULT;
+               if (copy_from_user(&alias, argp, sizeof alias))
+                       goto out;
+               r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_CREATE_IRQCHIP:
+               r = -ENOMEM;
+               kvm->arch.vpic = kvm_create_pic(kvm);
+               if (kvm->arch.vpic) {
+                       r = kvm_ioapic_init(kvm);
+                       if (r) {
+                               kfree(kvm->arch.vpic);
+                               kvm->arch.vpic = NULL;
+                               goto out;
+                       }
+               } else
+                       goto out;
+               break;
+       case KVM_IRQ_LINE: {
+               struct kvm_irq_level irq_event;
  
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
-       ++vcpu->stat.halt_exits;
-       if (irqchip_in_kernel(vcpu->kvm)) {
-               vcpu->mp_state = VCPU_MP_STATE_HALTED;
-               kvm_vcpu_block(vcpu);
-               if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
-                       return -EINTR;
-               return 1;
-       } else {
-               vcpu->run->exit_reason = KVM_EXIT_HLT;
-               return 0;
+               r = -EFAULT;
+               if (copy_from_user(&irq_event, argp, sizeof irq_event))
+                       goto out;
+               if (irqchip_in_kernel(kvm)) {
+                       mutex_lock(&kvm->lock);
+                       if (irq_event.irq < 16)
+                               kvm_pic_set_irq(pic_irqchip(kvm),
+                                       irq_event.irq,
+                                       irq_event.level);
+                       kvm_ioapic_set_irq(kvm->arch.vioapic,
+                                       irq_event.irq,
+                                       irq_event.level);
+                       mutex_unlock(&kvm->lock);
+                       r = 0;
+               }
+               break;
         }
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+       case KVM_GET_IRQCHIP: {
+               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+               struct kvm_irqchip chip;
  
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-       unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
+               r = -EFAULT;
+               if (copy_from_user(&chip, argp, sizeof chip))
+                       goto out;
+               r = -ENXIO;
+               if (!irqchip_in_kernel(kvm))
+                       goto out;
+               r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &chip, sizeof chip))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_IRQCHIP: {
+               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+               struct kvm_irqchip chip;
  
-       kvm_x86_ops->cache_regs(vcpu);
-       ret = -KVM_EINVAL;
-#ifdef CONFIG_X86_64
-       if (is_long_mode(vcpu)) {
-               nr = vcpu->regs[VCPU_REGS_RAX];
-               a0 = vcpu->regs[VCPU_REGS_RDI];
-               a1 = vcpu->regs[VCPU_REGS_RSI];
-               a2 = vcpu->regs[VCPU_REGS_RDX];
-               a3 = vcpu->regs[VCPU_REGS_RCX];
-               a4 = vcpu->regs[VCPU_REGS_R8];
-               a5 = vcpu->regs[VCPU_REGS_R9];
-       } else
-#endif
-       {
-               nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
-               a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
-               a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
-               a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
-               a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
-               a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
-               a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
+               r = -EFAULT;
+               if (copy_from_user(&chip, argp, sizeof chip))
+                       goto out;
+               r = -ENXIO;
+               if (!irqchip_in_kernel(kvm))
+                       goto out;
+               r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_GET_SUPPORTED_CPUID: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
+                       cpuid_arg->entries);
+               if (r)
+                       goto out;
+
+               r = -EFAULT;
+               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+                       goto out;
+               r = 0;
+               break;
         }
-       switch (nr) {
         default:
-               run->hypercall.nr = nr;
-               run->hypercall.args[0] = a0;
-               run->hypercall.args[1] = a1;
-               run->hypercall.args[2] = a2;
-               run->hypercall.args[3] = a3;
-               run->hypercall.args[4] = a4;
-               run->hypercall.args[5] = a5;
-               run->hypercall.ret = ret;
-               run->hypercall.longmode = is_long_mode(vcpu);
-               kvm_x86_ops->decache_regs(vcpu);
-               return 0;
+               ;
         }
-       vcpu->regs[VCPU_REGS_RAX] = ret;
-       kvm_x86_ops->decache_regs(vcpu);
-       return 1;
+out:
+       return r;
  }
-EXPORT_SYMBOL_GPL(kvm_hypercall);
  
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+static void kvm_init_msr_list(void)
  {
-       return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+       u32 dummy[2];
+       unsigned i, j;
+
+       for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+               if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+                       continue;
+               if (j < i)
+                       msrs_to_save[j] = msrs_to_save[i];
+               j++;
+       }
+       num_msrs_to_save = j;
  }
  
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+/*
+ * Only apic need an MMIO device hook, so shortcut now..
+ */
+static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
+                                               gpa_t addr)
  {
-       struct descriptor_table dt = { limit, base };
+       struct kvm_io_device *dev;
  
-       kvm_x86_ops->set_gdt(vcpu, &dt);
+       if (vcpu->arch.apic) {
+               dev = &vcpu->arch.apic->dev;
+               if (dev->in_range(dev, addr))
+                       return dev;
+       }
+       return NULL;
  }
  
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+
+static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
+                                               gpa_t addr)
  {
-       struct descriptor_table dt = { limit, base };
+       struct kvm_io_device *dev;
  
-       kvm_x86_ops->set_idt(vcpu, &dt);
+       dev = vcpu_find_pervcpu_dev(vcpu, addr);
+       if (dev == NULL)
+               dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
+       return dev;
  }
  
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-                  unsigned long *rflags)
+int emulator_read_std(unsigned long addr,
+                            void *val,
+                            unsigned int bytes,
+                            struct kvm_vcpu *vcpu)
  {
-       lmsw(vcpu, msw);
-       *rflags = kvm_x86_ops->get_rflags(vcpu);
-}
+       void *data = val;
+       int r = X86EMUL_CONTINUE;
  
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-       switch (cr) {
-       case 0:
-               return vcpu->cr0;
-       case 2:
-               return vcpu->cr2;
-       case 3:
-               return vcpu->cr3;
-       case 4:
-               return vcpu->cr4;
-       default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
-               return 0;
-       }
-}
+       down_read(&current->mm->mmap_sem);
+       while (bytes) {
+               gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+               unsigned offset = addr & (PAGE_SIZE-1);
+               unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+               int ret;
  
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-                    unsigned long *rflags)
-{
-       switch (cr) {
-       case 0:
-               set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
-               *rflags = kvm_x86_ops->get_rflags(vcpu);
-               break;
-       case 2:
-               vcpu->cr2 = val;
-               break;
-       case 3:
-               set_cr3(vcpu, val);
-               break;
-       case 4:
-               set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
-               break;
-       default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+               if (gpa == UNMAPPED_GVA) {
+                       r = X86EMUL_PROPAGATE_FAULT;
+                       goto out;
+               }
+               ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
+               if (ret < 0) {
+                       r = X86EMUL_UNHANDLEABLE;
+                       goto out;
+               }
+
+               bytes -= tocopy;
+               data += tocopy;
+               addr += tocopy;
         }
+out:
+       up_read(&current->mm->mmap_sem);
+       return r;
  }
+EXPORT_SYMBOL_GPL(emulator_read_std);
  
-/*
- * Register the para guest with the host:
- */
-static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
+static int emulator_read_emulated(unsigned long addr,
+                                 void *val,
+                                 unsigned int bytes,
+                                 struct kvm_vcpu *vcpu)
  {
-       struct kvm_vcpu_para_state *para_state;
-       hpa_t para_state_hpa, hypercall_hpa;
-       struct page *para_state_page;
-       unsigned char *hypercall;
-       gpa_t hypercall_gpa;
-
-       printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
-       printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
+       struct kvm_io_device *mmio_dev;
+       gpa_t                 gpa;
  
-       /*
-        * Needs to be page aligned:
-        */
-       if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
-               goto err_gp;
+       if (vcpu->mmio_read_completed) {
+               memcpy(val, vcpu->mmio_data, bytes);
+               vcpu->mmio_read_completed = 0;
+               return X86EMUL_CONTINUE;
+       }
  
-       para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
-       printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
-       if (is_error_hpa(para_state_hpa))
-               goto err_gp;
+       down_read(&current->mm->mmap_sem);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       up_read(&current->mm->mmap_sem);
  
-       mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
-       para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
-       para_state = kmap(para_state_page);
+       /* For APIC access vmexit */
+       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+               goto mmio;
  
-       printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
-       printk(KERN_DEBUG "....           size: %d\n", para_state->size);
+       if (emulator_read_std(addr, val, bytes, vcpu)
+                       == X86EMUL_CONTINUE)
+               return X86EMUL_CONTINUE;
+       if (gpa == UNMAPPED_GVA)
+               return X86EMUL_PROPAGATE_FAULT;
  
-       para_state->host_version = KVM_PARA_API_VERSION;
+mmio:
         /*
-        * We cannot support guests that try to register themselves
-        * with a newer API version than the host supports:
+        * Is this MMIO handled locally?
          */
-       if (para_state->guest_version > KVM_PARA_API_VERSION) {
-               para_state->ret = -KVM_EINVAL;
-               goto err_kunmap_skip;
+       mutex_lock(&vcpu->kvm->lock);
+       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+       if (mmio_dev) {
+               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
+               mutex_unlock(&vcpu->kvm->lock);
+               return X86EMUL_CONTINUE;
         }
+       mutex_unlock(&vcpu->kvm->lock);
  
-       hypercall_gpa = para_state->hypercall_gpa;
-       hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
-       printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
-       if (is_error_hpa(hypercall_hpa)) {
-               para_state->ret = -KVM_EINVAL;
-               goto err_kunmap_skip;
-       }
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_phys_addr = gpa;
+       vcpu->mmio_size = bytes;
+       vcpu->mmio_is_write = 0;
  
-       printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
-       vcpu->para_state_page = para_state_page;
-       vcpu->para_state_gpa = para_state_gpa;
-       vcpu->hypercall_gpa = hypercall_gpa;
+       return X86EMUL_UNHANDLEABLE;
+}
  
-       mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
-       hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
-                               KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
-       kvm_x86_ops->patch_hypercall(vcpu, hypercall);
-       kunmap_atomic(hypercall, KM_USER1);
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+                              const void *val, int bytes)
+{
+       int ret;
  
-       para_state->ret = 0;
-err_kunmap_skip:
-       kunmap(para_state_page);
-       return 0;
-err_gp:
+       down_read(&current->mm->mmap_sem);
+       ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+       if (ret < 0) {
+               up_read(&current->mm->mmap_sem);
+               return 0;
+       }
+       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       up_read(&current->mm->mmap_sem);
         return 1;
  }
  
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int emulator_write_emulated_onepage(unsigned long addr,
+                                          const void *val,
+                                          unsigned int bytes,
+                                          struct kvm_vcpu *vcpu)
  {
-       u64 data;
+       struct kvm_io_device *mmio_dev;
+       gpa_t                 gpa;
  
-       switch (msr) {
-       case 0xc0010010: /* SYSCFG */
-       case 0xc0010015: /* HWCR */
-       case MSR_IA32_PLATFORM_ID:
-       case MSR_IA32_P5_MC_ADDR:
-       case MSR_IA32_P5_MC_TYPE:
-       case MSR_IA32_MC0_CTL:
-       case MSR_IA32_MCG_STATUS:
-       case MSR_IA32_MCG_CAP:
-       case MSR_IA32_MC0_MISC:
-       case MSR_IA32_MC0_MISC+4:
-       case MSR_IA32_MC0_MISC+8:
-       case MSR_IA32_MC0_MISC+12:
-       case MSR_IA32_MC0_MISC+16:
-       case MSR_IA32_UCODE_REV:
-       case MSR_IA32_PERF_STATUS:
-       case MSR_IA32_EBL_CR_POWERON:
-               /* MTRR registers */
-       case 0xfe:
-       case 0x200 ... 0x2ff:
-               data = 0;
-               break;
-       case 0xcd: /* fsb frequency */
-               data = 3;
-               break;
-       case MSR_IA32_APICBASE:
-               data = kvm_get_apic_base(vcpu);
-               break;
-       case MSR_IA32_MISC_ENABLE:
-               data = vcpu->ia32_misc_enable_msr;
-               break;
-#ifdef CONFIG_X86_64
-       case MSR_EFER:
-               data = vcpu->shadow_efer;
-               break;
-#endif
-       default:
-               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
-               return 1;
+       down_read(&current->mm->mmap_sem);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+       up_read(&current->mm->mmap_sem);
+
+       if (gpa == UNMAPPED_GVA) {
+               kvm_inject_page_fault(vcpu, addr, 2);
+               return X86EMUL_PROPAGATE_FAULT;
         }
-       *pdata = data;
-       return 0;
+
+       /* For APIC access vmexit */
+       if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+               goto mmio;
+
+       if (emulator_write_phys(vcpu, gpa, val, bytes))
+               return X86EMUL_CONTINUE;
+
+mmio:
+       /*
+        * Is this MMIO handled locally?
+        */
+       mutex_lock(&vcpu->kvm->lock);
+       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
+       if (mmio_dev) {
+               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
+               mutex_unlock(&vcpu->kvm->lock);
+               return X86EMUL_CONTINUE;
+       }
+       mutex_unlock(&vcpu->kvm->lock);
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_phys_addr = gpa;
+       vcpu->mmio_size = bytes;
+       vcpu->mmio_is_write = 1;
+       memcpy(vcpu->mmio_data, val, bytes);
+
+       return X86EMUL_CONTINUE;
  }
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
  
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+int emulator_write_emulated(unsigned long addr,
+                                  const void *val,
+                                  unsigned int bytes,
+                                  struct kvm_vcpu *vcpu)
  {
-       return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
-}
+       /* Crossing a page boundary? */
+       if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+               int rc, now;
  
-#ifdef CONFIG_X86_64
+               now = -addr & ~PAGE_MASK;
+               rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+               if (rc != X86EMUL_CONTINUE)
+                       return rc;
+               addr += now;
+               val += now;
+               bytes -= now;
+       }
+       return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+}
+EXPORT_SYMBOL_GPL(emulator_write_emulated);
  
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int emulator_cmpxchg_emulated(unsigned long addr,
+                                    const void *old,
+                                    const void *new,
+                                    unsigned int bytes,
+                                    struct kvm_vcpu *vcpu)
  {
-       if (efer & EFER_RESERVED_BITS) {
-               printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
-                      efer);
-               inject_gp(vcpu);
-               return;
-       }
+       static int reported;
  
-       if (is_paging(vcpu)
-           && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
-               printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
-               inject_gp(vcpu);
-               return;
+       if (!reported) {
+               reported = 1;
+               printk(KERN_WARNING "kvm: emulating exchange as write\n");
         }
+#ifndef CONFIG_X86_64
+       /* guests cmpxchg8b have to be emulated atomically */
+       if (bytes == 8) {
+               gpa_t gpa;
+               struct page *page;
+               char *addr;
+               u64 val;
  
-       kvm_x86_ops->set_efer(vcpu, efer);
+               down_read(&current->mm->mmap_sem);
+               gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
  
-       efer &= ~EFER_LMA;
-       efer |= vcpu->shadow_efer & EFER_LMA;
+               if (gpa == UNMAPPED_GVA ||
+                  (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+                       goto emul_write;
  
-       vcpu->shadow_efer = efer;
-}
+               if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+                       goto emul_write;
  
+               val = *(u64 *)new;
+               page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+               addr = kmap_atomic(page, KM_USER0);
+               set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
+               kunmap_atomic(addr, KM_USER0);
+               kvm_release_page_dirty(page);
+       emul_write:
+               up_read(&current->mm->mmap_sem);
+       }
  #endif
  
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-       switch (msr) {
-#ifdef CONFIG_X86_64
-       case MSR_EFER:
-               set_efer(vcpu, data);
-               break;
-#endif
-       case MSR_IA32_MC0_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-                      __FUNCTION__, data);
-               break;
-       case MSR_IA32_MCG_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-                       __FUNCTION__, data);
-               break;
-       case MSR_IA32_UCODE_REV:
-       case MSR_IA32_UCODE_WRITE:
-       case 0x200 ... 0x2ff: /* MTRRs */
-               break;
-       case MSR_IA32_APICBASE:
-               kvm_set_apic_base(vcpu, data);
-               break;
-       case MSR_IA32_MISC_ENABLE:
-               vcpu->ia32_misc_enable_msr = data;
-               break;
-       /*
-        * This is the 'probe whether the host is KVM' logic:
-        */
-       case MSR_KVM_API_MAGIC:
-               return vcpu_register_para(vcpu, data);
+       return emulator_write_emulated(addr, new, bytes, vcpu);
+}
  
-       default:
-               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
-               return 1;
-       }
-       return 0;
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+       return kvm_x86_ops->get_segment_base(vcpu, seg);
  }
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
  
-/*
- * Writes msr value into into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
  {
-       return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+       return X86EMUL_CONTINUE;
  }
  
-void kvm_resched(struct kvm_vcpu *vcpu)
+int emulate_clts(struct kvm_vcpu *vcpu)
  {
-       if (!need_resched())
-               return;
-       cond_resched();
+       kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
+       return X86EMUL_CONTINUE;
  }
-EXPORT_SYMBOL_GPL(kvm_resched);
  
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
  {
-       int i;
-       u32 function;
-       struct kvm_cpuid_entry *e, *best;
+       struct kvm_vcpu *vcpu = ctxt->vcpu;
  
-       kvm_x86_ops->cache_regs(vcpu);
-       function = vcpu->regs[VCPU_REGS_RAX];
-       vcpu->regs[VCPU_REGS_RAX] = 0;
-       vcpu->regs[VCPU_REGS_RBX] = 0;
-       vcpu->regs[VCPU_REGS_RCX] = 0;
-       vcpu->regs[VCPU_REGS_RDX] = 0;
-       best = NULL;
-       for (i = 0; i < vcpu->cpuid_nent; ++i) {
-               e = &vcpu->cpuid_entries[i];
-               if (e->function == function) {
-                       best = e;
-                       break;
+       switch (dr) {
+       case 0 ... 3:
+               *dest = kvm_x86_ops->get_dr(vcpu, dr);
+               return X86EMUL_CONTINUE;
+       default:
+               pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+               return X86EMUL_UNHANDLEABLE;
+       }
+}
+
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+{
+       unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
+       int exception;
+
+       kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
+       if (exception) {
+               /* FIXME: better handling */
+               return X86EMUL_UNHANDLEABLE;
+       }
+       return X86EMUL_CONTINUE;
+}
+
+void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+{
+       static int reported;
+       u8 opcodes[4];
+       unsigned long rip = vcpu->arch.rip;
+       unsigned long rip_linear;
+
+       rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
+
+       if (reported)
+               return;
+
+       emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+
+       printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
+              context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+       reported = 1;
+}
+EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
+
+struct x86_emulate_ops emulate_ops = {
+       .read_std            = emulator_read_std,
+       .read_emulated       = emulator_read_emulated,
+       .write_emulated      = emulator_write_emulated,
+       .cmpxchg_emulated    = emulator_cmpxchg_emulated,
+};
+
+int emulate_instruction(struct kvm_vcpu *vcpu,
+                       struct kvm_run *run,
+                       unsigned long cr2,
+                       u16 error_code,
+                       int emulation_type)
+{
+       int r;
+       struct decode_cache *c;
+
+       vcpu->arch.mmio_fault_cr2 = cr2;
+       kvm_x86_ops->cache_regs(vcpu);
+
+       vcpu->mmio_is_write = 0;
+       vcpu->arch.pio.string = 0;
+
+       if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+               int cs_db, cs_l;
+               kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+               vcpu->arch.emulate_ctxt.vcpu = vcpu;
+               vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+               vcpu->arch.emulate_ctxt.mode =
+                       (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+                       ? X86EMUL_MODE_REAL : cs_l
+                       ? X86EMUL_MODE_PROT64 : cs_db
+                       ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+
+               if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
+                       vcpu->arch.emulate_ctxt.cs_base = 0;
+                       vcpu->arch.emulate_ctxt.ds_base = 0;
+                       vcpu->arch.emulate_ctxt.es_base = 0;
+                       vcpu->arch.emulate_ctxt.ss_base = 0;
+               } else {
+                       vcpu->arch.emulate_ctxt.cs_base =
+                                       get_segment_base(vcpu, VCPU_SREG_CS);
+                       vcpu->arch.emulate_ctxt.ds_base =
+                                       get_segment_base(vcpu, VCPU_SREG_DS);
+                       vcpu->arch.emulate_ctxt.es_base =
+                                       get_segment_base(vcpu, VCPU_SREG_ES);
+                       vcpu->arch.emulate_ctxt.ss_base =
+                                       get_segment_base(vcpu, VCPU_SREG_SS);
+               }
+
+               vcpu->arch.emulate_ctxt.gs_base =
+                                       get_segment_base(vcpu, VCPU_SREG_GS);
+               vcpu->arch.emulate_ctxt.fs_base =
+                                       get_segment_base(vcpu, VCPU_SREG_FS);
+
+               r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+               /* Reject the instructions other than VMCALL/VMMCALL when
+                * try to emulate invalid opcode */
+               c = &vcpu->arch.emulate_ctxt.decode;
+               if ((emulation_type & EMULTYPE_TRAP_UD) &&
+                   (!(c->twobyte && c->b == 0x01 &&
+                     (c->modrm_reg == 0 || c->modrm_reg == 3) &&
+                      c->modrm_mod == 3 && c->modrm_rm == 1)))
+                       return EMULATE_FAIL;
+
+               ++vcpu->stat.insn_emulation;
+               if (r)  {
+                       ++vcpu->stat.insn_emulation_fail;
+                       if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+                               return EMULATE_DONE;
+                       return EMULATE_FAIL;
                 }
-               /*
-                * Both basic or both extended?
-                */
-               if (((e->function ^ function) & 0x80000000) == 0)
-                       if (!best || e->function > best->function)
-                               best = e;
         }
-       if (best) {
-               vcpu->regs[VCPU_REGS_RAX] = best->eax;
-               vcpu->regs[VCPU_REGS_RBX] = best->ebx;
-               vcpu->regs[VCPU_REGS_RCX] = best->ecx;
-               vcpu->regs[VCPU_REGS_RDX] = best->edx;
+
+       r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+
+       if (vcpu->arch.pio.string)
+               return EMULATE_DO_MMIO;
+
+       if ((r || vcpu->mmio_is_write) && run) {
+               run->exit_reason = KVM_EXIT_MMIO;
+               run->mmio.phys_addr = vcpu->mmio_phys_addr;
+               memcpy(run->mmio.data, vcpu->mmio_data, 8);
+               run->mmio.len = vcpu->mmio_size;
+               run->mmio.is_write = vcpu->mmio_is_write;
+       }
+
+       if (r) {
+               if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+                       return EMULATE_DONE;
+               if (!vcpu->mmio_needed) {
+                       kvm_report_emulation_failure(vcpu, "mmio");
+                       return EMULATE_FAIL;
+               }
+               return EMULATE_DO_MMIO;
         }
+
         kvm_x86_ops->decache_regs(vcpu);
-       kvm_x86_ops->skip_emulated_instruction(vcpu);
+       kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+       if (vcpu->mmio_is_write) {
+               vcpu->mmio_needed = 0;
+               return EMULATE_DO_MMIO;
+       }
+
+       return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(emulate_instruction);
+
+static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
+               if (vcpu->arch.pio.guest_pages[i]) {
+                       kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
+                       vcpu->arch.pio.guest_pages[i] = NULL;
+               }
  }
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
  
  static int pio_copy_data(struct kvm_vcpu *vcpu)
  {
-       void *p = vcpu->pio_data;
+       void *p = vcpu->arch.pio_data;
         void *q;
         unsigned bytes;
-       int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
+       int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
  
-       q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
+       q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
                  PAGE_KERNEL);
         if (!q) {
                 free_pio_guest_pages(vcpu);
                 return -ENOMEM;
         }
-       q += vcpu->pio.guest_page_offset;
-       bytes = vcpu->pio.size * vcpu->pio.cur_count;
-       if (vcpu->pio.in)
+       q += vcpu->arch.pio.guest_page_offset;
+       bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
+       if (vcpu->arch.pio.in)
                 memcpy(q, p, bytes);
         else
                 memcpy(p, q, bytes);
-       q -= vcpu->pio.guest_page_offset;
+       q -= vcpu->arch.pio.guest_page_offset;
         vunmap(q);
         free_pio_guest_pages(vcpu);
         return 0;
  }
  
-static int complete_pio(struct kvm_vcpu *vcpu)
+int complete_pio(struct kvm_vcpu *vcpu)
  {
-       struct kvm_pio_request *io = &vcpu->pio;
+       struct kvm_pio_request *io = &vcpu->arch.pio;
         long delta;
         int r;
  
@@ -1760,7 +1985,7 @@ static int complete_pio(struct kvm_vcpu *vcpu)
  
         if (!io->string) {
                 if (io->in)
-                       memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
+                       memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
                                io->size);
         } else {
                 if (io->in) {
@@ -1778,15 +2003,15 @@ static int complete_pio(struct kvm_vcpu *vcpu)
                          * The size of the register should really depend on
                          * current address size.
                          */
-                       vcpu->regs[VCPU_REGS_RCX] -= delta;
+                       vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
                 }
                 if (io->down)
                         delta = -delta;
                 delta *= io->size;
                 if (io->in)
-                       vcpu->regs[VCPU_REGS_RDI] += delta;
+                       vcpu->arch.regs[VCPU_REGS_RDI] += delta;
                 else
-                       vcpu->regs[VCPU_REGS_RSI] += delta;
+                       vcpu->arch.regs[VCPU_REGS_RSI] += delta;
         }
  
         kvm_x86_ops->decache_regs(vcpu);
@@ -1804,13 +2029,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
         /* TODO: String I/O for in kernel device */
  
         mutex_lock(&vcpu->kvm->lock);
-       if (vcpu->pio.in)
-               kvm_iodevice_read(pio_dev, vcpu->pio.port,
-                                 vcpu->pio.size,
+       if (vcpu->arch.pio.in)
+               kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
+                                 vcpu->arch.pio.size,
                                   pd);
         else
-               kvm_iodevice_write(pio_dev, vcpu->pio.port,
-                                  vcpu->pio.size,
+               kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
+                                  vcpu->arch.pio.size,
                                    pd);
         mutex_unlock(&vcpu->kvm->lock);
  }
@@ -1818,8 +2043,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
  static void pio_string_write(struct kvm_io_device *pio_dev,
                              struct kvm_vcpu *vcpu)
  {
-       struct kvm_pio_request *io = &vcpu->pio;
-       void *pd = vcpu->pio_data;
+       struct kvm_pio_request *io = &vcpu->arch.pio;
+       void *pd = vcpu->arch.pio_data;
         int i;
  
         mutex_lock(&vcpu->kvm->lock);
@@ -1832,32 +2057,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
         mutex_unlock(&vcpu->kvm->lock);
  }
  
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
+                                              gpa_t addr)
+{
+       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
+}
+
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                   int size, unsigned port)
  {
         struct kvm_io_device *pio_dev;
  
         vcpu->run->exit_reason = KVM_EXIT_IO;
         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-       vcpu->run->io.size = vcpu->pio.size = size;
+       vcpu->run->io.size = vcpu->arch.pio.size = size;
         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-       vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
-       vcpu->run->io.port = vcpu->pio.port = port;
-       vcpu->pio.in = in;
-       vcpu->pio.string = 0;
-       vcpu->pio.down = 0;
-       vcpu->pio.guest_page_offset = 0;
-       vcpu->pio.rep = 0;
+       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
+       vcpu->run->io.port = vcpu->arch.pio.port = port;
+       vcpu->arch.pio.in = in;
+       vcpu->arch.pio.string = 0;
+       vcpu->arch.pio.down = 0;
+       vcpu->arch.pio.guest_page_offset = 0;
+       vcpu->arch.pio.rep = 0;
  
         kvm_x86_ops->cache_regs(vcpu);
-       memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
+       memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
         kvm_x86_ops->decache_regs(vcpu);
  
         kvm_x86_ops->skip_emulated_instruction(vcpu);
  
         pio_dev = vcpu_find_pio_dev(vcpu, port);
         if (pio_dev) {
-               kernel_pio(pio_dev, vcpu, vcpu->pio_data);
+               kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
                 complete_pio(vcpu);
                 return 1;
         }
@@ -1877,15 +2108,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
  
         vcpu->run->exit_reason = KVM_EXIT_IO;
         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-       vcpu->run->io.size = vcpu->pio.size = size;
+       vcpu->run->io.size = vcpu->arch.pio.size = size;
         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-       vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
-       vcpu->run->io.port = vcpu->pio.port = port;
-       vcpu->pio.in = in;
-       vcpu->pio.string = 1;
-       vcpu->pio.down = down;
-       vcpu->pio.guest_page_offset = offset_in_page(address);
-       vcpu->pio.rep = rep;
+       vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+       vcpu->run->io.port = vcpu->arch.pio.port = port;
+       vcpu->arch.pio.in = in;
+       vcpu->arch.pio.string = 1;
+       vcpu->arch.pio.down = down;
+       vcpu->arch.pio.guest_page_offset = offset_in_page(address);
+       vcpu->arch.pio.rep = rep;
  
         if (!count) {
                 kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -1911,37 +2142,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
                  */
                 pr_unimpl(vcpu, "guest string pio down\n");
-               inject_gp(vcpu);
+               kvm_inject_gp(vcpu, 0);
                 return 1;
         }
         vcpu->run->io.count = now;
-       vcpu->pio.cur_count = now;
+       vcpu->arch.pio.cur_count = now;
  
-       if (vcpu->pio.cur_count == vcpu->pio.count)
+       if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
                 kvm_x86_ops->skip_emulated_instruction(vcpu);
  
         for (i = 0; i < nr_pages; ++i) {
-               mutex_lock(&vcpu->kvm->lock);
+               down_read(&current->mm->mmap_sem);
                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-               if (page)
-                       get_page(page);
-               vcpu->pio.guest_pages[i] = page;
-               mutex_unlock(&vcpu->kvm->lock);
+               vcpu->arch.pio.guest_pages[i] = page;
+               up_read(&current->mm->mmap_sem);
                 if (!page) {
-                       inject_gp(vcpu);
+                       kvm_inject_gp(vcpu, 0);
                         free_pio_guest_pages(vcpu);
                         return 1;
                 }
         }
  
         pio_dev = vcpu_find_pio_dev(vcpu, port);
-       if (!vcpu->pio.in) {
+       if (!vcpu->arch.pio.in) {
                 /* string PIO write */
                 ret = pio_copy_data(vcpu);
                 if (ret >= 0 && pio_dev) {
                         pio_string_write(pio_dev, vcpu);
                         complete_pio(vcpu);
-                       if (vcpu->pio.count == 0)
+                       if (vcpu->arch.pio.count == 0)
                                 ret = 1;
                 }
         } else if (pio_dev)
@@ -1953,124 +2182,437 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
  
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
-static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
-                                         struct kvm_run *kvm_run)
-{
-       return (!vcpu->irq_summary &&
-               kvm_run->request_interrupt_window &&
-               vcpu->interrupt_window_open &&
-               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
-}
-
-static void post_kvm_run_save(struct kvm_vcpu *vcpu,
-                             struct kvm_run *kvm_run)
-{
-       kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-       kvm_run->cr8 = get_cr8(vcpu);
-       kvm_run->apic_base = kvm_get_apic_base(vcpu);
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_run->ready_for_interrupt_injection = 1;
-       else
-               kvm_run->ready_for_interrupt_injection =
-                                       (vcpu->interrupt_window_open &&
-                                        vcpu->irq_summary == 0);
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_init(void *opaque)
  {
         int r;
+       struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
  
-       if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
-               printk("vcpu %d received sipi with vector # %x\n",
-                      vcpu->vcpu_id, vcpu->sipi_vector);
-               kvm_lapic_reset(vcpu);
-               kvm_x86_ops->vcpu_reset(vcpu);
-               vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
+       if (kvm_x86_ops) {
+               printk(KERN_ERR "kvm: already loaded the other module\n");
+               r = -EEXIST;
+               goto out;
         }
  
-preempted:
-       if (vcpu->guest_debug.enabled)
-               kvm_x86_ops->guest_debug_pre(vcpu);
+       if (!ops->cpu_has_kvm_support()) {
+               printk(KERN_ERR "kvm: no hardware support\n");
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+       if (ops->disabled_by_bios()) {
+               printk(KERN_ERR "kvm: disabled by bios\n");
+               r = -EOPNOTSUPP;
+               goto out;
+       }
  
-again:
-       r = kvm_mmu_reload(vcpu);
-       if (unlikely(r))
+       r = kvm_mmu_module_init();
+       if (r)
                 goto out;
  
-       preempt_disable();
+       kvm_init_msr_list();
  
-       kvm_x86_ops->prepare_guest_switch(vcpu);
-       kvm_load_guest_fpu(vcpu);
+       kvm_x86_ops = ops;
+       kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+       return 0;
  
-       local_irq_disable();
+out:
+       return r;
+}
  
-       if (signal_pending(current)) {
-               local_irq_enable();
-               preempt_enable();
-               r = -EINTR;
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               ++vcpu->stat.signal_exits;
-               goto out;
+void kvm_arch_exit(void)
+{
+       kvm_x86_ops = NULL;
+       kvm_mmu_module_exit();
+}
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.halt_exits;
+       if (irqchip_in_kernel(vcpu->kvm)) {
+               vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+               kvm_vcpu_block(vcpu);
+               if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
+                       return -EINTR;
+               return 1;
+       } else {
+               vcpu->run->exit_reason = KVM_EXIT_HLT;
+               return 0;
         }
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
  
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_x86_ops->inject_pending_irq(vcpu);
-       else if (!vcpu->mmio_read_completed)
-               kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+       unsigned long nr, a0, a1, a2, a3, ret;
  
-       vcpu->guest_mode = 1;
-       kvm_guest_enter();
+       kvm_x86_ops->cache_regs(vcpu);
  
-       if (vcpu->requests)
-               if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
-                       kvm_x86_ops->tlb_flush(vcpu);
+       nr = vcpu->arch.regs[VCPU_REGS_RAX];
+       a0 = vcpu->arch.regs[VCPU_REGS_RBX];
+       a1 = vcpu->arch.regs[VCPU_REGS_RCX];
+       a2 = vcpu->arch.regs[VCPU_REGS_RDX];
+       a3 = vcpu->arch.regs[VCPU_REGS_RSI];
  
-       kvm_x86_ops->run(vcpu, kvm_run);
+       if (!is_long_mode(vcpu)) {
+               nr &= 0xFFFFFFFF;
+               a0 &= 0xFFFFFFFF;
+               a1 &= 0xFFFFFFFF;
+               a2 &= 0xFFFFFFFF;
+               a3 &= 0xFFFFFFFF;
+       }
  
-       vcpu->guest_mode = 0;
-       local_irq_enable();
+       switch (nr) {
+       case KVM_HC_VAPIC_POLL_IRQ:
+               ret = 0;
+               break;
+       default:
+               ret = -KVM_ENOSYS;
+               break;
+       }
+       vcpu->arch.regs[VCPU_REGS_RAX] = ret;
+       kvm_x86_ops->decache_regs(vcpu);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+{
+       char instruction[3];
+       int ret = 0;
  
-       ++vcpu->stat.exits;
  
         /*
-        * We must have an instruction between local_irq_enable() and
-        * kvm_guest_exit(), so the timer interrupt isn't delayed by
-        * the interrupt shadow.  The stat.exits increment will do nicely.
-        * But we need to prevent reordering, hence this barrier():
+        * Blow out the MMU to ensure that no other VCPU has an active mapping
+        * to ensure that the updated hypercall appears atomically across all
+        * VCPUs.
          */
-       barrier();
+       kvm_mmu_zap_all(vcpu->kvm);
  
-       kvm_guest_exit();
+       kvm_x86_ops->cache_regs(vcpu);
+       kvm_x86_ops->patch_hypercall(vcpu, instruction);
+       if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+           != X86EMUL_CONTINUE)
+               ret = -EFAULT;
  
-       preempt_enable();
+       return ret;
+}
  
-       /*
-        * Profile KVM exit RIPs:
-        */
-       if (unlikely(prof_on == KVM_PROFILING)) {
-               kvm_x86_ops->cache_regs(vcpu);
-               profile_hit(KVM_PROFILING, (void *)vcpu->rip);
-       }
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+       return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
  
-       r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+       struct descriptor_table dt = { limit, base };
  
-       if (r > 0) {
-               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-                       r = -EINTR;
-                       kvm_run->exit_reason = KVM_EXIT_INTR;
-                       ++vcpu->stat.request_irq_exits;
-                       goto out;
-               }
-               if (!need_resched()) {
-                       ++vcpu->stat.light_exits;
-                       goto again;
+       kvm_x86_ops->set_gdt(vcpu, &dt);
+}
+
+void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
+{
+       struct descriptor_table dt = { limit, base };
+
+       kvm_x86_ops->set_idt(vcpu, &dt);
+}
+
+void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
+                  unsigned long *rflags)
+{
+       lmsw(vcpu, msw);
+       *rflags = kvm_x86_ops->get_rflags(vcpu);
+}
+
+unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
+{
+       kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+       switch (cr) {
+       case 0:
+               return vcpu->arch.cr0;
+       case 2:
+               return vcpu->arch.cr2;
+       case 3:
+               return vcpu->arch.cr3;
+       case 4:
+               return vcpu->arch.cr4;
+       case 8:
+               return get_cr8(vcpu);
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+               return 0;
+       }
+}
+
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
+                    unsigned long *rflags)
+{
+       switch (cr) {
+       case 0:
+               set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+               *rflags = kvm_x86_ops->get_rflags(vcpu);
+               break;
+       case 2:
+               vcpu->arch.cr2 = val;
+               break;
+       case 3:
+               set_cr3(vcpu, val);
+               break;
+       case 4:
+               set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+               break;
+       case 8:
+               set_cr8(vcpu, val & 0xfUL);
+               break;
+       default:
+               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+       }
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+       struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+       int j, nent = vcpu->arch.cpuid_nent;
+
+       e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+       /* when no next entry is found, the current entry[i] is reselected */
+       for (j = i + 1; j == i; j = (j + 1) % nent) {
+               struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+               if (ej->function == e->function) {
+                       ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+                       return j;
+               }
+       }
+       return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+       u32 function, u32 index)
+{
+       if (e->function != function)
+               return 0;
+       if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+               return 0;
+       if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+               !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+               return 0;
+       return 1;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+       int i;
+       u32 function, index;
+       struct kvm_cpuid_entry2 *e, *best;
+
+       kvm_x86_ops->cache_regs(vcpu);
+       function = vcpu->arch.regs[VCPU_REGS_RAX];
+       index = vcpu->arch.regs[VCPU_REGS_RCX];
+       vcpu->arch.regs[VCPU_REGS_RAX] = 0;
+       vcpu->arch.regs[VCPU_REGS_RBX] = 0;
+       vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+       vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+       best = NULL;
+       for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+               e = &vcpu->arch.cpuid_entries[i];
+               if (is_matching_cpuid_entry(e, function, index)) {
+                       if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+                               move_to_next_stateful_cpuid_entry(vcpu, i);
+                       best = e;
+                       break;
+               }
+               /*
+                * Both basic or both extended?
+                */
+               if (((e->function ^ function) & 0x80000000) == 0)
+                       if (!best || e->function > best->function)
+                               best = e;
+       }
+       if (best) {
+               vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
+               vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
+               vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
+               vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+       }
+       kvm_x86_ops->decache_regs(vcpu);
+       kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
+
+/*
+ * Check if userspace requested an interrupt window, and that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
+                                         struct kvm_run *kvm_run)
+{
+       return (!vcpu->arch.irq_summary &&
+               kvm_run->request_interrupt_window &&
+               vcpu->arch.interrupt_window_open &&
+               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu,
+                             struct kvm_run *kvm_run)
+{
+       kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+       kvm_run->cr8 = get_cr8(vcpu);
+       kvm_run->apic_base = kvm_get_apic_base(vcpu);
+       if (irqchip_in_kernel(vcpu->kvm))
+               kvm_run->ready_for_interrupt_injection = 1;
+       else
+               kvm_run->ready_for_interrupt_injection =
+                                       (vcpu->arch.interrupt_window_open &&
+                                        vcpu->arch.irq_summary == 0);
+}
+
+static void vapic_enter(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       struct page *page;
+
+       if (!apic || !apic->vapic_addr)
+               return;
+
+       down_read(&current->mm->mmap_sem);
+       page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+       vcpu->arch.apic->vapic_page = page;
+       up_read(&current->mm->mmap_sem);
+}
+
+static void vapic_exit(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       if (!apic || !apic->vapic_addr)
+               return;
+
+       kvm_release_page_dirty(apic->vapic_page);
+       mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       int r;
+
+       if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+               pr_debug("vcpu %d received sipi with vector # %x\n",
+                      vcpu->vcpu_id, vcpu->arch.sipi_vector);
+               kvm_lapic_reset(vcpu);
+               r = kvm_x86_ops->vcpu_reset(vcpu);
+               if (r)
+                       return r;
+               vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+       }
+
+       vapic_enter(vcpu);
+
+preempted:
+       if (vcpu->guest_debug.enabled)
+               kvm_x86_ops->guest_debug_pre(vcpu);
+
+again:
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r))
+               goto out;
+
+       if (vcpu->requests) {
+               if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+                       __kvm_migrate_apic_timer(vcpu);
+               if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
+                                      &vcpu->requests)) {
+                       kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
+                       r = 0;
+                       goto out;
+               }
+       }
+
+       kvm_inject_pending_timer_irqs(vcpu);
+
+       preempt_disable();
+
+       kvm_x86_ops->prepare_guest_switch(vcpu);
+       kvm_load_guest_fpu(vcpu);
+
+       local_irq_disable();
+
+       if (need_resched()) {
+               local_irq_enable();
+               preempt_enable();
+               r = 1;
+               goto out;
+       }
+
+       if (signal_pending(current)) {
+               local_irq_enable();
+               preempt_enable();
+               r = -EINTR;
+               kvm_run->exit_reason = KVM_EXIT_INTR;
+               ++vcpu->stat.signal_exits;
+               goto out;
+       }
+
+       if (vcpu->arch.exception.pending)
+               __queue_exception(vcpu);
+       else if (irqchip_in_kernel(vcpu->kvm))
+               kvm_x86_ops->inject_pending_irq(vcpu);
+       else
+               kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+
+       kvm_lapic_sync_to_vapic(vcpu);
+
+       vcpu->guest_mode = 1;
+       kvm_guest_enter();
+
+       if (vcpu->requests)
+               if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                       kvm_x86_ops->tlb_flush(vcpu);
+
+       kvm_x86_ops->run(vcpu, kvm_run);
+
+       vcpu->guest_mode = 0;
+       local_irq_enable();
+
+       ++vcpu->stat.exits;
+
+       /*
+        * We must have an instruction between local_irq_enable() and
+        * kvm_guest_exit(), so the timer interrupt isn't delayed by
+        * the interrupt shadow.  The stat.exits increment will do nicely.
+        * But we need to prevent reordering, hence this barrier():
+        */
+       barrier();
+
+       kvm_guest_exit();
+
+       preempt_enable();
+
+       /*
+        * Profile KVM exit RIPs:
+        */
+       if (unlikely(prof_on == KVM_PROFILING)) {
+               kvm_x86_ops->cache_regs(vcpu);
+               profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+       }
+
+       if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
+               vcpu->arch.exception.pending = false;
+
+       kvm_lapic_sync_from_vapic(vcpu);
+
+       r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+
+       if (r > 0) {
+               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+                       r = -EINTR;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       ++vcpu->stat.request_irq_exits;
+                       goto out;
                 }
+               if (!need_resched())
+                       goto again;
         }
  
  out:
@@ -2081,18 +2623,19 @@ out:
  
         post_kvm_run_save(vcpu, kvm_run);
  
+       vapic_exit(vcpu);
+
         return r;
  }
  
-
-static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
         int r;
         sigset_t sigsaved;
  
         vcpu_load(vcpu);
  
-       if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+       if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
                 kvm_vcpu_block(vcpu);
                 vcpu_put(vcpu);
                 return -EAGAIN;
@@ -2105,18 +2648,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
         if (!irqchip_in_kernel(vcpu->kvm))
                 set_cr8(vcpu, kvm_run->cr8);
  
-       if (vcpu->pio.cur_count) {
+       if (vcpu->arch.pio.cur_count) {
                 r = complete_pio(vcpu);
                 if (r)
                         goto out;
         }
-
+#if CONFIG_HAS_IOMEM
         if (vcpu->mmio_needed) {
                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
                 vcpu->mmio_read_completed = 1;
                 vcpu->mmio_needed = 0;
                 r = emulate_instruction(vcpu, kvm_run,
-                                       vcpu->mmio_fault_cr2, 0);
+                                       vcpu->arch.mmio_fault_cr2, 0,
+                                       EMULTYPE_NO_DECODE);
                 if (r == EMULATE_DO_MMIO) {
                         /*
                          * Read-modify-write.  Back to userspace.
@@ -2125,10 +2669,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                         goto out;
                 }
         }
-
+#endif
         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
                 kvm_x86_ops->cache_regs(vcpu);
-               vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
+               vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
                 kvm_x86_ops->decache_regs(vcpu);
         }
  
@@ -2142,33 +2686,32 @@ out:
         return r;
  }
  
-static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
-                                  struct kvm_regs *regs)
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  {
         vcpu_load(vcpu);
  
         kvm_x86_ops->cache_regs(vcpu);
  
-       regs->rax = vcpu->regs[VCPU_REGS_RAX];
-       regs->rbx = vcpu->regs[VCPU_REGS_RBX];
-       regs->rcx = vcpu->regs[VCPU_REGS_RCX];
-       regs->rdx = vcpu->regs[VCPU_REGS_RDX];
-       regs->rsi = vcpu->regs[VCPU_REGS_RSI];
-       regs->rdi = vcpu->regs[VCPU_REGS_RDI];
-       regs->rsp = vcpu->regs[VCPU_REGS_RSP];
-       regs->rbp = vcpu->regs[VCPU_REGS_RBP];
+       regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
+       regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
+       regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
+       regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
+       regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
+       regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+       regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
  #ifdef CONFIG_X86_64
-       regs->r8 = vcpu->regs[VCPU_REGS_R8];
-       regs->r9 = vcpu->regs[VCPU_REGS_R9];
-       regs->r10 = vcpu->regs[VCPU_REGS_R10];
-       regs->r11 = vcpu->regs[VCPU_REGS_R11];
-       regs->r12 = vcpu->regs[VCPU_REGS_R12];
-       regs->r13 = vcpu->regs[VCPU_REGS_R13];
-       regs->r14 = vcpu->regs[VCPU_REGS_R14];
-       regs->r15 = vcpu->regs[VCPU_REGS_R15];
+       regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
+       regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
+       regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
+       regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
+       regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
+       regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
+       regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
+       regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
  #endif
  
-       regs->rip = vcpu->rip;
+       regs->rip = vcpu->arch.rip;
         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
  
         /*
@@ -2182,31 +2725,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
         return 0;
  }
  
-static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
-                                  struct kvm_regs *regs)
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  {
         vcpu_load(vcpu);
  
-       vcpu->regs[VCPU_REGS_RAX] = regs->rax;
-       vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
-       vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
-       vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
-       vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
-       vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
-       vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
-       vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
+       vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
+       vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
+       vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
+       vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
+       vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
+       vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
+       vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
+       vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
  #ifdef CONFIG_X86_64
-       vcpu->regs[VCPU_REGS_R8] = regs->r8;
-       vcpu->regs[VCPU_REGS_R9] = regs->r9;
-       vcpu->regs[VCPU_REGS_R10] = regs->r10;
-       vcpu->regs[VCPU_REGS_R11] = regs->r11;
-       vcpu->regs[VCPU_REGS_R12] = regs->r12;
-       vcpu->regs[VCPU_REGS_R13] = regs->r13;
-       vcpu->regs[VCPU_REGS_R14] = regs->r14;
-       vcpu->regs[VCPU_REGS_R15] = regs->r15;
+       vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
+       vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
+       vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
+       vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
+       vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
+       vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
+       vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
+       vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
  #endif
  
-       vcpu->rip = regs->rip;
+       vcpu->arch.rip = regs->rip;
         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
  
         kvm_x86_ops->decache_regs(vcpu);
@@ -2222,8 +2764,18 @@ static void get_segment(struct kvm_vcpu *vcpu,
         return kvm_x86_ops->get_segment(vcpu, var, seg);
  }
  
-static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                   struct kvm_sregs *sregs)
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+       struct kvm_segment cs;
+
+       get_segment(vcpu, &cs, VCPU_SREG_CS);
+       *db = cs.db;
+       *l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs)
  {
         struct descriptor_table dt;
         int pending_vec;
@@ -2248,12 +2800,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
         sregs->gdt.base = dt.base;
  
         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
-       sregs->cr0 = vcpu->cr0;
-       sregs->cr2 = vcpu->cr2;
-       sregs->cr3 = vcpu->cr3;
-       sregs->cr4 = vcpu->cr4;
+       sregs->cr0 = vcpu->arch.cr0;
+       sregs->cr2 = vcpu->arch.cr2;
+       sregs->cr3 = vcpu->arch.cr3;
+       sregs->cr4 = vcpu->arch.cr4;
         sregs->cr8 = get_cr8(vcpu);
-       sregs->efer = vcpu->shadow_efer;
+       sregs->efer = vcpu->arch.shadow_efer;
         sregs->apic_base = kvm_get_apic_base(vcpu);
  
         if (irqchip_in_kernel(vcpu->kvm)) {
@@ -2261,9 +2813,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                        sizeof sregs->interrupt_bitmap);
                 pending_vec = kvm_x86_ops->get_irq(vcpu);
                 if (pending_vec >= 0)
-                       set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
+                       set_bit(pending_vec,
+                               (unsigned long *)sregs->interrupt_bitmap);
         } else
-               memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
+               memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
                        sizeof sregs->interrupt_bitmap);
  
         vcpu_put(vcpu);
@@ -2277,8 +2830,8 @@ static void set_segment(struct kvm_vcpu *vcpu,
         return kvm_x86_ops->set_segment(vcpu, var, seg);
  }
  
-static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                   struct kvm_sregs *sregs)
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs)
  {
         int mmu_reset_needed = 0;
         int i, pending_vec, max_bits;
@@ -2293,13 +2846,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
         dt.base = sregs->gdt.base;
         kvm_x86_ops->set_gdt(vcpu, &dt);
  
-       vcpu->cr2 = sregs->cr2;
-       mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
-       vcpu->cr3 = sregs->cr3;
+       vcpu->arch.cr2 = sregs->cr2;
+       mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+       vcpu->arch.cr3 = sregs->cr3;
  
         set_cr8(vcpu, sregs->cr8);
  
-       mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
+       mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
  #ifdef CONFIG_X86_64
         kvm_x86_ops->set_efer(vcpu, sregs->efer);
  #endif
@@ -2307,25 +2860,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
  
         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
  
-       mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
-       vcpu->cr0 = sregs->cr0;
+       mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
+       vcpu->arch.cr0 = sregs->cr0;
         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
  
-       mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
+       mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
         if (!is_long_mode(vcpu) && is_pae(vcpu))
-               load_pdptrs(vcpu, vcpu->cr3);
+               load_pdptrs(vcpu, vcpu->arch.cr3);
  
         if (mmu_reset_needed)
                 kvm_mmu_reset_context(vcpu);
  
         if (!irqchip_in_kernel(vcpu->kvm)) {
-               memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
-                      sizeof vcpu->irq_pending);
-               vcpu->irq_summary = 0;
-               for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
-                       if (vcpu->irq_pending[i])
-                               __set_bit(i, &vcpu->irq_summary);
+               memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
+                      sizeof vcpu->arch.irq_pending);
+               vcpu->arch.irq_summary = 0;
+               for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
+                       if (vcpu->arch.irq_pending[i])
+                               __set_bit(i, &vcpu->arch.irq_summary);
         } else {
                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
                 pending_vec = find_first_bit(
@@ -2334,1295 +2887,401 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                 /* Only pending external irq is handled here */
                 if (pending_vec < max_bits) {
                         kvm_x86_ops->set_irq(vcpu, pending_vec);
-                       printk("Set back pending irq %d\n", pending_vec);
+                       pr_debug("Set back pending irq %d\n",
+                                pending_vec);
                 }
         }
  
         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
-       set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
-       set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
-       set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
-       set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
-
-       set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
-       set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
-       struct kvm_segment cs;
-
-       get_segment(vcpu, &cs, VCPU_SREG_CS);
-       *db = cs.db;
-       *l = cs.l;
-}
-EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-
-/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * This list is modified at module load time to reflect the
- * capabilities of the host cpu.
- */
-static u32 msrs_to_save[] = {
-       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-       MSR_K6_STAR,
-#ifdef CONFIG_X86_64
-       MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
-       MSR_IA32_TIME_STAMP_COUNTER,
-};
-
-static unsigned num_msrs_to_save;
-
-static u32 emulated_msrs[] = {
-       MSR_IA32_MISC_ENABLE,
-};
-
-static __init void kvm_init_msr_list(void)
-{
-       u32 dummy[2];
-       unsigned i, j;
-
-       for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
-               if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
-                       continue;
-               if (j < i)
-                       msrs_to_save[j] = msrs_to_save[i];
-               j++;
-       }
-       num_msrs_to_save = j;
-}
-
-/*
- * Adapt set_msr() to msr_io()'s calling convention
- */
-static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
-{
-       return kvm_set_msr(vcpu, index, *data);
-}
-
-/*
- * Read or write a bunch of msrs. All parameters are kernel addresses.
- *
- * @return number of msrs set successfully.
- */
-static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
-                   struct kvm_msr_entry *entries,
-                   int (*do_msr)(struct kvm_vcpu *vcpu,
-                                 unsigned index, u64 *data))
-{
-       int i;
-
-       vcpu_load(vcpu);
-
-       for (i = 0; i < msrs->nmsrs; ++i)
-               if (do_msr(vcpu, entries[i].index, &entries[i].data))
-                       break;
-
-       vcpu_put(vcpu);
-
-       return i;
-}
-
-/*
- * Read or write a bunch of msrs. Parameters are user addresses.
- *
- * @return number of msrs set successfully.
- */
-static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
-                 int (*do_msr)(struct kvm_vcpu *vcpu,
-                               unsigned index, u64 *data),
-                 int writeback)
-{
-       struct kvm_msrs msrs;
-       struct kvm_msr_entry *entries;
-       int r, n;
-       unsigned size;
-
-       r = -EFAULT;
-       if (copy_from_user(&msrs, user_msrs, sizeof msrs))
-               goto out;
-
-       r = -E2BIG;
-       if (msrs.nmsrs >= MAX_IO_MSRS)
-               goto out;
-
-       r = -ENOMEM;
-       size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
-       entries = vmalloc(size);
-       if (!entries)
-               goto out;
-
-       r = -EFAULT;
-       if (copy_from_user(entries, user_msrs->entries, size))
-               goto out_free;
-
-       r = n = __msr_io(vcpu, &msrs, entries, do_msr);
-       if (r < 0)
-               goto out_free;
-
-       r = -EFAULT;
-       if (writeback && copy_to_user(user_msrs->entries, entries, size))
-               goto out_free;
-
-       r = n;
-
-out_free:
-       vfree(entries);
-out:
-       return r;
-}
-
-/*
- * Translate a guest virtual address to a guest physical address.
- */
-static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
-                                   struct kvm_translation *tr)
-{
-       unsigned long vaddr = tr->linear_address;
-       gpa_t gpa;
-
-       vcpu_load(vcpu);
-       mutex_lock(&vcpu->kvm->lock);
-       gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
-       tr->physical_address = gpa;
-       tr->valid = gpa != UNMAPPED_GVA;
-       tr->writeable = 1;
-       tr->usermode = 0;
-       mutex_unlock(&vcpu->kvm->lock);
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
-                                   struct kvm_interrupt *irq)
-{
-       if (irq->irq < 0 || irq->irq >= 256)
-               return -EINVAL;
-       if (irqchip_in_kernel(vcpu->kvm))
-               return -ENXIO;
-       vcpu_load(vcpu);
-
-       set_bit(irq->irq, vcpu->irq_pending);
-       set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-                                     struct kvm_debug_guest *dbg)
-{
-       int r;
-
-       vcpu_load(vcpu);
-
-       r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
-
-       vcpu_put(vcpu);
-
-       return r;
-}
-
-static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
-                                   unsigned long address,
-                                   int *type)
-{
-       struct kvm_vcpu *vcpu = vma->vm_file->private_data;
-       unsigned long pgoff;
-       struct page *page;
-
-       pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-       if (pgoff == 0)
-               page = virt_to_page(vcpu->run);
-       else if (pgoff == KVM_PIO_PAGE_OFFSET)
-               page = virt_to_page(vcpu->pio_data);
-       else
-               return NOPAGE_SIGBUS;
-       get_page(page);
-       if (type != NULL)
-               *type = VM_FAULT_MINOR;
-
-       return page;
-}
-
-static struct vm_operations_struct kvm_vcpu_vm_ops = {
-       .nopage = kvm_vcpu_nopage,
-};
-
-static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
-{
-       vma->vm_ops = &kvm_vcpu_vm_ops;
-       return 0;
-}
-
-static int kvm_vcpu_release(struct inode *inode, struct file *filp)
-{
-       struct kvm_vcpu *vcpu = filp->private_data;
-
-       fput(vcpu->kvm->filp);
-       return 0;
-}
-
-static struct file_operations kvm_vcpu_fops = {
-       .release        = kvm_vcpu_release,
-       .unlocked_ioctl = kvm_vcpu_ioctl,
-       .compat_ioctl   = kvm_vcpu_ioctl,
-       .mmap           = kvm_vcpu_mmap,
-};
-
-/*
- * Allocates an inode for the vcpu.
- */
-static int create_vcpu_fd(struct kvm_vcpu *vcpu)
-{
-       int fd, r;
-       struct inode *inode;
-       struct file *file;
-
-       r = anon_inode_getfd(&fd, &inode, &file,
-                            "kvm-vcpu", &kvm_vcpu_fops, vcpu);
-       if (r)
-               return r;
-       atomic_inc(&vcpu->kvm->filp->f_count);
-       return fd;
-}
-
-/*
- * Creates some virtual cpus.  Good luck creating more than one.
- */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
-{
-       int r;
-       struct kvm_vcpu *vcpu;
-
-       if (!valid_vcpu(n))
-               return -EINVAL;
-
-       vcpu = kvm_x86_ops->vcpu_create(kvm, n);
-       if (IS_ERR(vcpu))
-               return PTR_ERR(vcpu);
-
-       preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
-
-       /* We do fxsave: this must be aligned. */
-       BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
-
-       vcpu_load(vcpu);
-       r = kvm_mmu_setup(vcpu);
-       vcpu_put(vcpu);
-       if (r < 0)
-               goto free_vcpu;
-
-       mutex_lock(&kvm->lock);
-       if (kvm->vcpus[n]) {
-               r = -EEXIST;
-               mutex_unlock(&kvm->lock);
-               goto mmu_unload;
-       }
-       kvm->vcpus[n] = vcpu;
-       mutex_unlock(&kvm->lock);
-
-       /* Now it's all set up, let userspace reach it */
-       r = create_vcpu_fd(vcpu);
-       if (r < 0)
-               goto unlink;
-       return r;
-
-unlink:
-       mutex_lock(&kvm->lock);
-       kvm->vcpus[n] = NULL;
-       mutex_unlock(&kvm->lock);
-
-mmu_unload:
-       vcpu_load(vcpu);
-       kvm_mmu_unload(vcpu);
-       vcpu_put(vcpu);
-
-free_vcpu:
-       kvm_x86_ops->vcpu_free(vcpu);
-       return r;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
-       u64 efer;
-       int i;
-       struct kvm_cpuid_entry *e, *entry;
-
-       rdmsrl(MSR_EFER, efer);
-       entry = NULL;
-       for (i = 0; i < vcpu->cpuid_nent; ++i) {
-               e = &vcpu->cpuid_entries[i];
-               if (e->function == 0x80000001) {
-                       entry = e;
-                       break;
-               }
-       }
-       if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
-               entry->edx &= ~(1 << 20);
-               printk(KERN_INFO "kvm: guest NX capability removed\n");
-       }
-}
-
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
-                                   struct kvm_cpuid *cpuid,
-                                   struct kvm_cpuid_entry __user *entries)
-{
-       int r;
-
-       r = -E2BIG;
-       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-               goto out;
-       r = -EFAULT;
-       if (copy_from_user(&vcpu->cpuid_entries, entries,
-                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
-               goto out;
-       vcpu->cpuid_nent = cpuid->nent;
-       cpuid_fix_nx_cap(vcpu);
-       return 0;
-
-out:
-       return r;
-}
-
-static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
-{
-       if (sigset) {
-               sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-               vcpu->sigset_active = 1;
-               vcpu->sigset = *sigset;
-       } else
-               vcpu->sigset_active = 0;
-       return 0;
-}
-
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-       u16     cwd;
-       u16     swd;
-       u16     twd;
-       u16     fop;
-       u64     rip;
-       u64     rdp;
-       u32     mxcsr;
-       u32     mxcsr_mask;
-       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-       u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
-       struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
-       vcpu_load(vcpu);
-
-       memcpy(fpu->fpr, fxsave->st_space, 128);
-       fpu->fcw = fxsave->cwd;
-       fpu->fsw = fxsave->swd;
-       fpu->ftwx = fxsave->twd;
-       fpu->last_opcode = fxsave->fop;
-       fpu->last_ip = fxsave->rip;
-       fpu->last_dp = fxsave->rdp;
-       memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
-{
-       struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
-
-       vcpu_load(vcpu);
-
-       memcpy(fxsave->st_space, fpu->fpr, 128);
-       fxsave->cwd = fpu->fcw;
-       fxsave->swd = fpu->fsw;
-       fxsave->twd = fpu->ftwx;
-       fxsave->fop = fpu->last_opcode;
-       fxsave->rip = fpu->last_ip;
-       fxsave->rdp = fpu->last_dp;
-       memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
-
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
-                                   struct kvm_lapic_state *s)
-{
-       vcpu_load(vcpu);
-       memcpy(s->regs, vcpu->apic->regs, sizeof *s);
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
-                                   struct kvm_lapic_state *s)
-{
-       vcpu_load(vcpu);
-       memcpy(vcpu->apic->regs, s->regs, sizeof *s);
-       kvm_apic_post_state_restore(vcpu);
-       vcpu_put(vcpu);
-
-       return 0;
-}
-
-static long kvm_vcpu_ioctl(struct file *filp,
-                          unsigned int ioctl, unsigned long arg)
-{
-       struct kvm_vcpu *vcpu = filp->private_data;
-       void __user *argp = (void __user *)arg;
-       int r = -EINVAL;
-
-       switch (ioctl) {
-       case KVM_RUN:
-               r = -EINVAL;
-               if (arg)
-                       goto out;
-               r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
-               break;
-       case KVM_GET_REGS: {
-               struct kvm_regs kvm_regs;
-
-               memset(&kvm_regs, 0, sizeof kvm_regs);
-               r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_REGS: {
-               struct kvm_regs kvm_regs;
-
-               r = -EFAULT;
-               if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_GET_SREGS: {
-               struct kvm_sregs kvm_sregs;
-
-               memset(&kvm_sregs, 0, sizeof kvm_sregs);
-               r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_SREGS: {
-               struct kvm_sregs kvm_sregs;
-
-               r = -EFAULT;
-               if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_TRANSLATE: {
-               struct kvm_translation tr;
-
-               r = -EFAULT;
-               if (copy_from_user(&tr, argp, sizeof tr))
-                       goto out;
-               r = kvm_vcpu_ioctl_translate(vcpu, &tr);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &tr, sizeof tr))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_INTERRUPT: {
-               struct kvm_interrupt irq;
-
-               r = -EFAULT;
-               if (copy_from_user(&irq, argp, sizeof irq))
-                       goto out;
-               r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_DEBUG_GUEST: {
-               struct kvm_debug_guest dbg;
-
-               r = -EFAULT;
-               if (copy_from_user(&dbg, argp, sizeof dbg))
-                       goto out;
-               r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_GET_MSRS:
-               r = msr_io(vcpu, argp, kvm_get_msr, 1);
-               break;
-       case KVM_SET_MSRS:
-               r = msr_io(vcpu, argp, do_set_msr, 0);
-               break;
-       case KVM_SET_CPUID: {
-               struct kvm_cpuid __user *cpuid_arg = argp;
-               struct kvm_cpuid cpuid;
-
-               r = -EFAULT;
-               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_SET_SIGNAL_MASK: {
-               struct kvm_signal_mask __user *sigmask_arg = argp;
-               struct kvm_signal_mask kvm_sigmask;
-               sigset_t sigset, *p;
-
-               p = NULL;
-               if (argp) {
-                       r = -EFAULT;
-                       if (copy_from_user(&kvm_sigmask, argp,
-                                          sizeof kvm_sigmask))
-                               goto out;
-                       r = -EINVAL;
-                       if (kvm_sigmask.len != sizeof sigset)
-                               goto out;
-                       r = -EFAULT;
-                       if (copy_from_user(&sigset, sigmask_arg->sigset,
-                                          sizeof sigset))
-                               goto out;
-                       p = &sigset;
-               }
-               r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
-               break;
-       }
-       case KVM_GET_FPU: {
-               struct kvm_fpu fpu;
-
-               memset(&fpu, 0, sizeof fpu);
-               r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &fpu, sizeof fpu))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_FPU: {
-               struct kvm_fpu fpu;
-
-               r = -EFAULT;
-               if (copy_from_user(&fpu, argp, sizeof fpu))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_GET_LAPIC: {
-               struct kvm_lapic_state lapic;
-
-               memset(&lapic, 0, sizeof lapic);
-               r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &lapic, sizeof lapic))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_LAPIC: {
-               struct kvm_lapic_state lapic;
-
-               r = -EFAULT;
-               if (copy_from_user(&lapic, argp, sizeof lapic))
-                       goto out;
-               r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       default:
-               ;
-       }
-out:
-       return r;
-}
-
-static long kvm_vm_ioctl(struct file *filp,
-                          unsigned int ioctl, unsigned long arg)
-{
-       struct kvm *kvm = filp->private_data;
-       void __user *argp = (void __user *)arg;
-       int r = -EINVAL;
-
-       switch (ioctl) {
-       case KVM_CREATE_VCPU:
-               r = kvm_vm_ioctl_create_vcpu(kvm, arg);
-               if (r < 0)
-                       goto out;
-               break;
-       case KVM_SET_MEMORY_REGION: {
-               struct kvm_memory_region kvm_mem;
-
-               r = -EFAULT;
-               if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-                       goto out;
-               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_GET_DIRTY_LOG: {
-               struct kvm_dirty_log log;
-
-               r = -EFAULT;
-               if (copy_from_user(&log, argp, sizeof log))
-                       goto out;
-               r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_SET_MEMORY_ALIAS: {
-               struct kvm_memory_alias alias;
-
-               r = -EFAULT;
-               if (copy_from_user(&alias, argp, sizeof alias))
-                       goto out;
-               r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_CREATE_IRQCHIP:
-               r = -ENOMEM;
-               kvm->vpic = kvm_create_pic(kvm);
-               if (kvm->vpic) {
-                       r = kvm_ioapic_init(kvm);
-                       if (r) {
-                               kfree(kvm->vpic);
-                               kvm->vpic = NULL;
-                               goto out;
-                       }
-               }
-               else
-                       goto out;
-               break;
-       case KVM_IRQ_LINE: {
-               struct kvm_irq_level irq_event;
-
-               r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
-                       goto out;
-               if (irqchip_in_kernel(kvm)) {
-                       mutex_lock(&kvm->lock);
-                       if (irq_event.irq < 16)
-                               kvm_pic_set_irq(pic_irqchip(kvm),
-                                       irq_event.irq,
-                                       irq_event.level);
-                       kvm_ioapic_set_irq(kvm->vioapic,
-                                       irq_event.irq,
-                                       irq_event.level);
-                       mutex_unlock(&kvm->lock);
-                       r = 0;
-               }
-               break;
-       }
-       case KVM_GET_IRQCHIP: {
-               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip chip;
-
-               r = -EFAULT;
-               if (copy_from_user(&chip, argp, sizeof chip))
-                       goto out;
-               r = -ENXIO;
-               if (!irqchip_in_kernel(kvm))
-                       goto out;
-               r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
-               if (r)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(argp, &chip, sizeof chip))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_SET_IRQCHIP: {
-               /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip chip;
-
-               r = -EFAULT;
-               if (copy_from_user(&chip, argp, sizeof chip))
-                       goto out;
-               r = -ENXIO;
-               if (!irqchip_in_kernel(kvm))
-                       goto out;
-               r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
-               if (r)
-                       goto out;
-               r = 0;
-               break;
-       }
-       default:
-               ;
-       }
-out:
-       return r;
-}
-
-static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
-                                 unsigned long address,
-                                 int *type)
-{
-       struct kvm *kvm = vma->vm_file->private_data;
-       unsigned long pgoff;
-       struct page *page;
-
-       pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-       page = gfn_to_page(kvm, pgoff);
-       if (!page)
-               return NOPAGE_SIGBUS;
-       get_page(page);
-       if (type != NULL)
-               *type = VM_FAULT_MINOR;
-
-       return page;
-}
-
-static struct vm_operations_struct kvm_vm_vm_ops = {
-       .nopage = kvm_vm_nopage,
-};
-
-static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
-{
-       vma->vm_ops = &kvm_vm_vm_ops;
-       return 0;
-}
-
-static struct file_operations kvm_vm_fops = {
-       .release        = kvm_vm_release,
-       .unlocked_ioctl = kvm_vm_ioctl,
-       .compat_ioctl   = kvm_vm_ioctl,
-       .mmap           = kvm_vm_mmap,
-};
-
-static int kvm_dev_ioctl_create_vm(void)
-{
-       int fd, r;
-       struct inode *inode;
-       struct file *file;
-       struct kvm *kvm;
-
-       kvm = kvm_create_vm();
-       if (IS_ERR(kvm))
-               return PTR_ERR(kvm);
-       r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
-       if (r) {
-               kvm_destroy_vm(kvm);
-               return r;
-       }
-
-       kvm->filp = file;
-
-       return fd;
-}
-
-static long kvm_dev_ioctl(struct file *filp,
-                         unsigned int ioctl, unsigned long arg)
-{
-       void __user *argp = (void __user *)arg;
-       long r = -EINVAL;
-
-       switch (ioctl) {
-       case KVM_GET_API_VERSION:
-               r = -EINVAL;
-               if (arg)
-                       goto out;
-               r = KVM_API_VERSION;
-               break;
-       case KVM_CREATE_VM:
-               r = -EINVAL;
-               if (arg)
-                       goto out;
-               r = kvm_dev_ioctl_create_vm();
-               break;
-       case KVM_GET_MSR_INDEX_LIST: {
-               struct kvm_msr_list __user *user_msr_list = argp;
-               struct kvm_msr_list msr_list;
-               unsigned n;
-
-               r = -EFAULT;
-               if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
-                       goto out;
-               n = msr_list.nmsrs;
-               msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
-               if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
-                       goto out;
-               r = -E2BIG;
-               if (n < num_msrs_to_save)
-                       goto out;
-               r = -EFAULT;
-               if (copy_to_user(user_msr_list->indices, &msrs_to_save,
-                                num_msrs_to_save * sizeof(u32)))
-                       goto out;
-               if (copy_to_user(user_msr_list->indices
-                                + num_msrs_to_save * sizeof(u32),
-                                &emulated_msrs,
-                                ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
-                       goto out;
-               r = 0;
-               break;
-       }
-       case KVM_CHECK_EXTENSION: {
-               int ext = (long)argp;
-
-               switch (ext) {
-               case KVM_CAP_IRQCHIP:
-               case KVM_CAP_HLT:
-                       r = 1;
-                       break;
-               default:
-                       r = 0;
-                       break;
-               }
-               break;
-       }
-       case KVM_GET_VCPU_MMAP_SIZE:
-               r = -EINVAL;
-               if (arg)
-                       goto out;
-               r = 2 * PAGE_SIZE;
-               break;
-       default:
-               ;
-       }
-out:
-       return r;
-}
-
-static struct file_operations kvm_chardev_ops = {
-       .unlocked_ioctl = kvm_dev_ioctl,
-       .compat_ioctl   = kvm_dev_ioctl,
-};
-
-static struct miscdevice kvm_dev = {
-       KVM_MINOR,
-       "kvm",
-       &kvm_chardev_ops,
-};
-
-/*
- * Make sure that a cpu that is being hot-unplugged does not have any vcpus
- * cached on it.
- */
-static void decache_vcpus_on_cpu(int cpu)
-{
-       struct kvm *vm;
-       struct kvm_vcpu *vcpu;
-       int i;
-
-       spin_lock(&kvm_lock);
-       list_for_each_entry(vm, &vm_list, vm_list)
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = vm->vcpus[i];
-                       if (!vcpu)
-                               continue;
-                       /*
-                        * If the vcpu is locked, then it is running on some
-                        * other cpu and therefore it is not cached on the
-                        * cpu in question.
-                        *
-                        * If it's not locked, check the last cpu it executed
-                        * on.
-                        */
-                       if (mutex_trylock(&vcpu->mutex)) {
-                               if (vcpu->cpu == cpu) {
-                                       kvm_x86_ops->vcpu_decache(vcpu);
-                                       vcpu->cpu = -1;
-                               }
-                               mutex_unlock(&vcpu->mutex);
-                       }
-               }
-       spin_unlock(&kvm_lock);
-}
+       set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+       set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+       set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+       set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
  
-static void hardware_enable(void *junk)
-{
-       int cpu = raw_smp_processor_id();
+       set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+       set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
  
-       if (cpu_isset(cpu, cpus_hardware_enabled))
-               return;
-       cpu_set(cpu, cpus_hardware_enabled);
-       kvm_x86_ops->hardware_enable(NULL);
+       vcpu_put(vcpu);
+
+       return 0;
  }
  
-static void hardware_disable(void *junk)
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+                                   struct kvm_debug_guest *dbg)
  {
-       int cpu = raw_smp_processor_id();
+       int r;
  
-       if (!cpu_isset(cpu, cpus_hardware_enabled))
-               return;
-       cpu_clear(cpu, cpus_hardware_enabled);
-       decache_vcpus_on_cpu(cpu);
-       kvm_x86_ops->hardware_disable(NULL);
-}
+       vcpu_load(vcpu);
  
-static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
-                          void *v)
-{
-       int cpu = (long)v;
+       r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
  
-       switch (val) {
-       case CPU_DYING:
-       case CPU_DYING_FROZEN:
-               printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
-                      cpu);
-               hardware_disable(NULL);
-               break;
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
-                      cpu);
-               smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
-               break;
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
-                      cpu);
-               smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
-               break;
-       }
-       return NOTIFY_OK;
-}
+       vcpu_put(vcpu);
  
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
-                       void *v)
-{
-       if (val == SYS_RESTART) {
-               /*
-                * Some (well, at least mine) BIOSes hang on reboot if
-                * in vmx root mode.
-                */
-               printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-               on_each_cpu(hardware_disable, NULL, 0, 1);
-       }
-       return NOTIFY_OK;
+       return r;
  }
  
-static struct notifier_block kvm_reboot_notifier = {
-       .notifier_call = kvm_reboot,
-       .priority = 0,
+/*
+ * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
+ * we have asm/x86/processor.h
+ */
+struct fxsave {
+       u16     cwd;
+       u16     swd;
+       u16     twd;
+       u16     fop;
+       u64     rip;
+       u64     rdp;
+       u32     mxcsr;
+       u32     mxcsr_mask;
+       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
+#ifdef CONFIG_X86_64
+       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
+#else
+       u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
+#endif
  };
  
-void kvm_io_bus_init(struct kvm_io_bus *bus)
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                   struct kvm_translation *tr)
  {
-       memset(bus, 0, sizeof(*bus));
+       unsigned long vaddr = tr->linear_address;
+       gpa_t gpa;
+
+       vcpu_load(vcpu);
+       down_read(&current->mm->mmap_sem);
+       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
+       up_read(&current->mm->mmap_sem);
+       tr->physical_address = gpa;
+       tr->valid = gpa != UNMAPPED_GVA;
+       tr->writeable = 1;
+       tr->usermode = 0;
+       vcpu_put(vcpu);
+
+       return 0;
  }
  
-void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       int i;
+       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+
+       vcpu_load(vcpu);
+
+       memcpy(fpu->fpr, fxsave->st_space, 128);
+       fpu->fcw = fxsave->cwd;
+       fpu->fsw = fxsave->swd;
+       fpu->ftwx = fxsave->twd;
+       fpu->last_opcode = fxsave->fop;
+       fpu->last_ip = fxsave->rip;
+       fpu->last_dp = fxsave->rdp;
+       memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
  
-       for (i = 0; i < bus->dev_count; i++) {
-               struct kvm_io_device *pos = bus->devs[i];
+       vcpu_put(vcpu);
  
-               kvm_iodevice_destructor(pos);
-       }
+       return 0;
  }
  
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
-       int i;
+       struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
  
-       for (i = 0; i < bus->dev_count; i++) {
-               struct kvm_io_device *pos = bus->devs[i];
+       vcpu_load(vcpu);
  
-               if (pos->in_range(pos, addr))
-                       return pos;
-       }
+       memcpy(fxsave->st_space, fpu->fpr, 128);
+       fxsave->cwd = fpu->fcw;
+       fxsave->swd = fpu->fsw;
+       fxsave->twd = fpu->ftwx;
+       fxsave->fop = fpu->last_opcode;
+       fxsave->rip = fpu->last_ip;
+       fxsave->rdp = fpu->last_dp;
+       memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
  
-       return NULL;
+       vcpu_put(vcpu);
+
+       return 0;
  }
  
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+void fx_init(struct kvm_vcpu *vcpu)
  {
-       BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+       unsigned after_mxcsr_mask;
  
-       bus->devs[bus->dev_count++] = dev;
-}
+       /* Initialize guest FPU by resetting ours and saving into guest's */
+       preempt_disable();
+       fx_save(&vcpu->arch.host_fx_image);
+       fpu_init();
+       fx_save(&vcpu->arch.guest_fx_image);
+       fx_restore(&vcpu->arch.host_fx_image);
+       preempt_enable();
  
-static struct notifier_block kvm_cpu_notifier = {
-       .notifier_call = kvm_cpu_hotplug,
-       .priority = 20, /* must be > scheduler priority */
-};
+       vcpu->arch.cr0 |= X86_CR0_ET;
+       after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
+       vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
+       memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
+              0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+}
+EXPORT_SYMBOL_GPL(fx_init);
  
-static u64 stat_get(void *_offset)
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
  {
-       unsigned offset = (long)_offset;
-       u64 total = 0;
-       struct kvm *kvm;
-       struct kvm_vcpu *vcpu;
-       int i;
+       if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
+               return;
  
-       spin_lock(&kvm_lock);
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = kvm->vcpus[i];
-                       if (vcpu)
-                               total += *(u32 *)((void *)vcpu + offset);
-               }
-       spin_unlock(&kvm_lock);
-       return total;
+       vcpu->guest_fpu_loaded = 1;
+       fx_save(&vcpu->arch.host_fx_image);
+       fx_restore(&vcpu->arch.guest_fx_image);
  }
+EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
  
-DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
-
-static __init void kvm_init_debug(void)
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  {
-       struct kvm_stats_debugfs_item *p;
+       if (!vcpu->guest_fpu_loaded)
+               return;
  
-       debugfs_dir = debugfs_create_dir("kvm", NULL);
-       for (p = debugfs_entries; p->name; ++p)
-               p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
-                                               (void *)(long)p->offset,
-                                               &stat_fops);
+       vcpu->guest_fpu_loaded = 0;
+       fx_save(&vcpu->arch.guest_fx_image);
+       fx_restore(&vcpu->arch.host_fx_image);
+       ++vcpu->stat.fpu_reload;
  }
+EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
  
-static void kvm_exit_debug(void)
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  {
-       struct kvm_stats_debugfs_item *p;
-
-       for (p = debugfs_entries; p->name; ++p)
-               debugfs_remove(p->dentry);
-       debugfs_remove(debugfs_dir);
+       kvm_x86_ops->vcpu_free(vcpu);
  }
  
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+                                               unsigned int id)
  {
-       hardware_disable(NULL);
-       return 0;
+       return kvm_x86_ops->vcpu_create(kvm, id);
  }
  
-static int kvm_resume(struct sys_device *dev)
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
  {
-       hardware_enable(NULL);
+       int r;
+
+       /* We do fxsave: this must be aligned. */
+       BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+
+       vcpu_load(vcpu);
+       r = kvm_arch_vcpu_reset(vcpu);
+       if (r == 0)
+               r = kvm_mmu_setup(vcpu);
+       vcpu_put(vcpu);
+       if (r < 0)
+               goto free_vcpu;
+
         return 0;
+free_vcpu:
+       kvm_x86_ops->vcpu_free(vcpu);
+       return r;
  }
  
-static struct sysdev_class kvm_sysdev_class = {
-       .name = "kvm",
-       .suspend = kvm_suspend,
-       .resume = kvm_resume,
-};
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       vcpu_load(vcpu);
+       kvm_mmu_unload(vcpu);
+       vcpu_put(vcpu);
  
-static struct sys_device kvm_sysdev = {
-       .id = 0,
-       .cls = &kvm_sysdev_class,
-};
+       kvm_x86_ops->vcpu_free(vcpu);
+}
  
-hpa_t bad_page_address;
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+       return kvm_x86_ops->vcpu_reset(vcpu);
+}
  
-static inline
-struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+void kvm_arch_hardware_enable(void *garbage)
  {
-       return container_of(pn, struct kvm_vcpu, preempt_notifier);
+       kvm_x86_ops->hardware_enable(garbage);
  }
  
-static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+void kvm_arch_hardware_disable(void *garbage)
  {
-       struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+       kvm_x86_ops->hardware_disable(garbage);
+}
  
-       kvm_x86_ops->vcpu_load(vcpu, cpu);
+int kvm_arch_hardware_setup(void)
+{
+       return kvm_x86_ops->hardware_setup();
  }
  
-static void kvm_sched_out(struct preempt_notifier *pn,
-                         struct task_struct *next)
+void kvm_arch_hardware_unsetup(void)
  {
-       struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+       kvm_x86_ops->hardware_unsetup();
+}
  
-       kvm_x86_ops->vcpu_put(vcpu);
+void kvm_arch_check_processor_compat(void *rtn)
+{
+       kvm_x86_ops->check_processor_compatibility(rtn);
  }
  
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
-                 struct module *module)
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
+       struct page *page;
+       struct kvm *kvm;
         int r;
-       int cpu;
  
-       if (kvm_x86_ops) {
-               printk(KERN_ERR "kvm: already loaded the other module\n");
-               return -EEXIST;
-       }
+       BUG_ON(vcpu->kvm == NULL);
+       kvm = vcpu->kvm;
  
-       if (!ops->cpu_has_kvm_support()) {
-               printk(KERN_ERR "kvm: no hardware support\n");
-               return -EOPNOTSUPP;
-       }
-       if (ops->disabled_by_bios()) {
-               printk(KERN_ERR "kvm: disabled by bios\n");
-               return -EOPNOTSUPP;
-       }
+       vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+       if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+               vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+       else
+               vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
  
-       kvm_x86_ops = ops;
+       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       if (!page) {
+               r = -ENOMEM;
+               goto fail;
+       }
+       vcpu->arch.pio_data = page_address(page);
  
-       r = kvm_x86_ops->hardware_setup();
+       r = kvm_mmu_create(vcpu);
         if (r < 0)
-               goto out;
+               goto fail_free_pio_data;
  
-       for_each_online_cpu(cpu) {
-               smp_call_function_single(cpu,
-                               kvm_x86_ops->check_processor_compatibility,
-                               &r, 0, 1);
+       if (irqchip_in_kernel(kvm)) {
+               r = kvm_create_lapic(vcpu);
                 if (r < 0)
-                       goto out_free_0;
+                       goto fail_mmu_destroy;
         }
  
-       on_each_cpu(hardware_enable, NULL, 0, 1);
-       r = register_cpu_notifier(&kvm_cpu_notifier);
-       if (r)
-               goto out_free_1;
-       register_reboot_notifier(&kvm_reboot_notifier);
-
-       r = sysdev_class_register(&kvm_sysdev_class);
-       if (r)
-               goto out_free_2;
-
-       r = sysdev_register(&kvm_sysdev);
-       if (r)
-               goto out_free_3;
+       return 0;
  
-       /* A kmem cache lets us meet the alignment requirements of fx_save. */
-       kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
-                                          __alignof__(struct kvm_vcpu), 0, 0);
-       if (!kvm_vcpu_cache) {
-               r = -ENOMEM;
-               goto out_free_4;
-       }
+fail_mmu_destroy:
+       kvm_mmu_destroy(vcpu);
+fail_free_pio_data:
+       free_page((unsigned long)vcpu->arch.pio_data);
+fail:
+       return r;
+}
  
-       kvm_chardev_ops.owner = module;
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+       kvm_free_lapic(vcpu);
+       kvm_mmu_destroy(vcpu);
+       free_page((unsigned long)vcpu->arch.pio_data);
+}
  
-       r = misc_register(&kvm_dev);
-       if (r) {
-               printk (KERN_ERR "kvm: misc device register failed\n");
-               goto out_free;
-       }
+struct  kvm *kvm_arch_create_vm(void)
+{
+       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
  
-       kvm_preempt_ops.sched_in = kvm_sched_in;
-       kvm_preempt_ops.sched_out = kvm_sched_out;
+       if (!kvm)
+               return ERR_PTR(-ENOMEM);
  
-       return r;
+       INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
  
-out_free:
-       kmem_cache_destroy(kvm_vcpu_cache);
-out_free_4:
-       sysdev_unregister(&kvm_sysdev);
-out_free_3:
-       sysdev_class_unregister(&kvm_sysdev_class);
-out_free_2:
-       unregister_reboot_notifier(&kvm_reboot_notifier);
-       unregister_cpu_notifier(&kvm_cpu_notifier);
-out_free_1:
-       on_each_cpu(hardware_disable, NULL, 0, 1);
-out_free_0:
-       kvm_x86_ops->hardware_unsetup();
-out:
-       kvm_x86_ops = NULL;
-       return r;
+       return kvm;
  }
  
-void kvm_exit_x86(void)
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
  {
-       misc_deregister(&kvm_dev);
-       kmem_cache_destroy(kvm_vcpu_cache);
-       sysdev_unregister(&kvm_sysdev);
-       sysdev_class_unregister(&kvm_sysdev_class);
-       unregister_reboot_notifier(&kvm_reboot_notifier);
-       unregister_cpu_notifier(&kvm_cpu_notifier);
-       on_each_cpu(hardware_disable, NULL, 0, 1);
-       kvm_x86_ops->hardware_unsetup();
-       kvm_x86_ops = NULL;
+       vcpu_load(vcpu);
+       kvm_mmu_unload(vcpu);
+       vcpu_put(vcpu);
  }
  
-static __init int kvm_init(void)
+static void kvm_free_vcpus(struct kvm *kvm)
  {
-       static struct page *bad_page;
-       int r;
+       unsigned int i;
  
-       r = kvm_mmu_module_init();
-       if (r)
-               goto out4;
+       /*
+        * Unpin any mmu pages first.
+        */
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               if (kvm->vcpus[i])
+                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+               if (kvm->vcpus[i]) {
+                       kvm_arch_vcpu_free(kvm->vcpus[i]);
+                       kvm->vcpus[i] = NULL;
+               }
+       }
  
-       kvm_init_debug();
+}
  
-       kvm_init_msr_list();
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+       kfree(kvm->arch.vpic);
+       kfree(kvm->arch.vioapic);
+       kvm_free_vcpus(kvm);
+       kvm_free_physmem(kvm);
+       kfree(kvm);
+}
  
-       if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
-               r = -ENOMEM;
-               goto out;
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               struct kvm_memory_slot old,
+                               int user_alloc)
+{
+       int npages = mem->memory_size >> PAGE_SHIFT;
+       struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+
+       /*To keep backward compatibility with older userspace,
+        *x86 needs to hanlde !user_alloc case.
+        */
+       if (!user_alloc) {
+               if (npages && !old.rmap) {
+                       memslot->userspace_addr = do_mmap(NULL, 0,
+                                                    npages * PAGE_SIZE,
+                                                    PROT_READ | PROT_WRITE,
+                                                    MAP_SHARED | MAP_ANONYMOUS,
+                                                    0);
+
+                       if (IS_ERR((void *)memslot->userspace_addr))
+                               return PTR_ERR((void *)memslot->userspace_addr);
+               } else {
+                       if (!old.user_alloc && old.rmap) {
+                               int ret;
+
+                               ret = do_munmap(current->mm, old.userspace_addr,
+                                               old.npages * PAGE_SIZE);
+                               if (ret < 0)
+                                       printk(KERN_WARNING
+                                      "kvm_vm_ioctl_set_memory_region: "
+                                      "failed to munmap memory\n");
+                       }
+               }
+       }
+
+       if (!kvm->arch.n_requested_mmu_pages) {
+               unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+               kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
         }
  
-       bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
-       memset(__va(bad_page_address), 0, PAGE_SIZE);
+       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+       kvm_flush_remote_tlbs(kvm);
  
         return 0;
+}
  
-out:
-       kvm_exit_debug();
-       kvm_mmu_module_exit();
-out4:
-       return r;
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
+              || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
  }
  
-static __exit void kvm_exit(void)
+static void vcpu_kick_intr(void *info)
  {
-       kvm_exit_debug();
-       __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
-       kvm_mmu_module_exit();
+#ifdef DEBUG
+       struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
+       printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
+#endif
  }
  
-module_init(kvm_init)
-module_exit(kvm_exit)
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+       int ipi_pcpu = vcpu->cpu;
  
-EXPORT_SYMBOL_GPL(kvm_init_x86);
-EXPORT_SYMBOL_GPL(kvm_exit_x86);
+       if (waitqueue_active(&vcpu->wq)) {
+               wake_up_interruptible(&vcpu->wq);
+               ++vcpu->stat.halt_wakeup;
+       }
+       if (vcpu->guest_mode)
+               smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c

new file mode 100644 (file)

index 0000000..7958600
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
+/******************************************************************************
+ * x86_emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ *
+ *   Avi Kivity <avi@qumranet.com>
+ *   Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <stdint.h>
+#include <public/xen.h>
+#define DPRINTF(_f, _a ...) printf(_f , ## _a)
+#else
+#include <linux/kvm_host.h>
+#define DPRINTF(x...) do {} while (0)
+#endif
+#include <linux/module.h>
+#include <asm/kvm_x86_emulate.h>
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp      (1<<0)     /* 8-bit operands. */
+/* Destination operand type. */
+#define ImplicitOps (1<<1)     /* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<1)     /* Register operand. */
+#define DstMem      (3<<1)     /* Memory operand. */
+#define DstMask     (3<<1)
+/* Source operand type. */
+#define SrcNone     (0<<3)     /* No source operand. */
+#define SrcImplicit (0<<3)     /* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<3)     /* Register operand. */
+#define SrcMem      (2<<3)     /* Memory operand. */
+#define SrcMem16    (3<<3)     /* Memory operand (16-bit). */
+#define SrcMem32    (4<<3)     /* Memory operand (32-bit). */
+#define SrcImm      (5<<3)     /* Immediate operand. */
+#define SrcImmByte  (6<<3)     /* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<3)
+/* Generic ModRM decode. */
+#define ModRM       (1<<6)
+/* Destination is only written; never read. */
+#define Mov         (1<<7)
+#define BitOp       (1<<8)
+#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
+#define String      (1<<10)     /* String instruction (rep capable) */
+#define Stack       (1<<11)     /* Stack instruction (push/pop) */
+
+static u16 opcode_table[256] = {
+       /* 0x00 - 0x07 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x08 - 0x0F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x10 - 0x17 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x18 - 0x1F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x20 - 0x27 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       SrcImmByte, SrcImm, 0, 0,
+       /* 0x28 - 0x2F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x30 - 0x37 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x38 - 0x3F */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
+       0, 0, 0, 0,
+       /* 0x40 - 0x47 */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+       /* 0x48 - 0x4F */
+       DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
+       /* 0x50 - 0x57 */
+       SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+       SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+       /* 0x58 - 0x5F */
+       DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+       DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+       /* 0x60 - 0x67 */
+       0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
+       0, 0, 0, 0,
+       /* 0x68 - 0x6F */
+       0, 0, ImplicitOps | Mov | Stack, 0,
+       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
+       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+       /* 0x70 - 0x77 */
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       /* 0x78 - 0x7F */
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       /* 0x80 - 0x87 */
+       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
+       /* 0x88 - 0x8F */
+       ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
+       ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+       /* 0x90 - 0x9F */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+       /* 0xA0 - 0xA7 */
+       ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
+       ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+       ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+       ByteOp | ImplicitOps | String, ImplicitOps | String,
+       /* 0xA8 - 0xAF */
+       0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+       ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
+       ByteOp | ImplicitOps | String, ImplicitOps | String,
+       /* 0xB0 - 0xBF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xC0 - 0xC7 */
+       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+       0, ImplicitOps | Stack, 0, 0,
+       ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+       /* 0xC8 - 0xCF */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xD0 - 0xD7 */
+       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
+       0, 0, 0, 0,
+       /* 0xD8 - 0xDF */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xE0 - 0xE7 */
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xE8 - 0xEF */
+       ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
+       0, 0, 0, 0,
+       /* 0xF0 - 0xF7 */
+       0, 0, 0, 0,
+       ImplicitOps, ImplicitOps,
+       ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+       /* 0xF8 - 0xFF */
+       ImplicitOps, 0, ImplicitOps, ImplicitOps,
+       0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+};
+
+static u16 twobyte_table[256] = {
+       /* 0x00 - 0x0F */
+       0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+       ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
+       /* 0x10 - 0x1F */
+       0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x20 - 0x2F */
+       ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x30 - 0x3F */
+       ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x40 - 0x47 */
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       /* 0x48 - 0x4F */
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+       /* 0x50 - 0x5F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x60 - 0x6F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x70 - 0x7F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0x80 - 0x8F */
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+       /* 0x90 - 0x9F */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xA0 - 0xA7 */
+       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+       /* 0xA8 - 0xAF */
+       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+       /* 0xB0 - 0xB7 */
+       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
+           DstMem | SrcReg | ModRM | BitOp,
+       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+           DstReg | SrcMem16 | ModRM | Mov,
+       /* 0xB8 - 0xBF */
+       0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
+       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
+           DstReg | SrcMem16 | ModRM | Mov,
+       /* 0xC0 - 0xCF */
+       0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xD0 - 0xDF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xE0 - 0xEF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xF0 - 0xFF */
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* EFLAGS bit definitions. */
+#define EFLG_OF (1<<11)
+#define EFLG_DF (1<<10)
+#define EFLG_SF (1<<7)
+#define EFLG_ZF (1<<6)
+#define EFLG_AF (1<<4)
+#define EFLG_PF (1<<2)
+#define EFLG_CF (1<<0)
+
+/*
+ * Instruction emulation:
+ * Most instructions are emulated directly via a fragment of inline assembly
+ * code. This allows us to save/restore EFLAGS and thus very easily pick up
+ * any modified flags.
+ */
+
+#if defined(CONFIG_X86_64)
+#define _LO32 "k"              /* force 32-bit operand */
+#define _STK  "%%rsp"          /* stack pointer */
+#elif defined(__i386__)
+#define _LO32 ""               /* force 32-bit operand */
+#define _STK  "%%esp"          /* stack pointer */
+#endif
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+
+/* Before executing instruction: restore necessary bits in EFLAGS. */
+#define _PRE_EFLAGS(_sav, _msk, _tmp)                                  \
+       /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
+       "movl %"_sav",%"_LO32 _tmp"; "                                  \
+       "push %"_tmp"; "                                                \
+       "push %"_tmp"; "                                                \
+       "movl %"_msk",%"_LO32 _tmp"; "                                  \
+       "andl %"_LO32 _tmp",("_STK"); "                                 \
+       "pushf; "                                                       \
+       "notl %"_LO32 _tmp"; "                                          \
+       "andl %"_LO32 _tmp",("_STK"); "                                 \
+       "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); "   \
+       "pop  %"_tmp"; "                                                \
+       "orl  %"_LO32 _tmp",("_STK"); "                                 \
+       "popf; "                                                        \
+       "pop  %"_sav"; "
+
+/* After executing instruction: write-back necessary bits in EFLAGS. */
+#define _POST_EFLAGS(_sav, _msk, _tmp) \
+       /* _sav |= EFLAGS & _msk; */            \
+       "pushf; "                               \
+       "pop  %"_tmp"; "                        \
+       "andl %"_msk",%"_LO32 _tmp"; "          \
+       "orl  %"_LO32 _tmp",%"_sav"; "
+
+/* Raw emulation: instruction has two explicit operands. */
+#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+       do {                                                                \
+               unsigned long _tmp;                                         \
+                                                                           \
+               switch ((_dst).bytes) {                                     \
+               case 2:                                                     \
+                       __asm__ __volatile__ (                              \
+                               _PRE_EFLAGS("0", "4", "2")                  \
+                               _op"w %"_wx"3,%1; "                         \
+                               _POST_EFLAGS("0", "4", "2")                 \
+                               : "=m" (_eflags), "=m" ((_dst).val),        \
+                                 "=&r" (_tmp)                              \
+                               : _wy ((_src).val), "i" (EFLAGS_MASK));     \
+                       break;                                              \
+               case 4:                                                     \
+                       __asm__ __volatile__ (                              \
+                               _PRE_EFLAGS("0", "4", "2")                  \
+                               _op"l %"_lx"3,%1; "                         \
+                               _POST_EFLAGS("0", "4", "2")                 \
+                               : "=m" (_eflags), "=m" ((_dst).val),        \
+                                 "=&r" (_tmp)                              \
+                               : _ly ((_src).val), "i" (EFLAGS_MASK));     \
+                       break;                                              \
+               case 8:                                                     \
+                       __emulate_2op_8byte(_op, _src, _dst,                \
+                                           _eflags, _qx, _qy);             \
+                       break;                                              \
+               }                                                           \
+       } while (0)
+
+#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+       do {                                                                 \
+               unsigned long _tmp;                                          \
+               switch ((_dst).bytes) {                                      \
+               case 1:                                                      \
+                       __asm__ __volatile__ (                               \
+                               _PRE_EFLAGS("0", "4", "2")                   \
+                               _op"b %"_bx"3,%1; "                          \
+                               _POST_EFLAGS("0", "4", "2")                  \
+                               : "=m" (_eflags), "=m" ((_dst).val),         \
+                                 "=&r" (_tmp)                               \
+                               : _by ((_src).val), "i" (EFLAGS_MASK));      \
+                       break;                                               \
+               default:                                                     \
+                       __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
+                                            _wx, _wy, _lx, _ly, _qx, _qy);  \
+                       break;                                               \
+               }                                                            \
+       } while (0)
+
+/* Source operand is byte-sized and may be restricted to just %cl. */
+#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
+       __emulate_2op(_op, _src, _dst, _eflags,                         \
+                     "b", "c", "b", "c", "b", "c", "b", "c")
+
+/* Source operand is byte, word, long or quad sized. */
+#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
+       __emulate_2op(_op, _src, _dst, _eflags,                         \
+                     "b", "q", "w", "r", _LO32, "r", "", "r")
+
+/* Source operand is word, long or quad sized. */
+#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
+       __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
+                            "w", "r", _LO32, "r", "", "r")
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags)                                    \
+       do {                                                            \
+               unsigned long _tmp;                                     \
+                                                                       \
+               switch ((_dst).bytes) {                                 \
+               case 1:                                                 \
+                       __asm__ __volatile__ (                          \
+                               _PRE_EFLAGS("0", "3", "2")              \
+                               _op"b %1; "                             \
+                               _POST_EFLAGS("0", "3", "2")             \
+                               : "=m" (_eflags), "=m" ((_dst).val),    \
+                                 "=&r" (_tmp)                          \
+                               : "i" (EFLAGS_MASK));                   \
+                       break;                                          \
+               case 2:                                                 \
+                       __asm__ __volatile__ (                          \
+                               _PRE_EFLAGS("0", "3", "2")              \
+                               _op"w %1; "                             \
+                               _POST_EFLAGS("0", "3", "2")             \
+                               : "=m" (_eflags), "=m" ((_dst).val),    \
+                                 "=&r" (_tmp)                          \
+                               : "i" (EFLAGS_MASK));                   \
+                       break;                                          \
+               case 4:                                                 \
+                       __asm__ __volatile__ (                          \
+                               _PRE_EFLAGS("0", "3", "2")              \
+                               _op"l %1; "                             \
+                               _POST_EFLAGS("0", "3", "2")             \
+                               : "=m" (_eflags), "=m" ((_dst).val),    \
+                                 "=&r" (_tmp)                          \
+                               : "i" (EFLAGS_MASK));                   \
+                       break;                                          \
+               case 8:                                                 \
+                       __emulate_1op_8byte(_op, _dst, _eflags);        \
+                       break;                                          \
+               }                                                       \
+       } while (0)
+
+/* Emulate an instruction with quadword operands (x86/64 only). */
+#if defined(CONFIG_X86_64)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
+       do {                                                              \
+               __asm__ __volatile__ (                                    \
+                       _PRE_EFLAGS("0", "4", "2")                        \
+                       _op"q %"_qx"3,%1; "                               \
+                       _POST_EFLAGS("0", "4", "2")                       \
+                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                       : _qy ((_src).val), "i" (EFLAGS_MASK));         \
+       } while (0)
+
+#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
+       do {                                                              \
+               __asm__ __volatile__ (                                    \
+                       _PRE_EFLAGS("0", "3", "2")                        \
+                       _op"q %1; "                                       \
+                       _POST_EFLAGS("0", "3", "2")                       \
+                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                       : "i" (EFLAGS_MASK));                             \
+       } while (0)
+
+#elif defined(__i386__)
+#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define __emulate_1op_8byte(_op, _dst, _eflags)
+#endif                         /* __i386__ */
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip)                                  \
+({     unsigned long _x;                                               \
+       rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));            \
+       if (rc != 0)                                                    \
+               goto done;                                              \
+       (_eip) += (_size);                                              \
+       (_type)_x;                                                      \
+})
+
+/* Access/update address held in a register, based on addressing mode. */
+#define address_mask(reg)                                              \
+       ((c->ad_bytes == sizeof(unsigned long)) ?                       \
+               (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
+#define register_address(base, reg)                                     \
+       ((base) + address_mask(reg))
+#define register_address_increment(reg, inc)                            \
+       do {                                                            \
+               /* signed type ensures sign extension to long */        \
+               int _inc = (inc);                                       \
+               if (c->ad_bytes == sizeof(unsigned long))               \
+                       (reg) += _inc;                                  \
+               else                                                    \
+                       (reg) = ((reg) &                                \
+                                ~((1UL << (c->ad_bytes << 3)) - 1)) |  \
+                               (((reg) + _inc) &                       \
+                                ((1UL << (c->ad_bytes << 3)) - 1));    \
+       } while (0)
+
+#define JMP_REL(rel)                                                   \
+       do {                                                            \
+               register_address_increment(c->eip, rel);                \
+       } while (0)
+
+static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
+                             struct x86_emulate_ops *ops,
+                             unsigned long linear, u8 *dest)
+{
+       struct fetch_cache *fc = &ctxt->decode.fetch;
+       int rc;
+       int size;
+
+       if (linear < fc->start || linear >= fc->end) {
+               size = min(15UL, PAGE_SIZE - offset_in_page(linear));
+               rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
+               if (rc)
+                       return rc;
+               fc->start = linear;
+               fc->end = linear + size;
+       }
+       *dest = fc->data[linear - fc->start];
+       return 0;
+}
+
+static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
+                        struct x86_emulate_ops *ops,
+                        unsigned long eip, void *dest, unsigned size)
+{
+       int rc = 0;
+
+       eip += ctxt->cs_base;
+       while (size--) {
+               rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(u8 modrm_reg, unsigned long *regs,
+                            int highbyte_regs)
+{
+       void *p;
+
+       p = &regs[modrm_reg];
+       if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+               p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+       return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+                          struct x86_emulate_ops *ops,
+                          void *ptr,
+                          u16 *size, unsigned long *address, int op_bytes)
+{
+       int rc;
+
+       if (op_bytes == 2)
+               op_bytes = 3;
+       *address = 0;
+       rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
+                          ctxt->vcpu);
+       if (rc)
+               return rc;
+       rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
+                          ctxt->vcpu);
+       return rc;
+}
+
+static int test_cc(unsigned int condition, unsigned int flags)
+{
+       int rc = 0;
+
+       switch ((condition & 15) >> 1) {
+       case 0: /* o */
+               rc |= (flags & EFLG_OF);
+               break;
+       case 1: /* b/c/nae */
+               rc |= (flags & EFLG_CF);
+               break;
+       case 2: /* z/e */
+               rc |= (flags & EFLG_ZF);
+               break;
+       case 3: /* be/na */
+               rc |= (flags & (EFLG_CF|EFLG_ZF));
+               break;
+       case 4: /* s */
+               rc |= (flags & EFLG_SF);
+               break;
+       case 5: /* p/pe */
+               rc |= (flags & EFLG_PF);
+               break;
+       case 7: /* le/ng */
+               rc |= (flags & EFLG_ZF);
+               /* fall through */
+       case 6: /* l/nge */
+               rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+               break;
+       }
+
+       /* Odd condition identifiers (lsb == 1) have inverted sense. */
+       return (!!rc ^ (condition & 1));
+}
+
+static void decode_register_operand(struct operand *op,
+                                   struct decode_cache *c,
+                                   int inhibit_bytereg)
+{
+       unsigned reg = c->modrm_reg;
+       int highbyte_regs = c->rex_prefix == 0;
+
+       if (!(c->d & ModRM))
+               reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+       op->type = OP_REG;
+       if ((c->d & ByteOp) && !inhibit_bytereg) {
+               op->ptr = decode_register(reg, c->regs, highbyte_regs);
+               op->val = *(u8 *)op->ptr;
+               op->bytes = 1;
+       } else {
+               op->ptr = decode_register(reg, c->regs, 0);
+               op->bytes = c->op_bytes;
+               switch (op->bytes) {
+               case 2:
+                       op->val = *(u16 *)op->ptr;
+                       break;
+               case 4:
+                       op->val = *(u32 *)op->ptr;
+                       break;
+               case 8:
+                       op->val = *(u64 *) op->ptr;
+                       break;
+               }
+       }
+       op->orig_val = op->val;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+                       struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u8 sib;
+       int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
+       int rc = 0;
+
+       if (c->rex_prefix) {
+               c->modrm_reg = (c->rex_prefix & 4) << 1;        /* REX.R */
+               index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
+               c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+       }
+
+       c->modrm = insn_fetch(u8, 1, c->eip);
+       c->modrm_mod |= (c->modrm & 0xc0) >> 6;
+       c->modrm_reg |= (c->modrm & 0x38) >> 3;
+       c->modrm_rm |= (c->modrm & 0x07);
+       c->modrm_ea = 0;
+       c->use_modrm_ea = 1;
+
+       if (c->modrm_mod == 3) {
+               c->modrm_val = *(unsigned long *)
+                       decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
+               return rc;
+       }
+
+       if (c->ad_bytes == 2) {
+               unsigned bx = c->regs[VCPU_REGS_RBX];
+               unsigned bp = c->regs[VCPU_REGS_RBP];
+               unsigned si = c->regs[VCPU_REGS_RSI];
+               unsigned di = c->regs[VCPU_REGS_RDI];
+
+               /* 16-bit ModR/M decode. */
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 6)
+                               c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(u16, 2, c->eip);
+                       break;
+               }
+               switch (c->modrm_rm) {
+               case 0:
+                       c->modrm_ea += bx + si;
+                       break;
+               case 1:
+                       c->modrm_ea += bx + di;
+                       break;
+               case 2:
+                       c->modrm_ea += bp + si;
+                       break;
+               case 3:
+                       c->modrm_ea += bp + di;
+                       break;
+               case 4:
+                       c->modrm_ea += si;
+                       break;
+               case 5:
+                       c->modrm_ea += di;
+                       break;
+               case 6:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += bp;
+                       break;
+               case 7:
+                       c->modrm_ea += bx;
+                       break;
+               }
+               if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
+                   (c->modrm_rm == 6 && c->modrm_mod != 0))
+                       if (!c->override_base)
+                               c->override_base = &ctxt->ss_base;
+               c->modrm_ea = (u16)c->modrm_ea;
+       } else {
+               /* 32/64-bit ModR/M decode. */
+               switch (c->modrm_rm) {
+               case 4:
+               case 12:
+                       sib = insn_fetch(u8, 1, c->eip);
+                       index_reg |= (sib >> 3) & 7;
+                       base_reg |= sib & 7;
+                       scale = sib >> 6;
+
+                       switch (base_reg) {
+                       case 5:
+                               if (c->modrm_mod != 0)
+                                       c->modrm_ea += c->regs[base_reg];
+                               else
+                                       c->modrm_ea +=
+                                               insn_fetch(s32, 4, c->eip);
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[base_reg];
+                       }
+                       switch (index_reg) {
+                       case 4:
+                               break;
+                       default:
+                               c->modrm_ea += c->regs[index_reg] << scale;
+                       }
+                       break;
+               case 5:
+                       if (c->modrm_mod != 0)
+                               c->modrm_ea += c->regs[c->modrm_rm];
+                       else if (ctxt->mode == X86EMUL_MODE_PROT64)
+                               rip_relative = 1;
+                       break;
+               default:
+                       c->modrm_ea += c->regs[c->modrm_rm];
+                       break;
+               }
+               switch (c->modrm_mod) {
+               case 0:
+                       if (c->modrm_rm == 5)
+                               c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               case 1:
+                       c->modrm_ea += insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->modrm_ea += insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+       }
+       if (rip_relative) {
+               c->modrm_ea += c->eip;
+               switch (c->d & SrcMask) {
+               case SrcImmByte:
+                       c->modrm_ea += 1;
+                       break;
+               case SrcImm:
+                       if (c->d & ByteOp)
+                               c->modrm_ea += 1;
+                       else
+                               if (c->op_bytes == 8)
+                                       c->modrm_ea += 4;
+                               else
+                                       c->modrm_ea += c->op_bytes;
+               }
+       }
+done:
+       return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+                     struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->ad_bytes) {
+       case 2:
+               c->modrm_ea = insn_fetch(u16, 2, c->eip);
+               break;
+       case 4:
+               c->modrm_ea = insn_fetch(u32, 4, c->eip);
+               break;
+       case 8:
+               c->modrm_ea = insn_fetch(u64, 8, c->eip);
+               break;
+       }
+done:
+       return rc;
+}
+
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+       int mode = ctxt->mode;
+       int def_op_bytes, def_ad_bytes;
+
+       /* Shadow copy of register state. Committed on successful emulation. */
+
+       memset(c, 0, sizeof(struct decode_cache));
+       c->eip = ctxt->vcpu->arch.rip;
+       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+
+       switch (mode) {
+       case X86EMUL_MODE_REAL:
+       case X86EMUL_MODE_PROT16:
+               def_op_bytes = def_ad_bytes = 2;
+               break;
+       case X86EMUL_MODE_PROT32:
+               def_op_bytes = def_ad_bytes = 4;
+               break;
+#ifdef CONFIG_X86_64
+       case X86EMUL_MODE_PROT64:
+               def_op_bytes = 4;
+               def_ad_bytes = 8;
+               break;
+#endif
+       default:
+               return -1;
+       }
+
+       c->op_bytes = def_op_bytes;
+       c->ad_bytes = def_ad_bytes;
+
+       /* Legacy prefixes. */
+       for (;;) {
+               switch (c->b = insn_fetch(u8, 1, c->eip)) {
+               case 0x66:      /* operand-size override */
+                       /* switch between 2/4 bytes */
+                       c->op_bytes = def_op_bytes ^ 6;
+                       break;
+               case 0x67:      /* address-size override */
+                       if (mode == X86EMUL_MODE_PROT64)
+                               /* switch between 4/8 bytes */
+                               c->ad_bytes = def_ad_bytes ^ 12;
+                       else
+                               /* switch between 2/4 bytes */
+                               c->ad_bytes = def_ad_bytes ^ 6;
+                       break;
+               case 0x2e:      /* CS override */
+                       c->override_base = &ctxt->cs_base;
+                       break;
+               case 0x3e:      /* DS override */
+                       c->override_base = &ctxt->ds_base;
+                       break;
+               case 0x26:      /* ES override */
+                       c->override_base = &ctxt->es_base;
+                       break;
+               case 0x64:      /* FS override */
+                       c->override_base = &ctxt->fs_base;
+                       break;
+               case 0x65:      /* GS override */
+                       c->override_base = &ctxt->gs_base;
+                       break;
+               case 0x36:      /* SS override */
+                       c->override_base = &ctxt->ss_base;
+                       break;
+               case 0x40 ... 0x4f: /* REX */
+                       if (mode != X86EMUL_MODE_PROT64)
+                               goto done_prefixes;
+                       c->rex_prefix = c->b;
+                       continue;
+               case 0xf0:      /* LOCK */
+                       c->lock_prefix = 1;
+                       break;
+               case 0xf2:      /* REPNE/REPNZ */
+                       c->rep_prefix = REPNE_PREFIX;
+                       break;
+               case 0xf3:      /* REP/REPE/REPZ */
+                       c->rep_prefix = REPE_PREFIX;
+                       break;
+               default:
+                       goto done_prefixes;
+               }
+
+               /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+               c->rex_prefix = 0;
+       }
+
+done_prefixes:
+
+       /* REX prefix. */
+       if (c->rex_prefix)
+               if (c->rex_prefix & 8)
+                       c->op_bytes = 8;        /* REX.W */
+
+       /* Opcode byte(s). */
+       c->d = opcode_table[c->b];
+       if (c->d == 0) {
+               /* Two-byte opcode? */
+               if (c->b == 0x0f) {
+                       c->twobyte = 1;
+                       c->b = insn_fetch(u8, 1, c->eip);
+                       c->d = twobyte_table[c->b];
+               }
+
+               /* Unrecognised? */
+               if (c->d == 0) {
+                       DPRINTF("Cannot emulate %02x\n", c->b);
+                       return -1;
+               }
+       }
+
+       if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+               c->op_bytes = 8;
+
+       /* ModRM and SIB bytes. */
+       if (c->d & ModRM)
+               rc = decode_modrm(ctxt, ops);
+       else if (c->d & MemAbs)
+               rc = decode_abs(ctxt, ops);
+       if (rc)
+               goto done;
+
+       if (!c->override_base)
+               c->override_base = &ctxt->ds_base;
+       if (mode == X86EMUL_MODE_PROT64 &&
+           c->override_base != &ctxt->fs_base &&
+           c->override_base != &ctxt->gs_base)
+               c->override_base = NULL;
+
+       if (c->override_base)
+               c->modrm_ea += *c->override_base;
+
+       if (c->ad_bytes != 8)
+               c->modrm_ea = (u32)c->modrm_ea;
+       /*
+        * Decode and fetch the source operand: register, memory
+        * or immediate.
+        */
+       switch (c->d & SrcMask) {
+       case SrcNone:
+               break;
+       case SrcReg:
+               decode_register_operand(&c->src, c, 0);
+               break;
+       case SrcMem16:
+               c->src.bytes = 2;
+               goto srcmem_common;
+       case SrcMem32:
+               c->src.bytes = 4;
+               goto srcmem_common;
+       case SrcMem:
+               c->src.bytes = (c->d & ByteOp) ? 1 :
+                                                          c->op_bytes;
+               /* Don't fetch the address for invlpg: it could be unmapped. */
+               if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+                       break;
+       srcmem_common:
+               /*
+                * For instructions with a ModR/M byte, switch to register
+                * access if Mod = 3.
+                */
+               if ((c->d & ModRM) && c->modrm_mod == 3) {
+                       c->src.type = OP_REG;
+                       break;
+               }
+               c->src.type = OP_MEM;
+               break;
+       case SrcImm:
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               if (c->src.bytes == 8)
+                       c->src.bytes = 4;
+               /* NB. Immediates are sign-extended as necessary. */
+               switch (c->src.bytes) {
+               case 1:
+                       c->src.val = insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->src.val = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       c->src.val = insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+               break;
+       case SrcImmByte:
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = 1;
+               c->src.val = insn_fetch(s8, 1, c->eip);
+               break;
+       }
+
+       /* Decode and fetch the destination operand: register or memory. */
+       switch (c->d & DstMask) {
+       case ImplicitOps:
+               /* Special instructions do their own operand decoding. */
+               return 0;
+       case DstReg:
+               decode_register_operand(&c->dst, c,
+                        c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+               break;
+       case DstMem:
+               if ((c->d & ModRM) && c->modrm_mod == 3) {
+                       c->dst.type = OP_REG;
+                       break;
+               }
+               c->dst.type = OP_MEM;
+               break;
+       }
+
+done:
+       return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+
+       c->dst.type  = OP_MEM;
+       c->dst.bytes = c->op_bytes;
+       c->dst.val = c->src.val;
+       register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
+       c->dst.ptr = (void *) register_address(ctxt->ss_base,
+                                              c->regs[VCPU_REGS_RSP]);
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+                               struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       rc = ops->read_std(register_address(ctxt->ss_base,
+                                           c->regs[VCPU_REGS_RSP]),
+                          &c->dst.val, c->dst.bytes, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+
+       return 0;
+}
+
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       switch (c->modrm_reg) {
+       case 0: /* rol */
+               emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+               break;
+       case 1: /* ror */
+               emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* rcl */
+               emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+               break;
+       case 3: /* rcr */
+               emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 4: /* sal/shl */
+       case 6: /* sal/shl */
+               emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+               break;
+       case 5: /* shr */
+               emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 7: /* sar */
+               emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+               break;
+       }
+}
+
+static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       switch (c->modrm_reg) {
+       case 0 ... 1:   /* test */
+               /*
+                * Special case in Grp3: test has an immediate
+                * source operand.
+                */
+               c->src.type = OP_IMM;
+               c->src.ptr = (unsigned long *)c->eip;
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               if (c->src.bytes == 8)
+                       c->src.bytes = 4;
+               switch (c->src.bytes) {
+               case 1:
+                       c->src.val = insn_fetch(s8, 1, c->eip);
+                       break;
+               case 2:
+                       c->src.val = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       c->src.val = insn_fetch(s32, 4, c->eip);
+                       break;
+               }
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+               break;
+       case 2: /* not */
+               c->dst.val = ~c->dst.val;
+               break;
+       case 3: /* neg */
+               emulate_1op("neg", c->dst, ctxt->eflags);
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               rc = X86EMUL_UNHANDLEABLE;
+               break;
+       }
+done:
+       return rc;
+}
+
+static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
+
+       switch (c->modrm_reg) {
+       case 0: /* inc */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 1: /* dec */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 4: /* jmp abs */
+               if (c->b == 0xff)
+                       c->eip = c->dst.val;
+               else {
+                       DPRINTF("Cannot emulate %02x\n", c->b);
+                       return X86EMUL_UNHANDLEABLE;
+               }
+               break;
+       case 6: /* push */
+
+               /* 64-bit mode: PUSH always pushes a 64-bit operand. */
+
+               if (ctxt->mode == X86EMUL_MODE_PROT64) {
+                       c->dst.bytes = 8;
+                       rc = ops->read_std((unsigned long)c->dst.ptr,
+                                          &c->dst.val, 8, ctxt->vcpu);
+                       if (rc != 0)
+                               return rc;
+               }
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->dst.bytes);
+               rc = ops->write_emulated(register_address(ctxt->ss_base,
+                                   c->regs[VCPU_REGS_RSP]), &c->dst.val,
+                                   c->dst.bytes, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               c->dst.type = OP_NONE;
+               break;
+       default:
+               DPRINTF("Cannot emulate %02x\n", c->b);
+               return X86EMUL_UNHANDLEABLE;
+       }
+       return 0;
+}
+
+static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
+                              struct x86_emulate_ops *ops,
+                              unsigned long memop)
+{
+       struct decode_cache *c = &ctxt->decode;
+       u64 old, new;
+       int rc;
+
+       rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+       if (rc != 0)
+               return rc;
+
+       if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
+           ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+
+               c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+               c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+               ctxt->eflags &= ~EFLG_ZF;
+
+       } else {
+               new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+                      (u32) c->regs[VCPU_REGS_RBX];
+
+               rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               ctxt->eflags |= EFLG_ZF;
+       }
+       return 0;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+                           struct x86_emulate_ops *ops)
+{
+       int rc;
+       struct decode_cache *c = &ctxt->decode;
+
+       switch (c->dst.type) {
+       case OP_REG:
+               /* The 4-byte case *is* correct:
+                * in 64-bit mode we zero-extend.
+                */
+               switch (c->dst.bytes) {
+               case 1:
+                       *(u8 *)c->dst.ptr = (u8)c->dst.val;
+                       break;
+               case 2:
+                       *(u16 *)c->dst.ptr = (u16)c->dst.val;
+                       break;
+               case 4:
+                       *c->dst.ptr = (u32)c->dst.val;
+                       break;  /* 64b: zero-ext */
+               case 8:
+                       *c->dst.ptr = c->dst.val;
+                       break;
+               }
+               break;
+       case OP_MEM:
+               if (c->lock_prefix)
+                       rc = ops->cmpxchg_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.orig_val,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               else
+                       rc = ops->write_emulated(
+                                       (unsigned long)c->dst.ptr,
+                                       &c->dst.val,
+                                       c->dst.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
+                       return rc;
+               break;
+       case OP_NONE:
+               /* no writeback */
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+int
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+{
+       unsigned long memop = 0;
+       u64 msr_data;
+       unsigned long saved_eip = 0;
+       struct decode_cache *c = &ctxt->decode;
+       int rc = 0;
+
+       /* Shadow copy of register state. Committed on successful emulation.
+        * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
+        * modify them.
+        */
+
+       memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+       saved_eip = c->eip;
+
+       if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
+               memop = c->modrm_ea;
+
+       if (c->rep_prefix && (c->d & String)) {
+               /* All REP prefixes have the same first termination condition */
+               if (c->regs[VCPU_REGS_RCX] == 0) {
+                       ctxt->vcpu->arch.rip = c->eip;
+                       goto done;
+               }
+               /* The second termination condition only applies for REPE
+                * and REPNE. Test if the repeat string operation prefix is
+                * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+                * corresponding termination condition according to:
+                *      - if REPE/REPZ and ZF = 0 then done
+                *      - if REPNE/REPNZ and ZF = 1 then done
+                */
+               if ((c->b == 0xa6) || (c->b == 0xa7) ||
+                               (c->b == 0xae) || (c->b == 0xaf)) {
+                       if ((c->rep_prefix == REPE_PREFIX) &&
+                               ((ctxt->eflags & EFLG_ZF) == 0)) {
+                                       ctxt->vcpu->arch.rip = c->eip;
+                                       goto done;
+                       }
+                       if ((c->rep_prefix == REPNE_PREFIX) &&
+                               ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
+                               ctxt->vcpu->arch.rip = c->eip;
+                               goto done;
+                       }
+               }
+               c->regs[VCPU_REGS_RCX]--;
+               c->eip = ctxt->vcpu->arch.rip;
+       }
+
+       if (c->src.type == OP_MEM) {
+               c->src.ptr = (unsigned long *)memop;
+               c->src.val = 0;
+               rc = ops->read_emulated((unsigned long)c->src.ptr,
+                                       &c->src.val,
+                                       c->src.bytes,
+                                       ctxt->vcpu);
+               if (rc != 0)
+                       goto done;
+               c->src.orig_val = c->src.val;
+       }
+
+       if ((c->d & DstMask) == ImplicitOps)
+               goto special_insn;
+
+
+       if (c->dst.type == OP_MEM) {
+               c->dst.ptr = (unsigned long *)memop;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.val = 0;
+               if (c->d & BitOp) {
+                       unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+                       c->dst.ptr = (void *)c->dst.ptr +
+                                                  (c->src.val & mask) / 8;
+               }
+               if (!(c->d & Mov) &&
+                                  /* optimisation - avoid slow emulated read */
+                   ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+                                          &c->dst.val,
+                                         c->dst.bytes, ctxt->vcpu)) != 0))
+                       goto done;
+       }
+       c->dst.orig_val = c->dst.val;
+
+special_insn:
+
+       if (c->twobyte)
+               goto twobyte_insn;
+
+       switch (c->b) {
+       case 0x00 ... 0x05:
+             add:              /* add */
+               emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x08 ... 0x0d:
+             or:               /* or */
+               emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x10 ... 0x15:
+             adc:              /* adc */
+               emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x18 ... 0x1d:
+             sbb:              /* sbb */
+               emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x20 ... 0x23:
+             and:              /* and */
+               emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x24:              /* and al imm8 */
+               c->dst.type = OP_REG;
+               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+               c->dst.val = *(u8 *)c->dst.ptr;
+               c->dst.bytes = 1;
+               c->dst.orig_val = c->dst.val;
+               goto and;
+       case 0x25:              /* and ax imm16, or eax imm32 */
+               c->dst.type = OP_REG;
+               c->dst.bytes = c->op_bytes;
+               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+               if (c->op_bytes == 2)
+                       c->dst.val = *(u16 *)c->dst.ptr;
+               else
+                       c->dst.val = *(u32 *)c->dst.ptr;
+               c->dst.orig_val = c->dst.val;
+               goto and;
+       case 0x28 ... 0x2d:
+             sub:              /* sub */
+               emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x30 ... 0x35:
+             xor:              /* xor */
+               emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x38 ... 0x3d:
+             cmp:              /* cmp */
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x40 ... 0x47: /* inc r16/r32 */
+               emulate_1op("inc", c->dst, ctxt->eflags);
+               break;
+       case 0x48 ... 0x4f: /* dec r16/r32 */
+               emulate_1op("dec", c->dst, ctxt->eflags);
+               break;
+       case 0x50 ... 0x57:  /* push reg */
+               c->dst.type  = OP_MEM;
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = c->src.val;
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          -c->op_bytes);
+               c->dst.ptr = (void *) register_address(
+                       ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+               break;
+       case 0x58 ... 0x5f: /* pop reg */
+       pop_instruction:
+               if ((rc = ops->read_std(register_address(ctxt->ss_base,
+                       c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+                       c->op_bytes, ctxt->vcpu)) != 0)
+                       goto done;
+
+               register_address_increment(c->regs[VCPU_REGS_RSP],
+                                          c->op_bytes);
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0x63:              /* movsxd */
+               if (ctxt->mode != X86EMUL_MODE_PROT64)
+                       goto cannot_emulate;
+               c->dst.val = (s32) c->src.val;
+               break;
+       case 0x6a: /* push imm8 */
+               c->src.val = 0L;
+               c->src.val = insn_fetch(s8, 1, c->eip);
+               emulate_push(ctxt);
+               break;
+       case 0x6c:              /* insb */
+       case 0x6d:              /* insw/insd */
+                if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+                               1,
+                               (c->d & ByteOp) ? 1 : c->op_bytes,
+                               c->rep_prefix ?
+                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                               (ctxt->eflags & EFLG_DF),
+                               register_address(ctxt->es_base,
+                                                c->regs[VCPU_REGS_RDI]),
+                               c->rep_prefix,
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
+                       return -1;
+               }
+               return 0;
+       case 0x6e:              /* outsb */
+       case 0x6f:              /* outsw/outsd */
+               if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
+                               0,
+                               (c->d & ByteOp) ? 1 : c->op_bytes,
+                               c->rep_prefix ?
+                               address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+                               (ctxt->eflags & EFLG_DF),
+                               register_address(c->override_base ?
+                                                       *c->override_base :
+                                                       ctxt->ds_base,
+                                                c->regs[VCPU_REGS_RSI]),
+                               c->rep_prefix,
+                               c->regs[VCPU_REGS_RDX]) == 0) {
+                       c->eip = saved_eip;
+                       return -1;
+               }
+               return 0;
+       case 0x70 ... 0x7f: /* jcc (short) */ {
+               int rel = insn_fetch(s8, 1, c->eip);
+
+               if (test_cc(c->b, ctxt->eflags))
+                       JMP_REL(rel);
+               break;
+       }
+       case 0x80 ... 0x83:     /* Grp1 */
+               switch (c->modrm_reg) {
+               case 0:
+                       goto add;
+               case 1:
+                       goto or;
+               case 2:
+                       goto adc;
+               case 3:
+                       goto sbb;
+               case 4:
+                       goto and;
+               case 5:
+                       goto sub;
+               case 6:
+                       goto xor;
+               case 7:
+                       goto cmp;
+               }
+               break;
+       case 0x84 ... 0x85:
+               emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0x86 ... 0x87:     /* xchg */
+               /* Write back the register source. */
+               switch (c->dst.bytes) {
+               case 1:
+                       *(u8 *) c->src.ptr = (u8) c->dst.val;
+                       break;
+               case 2:
+                       *(u16 *) c->src.ptr = (u16) c->dst.val;
+                       break;
+               case 4:
+                       *c->src.ptr = (u32) c->dst.val;
+                       break;  /* 64b reg: zero-extend */
+               case 8:
+                       *c->src.ptr = c->dst.val;
+                       break;
+               }
+               /*
+                * Write back the memory destination with implicit LOCK
+                * prefix.
+                */
+               c->dst.val = c->src.val;
+               c->lock_prefix = 1;
+               break;
+       case 0x88 ... 0x8b:     /* mov */
+               goto mov;
+       case 0x8d: /* lea r16/r32, m */
+               c->dst.val = c->modrm_val;
+               break;
+       case 0x8f:              /* pop (sole member of Grp1a) */
+               rc = emulate_grp1a(ctxt, ops);
+               if (rc != 0)
+                       goto done;
+               break;
+       case 0x9c: /* pushf */
+               c->src.val =  (unsigned long) ctxt->eflags;
+               emulate_push(ctxt);
+               break;
+       case 0x9d: /* popf */
+               c->dst.ptr = (unsigned long *) &ctxt->eflags;
+               goto pop_instruction;
+       case 0xa0 ... 0xa1:     /* mov */
+               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               c->dst.val = c->src.val;
+               break;
+       case 0xa2 ... 0xa3:     /* mov */
+               c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
+               break;
+       case 0xa4 ... 0xa5:     /* movs */
+               c->dst.type = OP_MEM;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
+               if ((rc = ops->read_emulated(register_address(
+                     c->override_base ? *c->override_base :
+                                       ctxt->ds_base,
+                                       c->regs[VCPU_REGS_RSI]),
+                                       &c->dst.val,
+                                       c->dst.bytes, ctxt->vcpu)) != 0)
+                       goto done;
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               break;
+       case 0xa6 ... 0xa7:     /* cmps */
+               c->src.type = OP_NONE; /* Disable writeback. */
+               c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->src.ptr = (unsigned long *)register_address(
+                               c->override_base ? *c->override_base :
+                                                  ctxt->ds_base,
+                                                  c->regs[VCPU_REGS_RSI]);
+               if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
+                                               &c->src.val,
+                                               c->src.bytes,
+                                               ctxt->vcpu)) != 0)
+                       goto done;
+
+               c->dst.type = OP_NONE; /* Disable writeback. */
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
+               if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
+                                               &c->dst.val,
+                                               c->dst.bytes,
+                                               ctxt->vcpu)) != 0)
+                       goto done;
+
+               DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->src.bytes
+                                                                 : c->src.bytes);
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                                 : c->dst.bytes);
+
+               break;
+       case 0xaa ... 0xab:     /* stos */
+               c->dst.type = OP_MEM;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)register_address(
+                                                  ctxt->es_base,
+                                                  c->regs[VCPU_REGS_RDI]);
+               c->dst.val = c->regs[VCPU_REGS_RAX];
+               register_address_increment(c->regs[VCPU_REGS_RDI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               break;
+       case 0xac ... 0xad:     /* lods */
+               c->dst.type = OP_REG;
+               c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+               c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               if ((rc = ops->read_emulated(register_address(
+                               c->override_base ? *c->override_base :
+                                                  ctxt->ds_base,
+                                                c->regs[VCPU_REGS_RSI]),
+                                                &c->dst.val,
+                                                c->dst.bytes,
+                                                ctxt->vcpu)) != 0)
+                       goto done;
+               register_address_increment(c->regs[VCPU_REGS_RSI],
+                                      (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
+                                                          : c->dst.bytes);
+               break;
+       case 0xae ... 0xaf:     /* scas */
+               DPRINTF("Urk! I don't handle SCAS.\n");
+               goto cannot_emulate;
+       case 0xc0 ... 0xc1:
+               emulate_grp2(ctxt);
+               break;
+       case 0xc3: /* ret */
+               c->dst.ptr = &c->eip;
+               goto pop_instruction;
+       case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
+       mov:
+               c->dst.val = c->src.val;
+               break;
+       case 0xd0 ... 0xd1:     /* Grp2 */
+               c->src.val = 1;
+               emulate_grp2(ctxt);
+               break;
+       case 0xd2 ... 0xd3:     /* Grp2 */
+               c->src.val = c->regs[VCPU_REGS_RCX];
+               emulate_grp2(ctxt);
+               break;
+       case 0xe8: /* call (near) */ {
+               long int rel;
+               switch (c->op_bytes) {
+               case 2:
+                       rel = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       rel = insn_fetch(s32, 4, c->eip);
+                       break;
+               default:
+                       DPRINTF("Call: Invalid op_bytes\n");
+                       goto cannot_emulate;
+               }
+               c->src.val = (unsigned long) c->eip;
+               JMP_REL(rel);
+               c->op_bytes = c->ad_bytes;
+               emulate_push(ctxt);
+               break;
+       }
+       case 0xe9: /* jmp rel */
+       case 0xeb: /* jmp rel short */
+               JMP_REL(c->src.val);
+               c->dst.type = OP_NONE; /* Disable writeback. */
+               break;
+       case 0xf4:              /* hlt */
+               ctxt->vcpu->arch.halt_request = 1;
+               goto done;
+       case 0xf5:      /* cmc */
+               /* complement carry flag from eflags reg */
+               ctxt->eflags ^= EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xf6 ... 0xf7:     /* Grp3 */
+               rc = emulate_grp3(ctxt, ops);
+               if (rc != 0)
+                       goto done;
+               break;
+       case 0xf8: /* clc */
+               ctxt->eflags &= ~EFLG_CF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfa: /* cli */
+               ctxt->eflags &= ~X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfb: /* sti */
+               ctxt->eflags |= X86_EFLAGS_IF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfe ... 0xff:     /* Grp4/Grp5 */
+               rc = emulate_grp45(ctxt, ops);
+               if (rc != 0)
+                       goto done;
+               break;
+       }
+
+writeback:
+       rc = writeback(ctxt, ops);
+       if (rc != 0)
+               goto done;
+
+       /* Commit shadow register state. */
+       memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+       ctxt->vcpu->arch.rip = c->eip;
+
+done:
+       if (rc == X86EMUL_UNHANDLEABLE) {
+               c->eip = saved_eip;
+               return -1;
+       }
+       return 0;
+
+twobyte_insn:
+       switch (c->b) {
+       case 0x01: /* lgdt, lidt, lmsw */
+               switch (c->modrm_reg) {
+                       u16 size;
+                       unsigned long address;
+
+               case 0: /* vmcall */
+                       if (c->modrm_mod != 3 || c->modrm_rm != 1)
+                               goto cannot_emulate;
+
+                       rc = kvm_fix_hypercall(ctxt->vcpu);
+                       if (rc)
+                               goto done;
+
+                       kvm_emulate_hypercall(ctxt->vcpu);
+                       break;
+               case 2: /* lgdt */
+                       rc = read_descriptor(ctxt, ops, c->src.ptr,
+                                            &size, &address, c->op_bytes);
+                       if (rc)
+                               goto done;
+                       realmode_lgdt(ctxt->vcpu, size, address);
+                       break;
+               case 3: /* lidt/vmmcall */
+                       if (c->modrm_mod == 3 && c->modrm_rm == 1) {
+                               rc = kvm_fix_hypercall(ctxt->vcpu);
+                               if (rc)
+                                       goto done;
+                               kvm_emulate_hypercall(ctxt->vcpu);
+                       } else {
+                               rc = read_descriptor(ctxt, ops, c->src.ptr,
+                                                    &size, &address,
+                                                    c->op_bytes);
+                               if (rc)
+                                       goto done;
+                               realmode_lidt(ctxt->vcpu, size, address);
+                       }
+                       break;
+               case 4: /* smsw */
+                       if (c->modrm_mod != 3)
+                               goto cannot_emulate;
+                       *(u16 *)&c->regs[c->modrm_rm]
+                               = realmode_get_cr(ctxt->vcpu, 0);
+                       break;
+               case 6: /* lmsw */
+                       if (c->modrm_mod != 3)
+                               goto cannot_emulate;
+                       realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
+                                                 &ctxt->eflags);
+                       break;
+               case 7: /* invlpg*/
+                       emulate_invlpg(ctxt->vcpu, memop);
+                       break;
+               default:
+                       goto cannot_emulate;
+               }
+               /* Disable writeback. */
+               c->dst.type = OP_NONE;
+               break;
+       case 0x06:
+               emulate_clts(ctxt->vcpu);
+               c->dst.type = OP_NONE;
+               break;
+       case 0x08:              /* invd */
+       case 0x09:              /* wbinvd */
+       case 0x0d:              /* GrpP (prefetch) */
+       case 0x18:              /* Grp16 (prefetch/nop) */
+               c->dst.type = OP_NONE;
+               break;
+       case 0x20: /* mov cr, reg */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               c->regs[c->modrm_rm] =
+                               realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+               c->dst.type = OP_NONE;  /* no writeback */
+               break;
+       case 0x21: /* mov from dr to reg */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+               if (rc)
+                       goto cannot_emulate;
+               c->dst.type = OP_NONE;  /* no writeback */
+               break;
+       case 0x22: /* mov reg, cr */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               realmode_set_cr(ctxt->vcpu,
+                               c->modrm_reg, c->modrm_val, &ctxt->eflags);
+               c->dst.type = OP_NONE;
+               break;
+       case 0x23: /* mov from reg to dr */
+               if (c->modrm_mod != 3)
+                       goto cannot_emulate;
+               rc = emulator_set_dr(ctxt, c->modrm_reg,
+                                    c->regs[c->modrm_rm]);
+               if (rc)
+                       goto cannot_emulate;
+               c->dst.type = OP_NONE;  /* no writeback */
+               break;
+       case 0x30:
+               /* wrmsr */
+               msr_data = (u32)c->regs[VCPU_REGS_RAX]
+                       | ((u64)c->regs[VCPU_REGS_RDX] << 32);
+               rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
+               if (rc) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       c->eip = ctxt->vcpu->arch.rip;
+               }
+               rc = X86EMUL_CONTINUE;
+               c->dst.type = OP_NONE;
+               break;
+       case 0x32:
+               /* rdmsr */
+               rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
+               if (rc) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       c->eip = ctxt->vcpu->arch.rip;
+               } else {
+                       c->regs[VCPU_REGS_RAX] = (u32)msr_data;
+                       c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+               }
+               rc = X86EMUL_CONTINUE;
+               c->dst.type = OP_NONE;
+               break;
+       case 0x40 ... 0x4f:     /* cmov */
+               c->dst.val = c->dst.orig_val = c->src.val;
+               if (!test_cc(c->b, ctxt->eflags))
+                       c->dst.type = OP_NONE; /* no writeback */
+               break;
+       case 0x80 ... 0x8f: /* jnz rel, etc*/ {
+               long int rel;
+
+               switch (c->op_bytes) {
+               case 2:
+                       rel = insn_fetch(s16, 2, c->eip);
+                       break;
+               case 4:
+                       rel = insn_fetch(s32, 4, c->eip);
+                       break;
+               case 8:
+                       rel = insn_fetch(s64, 8, c->eip);
+                       break;
+               default:
+                       DPRINTF("jnz: Invalid op_bytes\n");
+                       goto cannot_emulate;
+               }
+               if (test_cc(c->b, ctxt->eflags))
+                       JMP_REL(rel);
+               c->dst.type = OP_NONE;
+               break;
+       }
+       case 0xa3:
+             bt:               /* bt */
+               c->dst.type = OP_NONE;
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xab:
+             bts:              /* bts */
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xb0 ... 0xb1:     /* cmpxchg */
+               /*
+                * Save real source value, then compare EAX against
+                * destination.
+                */
+               c->src.orig_val = c->src.val;
+               c->src.val = c->regs[VCPU_REGS_RAX];
+               emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+               if (ctxt->eflags & EFLG_ZF) {
+                       /* Success: write back to memory. */
+                       c->dst.val = c->src.orig_val;
+               } else {
+                       /* Failure: write the value we saw to EAX. */
+                       c->dst.type = OP_REG;
+                       c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+               }
+               break;
+       case 0xb3:
+             btr:              /* btr */
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xb6 ... 0xb7:     /* movzx */
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
+                                                      : (u16) c->src.val;
+               break;
+       case 0xba:              /* Grp8 */
+               switch (c->modrm_reg & 3) {
+               case 0:
+                       goto bt;
+               case 1:
+                       goto bts;
+               case 2:
+                       goto btr;
+               case 3:
+                       goto btc;
+               }
+               break;
+       case 0xbb:
+             btc:              /* btc */
+               /* only subword offset */
+               c->src.val &= (c->dst.bytes << 3) - 1;
+               emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+               break;
+       case 0xbe ... 0xbf:     /* movsx */
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
+                                                       (s16) c->src.val;
+               break;
+       case 0xc3:              /* movnti */
+               c->dst.bytes = c->op_bytes;
+               c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
+                                                       (u64) c->src.val;
+               break;
+       case 0xc7:              /* Grp9 (cmpxchg8b) */
+               rc = emulate_grp9(ctxt, ops, memop);
+               if (rc != 0)
+                       goto done;
+               c->dst.type = OP_NONE;
+               break;
+       }
+       goto writeback;
+
+cannot_emulate:
+       DPRINTF("Cannot emulate %02x\n", c->b);
+       c->eip = saved_eip;
+       return -1;
+}
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig

index 19626ace0f50732cc5f02eaed636478017cf2b8b..964dfa36d3679155bcbf4de5068c08ad264c867d 100644 (file)
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -1,6 +1,7 @@
  config LGUEST_GUEST
         bool "Lguest guest support"
         select PARAVIRT
+       depends on X86_32
         depends on !X86_PAE
         depends on !(X86_VISWS || X86_VOYAGER)
         select VIRTIO
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index 92c56117eae5ed2af3cbdc37c6086d7a7dd25120..5afdde4895dcefe823e0e500df57d977a7ffa350 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
  #include <asm/mce.h>
  #include <asm/io.h>
  #include <asm/i387.h>
+#include <asm/reboot.h>                /* for struct machine_ops */
  
  /*G:010 Welcome to the Guest!
   *
@@ -175,8 +176,8 @@ static void lguest_leave_lazy_mode(void)
   * check there when it wants to deliver an interrupt.
   */
  
-/* save_flags() is expected to return the processor state (ie. "eflags").  The
- * eflags word contains all kind of stuff, but in practice Linux only cares
+/* save_flags() is expected to return the processor state (ie. "flags").  The
+ * flags word contains all kind of stuff, but in practice Linux only cares
   * about the interrupt flag.  Our "save_flags()" just returns that. */
  static unsigned long save_fl(void)
  {
@@ -217,19 +218,20 @@ static void irq_enable(void)
   * address of the handler, and... well, who cares?  The Guest just asks the
   * Host to make the change anyway, because the Host controls the real IDT.
   */
-static void lguest_write_idt_entry(struct desc_struct *dt,
-                                  int entrynum, u32 low, u32 high)
+static void lguest_write_idt_entry(gate_desc *dt,
+                                  int entrynum, const gate_desc *g)
  {
+       u32 *desc = (u32 *)g;
         /* Keep the local copy up to date. */
-       write_dt_entry(dt, entrynum, low, high);
+       native_write_idt_entry(dt, entrynum, g);
         /* Tell Host about this new entry. */
-       hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+       hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
  }
  
  /* Changing to a different IDT is very rare: we keep the IDT up-to-date every
   * time it is written, so we can simply loop through all entries and tell the
   * Host about them. */
-static void lguest_load_idt(const struct Xgt_desc_struct *desc)
+static void lguest_load_idt(const struct desc_ptr *desc)
  {
         unsigned int i;
         struct desc_struct *idt = (void *)desc->address;
@@ -252,7 +254,7 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc)
   * hypercall and use that repeatedly to load a new IDT.  I don't think it
   * really matters, but wouldn't it be nice if they were the same?
   */
-static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
+static void lguest_load_gdt(const struct desc_ptr *desc)
  {
         BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
         hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
@@ -261,10 +263,10 @@ static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
  /* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
   * then tell the Host to reload the entire thing.  This operation is so rare
   * that this naive implementation is reasonable. */
-static void lguest_write_gdt_entry(struct desc_struct *dt,
-                                  int entrynum, u32 low, u32 high)
+static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
+                                  const void *desc, int type)
  {
-       write_dt_entry(dt, entrynum, low, high);
+       native_write_gdt_entry(dt, entrynum, desc, type);
         hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
  }
  
@@ -323,30 +325,30 @@ static void lguest_load_tr_desc(void)
   * anyone (including userspace) can just use the raw "cpuid" instruction and
   * the Host won't even notice since it isn't privileged.  So we try not to get
   * too worked up about it. */
-static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
-                        unsigned int *ecx, unsigned int *edx)
+static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
+                        unsigned int *cx, unsigned int *dx)
  {
-       int function = *eax;
+       int function = *ax;
  
-       native_cpuid(eax, ebx, ecx, edx);
+       native_cpuid(ax, bx, cx, dx);
         switch (function) {
         case 1: /* Basic feature request. */
                 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
-               *ecx &= 0x00002201;
+               *cx &= 0x00002201;
                 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
-               *edx &= 0x07808101;
+               *dx &= 0x07808101;
                 /* The Host can do a nice optimization if it knows that the
                  * kernel mappings (addresses above 0xC0000000 or whatever
                  * PAGE_OFFSET is set to) haven't changed.  But Linux calls
                  * flush_tlb_user() for both user and kernel mappings unless
                  * the Page Global Enable (PGE) feature bit is set. */
-               *edx |= 0x00002000;
+               *dx |= 0x00002000;
                 break;
         case 0x80000000:
                 /* Futureproof this a little: if they ask how much extended
                  * processor information there is, limit it to known fields. */
-               if (*eax > 0x80000008)
-                       *eax = 0x80000008;
+               if (*ax > 0x80000008)
+                       *ax = 0x80000008;
                 break;
         }
  }
@@ -755,10 +757,10 @@ static void lguest_time_init(void)
   * segment), the privilege level (we're privilege level 1, the Host is 0 and
   * will not tolerate us trying to use that), the stack pointer, and the number
   * of pages in the stack. */
-static void lguest_load_esp0(struct tss_struct *tss,
+static void lguest_load_sp0(struct tss_struct *tss,
                                      struct thread_struct *thread)
  {
-       lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
+       lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
                    THREAD_SIZE/PAGE_SIZE);
  }
  
@@ -788,11 +790,11 @@ static void lguest_wbinvd(void)
   * code qualifies for Advanced.  It will also never interrupt anything.  It
   * does, however, allow us to get through the Linux boot code. */
  #ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(unsigned long reg, unsigned long v)
+static void lguest_apic_write(unsigned long reg, u32 v)
  {
  }
  
-static unsigned long lguest_apic_read(unsigned long reg)
+static u32 lguest_apic_read(unsigned long reg)
  {
         return 0;
  }
@@ -812,7 +814,7 @@ static void lguest_safe_halt(void)
   * rather than virtual addresses, so we use __pa() here. */
  static void lguest_power_off(void)
  {
-       hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+       hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0);
  }
  
  /*
@@ -822,7 +824,7 @@ static void lguest_power_off(void)
   */
  static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
  {
-       hcall(LHCALL_CRASH, __pa(p), 0, 0);
+       hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0);
         /* The hcall won't return, but to keep gcc happy, we're "done". */
         return NOTIFY_DONE;
  }
@@ -926,6 +928,11 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
         return insn_len;
  }
  
+static void lguest_restart(char *reason)
+{
+       hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+}
+
  /*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
   * structures in the kernel provide points for (almost) every routine we have
   * to override to avoid privileged instructions. */
@@ -957,7 +964,7 @@ __init void lguest_init(void)
         pv_cpu_ops.cpuid = lguest_cpuid;
         pv_cpu_ops.load_idt = lguest_load_idt;
         pv_cpu_ops.iret = lguest_iret;
-       pv_cpu_ops.load_esp0 = lguest_load_esp0;
+       pv_cpu_ops.load_sp0 = lguest_load_sp0;
         pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
         pv_cpu_ops.set_ldt = lguest_set_ldt;
         pv_cpu_ops.load_tls = lguest_load_tls;
@@ -1059,6 +1066,7 @@ __init void lguest_init(void)
          * the Guest routine to power off. */
         pm_power_off = lguest_power_off;
  
+       machine_ops.restart = lguest_restart;
         /* Now we're set up, call start_kernel() in init/main.c and we proceed
          * to boot as normal.  It never returns. */
         start_kernel();
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile

index 329da276c6f1839d8bdb75ab4448c0d61a6af2f0..4876182daf8a5d1af3bf11516fa6ab2ec0d15283 100644 (file)
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -1,5 +1,27 @@
+#
+# Makefile for x86 specific library files.
+#
+
+obj-$(CONFIG_SMP) := msr-on-cpu.o
+
+lib-y := delay_$(BITS).o
+lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
+lib-y += memcpy_$(BITS).o
+
  ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/lib/Makefile_32
+        lib-y += checksum_32.o
+        lib-y += strstr_32.o
+        lib-y += bitops_32.o semaphore_32.o string_32.o
+
+        lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
  else
-include ${srctree}/arch/x86/lib/Makefile_64
+        obj-y += io_64.o iomap_copy_64.o
+
+        CFLAGS_csum-partial_64.o := -funroll-loops
+
+        lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
+        lib-y += thunk_64.o clear_page_64.o copy_page_64.o
+        lib-y += bitstr_64.o bitops_64.o
+        lib-y += memmove_64.o memset_64.o
+        lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
  endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32

deleted file mode 100644 (file)

index 98d1f1e..0000000
--- a/arch/x86/lib/Makefile_32
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Makefile for i386-specific library files..
-#
-
-
-lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
-       bitops_32.o semaphore_32.o string_32.o
-
-lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
-
-obj-$(CONFIG_SMP)      += msr-on-cpu.o
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64

deleted file mode 100644 (file)

index bbabad3..0000000
--- a/arch/x86/lib/Makefile_64
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Makefile for x86_64-specific library files.
-#
-
-CFLAGS_csum-partial_64.o := -funroll-loops
-
-obj-y := io_64.o iomap_copy_64.o
-obj-$(CONFIG_SMP)      += msr-on-cpu.o
-
-lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
-       usercopy_64.o getuser_64.o putuser_64.o  \
-       thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c

index 8ac51b82a632ca61a4e112a66301816cb0daf2d5..37756b6fb32992c5d703b5ccacf9fd02661213a1 100644 (file)
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -34,8 +34,8 @@ void *memmove(void *dest, const void *src, size_t n)
                         "cld"
                         : "=&c" (d0), "=&S" (d1), "=&D" (d2)
                         :"0" (n),
-                        "1" (n-1+(const char *)src),
-                        "2" (n-1+(char *)dest)
+                        "1" (n-1+src),
+                        "2" (n-1+dest)
                         :"memory");
         }
         return dest;
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c

index 751ebae8ec4251bf74211bbfd937bd8c5a2b09df..80175e47b1902e4a3b5015599a5cc102e3922008 100644 (file)
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -11,8 +11,8 @@ void *memmove(void * dest,const void *src,size_t count)
         if (dest < src) { 
                 return memcpy(dest,src,count);
         } else {
-               char *p = (char *) dest + count;
-               char *s = (char *) src + count;
+               char *p = dest + count;
+               const char *s = src + count;
                 while (count--)
                         *--p = *--s;
         }
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S

index 444fba4009837d45971d6038e6c50851202856b0..3899bd37fdf0b445cfca6b4f747950b371cebe82 100644 (file)
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -29,7 +29,7 @@
   * registers (%eax, %edx and %ecx) except %eax whish is either a return
   * value or just clobbered..
   */
-       .section .sched.text
+       .section .sched.text, "ax"
  ENTRY(__down_failed)
         CFI_STARTPROC
         FRAME
@@ -49,7 +49,7 @@ ENTRY(__down_failed)
         ENDFRAME
         ret
         CFI_ENDPROC
-       END(__down_failed)
+       ENDPROC(__down_failed)
  
  ENTRY(__down_failed_interruptible)
         CFI_STARTPROC
@@ -70,7 +70,7 @@ ENTRY(__down_failed_interruptible)
         ENDFRAME
         ret
         CFI_ENDPROC
-       END(__down_failed_interruptible)
+       ENDPROC(__down_failed_interruptible)
  
  ENTRY(__down_failed_trylock)
         CFI_STARTPROC
@@ -91,7 +91,7 @@ ENTRY(__down_failed_trylock)
         ENDFRAME
         ret
         CFI_ENDPROC
-       END(__down_failed_trylock)
+       ENDPROC(__down_failed_trylock)
  
  ENTRY(__up_wakeup)
         CFI_STARTPROC
@@ -112,7 +112,7 @@ ENTRY(__up_wakeup)
         ENDFRAME
         ret
         CFI_ENDPROC
-       END(__up_wakeup)
+       ENDPROC(__up_wakeup)
  
  /*
   * rw spinlock fallbacks
@@ -132,7 +132,7 @@ ENTRY(__write_lock_failed)
         ENDFRAME
         ret
         CFI_ENDPROC
-       END(__write_lock_failed)
+       ENDPROC(__write_lock_failed)
  
  ENTRY(__read_lock_failed)
         CFI_STARTPROC
@@ -148,7 +148,7 @@ ENTRY(__read_lock_failed)
         ENDFRAME
         ret
         CFI_ENDPROC
-       END(__read_lock_failed)
+       ENDPROC(__read_lock_failed)
  
  #endif
  
@@ -170,7 +170,7 @@ ENTRY(call_rwsem_down_read_failed)
         CFI_ADJUST_CFA_OFFSET -4
         ret
         CFI_ENDPROC
-       END(call_rwsem_down_read_failed)
+       ENDPROC(call_rwsem_down_read_failed)
  
  ENTRY(call_rwsem_down_write_failed)
         CFI_STARTPROC
@@ -182,7 +182,7 @@ ENTRY(call_rwsem_down_write_failed)
         CFI_ADJUST_CFA_OFFSET -4
         ret
         CFI_ENDPROC
-       END(call_rwsem_down_write_failed)
+       ENDPROC(call_rwsem_down_write_failed)
  
  ENTRY(call_rwsem_wake)
         CFI_STARTPROC
@@ -196,7 +196,7 @@ ENTRY(call_rwsem_wake)
         CFI_ADJUST_CFA_OFFSET -4
  1:     ret
         CFI_ENDPROC
-       END(call_rwsem_wake)
+       ENDPROC(call_rwsem_wake)
  
  /* Fix up special calling conventions */
  ENTRY(call_rwsem_downgrade_wake)
@@ -214,6 +214,6 @@ ENTRY(call_rwsem_downgrade_wake)
         CFI_ADJUST_CFA_OFFSET -4
         ret
         CFI_ENDPROC
-       END(call_rwsem_downgrade_wake)
+       ENDPROC(call_rwsem_downgrade_wake)
  
  #endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S

index 6ea73f3de5677320c835234b70613413e746db35..8b92d428ab028833e7f21725302e9303347df73a 100644 (file)
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -33,7 +33,7 @@
         .endm
         
  
-       .section .sched.text
+       .section .sched.text, "ax"
  #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
         thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
         thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile

new file mode 100644 (file)

index 0000000..1faac81
--- /dev/null
+++ b/arch/x86/mach-rdc321x/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the RDC321x specific parts of the kernel
+#
+obj-$(CONFIG_X86_RDC321X)        := gpio.o platform.o wdt.o
+
diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c

new file mode 100644 (file)

index 0000000..0312691
--- /dev/null
+++ b/arch/x86/mach-rdc321x/gpio.c
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (C) 2007, OpenWrt.org, Florian Fainelli <florian@openwrt.org>
+ *     RDC321x architecture specific GPIO support
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ */
+
+#include <linux/autoconf.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+
+#include <asm/mach-rdc321x/rdc321x_defs.h>
+
+static inline int rdc_gpio_is_valid(unsigned gpio)
+{
+       return (gpio <= RDC_MAX_GPIO);
+}
+
+static unsigned int rdc_gpio_read(unsigned gpio)
+{
+       unsigned int val;
+
+       val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48));
+       outl(val, RDC3210_CFGREG_ADDR);
+       udelay(10);
+       val = inl(RDC3210_CFGREG_DATA);
+       val |= (0x1 << (gpio & 0x1F));
+       outl(val, RDC3210_CFGREG_DATA);
+       udelay(10);
+       val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C));
+       outl(val, RDC3210_CFGREG_ADDR);
+       udelay(10);
+       val = inl(RDC3210_CFGREG_DATA);
+
+       return val;
+}
+
+static void rdc_gpio_write(unsigned int val)
+{
+       if (val) {
+               outl(val, RDC3210_CFGREG_DATA);
+               udelay(10);
+       }
+}
+
+int rdc_gpio_get_value(unsigned gpio)
+{
+       if (rdc_gpio_is_valid(gpio))
+               return (int)rdc_gpio_read(gpio);
+       else
+               return -EINVAL;
+}
+EXPORT_SYMBOL(rdc_gpio_get_value);
+
+void rdc_gpio_set_value(unsigned gpio, int value)
+{
+       unsigned int val;
+
+       if (!rdc_gpio_is_valid(gpio))
+               return;
+
+       val = rdc_gpio_read(gpio);
+
+       if (value)
+               val &= ~(0x1 << (gpio & 0x1F));
+       else
+               val |= (0x1 << (gpio & 0x1F));
+
+       rdc_gpio_write(val);
+}
+EXPORT_SYMBOL(rdc_gpio_set_value);
+
+int rdc_gpio_direction_input(unsigned gpio)
+{
+       return 0;
+}
+EXPORT_SYMBOL(rdc_gpio_direction_input);
+
+int rdc_gpio_direction_output(unsigned gpio, int value)
+{
+       return 0;
+}
+EXPORT_SYMBOL(rdc_gpio_direction_output);
+
+
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c

new file mode 100644 (file)

index 0000000..dda6024
--- /dev/null
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -0,0 +1,68 @@
+/*
+ *  Generic RDC321x platform devices
+ *
+ *  Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the
+ *  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ *  Boston, MA  02110-1301, USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/version.h>
+#include <linux/leds.h>
+
+#include <asm/gpio.h>
+
+/* LEDS */
+static struct gpio_led default_leds[] = {
+       { .name = "rdc:dmz", .gpio = 1, },
+};
+
+static struct gpio_led_platform_data rdc321x_led_data = {
+       .num_leds = ARRAY_SIZE(default_leds),
+       .leds = default_leds,
+};
+
+static struct platform_device rdc321x_leds = {
+       .name = "leds-gpio",
+       .id = -1,
+       .dev = {
+               .platform_data = &rdc321x_led_data,
+       }
+};
+
+/* Watchdog */
+static struct platform_device rdc321x_wdt = {
+       .name = "rdc321x-wdt",
+       .id = -1,
+       .num_resources = 0,
+};
+
+static struct platform_device *rdc321x_devs[] = {
+       &rdc321x_leds,
+       &rdc321x_wdt
+};
+
+static int __init rdc_board_setup(void)
+{
+       return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs));
+}
+
+arch_initcall(rdc_board_setup);
diff --git a/arch/x86/mach-rdc321x/wdt.c b/arch/x86/mach-rdc321x/wdt.c

new file mode 100644 (file)

index 0000000..ec5625a
--- /dev/null
+++ b/arch/x86/mach-rdc321x/wdt.c
@@ -0,0 +1,275 @@
+/*
+ * RDC321x watchdog driver
+ *
+ * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
+ *
+ * This driver is highly inspired from the cpu5_wdt driver
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/timer.h>
+#include <linux/completion.h>
+#include <linux/jiffies.h>
+#include <linux/platform_device.h>
+#include <linux/watchdog.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+
+#include <asm/mach-rdc321x/rdc321x_defs.h>
+
+#define RDC_WDT_MASK   0x80000000 /* Mask */
+#define RDC_WDT_EN     0x00800000 /* Enable bit */
+#define RDC_WDT_WTI    0x00200000 /* Generate CPU reset/NMI/WDT on timeout */
+#define RDC_WDT_RST    0x00100000 /* Reset bit */
+#define RDC_WDT_WIF    0x00040000 /* WDT IRQ Flag */
+#define RDC_WDT_IRT    0x00000100 /* IRQ Routing table */
+#define RDC_WDT_CNT    0x00000001 /* WDT count */
+
+#define RDC_CLS_TMR    0x80003844 /* Clear timer */
+
+#define RDC_WDT_INTERVAL       (HZ/10+1)
+
+int nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, int, 0);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+static int ticks = 1000;
+
+/* some device data */
+
+static struct {
+       struct completion stop;
+       volatile int running;
+       struct timer_list timer;
+       volatile int queue;
+       int default_ticks;
+       unsigned long inuse;
+} rdc321x_wdt_device;
+
+/* generic helper functions */
+
+static void rdc321x_wdt_trigger(unsigned long unused)
+{
+       if (rdc321x_wdt_device.running)
+               ticks--;
+
+       /* keep watchdog alive */
+       outl(RDC_WDT_EN|inl(RDC3210_CFGREG_DATA), RDC3210_CFGREG_DATA);
+
+       /* requeue?? */
+       if (rdc321x_wdt_device.queue && ticks)
+               mod_timer(&rdc321x_wdt_device.timer,
+                               jiffies + RDC_WDT_INTERVAL);
+       else {
+               /* ticks doesn't matter anyway */
+               complete(&rdc321x_wdt_device.stop);
+       }
+
+}
+
+static void rdc321x_wdt_reset(void)
+{
+       ticks = rdc321x_wdt_device.default_ticks;
+}
+
+static void rdc321x_wdt_start(void)
+{
+       if (!rdc321x_wdt_device.queue) {
+               rdc321x_wdt_device.queue = 1;
+
+               /* Clear the timer */
+               outl(RDC_CLS_TMR, RDC3210_CFGREG_ADDR);
+
+               /* Enable watchdog and set the timeout to 81.92 us */
+               outl(RDC_WDT_EN|RDC_WDT_CNT, RDC3210_CFGREG_DATA);
+
+               mod_timer(&rdc321x_wdt_device.timer,
+                               jiffies + RDC_WDT_INTERVAL);
+       }
+
+       /* if process dies, counter is not decremented */
+       rdc321x_wdt_device.running++;
+}
+
+static int rdc321x_wdt_stop(void)
+{
+       if (rdc321x_wdt_device.running)
+               rdc321x_wdt_device.running = 0;
+
+       ticks = rdc321x_wdt_device.default_ticks;
+
+       return -EIO;
+}
+
+/* filesystem operations */
+
+static int rdc321x_wdt_open(struct inode *inode, struct file *file)
+{
+       if (test_and_set_bit(0, &rdc321x_wdt_device.inuse))
+               return -EBUSY;
+
+       return nonseekable_open(inode, file);
+}
+
+static int rdc321x_wdt_release(struct inode *inode, struct file *file)
+{
+       clear_bit(0, &rdc321x_wdt_device.inuse);
+       return 0;
+}
+
+static int rdc321x_wdt_ioctl(struct inode *inode, struct file *file,
+                               unsigned int cmd, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       unsigned int value;
+       static struct watchdog_info ident = {
+               .options = WDIOF_CARDRESET,
+               .identity = "RDC321x WDT",
+       };
+
+       switch (cmd) {
+       case WDIOC_KEEPALIVE:
+               rdc321x_wdt_reset();
+               break;
+       case WDIOC_GETSTATUS:
+               /* Read the value from the DATA register */
+               value = inl(RDC3210_CFGREG_DATA);
+               if (copy_to_user(argp, &value, sizeof(int)))
+                       return -EFAULT;
+               break;
+       case WDIOC_GETSUPPORT:
+               if (copy_to_user(argp, &ident, sizeof(ident)))
+                       return -EFAULT;
+               break;
+       case WDIOC_SETOPTIONS:
+               if (copy_from_user(&value, argp, sizeof(int)))
+                       return -EFAULT;
+               switch (value) {
+               case WDIOS_ENABLECARD:
+                       rdc321x_wdt_start();
+                       break;
+               case WDIOS_DISABLECARD:
+                       return rdc321x_wdt_stop();
+               default:
+                       return -EINVAL;
+               }
+               break;
+       default:
+               return -ENOTTY;
+       }
+       return 0;
+}
+
+static ssize_t rdc321x_wdt_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       if (!count)
+               return -EIO;
+
+       rdc321x_wdt_reset();
+
+       return count;
+}
+
+static const struct file_operations rdc321x_wdt_fops = {
+       .owner          = THIS_MODULE,
+       .llseek         = no_llseek,
+       .ioctl          = rdc321x_wdt_ioctl,
+       .open           = rdc321x_wdt_open,
+       .write          = rdc321x_wdt_write,
+       .release        = rdc321x_wdt_release,
+};
+
+static struct miscdevice rdc321x_wdt_misc = {
+       .minor  = WATCHDOG_MINOR,
+       .name   = "watchdog",
+       .fops   = &rdc321x_wdt_fops,
+};
+
+static int __devinit rdc321x_wdt_probe(struct platform_device *pdev)
+{
+       int err;
+
+       err = misc_register(&rdc321x_wdt_misc);
+       if (err < 0) {
+               printk(KERN_ERR PFX "watchdog misc_register failed\n");
+               return err;
+       }
+
+       /* Reset the watchdog */
+       outl(RDC_WDT_RST, RDC3210_CFGREG_DATA);
+
+       init_completion(&rdc321x_wdt_device.stop);
+       rdc321x_wdt_device.queue = 0;
+
+       clear_bit(0, &rdc321x_wdt_device.inuse);
+
+       setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
+
+       rdc321x_wdt_device.default_ticks = ticks;
+
+       printk(KERN_INFO PFX "watchdog init success\n");
+
+       return 0;
+}
+
+static int rdc321x_wdt_remove(struct platform_device *pdev)
+{
+       if (rdc321x_wdt_device.queue) {
+               rdc321x_wdt_device.queue = 0;
+               wait_for_completion(&rdc321x_wdt_device.stop);
+       }
+
+       misc_deregister(&rdc321x_wdt_misc);
+
+       return 0;
+}
+
+static struct platform_driver rdc321x_wdt_driver = {
+       .probe = rdc321x_wdt_probe,
+       .remove = rdc321x_wdt_remove,
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = "rdc321x-wdt",
+       },
+};
+
+static int __init rdc321x_wdt_init(void)
+{
+       return platform_driver_register(&rdc321x_wdt_driver);
+}
+
+static void __exit rdc321x_wdt_exit(void)
+{
+       platform_driver_unregister(&rdc321x_wdt_driver);
+}
+
+module_init(rdc321x_wdt_init);
+module_exit(rdc321x_wdt_exit);
+
+MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>");
+MODULE_DESCRIPTION("RDC321x watchdog driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c

index f3c74fab8b95d18b32878fe966dbbaef20e77481..2a8456a1f44fc5896fe7de8ed25aa106d8bead68 100644 (file)
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -36,19 +36,19 @@ unsigned int __initdata maxcpus = NR_CPUS;
  
  static void __init MP_processor_info (struct mpc_config_processor *m)
  {
-       int ver, logical_apicid;
+       int ver, logical_apicid;
         physid_mask_t apic_cpus;
-       
+
         if (!(m->mpc_cpuflag & CPU_ENABLED))
                 return;
  
         logical_apicid = m->mpc_apicid;
-       printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n",
-               m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
-               m->mpc_apicid,
-               (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-               (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-               m->mpc_apicver);
+       printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
+              m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver);
  
         if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
                 boot_cpu_physical_apicid = m->mpc_apicid;
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c

index 3bef977cb29b25d028db4caf061ea020d4ecd080..5ae5466b9eb9c3b9850fca6617e36dab2c653da7 100644 (file)
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -37,14 +37,14 @@ void __init pre_setup_arch_hook(void)
  {
         /* Voyagers run their CPUs from independent clocks, so disable
          * the TSC code because we can't sync them */
-       tsc_disable = 1;
+       setup_clear_cpu_cap(X86_FEATURE_TSC);
  }
  
  void __init trap_init_hook(void)
  {
  }
  
-static struct irqaction irq0  = {
+static struct irqaction irq0 = {
         .handler = timer_interrupt,
         .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,
         .mask = CPU_MASK_NONE,
@@ -59,44 +59,47 @@ void __init time_init_hook(void)
  
  /* Hook for machine specific memory setup. */
  
-char * __init machine_specific_memory_setup(void)
+char *__init machine_specific_memory_setup(void)
  {
         char *who;
  
         who = "NOT VOYAGER";
  
-       if(voyager_level == 5) {
+       if (voyager_level == 5) {
                 __u32 addr, length;
                 int i;
  
                 who = "Voyager-SUS";
  
                 e820.nr_map = 0;
-               for(i=0; voyager_memory_detect(i, &addr, &length); i++) {
+               for (i = 0; voyager_memory_detect(i, &addr, &length); i++) {
                         add_memory_region(addr, length, E820_RAM);
                 }
                 return who;
-       } else if(voyager_level == 4) {
+       } else if (voyager_level == 4) {
                 __u32 tom;
-               __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8;
+               __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
                 /* select the DINO config space */
                 outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT);
                 /* Read DINO top of memory register */
                 tom = ((inb(catbase + 0x4) & 0xf0) << 16)
-                       + ((inb(catbase + 0x5) & 0x7f) << 24);
+                   + ((inb(catbase + 0x5) & 0x7f) << 24);
  
-               if(inb(catbase) != VOYAGER_DINO) {
-                       printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n");
-                       tom = (boot_params.screen_info.ext_mem_k)<<10;
+               if (inb(catbase) != VOYAGER_DINO) {
+                       printk(KERN_ERR
+                              "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n");
+                       tom = (boot_params.screen_info.ext_mem_k) << 10;
                 }
                 who = "Voyager-TOM";
                 add_memory_region(0, 0x9f000, E820_RAM);
                 /* map from 1M to top of memory */
-               add_memory_region(1*1024*1024, tom - 1*1024*1024, E820_RAM);
+               add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024,
+                                 E820_RAM);
                 /* FIXME: Should check the ASICs to see if I need to
                  * take out the 8M window.  Just do it at the moment
                  * */
-               add_memory_region(8*1024*1024, 8*1024*1024, E820_RESERVED);
+               add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024,
+                                 E820_RESERVED);
                 return who;
         }
  
@@ -114,8 +117,7 @@ char * __init machine_specific_memory_setup(void)
                 unsigned long mem_size;
  
                 /* compare results from other methods and take the greater */
-               if (boot_params.alt_mem_k
-                   < boot_params.screen_info.ext_mem_k) {
+               if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
                         mem_size = boot_params.screen_info.ext_mem_k;
                         who = "BIOS-88";
                 } else {
@@ -126,6 +128,6 @@ char * __init machine_specific_memory_setup(void)
                 e820.nr_map = 0;
                 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
                 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
-       }
+       }
         return who;
  }
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c

index 9b77b39b71a6dde7a2f4e17aa39af680a13e27ff..6a949e4edde8f17758fbb5fc9ff004eff0b4d286 100644 (file)
--- a/arch/x86/mach-voyager/voyager_basic.c
+++ b/arch/x86/mach-voyager/voyager_basic.c
@@ -35,7 +35,7 @@
  /*
   * Power off function, if any
   */
-void (*pm_power_off)(void);
+void (*pm_power_off) (void);
  EXPORT_SYMBOL(pm_power_off);
  
  int voyager_level = 0;
@@ -43,39 +43,38 @@ int voyager_level = 0;
  struct voyager_SUS *voyager_SUS = NULL;
  
  #ifdef CONFIG_SMP
-static void
-voyager_dump(int dummy1, struct tty_struct *dummy3)
+static void voyager_dump(int dummy1, struct tty_struct *dummy3)
  {
         /* get here via a sysrq */
         voyager_smp_dump();
  }
  
  static struct sysrq_key_op sysrq_voyager_dump_op = {
-       .handler        = voyager_dump,
-       .help_msg       = "Voyager",
-       .action_msg     = "Dump Voyager Status",
+       .handler = voyager_dump,
+       .help_msg = "Voyager",
+       .action_msg = "Dump Voyager Status",
  };
  #endif
  
-void
-voyager_detect(struct voyager_bios_info *bios)
+void voyager_detect(struct voyager_bios_info *bios)
  {
-       if(bios->len != 0xff) {
-               int class = (bios->class_1 << 8) 
-                       | (bios->class_2 & 0xff);
+       if (bios->len != 0xff) {
+               int class = (bios->class_1 << 8)
+                   | (bios->class_2 & 0xff);
  
                 printk("Voyager System detected.\n"
                        "        Class %x, Revision %d.%d\n",
                        class, bios->major, bios->minor);
-               if(class == VOYAGER_LEVEL4) 
+               if (class == VOYAGER_LEVEL4)
                         voyager_level = 4;
-               else if(class < VOYAGER_LEVEL5_AND_ABOVE)
+               else if (class < VOYAGER_LEVEL5_AND_ABOVE)
                         voyager_level = 3;
                 else
                         voyager_level = 5;
                 printk("        Architecture Level %d\n", voyager_level);
-               if(voyager_level < 4)
-                       printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n");
+               if (voyager_level < 4)
+                       printk
+                           ("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n");
                 /* install the power off handler */
                 pm_power_off = voyager_power_off;
  #ifdef CONFIG_SMP
@@ -86,15 +85,13 @@ voyager_detect(struct voyager_bios_info *bios)
         }
  }
  
-void
-voyager_system_interrupt(int cpl, void *dev_id)
+void voyager_system_interrupt(int cpl, void *dev_id)
  {
         printk("Voyager: detected system interrupt\n");
  }
  
  /* Routine to read information from the extended CMOS area */
-__u8
-voyager_extended_cmos_read(__u16 addr)
+__u8 voyager_extended_cmos_read(__u16 addr)
  {
         outb(addr & 0xff, 0x74);
         outb((addr >> 8) & 0xff, 0x75);
@@ -108,12 +105,11 @@ voyager_extended_cmos_read(__u16 addr)
  
  typedef struct ClickMap {
         struct Entry {
-               __u32   Address;
-               __u32   Length;
+               __u32 Address;
+               __u32 Length;
         } Entry[CLICK_ENTRIES];
  } ClickMap_t;
  
-
  /* This routine is pretty much an awful hack to read the bios clickmap by
   * mapping it into page 0.  There are usually three regions in the map:
   *     Base Memory
@@ -122,8 +118,7 @@ typedef struct ClickMap {
   *
   * Returns are 0 for failure and 1 for success on extracting region.
   */
-int __init
-voyager_memory_detect(int region, __u32 *start, __u32 *length)
+int __init voyager_memory_detect(int region, __u32 * start, __u32 * length)
  {
         int i;
         int retval = 0;
@@ -132,13 +127,14 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
         unsigned long map_addr;
         unsigned long old;
  
-       if(region >= CLICK_ENTRIES) {
+       if (region >= CLICK_ENTRIES) {
                 printk("Voyager: Illegal ClickMap region %d\n", region);
                 return 0;
         }
  
-       for(i = 0; i < sizeof(cmos); i++)
-               cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i);
+       for (i = 0; i < sizeof(cmos); i++)
+               cmos[i] =
+                   voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i);
  
         map_addr = *(unsigned long *)cmos;
  
@@ -147,10 +143,10 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
         pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
         local_flush_tlb();
         /* now clear everything out but page 0 */
-       map = (ClickMap_t *)(map_addr & (~PAGE_MASK));
+       map = (ClickMap_t *) (map_addr & (~PAGE_MASK));
  
         /* zero length is the end of the clickmap */
-       if(map->Entry[region].Length != 0) {
+       if (map->Entry[region].Length != 0) {
                 *length = map->Entry[region].Length * CLICK_SIZE;
                 *start = map->Entry[region].Address;
                 retval = 1;
@@ -165,10 +161,9 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length)
  /* voyager specific handling code for timer interrupts.  Used to hand
   * off the timer tick to the SMP code, since the VIC doesn't have an
   * internal timer (The QIC does, but that's another story). */
-void
-voyager_timer_interrupt(void)
+void voyager_timer_interrupt(void)
  {
-       if((jiffies & 0x3ff) == 0) {
+       if ((jiffies & 0x3ff) == 0) {
  
                 /* There seems to be something flaky in either
                  * hardware or software that is resetting the timer 0
@@ -186,18 +181,20 @@ voyager_timer_interrupt(void)
                 __u16 val;
  
                 spin_lock(&i8253_lock);
-               
+
                 outb_p(0x00, 0x43);
                 val = inb_p(0x40);
                 val |= inb(0x40) << 8;
                 spin_unlock(&i8253_lock);
  
-               if(val > LATCH) {
-                       printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val);
+               if (val > LATCH) {
+                       printk
+                           ("\nVOYAGER: countdown timer value too high (%d), resetting\n\n",
+                            val);
                         spin_lock(&i8253_lock);
-                       outb(0x34,0x43);
-                       outb_p(LATCH & 0xff , 0x40);    /* LSB */
-                       outb(LATCH >> 8 , 0x40);        /* MSB */
+                       outb(0x34, 0x43);
+                       outb_p(LATCH & 0xff, 0x40);     /* LSB */
+                       outb(LATCH >> 8, 0x40); /* MSB */
                         spin_unlock(&i8253_lock);
                 }
         }
@@ -206,14 +203,13 @@ voyager_timer_interrupt(void)
  #endif
  }
  
-void
-voyager_power_off(void)
+void voyager_power_off(void)
  {
         printk("VOYAGER Power Off\n");
  
-       if(voyager_level == 5) {
+       if (voyager_level == 5) {
                 voyager_cat_power_off();
-       } else if(voyager_level == 4) {
+       } else if (voyager_level == 4) {
                 /* This doesn't apparently work on most L4 machines,
                  * but the specs say to do this to get automatic power
                  * off.  Unfortunately, if it doesn't power off the
@@ -222,10 +218,8 @@ voyager_power_off(void)
  #if 0
                 int port;
  
-         
                 /* enable the voyager Configuration Space */
-               outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, 
-                    VOYAGER_MC_SETUP);
+               outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, VOYAGER_MC_SETUP);
                 /* the port for the power off flag is an offset from the
                    floating base */
                 port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21;
@@ -235,62 +229,57 @@ voyager_power_off(void)
         }
         /* and wait for it to happen */
         local_irq_disable();
-       for(;;)
+       for (;;)
                 halt();
  }
  
  /* copied from process.c */
-static inline void
-kb_wait(void)
+static inline void kb_wait(void)
  {
         int i;
  
-       for (i=0; i<0x10000; i++)
+       for (i = 0; i < 0x10000; i++)
                 if ((inb_p(0x64) & 0x02) == 0)
                         break;
  }
  
-void
-machine_shutdown(void)
+void machine_shutdown(void)
  {
         /* Architecture specific shutdown needed before a kexec */
  }
  
-void
-machine_restart(char *cmd)
+void machine_restart(char *cmd)
  {
         printk("Voyager Warm Restart\n");
         kb_wait();
  
-       if(voyager_level == 5) {
+       if (voyager_level == 5) {
                 /* write magic values to the RTC to inform system that
                  * shutdown is beginning */
                 outb(0x8f, 0x70);
-               outb(0x5 , 0x71);
-               
+               outb(0x5, 0x71);
+
                 udelay(50);
-               outb(0xfe,0x64);         /* pull reset low */
-       } else if(voyager_level == 4) {
-               __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8;
+               outb(0xfe, 0x64);       /* pull reset low */
+       } else if (voyager_level == 4) {
+               __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8;
                 __u8 basebd = inb(VOYAGER_MC_SETUP);
-               
+
                 outb(basebd | 0x08, VOYAGER_MC_SETUP);
                 outb(0x02, catbase + 0x21);
         }
         local_irq_disable();
-       for(;;)
+       for (;;)
                 halt();
  }
  
-void
-machine_emergency_restart(void)
+void machine_emergency_restart(void)
  {
         /*for now, just hook this to a warm restart */
         machine_restart(NULL);
  }
  
-void
-mca_nmi_hook(void)
+void mca_nmi_hook(void)
  {
         __u8 dumpval __maybe_unused = inb(0xf823);
         __u8 swnmi __maybe_unused = inb(0xf813);
@@ -301,8 +290,8 @@ mca_nmi_hook(void)
         /* clear swnmi */
         outb(0xff, 0xf813);
         /* tell SUS to ignore dump */
-       if(voyager_level == 5 && voyager_SUS != NULL) {
-               if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) {
+       if (voyager_level == 5 && voyager_SUS != NULL) {
+               if (voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) {
                         voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND;
                         voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS;
                         udelay(1000);
@@ -310,15 +299,14 @@ mca_nmi_hook(void)
                         voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS;
                 }
         }
-       printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id());
+       printk(KERN_ERR
+              "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n",
+              smp_processor_id());
         show_stack(NULL, NULL);
         show_state();
  }
  
-
-
-void
-machine_halt(void)
+void machine_halt(void)
  {
         /* treat a halt like a power off */
         machine_power_off();
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c

index 2132ca652df1d65de50fefab33bb81bde854c3db..17a7904f75b19ce55b63a562158bc82c172cf677 100644 (file)
--- a/arch/x86/mach-voyager/voyager_cat.c
+++ b/arch/x86/mach-voyager/voyager_cat.c
@@ -39,34 +39,32 @@
  #define CAT_DATA       (sspb + 0xd)
  
  /* the internal cat functions */
-static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data, 
-                    __u16 num_bits);
-static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data,
+static void cat_pack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits);
+static void cat_unpack(__u8 * msg, __u16 start_bit, __u8 * data,
                        __u16 num_bits);
-static void cat_build_header(__u8 *header, const __u16 len, 
+static void cat_build_header(__u8 * header, const __u16 len,
                              const __u16 smallest_reg_bits,
                              const __u16 longest_reg_bits);
-static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp,
+static int cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp,
                         __u8 reg, __u8 op);
-static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp,
-                      __u8 reg, __u8 *value);
-static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes,
+static int cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp,
+                      __u8 reg, __u8 * value);
+static int cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes,
                         __u8 pad_bits);
-static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
+static int cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
                      __u8 value);
-static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
-                   __u8 *value);
-static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp,
+static int cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
+                   __u8 * value);
+static int cat_subread(voyager_module_t * modp, voyager_asic_t * asicp,
                        __u16 offset, __u16 len, void *buf);
-static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp,
+static int cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
                         __u8 reg, __u8 value);
-static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp);
-static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp);
+static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp);
+static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp);
  
-static inline const char *
-cat_module_name(int module_id)
+static inline const char *cat_module_name(int module_id)
  {
-       switch(module_id) {
+       switch (module_id) {
         case 0x10:
                 return "Processor Slot 0";
         case 0x11:
@@ -105,14 +103,14 @@ voyager_module_t *voyager_cat_list;
  
  /* the I/O port assignments for the VIC and QIC */
  static struct resource vic_res = {
-       .name   = "Voyager Interrupt Controller",
-       .start  = 0xFC00,
-       .end    = 0xFC6F
+       .name = "Voyager Interrupt Controller",
+       .start = 0xFC00,
+       .end = 0xFC6F
  };
  static struct resource qic_res = {
-       .name   = "Quad Interrupt Controller",
-       .start  = 0xFC70,
-       .end    = 0xFCFF
+       .name = "Quad Interrupt Controller",
+       .start = 0xFC70,
+       .end = 0xFCFF
  };
  
  /* This function is used to pack a data bit stream inside a message.
@@ -120,7 +118,7 @@ static struct resource qic_res = {
   * Note: This function assumes that any unused bit in the data stream
   * is set to zero so that the ors will work correctly */
  static void
-cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
+cat_pack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
  {
         /* compute initial shift needed */
         const __u16 offset = start_bit % BITS_PER_BYTE;
@@ -130,7 +128,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
         int i;
  
         /* adjust if we have more than a byte of residue */
-       if(residue >= BITS_PER_BYTE) {
+       if (residue >= BITS_PER_BYTE) {
                 residue -= BITS_PER_BYTE;
                 len++;
         }
@@ -138,24 +136,25 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
         /* clear out the bits.  We assume here that if len==0 then
          * residue >= offset.  This is always true for the catbus
          * operations */
-       msg[byte] &= 0xff << (BITS_PER_BYTE - offset); 
+       msg[byte] &= 0xff << (BITS_PER_BYTE - offset);
         msg[byte++] |= data[0] >> offset;
-       if(len == 0)
+       if (len == 0)
                 return;
-       for(i = 1; i < len; i++)
-               msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset))
-                       | (data[i] >> offset);
-       if(residue != 0) {
+       for (i = 1; i < len; i++)
+               msg[byte++] = (data[i - 1] << (BITS_PER_BYTE - offset))
+                   | (data[i] >> offset);
+       if (residue != 0) {
                 __u8 mask = 0xff >> residue;
-               __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset)
-                       | (data[i] >> offset);
-               
+               __u8 last_byte = data[i - 1] << (BITS_PER_BYTE - offset)
+                   | (data[i] >> offset);
+
                 last_byte &= ~mask;
                 msg[byte] &= mask;
                 msg[byte] |= last_byte;
         }
         return;
  }
+
  /* unpack the data again (same arguments as cat_pack()). data buffer
   * must be zero populated.
   *
@@ -163,7 +162,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
   * data (starting at bit 0 in data).
   */
  static void
-cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
+cat_unpack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits)
  {
         /* compute initial shift needed */
         const __u16 offset = start_bit % BITS_PER_BYTE;
@@ -172,97 +171,97 @@ cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits)
         __u16 byte = start_bit / BITS_PER_BYTE;
         int i;
  
-       if(last_bits != 0)
+       if (last_bits != 0)
                 len++;
  
         /* special case: want < 8 bits from msg and we can get it from
          * a single byte of the msg */
-       if(len == 0 && BITS_PER_BYTE - offset >= num_bits) {
+       if (len == 0 && BITS_PER_BYTE - offset >= num_bits) {
                 data[0] = msg[byte] << offset;
                 data[0] &= 0xff >> (BITS_PER_BYTE - num_bits);
                 return;
         }
-       for(i = 0; i < len; i++) {
+       for (i = 0; i < len; i++) {
                 /* this annoying if has to be done just in case a read of
                  * msg one beyond the array causes a panic */
-               if(offset != 0) {
+               if (offset != 0) {
                         data[i] = msg[byte++] << offset;
                         data[i] |= msg[byte] >> (BITS_PER_BYTE - offset);
-               }
-               else {
+               } else {
                         data[i] = msg[byte++];
                 }
         }
         /* do we need to truncate the final byte */
-       if(last_bits != 0) {
-               data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits);
+       if (last_bits != 0) {
+               data[i - 1] &= 0xff << (BITS_PER_BYTE - last_bits);
         }
         return;
  }
  
  static void
-cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits,
+cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits,
                  const __u16 longest_reg_bits)
  {
         int i;
         __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE;
         __u8 *last_byte = &header[len - 1];
  
-       if(start_bit == 0)
+       if (start_bit == 0)
                 start_bit = 1;  /* must have at least one bit in the hdr */
-       
-       for(i=0; i < len; i++)
+
+       for (i = 0; i < len; i++)
                 header[i] = 0;
  
-       for(i = start_bit; i > 0; i--)
+       for (i = start_bit; i > 0; i--)
                 *last_byte = ((*last_byte) << 1) + 1;
  
  }
  
  static int
-cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
+cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op)
  {
         __u8 parity, inst, inst_buf[4] = { 0 };
         __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE];
         __u16 ibytes, hbytes, padbits;
         int i;
-       
+
         /* 
          * Parity is the parity of the register number + 1 (READ_REGISTER
          * and WRITE_REGISTER always add '1' to the number of bits == 1)
          */
-       parity = (__u8)(1 + (reg & 0x01) +
-                ((__u8)(reg & 0x02) >> 1) +
-                ((__u8)(reg & 0x04) >> 2) +
-                ((__u8)(reg & 0x08) >> 3)) % 2;
+       parity = (__u8) (1 + (reg & 0x01) +
+                        ((__u8) (reg & 0x02) >> 1) +
+                        ((__u8) (reg & 0x04) >> 2) +
+                        ((__u8) (reg & 0x08) >> 3)) % 2;
  
         inst = ((parity << 7) | (reg << 2) | op);
  
         outb(VOYAGER_CAT_IRCYC, CAT_CMD);
-       if(!modp->scan_path_connected) {
-               if(asicp->asic_id != VOYAGER_CAT_ID) {
-                       printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n");
+       if (!modp->scan_path_connected) {
+               if (asicp->asic_id != VOYAGER_CAT_ID) {
+                       printk
+                           ("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n");
                         return 1;
                 }
                 outb(VOYAGER_CAT_HEADER, CAT_DATA);
                 outb(inst, CAT_DATA);
-               if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
+               if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
                         CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n"));
                         return 1;
                 }
                 return 0;
         }
         ibytes = modp->inst_bits / BITS_PER_BYTE;
-       if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) {
+       if ((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) {
                 padbits = BITS_PER_BYTE - padbits;
                 ibytes++;
         }
         hbytes = modp->largest_reg / BITS_PER_BYTE;
-       if(modp->largest_reg % BITS_PER_BYTE)
+       if (modp->largest_reg % BITS_PER_BYTE)
                 hbytes++;
         CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes));
         /* initialise the instruction sequence to 0xff */
-       for(i=0; i < ibytes + hbytes; i++)
+       for (i = 0; i < ibytes + hbytes; i++)
                 iseq[i] = 0xff;
         cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg);
         cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE);
@@ -271,11 +270,11 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
         cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length);
  #ifdef VOYAGER_CAT_DEBUG
         printk("ins = 0x%x, iseq: ", inst);
-       for(i=0; i< ibytes + hbytes; i++)
+       for (i = 0; i < ibytes + hbytes; i++)
                 printk("0x%x ", iseq[i]);
         printk("\n");
  #endif
-       if(cat_shiftout(iseq, ibytes, hbytes, padbits)) {
+       if (cat_shiftout(iseq, ibytes, hbytes, padbits)) {
                 CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n"));
                 return 1;
         }
@@ -284,72 +283,74 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op)
  }
  
  static int
-cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, 
-           __u8 *value)
+cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
+           __u8 * value)
  {
-       if(!modp->scan_path_connected) {
-               if(asicp->asic_id != VOYAGER_CAT_ID) {
+       if (!modp->scan_path_connected) {
+               if (asicp->asic_id != VOYAGER_CAT_ID) {
                         CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n"));
                         return 1;
                 }
-               if(reg > VOYAGER_SUBADDRHI) 
+               if (reg > VOYAGER_SUBADDRHI)
                         outb(VOYAGER_CAT_RUN, CAT_CMD);
                 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
                 outb(VOYAGER_CAT_HEADER, CAT_DATA);
                 *value = inb(CAT_DATA);
                 outb(0xAA, CAT_DATA);
-               if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
+               if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
                         CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n"));
                         return 1;
                 }
                 return 0;
-       }
-       else {
-               __u16 sbits = modp->num_asics -1 + asicp->ireg_length;
+       } else {
+               __u16 sbits = modp->num_asics - 1 + asicp->ireg_length;
                 __u16 sbytes = sbits / BITS_PER_BYTE;
                 __u16 tbytes;
-               __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE];
+               __u8 string[VOYAGER_MAX_SCAN_PATH],
+                   trailer[VOYAGER_MAX_REG_SIZE];
                 __u8 padbits;
                 int i;
-               
+
                 outb(VOYAGER_CAT_DRCYC, CAT_CMD);
  
-               if((padbits = sbits % BITS_PER_BYTE) != 0) {
+               if ((padbits = sbits % BITS_PER_BYTE) != 0) {
                         padbits = BITS_PER_BYTE - padbits;
                         sbytes++;
                 }
                 tbytes = asicp->ireg_length / BITS_PER_BYTE;
-               if(asicp->ireg_length % BITS_PER_BYTE)
+               if (asicp->ireg_length % BITS_PER_BYTE)
                         tbytes++;
                 CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n",
-                       tbytes, sbytes, padbits));
+                       tbytes, sbytes, padbits));
                 cat_build_header(trailer, tbytes, 1, asicp->ireg_length);
  
-               
-               for(i = tbytes - 1; i >= 0; i--) {
+               for (i = tbytes - 1; i >= 0; i--) {
                         outb(trailer[i], CAT_DATA);
                         string[sbytes + i] = inb(CAT_DATA);
                 }
  
-               for(i = sbytes - 1; i >= 0; i--) {
+               for (i = sbytes - 1; i >= 0; i--) {
                         outb(0xaa, CAT_DATA);
                         string[i] = inb(CAT_DATA);
                 }
                 *value = 0;
-               cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length);
+               cat_unpack(string,
+                          padbits + (tbytes * BITS_PER_BYTE) +
+                          asicp->asic_location, value, asicp->ireg_length);
  #ifdef VOYAGER_CAT_DEBUG
                 printk("value=0x%x, string: ", *value);
-               for(i=0; i< tbytes+sbytes; i++)
+               for (i = 0; i < tbytes + sbytes; i++)
                         printk("0x%x ", string[i]);
                 printk("\n");
  #endif
-               
+
                 /* sanity check the rest of the return */
-               for(i=0; i < tbytes; i++) {
+               for (i = 0; i < tbytes; i++) {
                         __u8 input = 0;
  
-                       cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE);
-                       if(trailer[i] != input) {
+                       cat_unpack(string, padbits + (i * BITS_PER_BYTE),
+                                  &input, BITS_PER_BYTE);
+                       if (trailer[i] != input) {
                                 CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i]));
                                 return 1;
                         }
@@ -360,14 +361,14 @@ cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
  }
  
  static int
-cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
+cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
  {
         int i;
-       
-       for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--)
+
+       for (i = data_bytes + header_bytes - 1; i >= header_bytes; i--)
                 outb(data[i], CAT_DATA);
  
-       for(i = header_bytes - 1; i >= 0; i--) {
+       for (i = header_bytes - 1; i >= 0; i--) {
                 __u8 header = 0;
                 __u8 input;
  
@@ -376,7 +377,7 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
                 CDEBUG(("cat_shiftout: returned 0x%x\n", input));
                 cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits,
                            &header, BITS_PER_BYTE);
-               if(input != header) {
+               if (input != header) {
                         CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header));
                         return 1;
                 }
@@ -385,57 +386,57 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits)
  }
  
  static int
-cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, 
+cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp,
              __u8 reg, __u8 value)
  {
         outb(VOYAGER_CAT_DRCYC, CAT_CMD);
-       if(!modp->scan_path_connected) {
-               if(asicp->asic_id != VOYAGER_CAT_ID) {
+       if (!modp->scan_path_connected) {
+               if (asicp->asic_id != VOYAGER_CAT_ID) {
                         CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n"));
                         return 1;
                 }
                 outb(VOYAGER_CAT_HEADER, CAT_DATA);
                 outb(value, CAT_DATA);
-               if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
+               if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) {
                         CDEBUG(("cat_senddata: failed to get correct header response to sent data\n"));
                         return 1;
                 }
-               if(reg > VOYAGER_SUBADDRHI) {
+               if (reg > VOYAGER_SUBADDRHI) {
                         outb(VOYAGER_CAT_RUN, CAT_CMD);
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         outb(VOYAGER_CAT_RUN, CAT_CMD);
                 }
-               
+
                 return 0;
-       }
-       else {
+       } else {
                 __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE;
-               __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE;
-               __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], 
-                       hseq[VOYAGER_MAX_REG_SIZE];
+               __u16 dbytes =
+                   (modp->num_asics - 1 + asicp->ireg_length) / BITS_PER_BYTE;
+               __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH],
+                   hseq[VOYAGER_MAX_REG_SIZE];
                 int i;
  
-               if((padbits = (modp->num_asics - 1 
-                              + asicp->ireg_length) % BITS_PER_BYTE) != 0) {
+               if ((padbits = (modp->num_asics - 1
+                               + asicp->ireg_length) % BITS_PER_BYTE) != 0) {
                         padbits = BITS_PER_BYTE - padbits;
                         dbytes++;
                 }
-               if(asicp->ireg_length % BITS_PER_BYTE)
+               if (asicp->ireg_length % BITS_PER_BYTE)
                         hbytes++;
-               
+
                 cat_build_header(hseq, hbytes, 1, asicp->ireg_length);
-               
-               for(i = 0; i < dbytes + hbytes; i++)
+
+               for (i = 0; i < dbytes + hbytes; i++)
                         dseq[i] = 0xff;
                 CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n",
                         dbytes, hbytes, padbits));
                 cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length,
                          hseq, hbytes * BITS_PER_BYTE);
-               cat_pack(dseq, asicp->asic_location, &value, 
+               cat_pack(dseq, asicp->asic_location, &value,
                          asicp->ireg_length);
  #ifdef VOYAGER_CAT_DEBUG
                 printk("dseq ");
-               for(i=0; i<hbytes+dbytes; i++) {
+               for (i = 0; i < hbytes + dbytes; i++) {
                         printk("0x%x ", dseq[i]);
                 }
                 printk("\n");
@@ -445,121 +446,125 @@ cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp,
  }
  
  static int
-cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
-        __u8 value)
+cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value)
  {
-       if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG))
+       if (cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG))
                 return 1;
         return cat_senddata(modp, asicp, reg, value);
  }
  
  static int
-cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg,
-        __u8 *value)
+cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg,
+        __u8 * value)
  {
-       if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG))
+       if (cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG))
                 return 1;
         return cat_getdata(modp, asicp, reg, value);
  }
  
  static int
-cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
+cat_subaddrsetup(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
                  __u16 len)
  {
         __u8 val;
  
-       if(len > 1) {
+       if (len > 1) {
                 /* set auto increment */
                 __u8 newval;
-               
-               if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) {
+
+               if (cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) {
                         CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n"));
                         return 1;
                 }
-               CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val));
+               CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n",
+                       val));
                 newval = val | VOYAGER_AUTO_INC;
-               if(newval != val) {
-                       if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) {
+               if (newval != val) {
+                       if (cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) {
                                 CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n"));
                                 return 1;
                         }
                 }
         }
-       if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) {
+       if (cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8) (offset & 0xff))) {
                 CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n"));
                 return 1;
         }
-       if(asicp->subaddr > VOYAGER_SUBADDR_LO) {
-               if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) {
+       if (asicp->subaddr > VOYAGER_SUBADDR_LO) {
+               if (cat_write
+                   (modp, asicp, VOYAGER_SUBADDRHI, (__u8) (offset >> 8))) {
                         CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n"));
                         return 1;
                 }
                 cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val);
-               CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val));
+               CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset,
+                       val));
         }
         cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val);
         CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val));
         return 0;
  }
-               
+
  static int
-cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
-           __u16 len, void *buf)
+cat_subwrite(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
+            __u16 len, void *buf)
  {
         int i, retval;
  
         /* FIXME: need special actions for VOYAGER_CAT_ID here */
-       if(asicp->asic_id == VOYAGER_CAT_ID) {
+       if (asicp->asic_id == VOYAGER_CAT_ID) {
                 CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n"));
                 /* FIXME -- This is supposed to be handled better
                  * There is a problem writing to the cat asic in the
                  * PSI.  The 30us delay seems to work, though */
                 udelay(30);
         }
-               
-       if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
+
+       if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
                 printk("cat_subwrite: cat_subaddrsetup FAILED\n");
                 return retval;
         }
-       
-       if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) {
+
+       if (cat_sendinst
+           (modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) {
                 printk("cat_subwrite: cat_sendinst FAILED\n");
                 return 1;
         }
-       for(i = 0; i < len; i++) {
-               if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) {
-                       printk("cat_subwrite: cat_sendata element at %d FAILED\n", i);
+       for (i = 0; i < len; i++) {
+               if (cat_senddata(modp, asicp, 0xFF, ((__u8 *) buf)[i])) {
+                       printk
+                           ("cat_subwrite: cat_sendata element at %d FAILED\n",
+                            i);
                         return 1;
                 }
         }
         return 0;
  }
  static int
-cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset,
+cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset,
             __u16 len, void *buf)
  {
         int i, retval;
  
-       if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
+       if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) {
                 CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n"));
                 return retval;
         }
  
-       if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) {
+       if (cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) {
                 CDEBUG(("cat_subread: cat_sendinst failed\n"));
                 return 1;
         }
-       for(i = 0; i < len; i++) {
-               if(cat_getdata(modp, asicp, 0xFF,
-                              &((__u8 *)buf)[i])) {
-                       CDEBUG(("cat_subread: cat_getdata element %d failed\n", i));
+       for (i = 0; i < len; i++) {
+               if (cat_getdata(modp, asicp, 0xFF, &((__u8 *) buf)[i])) {
+                       CDEBUG(("cat_subread: cat_getdata element %d failed\n",
+                               i));
                         return 1;
                 }
         }
         return 0;
  }
  
-
  /* buffer for storing EPROM data read in during initialisation */
  static __initdata __u8 eprom_buf[0xFFFF];
  static voyager_module_t *voyager_initial_module;
@@ -568,8 +573,7 @@ static voyager_module_t *voyager_initial_module;
   * boot cpu *after* all memory initialisation has been done (so we can
   * use kmalloc) but before smp initialisation, so we can probe the SMP
   * configuration and pick up necessary information.  */
-void __init
-voyager_cat_init(void)
+void __init voyager_cat_init(void)
  {
         voyager_module_t **modpp = &voyager_initial_module;
         voyager_asic_t **asicpp;
@@ -578,27 +582,29 @@ voyager_cat_init(void)
         unsigned long qic_addr = 0;
         __u8 qabc_data[0x20];
         __u8 num_submodules, val;
-       voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0];
-       
+       voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *) & eprom_buf[0];
+
         __u8 cmos[4];
         unsigned long addr;
-       
+
         /* initiallise the SUS mailbox */
-       for(i=0; i<sizeof(cmos); i++)
+       for (i = 0; i < sizeof(cmos); i++)
                 cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i);
         addr = *(unsigned long *)cmos;
-       if((addr & 0xff000000) != 0xff000000) {
-               printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr);
+       if ((addr & 0xff000000) != 0xff000000) {
+               printk(KERN_ERR
+                      "Voyager failed to get SUS mailbox (addr = 0x%lx\n",
+                      addr);
         } else {
                 static struct resource res;
-               
+
                 res.name = "voyager SUS";
                 res.start = addr;
-               res.end = addr+0x3ff;
-               
+               res.end = addr + 0x3ff;
+
                 request_resource(&iomem_resource, &res);
                 voyager_SUS = (struct voyager_SUS *)
-                       ioremap(addr, 0x400);
+                   ioremap(addr, 0x400);
                 printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n",
                        voyager_SUS->SUS_version);
                 voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION;
@@ -609,8 +615,6 @@ voyager_cat_init(void)
         voyager_extended_vic_processors = 0;
         voyager_quad_processors = 0;
  
-
-
         printk("VOYAGER: beginning CAT bus probe\n");
         /* set up the SuperSet Port Block which tells us where the
          * CAT communication port is */
@@ -618,14 +622,14 @@ voyager_cat_init(void)
         VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb));
  
         /* now find out if were 8 slot or normal */
-       if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER)
-          == EIGHT_SLOT_IDENTIFIER) {
+       if ((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER)
+           == EIGHT_SLOT_IDENTIFIER) {
                 voyager_8slot = 1;
-               printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n");
+               printk(KERN_NOTICE
+                      "Voyager: Eight slot 51xx configuration detected\n");
         }
  
-       for(i = VOYAGER_MIN_MODULE;
-           i <= VOYAGER_MAX_MODULE; i++) {
+       for (i = VOYAGER_MIN_MODULE; i <= VOYAGER_MAX_MODULE; i++) {
                 __u8 input;
                 int asic;
                 __u16 eprom_size;
@@ -643,21 +647,21 @@ voyager_cat_init(void)
                 outb(0xAA, CAT_DATA);
                 input = inb(CAT_DATA);
                 outb(VOYAGER_CAT_END, CAT_CMD);
-               if(input != VOYAGER_CAT_HEADER) {
+               if (input != VOYAGER_CAT_HEADER) {
                         continue;
                 }
                 CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i,
                         cat_module_name(i)));
-               *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/
-               if(*modpp == NULL) {
+               *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++]; */
+               if (*modpp == NULL) {
                         printk("**WARNING** kmalloc failure in cat_init\n");
                         continue;
                 }
                 memset(*modpp, 0, sizeof(voyager_module_t));
                 /* need temporary asic for cat_subread.  It will be
                  * filled in correctly later */
-               (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/
-               if((*modpp)->asic == NULL) {
+               (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL);   /*&voyager_asic_storage[asic_count]; */
+               if ((*modpp)->asic == NULL) {
                         printk("**WARNING** kmalloc failure in cat_init\n");
                         continue;
                 }
@@ -666,47 +670,52 @@ voyager_cat_init(void)
                 (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI;
                 (*modpp)->module_addr = i;
                 (*modpp)->scan_path_connected = 0;
-               if(i == VOYAGER_PSI) {
+               if (i == VOYAGER_PSI) {
                         /* Exception leg for modules with no EEPROM */
                         printk("Module \"%s\"\n", cat_module_name(i));
                         continue;
                 }
-                              
+
                 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
                 outb(VOYAGER_CAT_RUN, CAT_CMD);
                 cat_disconnect(*modpp, (*modpp)->asic);
-               if(cat_subread(*modpp, (*modpp)->asic,
-                              VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
-                              &eprom_size)) {
-                       printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i);
+               if (cat_subread(*modpp, (*modpp)->asic,
+                               VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
+                               &eprom_size)) {
+                       printk
+                           ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
+                            i);
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         continue;
                 }
-               if(eprom_size > sizeof(eprom_buf)) {
-                       printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x.  Need %d\n", i, eprom_size);
+               if (eprom_size > sizeof(eprom_buf)) {
+                       printk
+                           ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x.  Need %d\n",
+                            i, eprom_size);
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         continue;
                 }
                 outb(VOYAGER_CAT_END, CAT_CMD);
                 outb(VOYAGER_CAT_RUN, CAT_CMD);
-               CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size));
-               if(cat_subread(*modpp, (*modpp)->asic, 0, 
-                              eprom_size, eprom_buf)) {
+               CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
+                       eprom_size));
+               if (cat_subread
+                   (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         continue;
                 }
                 outb(VOYAGER_CAT_END, CAT_CMD);
                 printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n",
                        cat_module_name(i), eprom_hdr->version_id,
-                      *((__u32 *)eprom_hdr->tracer),  eprom_hdr->num_asics);
+                      *((__u32 *) eprom_hdr->tracer), eprom_hdr->num_asics);
                 (*modpp)->ee_size = eprom_hdr->ee_size;
                 (*modpp)->num_asics = eprom_hdr->num_asics;
                 asicpp = &((*modpp)->asic);
                 sp_offset = eprom_hdr->scan_path_offset;
                 /* All we really care about are the Quad cards.  We
-                 * identify them because they are in a processor slot
-                 * and have only four asics */
-               if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) {
+                * identify them because they are in a processor slot
+                * and have only four asics */
+               if ((i < 0x10 || (i >= 0x14 && i < 0x1c) || i > 0x1f)) {
                         modpp = &((*modpp)->next);
                         continue;
                 }
@@ -717,16 +726,17 @@ voyager_cat_init(void)
                          &num_submodules);
                 /* lowest two bits, active low */
                 num_submodules = ~(0xfc | num_submodules);
-               CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules));
-               if(num_submodules == 0) {
+               CDEBUG(("VOYAGER CAT: %d submodules present\n",
+                       num_submodules));
+               if (num_submodules == 0) {
                         /* fill in the dyadic extended processors */
                         __u8 cpu = i & 0x07;
  
                         printk("Module \"%s\": Dyadic Processor Card\n",
                                cat_module_name(i));
-                       voyager_extended_vic_processors |= (1<<cpu);
+                       voyager_extended_vic_processors |= (1 << cpu);
                         cpu += 4;
-                       voyager_extended_vic_processors |= (1<<cpu);
+                       voyager_extended_vic_processors |= (1 << cpu);
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         continue;
                 }
@@ -740,28 +750,32 @@ voyager_cat_init(void)
                 cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val);
  
                 outb(VOYAGER_CAT_END, CAT_CMD);
-                        
  
                 CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET));
                 outb(VOYAGER_CAT_RUN, CAT_CMD);
                 cat_disconnect(*modpp, (*modpp)->asic);
-               if(cat_subread(*modpp, (*modpp)->asic,
-                              VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
-                              &eprom_size)) {
-                       printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i);
+               if (cat_subread(*modpp, (*modpp)->asic,
+                               VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size),
+                               &eprom_size)) {
+                       printk
+                           ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n",
+                            i);
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         continue;
                 }
-               if(eprom_size > sizeof(eprom_buf)) {
-                       printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x.  Need %d\n", i, eprom_size);
+               if (eprom_size > sizeof(eprom_buf)) {
+                       printk
+                           ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x.  Need %d\n",
+                            i, eprom_size);
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         continue;
                 }
                 outb(VOYAGER_CAT_END, CAT_CMD);
                 outb(VOYAGER_CAT_RUN, CAT_CMD);
-               CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size));
-               if(cat_subread(*modpp, (*modpp)->asic, 0, 
-                              eprom_size, eprom_buf)) {
+               CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i,
+                       eprom_size));
+               if (cat_subread
+                   (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) {
                         outb(VOYAGER_CAT_END, CAT_CMD);
                         continue;
                 }
@@ -773,30 +787,35 @@ voyager_cat_init(void)
                 sp_offset = eprom_hdr->scan_path_offset;
                 /* get rid of the dummy CAT asic and read the real one */
                 kfree((*modpp)->asic);
-               for(asic=0; asic < (*modpp)->num_asics; asic++) {
+               for (asic = 0; asic < (*modpp)->num_asics; asic++) {
                         int j;
-                       voyager_asic_t *asicp = *asicpp 
-                               = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/
+                       voyager_asic_t *asicp = *asicpp = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL);  /*&voyager_asic_storage[asic_count++]; */
                         voyager_sp_table_t *sp_table;
                         voyager_at_t *asic_table;
                         voyager_jtt_t *jtag_table;
  
-                       if(asicp == NULL) {
-                               printk("**WARNING** kmalloc failure in cat_init\n");
+                       if (asicp == NULL) {
+                               printk
+                                   ("**WARNING** kmalloc failure in cat_init\n");
                                 continue;
                         }
                         asicpp = &(asicp->next);
                         asicp->asic_location = asic;
-                       sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset);
+                       sp_table =
+                           (voyager_sp_table_t *) (eprom_buf + sp_offset);
                         asicp->asic_id = sp_table->asic_id;
-                       asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset);
-                       for(j=0; j<4; j++)
+                       asic_table =
+                           (voyager_at_t *) (eprom_buf +
+                                             sp_table->asic_data_offset);
+                       for (j = 0; j < 4; j++)
                                 asicp->jtag_id[j] = asic_table->jtag_id[j];
-                       jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset);
+                       jtag_table =
+                           (voyager_jtt_t *) (eprom_buf +
+                                              asic_table->jtag_offset);
                         asicp->ireg_length = jtag_table->ireg_len;
                         asicp->bit_location = (*modpp)->inst_bits;
                         (*modpp)->inst_bits += asicp->ireg_length;
-                       if(asicp->ireg_length > (*modpp)->largest_reg)
+                       if (asicp->ireg_length > (*modpp)->largest_reg)
                                 (*modpp)->largest_reg = asicp->ireg_length;
                         if (asicp->ireg_length < (*modpp)->smallest_reg ||
                             (*modpp)->smallest_reg == 0)
@@ -804,15 +823,13 @@ voyager_cat_init(void)
                         CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n",
                                 asicp->asic_id, asicp->ireg_length,
                                 asicp->bit_location));
-                       if(asicp->asic_id == VOYAGER_QUAD_QABC) {
+                       if (asicp->asic_id == VOYAGER_QUAD_QABC) {
                                 CDEBUG(("VOYAGER CAT: QABC ASIC found\n"));
                                 qabc_asic = asicp;
                         }
                         sp_offset += sizeof(voyager_sp_table_t);
                 }
-               CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n",
-                       (*modpp)->inst_bits, (*modpp)->largest_reg,
-                       (*modpp)->smallest_reg));
+               CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", (*modpp)->inst_bits, (*modpp)->largest_reg, (*modpp)->smallest_reg));
                 /* OK, now we have the QUAD ASICs set up, use them.
                  * we need to:
                  *
@@ -828,10 +845,11 @@ voyager_cat_init(void)
                 qic_addr = qabc_data[5] << 8;
                 qic_addr = (qic_addr | qabc_data[6]) << 8;
                 qic_addr = (qic_addr | qabc_data[7]) << 8;
-               printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n",
-                      cat_module_name(i), qic_addr, qabc_data[8]);
+               printk
+                   ("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n",
+                    cat_module_name(i), qic_addr, qabc_data[8]);
  #if 0                          /* plumbing fails---FIXME */
-               if((qabc_data[8] & 0xf0) == 0) {
+               if ((qabc_data[8] & 0xf0) == 0) {
                         /* FIXME: 32 way 8 CPU slot monster cannot be
                          * plumbed this way---need to check for it */
  
@@ -842,94 +860,97 @@ voyager_cat_init(void)
  #ifdef VOYAGER_CAT_DEBUG
                         /* verify plumbing */
                         cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]);
-                       if((qabc_data[8] & 0xf0) == 0) {
-                               CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8]));
+                       if ((qabc_data[8] & 0xf0) == 0) {
+                               CDEBUG(("PLUMBING FAILED: 0x%x\n",
+                                       qabc_data[8]));
                         }
  #endif
                 }
  #endif
  
                 {
-                       struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL);
+                       struct resource *res =
+                           kzalloc(sizeof(struct resource), GFP_KERNEL);
                         res->name = kmalloc(128, GFP_KERNEL);
-                       sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i));
+                       sprintf((char *)res->name, "Voyager %s Quad CPI",
+                               cat_module_name(i));
                         res->start = qic_addr;
                         res->end = qic_addr + 0x3ff;
                         request_resource(&iomem_resource, res);
                 }
  
                 qic_addr = (unsigned long)ioremap(qic_addr, 0x400);
-                               
-               for(j = 0; j < 4; j++) {
+
+               for (j = 0; j < 4; j++) {
                         __u8 cpu;
  
-                       if(voyager_8slot) {
+                       if (voyager_8slot) {
                                 /* 8 slot has a different mapping,
                                  * each slot has only one vic line, so
                                  * 1 cpu in each slot must be < 8 */
-                               cpu = (i & 0x07) + j*8;
+                               cpu = (i & 0x07) + j * 8;
                         } else {
-                               cpu = (i & 0x03) + j*4;
+                               cpu = (i & 0x03) + j * 4;
                         }
-                       if( (qabc_data[8] & (1<<j))) {
-                               voyager_extended_vic_processors |= (1<<cpu);
+                       if ((qabc_data[8] & (1 << j))) {
+                               voyager_extended_vic_processors |= (1 << cpu);
                         }
-                       if(qabc_data[8] & (1<<(j+4)) ) {
+                       if (qabc_data[8] & (1 << (j + 4))) {
                                 /* Second SET register plumbed: Quad
                                  * card has two VIC connected CPUs.
                                  * Secondary cannot be booted as a VIC
                                  * CPU */
-                               voyager_extended_vic_processors |= (1<<cpu);
-                               voyager_allowed_boot_processors &= (~(1<<cpu));
+                               voyager_extended_vic_processors |= (1 << cpu);
+                               voyager_allowed_boot_processors &=
+                                   (~(1 << cpu));
                         }
  
-                       voyager_quad_processors |= (1<<cpu);
+                       voyager_quad_processors |= (1 << cpu);
                         voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *)
-                               (qic_addr+(j<<8));
+                           (qic_addr + (j << 8));
                         CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu,
                                 (unsigned long)voyager_quad_cpi_addr[cpu]));
                 }
                 outb(VOYAGER_CAT_END, CAT_CMD);
  
-               
-               
                 *asicpp = NULL;
                 modpp = &((*modpp)->next);
         }
         *modpp = NULL;
-       printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors);
+       printk
+           ("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n",
+            voyager_extended_vic_processors, voyager_quad_processors,
+            voyager_allowed_boot_processors);
         request_resource(&ioport_resource, &vic_res);
-       if(voyager_quad_processors)
+       if (voyager_quad_processors)
                 request_resource(&ioport_resource, &qic_res);
         /* set up the front power switch */
  }
  
-int
-voyager_cat_readb(__u8 module, __u8 asic, int reg)
+int voyager_cat_readb(__u8 module, __u8 asic, int reg)
  {
         return 0;
  }
  
-static int
-cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp) 
+static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp)
  {
         __u8 val;
         int err = 0;
  
-       if(!modp->scan_path_connected)
+       if (!modp->scan_path_connected)
                 return 0;
-       if(asicp->asic_id != VOYAGER_CAT_ID) {
+       if (asicp->asic_id != VOYAGER_CAT_ID) {
                 CDEBUG(("cat_disconnect: ASIC is not CAT\n"));
                 return 1;
         }
         err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
-       if(err) {
+       if (err) {
                 CDEBUG(("cat_disconnect: failed to read SCANPATH\n"));
                 return err;
         }
         val &= VOYAGER_DISCONNECT_ASIC;
         err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
-       if(err) {
+       if (err) {
                 CDEBUG(("cat_disconnect: failed to write SCANPATH\n"));
                 return err;
         }
@@ -940,27 +961,26 @@ cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp)
         return 0;
  }
  
-static int
-cat_connect(voyager_module_t *modp, voyager_asic_t *asicp) 
+static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp)
  {
         __u8 val;
         int err = 0;
  
-       if(modp->scan_path_connected)
+       if (modp->scan_path_connected)
                 return 0;
-       if(asicp->asic_id != VOYAGER_CAT_ID) {
+       if (asicp->asic_id != VOYAGER_CAT_ID) {
                 CDEBUG(("cat_connect: ASIC is not CAT\n"));
                 return 1;
         }
  
         err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val);
-       if(err) {
+       if (err) {
                 CDEBUG(("cat_connect: failed to read SCANPATH\n"));
                 return err;
         }
         val |= VOYAGER_CONNECT_ASIC;
         err = cat_write(modp, asicp, VOYAGER_SCANPATH, val);
-       if(err) {
+       if (err) {
                 CDEBUG(("cat_connect: failed to write SCANPATH\n"));
                 return err;
         }
@@ -971,11 +991,10 @@ cat_connect(voyager_module_t *modp, voyager_asic_t *asicp)
         return 0;
  }
  
-void
-voyager_cat_power_off(void)
+void voyager_cat_power_off(void)
  {
         /* Power the machine off by writing to the PSI over the CAT
-         * bus */
+        * bus */
         __u8 data;
         voyager_module_t psi = { 0 };
         voyager_asic_t psi_asic = { 0 };
@@ -1009,8 +1028,7 @@ voyager_cat_power_off(void)
  
  struct voyager_status voyager_status = { 0 };
  
-void
-voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
+void voyager_cat_psi(__u8 cmd, __u16 reg, __u8 * data)
  {
         voyager_module_t psi = { 0 };
         voyager_asic_t psi_asic = { 0 };
@@ -1027,7 +1045,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
         outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT);
         outb(VOYAGER_CAT_RUN, CAT_CMD);
         cat_disconnect(&psi, &psi_asic);
-       switch(cmd) {
+       switch (cmd) {
         case VOYAGER_PSI_READ:
                 cat_read(&psi, &psi_asic, reg, data);
                 break;
@@ -1047,8 +1065,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data)
         outb(VOYAGER_CAT_END, CAT_CMD);
  }
  
-void
-voyager_cat_do_common_interrupt(void)
+void voyager_cat_do_common_interrupt(void)
  {
         /* This is caused either by a memory parity error or something
          * in the PSI */
@@ -1057,7 +1074,7 @@ voyager_cat_do_common_interrupt(void)
         voyager_asic_t psi_asic = { 0 };
         struct voyager_psi psi_reg;
         int i;
- re_read:
+      re_read:
         psi.asic = &psi_asic;
         psi.asic->asic_id = VOYAGER_CAT_ID;
         psi.asic->subaddr = VOYAGER_SUBADDR_HI;
@@ -1072,43 +1089,45 @@ voyager_cat_do_common_interrupt(void)
         cat_disconnect(&psi, &psi_asic);
         /* Read the status.  NOTE: Need to read *all* the PSI regs here
          * otherwise the cmn int will be reasserted */
-       for(i = 0; i < sizeof(psi_reg.regs); i++) {
-               cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]);
+       for (i = 0; i < sizeof(psi_reg.regs); i++) {
+               cat_read(&psi, &psi_asic, i, &((__u8 *) & psi_reg.regs)[i]);
         }
         outb(VOYAGER_CAT_END, CAT_CMD);
-       if((psi_reg.regs.checkbit & 0x02) == 0) {
+       if ((psi_reg.regs.checkbit & 0x02) == 0) {
                 psi_reg.regs.checkbit |= 0x02;
                 cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit);
                 printk("VOYAGER RE-READ PSI\n");
                 goto re_read;
         }
         outb(VOYAGER_CAT_RUN, CAT_CMD);
-       for(i = 0; i < sizeof(psi_reg.subregs); i++) {
+       for (i = 0; i < sizeof(psi_reg.subregs); i++) {
                 /* This looks strange, but the PSI doesn't do auto increment
                  * correctly */
-               cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, 
-                           1, &((__u8 *)&psi_reg.subregs)[i]); 
+               cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i,
+                           1, &((__u8 *) & psi_reg.subregs)[i]);
         }
         outb(VOYAGER_CAT_END, CAT_CMD);
  #ifdef VOYAGER_CAT_DEBUG
         printk("VOYAGER PSI: ");
-       for(i=0; i<sizeof(psi_reg.regs); i++)
-               printk("%02x ", ((__u8 *)&psi_reg.regs)[i]);
+       for (i = 0; i < sizeof(psi_reg.regs); i++)
+               printk("%02x ", ((__u8 *) & psi_reg.regs)[i]);
         printk("\n           ");
-       for(i=0; i<sizeof(psi_reg.subregs); i++)
-               printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]);
+       for (i = 0; i < sizeof(psi_reg.subregs); i++)
+               printk("%02x ", ((__u8 *) & psi_reg.subregs)[i]);
         printk("\n");
  #endif
-       if(psi_reg.regs.intstatus & PSI_MON) {
+       if (psi_reg.regs.intstatus & PSI_MON) {
                 /* switch off or power fail */
  
-               if(psi_reg.subregs.supply & PSI_SWITCH_OFF) {
-                       if(voyager_status.switch_off) {
-                               printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n");
+               if (psi_reg.subregs.supply & PSI_SWITCH_OFF) {
+                       if (voyager_status.switch_off) {
+                               printk(KERN_ERR
+                                      "Voyager front panel switch turned off again---Immediate power off!\n");
                                 voyager_cat_power_off();
                                 /* not reached */
                         } else {
-                               printk(KERN_ERR "Voyager front panel switch turned off\n");
+                               printk(KERN_ERR
+                                      "Voyager front panel switch turned off\n");
                                 voyager_status.switch_off = 1;
                                 voyager_status.request_from_kernel = 1;
                                 wake_up_process(voyager_thread);
@@ -1127,7 +1146,7 @@ voyager_cat_do_common_interrupt(void)
  
                         VDEBUG(("Voyager ac fail reg 0x%x\n",
                                 psi_reg.subregs.ACfail));
-                       if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) {
+                       if ((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) {
                                 /* No further update */
                                 return;
                         }
@@ -1135,20 +1154,20 @@ voyager_cat_do_common_interrupt(void)
                         /* Don't bother trying to find out who failed.
                          * FIXME: This probably makes the code incorrect on
                          * anything other than a 345x */
-                       for(i=0; i< 5; i++) {
-                               if( psi_reg.subregs.ACfail &(1<<i)) {
+                       for (i = 0; i < 5; i++) {
+                               if (psi_reg.subregs.ACfail & (1 << i)) {
                                         break;
                                 }
                         }
                         printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i);
  #endif
                         /* DON'T do this: it shuts down the AC PSI 
-                       outb(VOYAGER_CAT_RUN, CAT_CMD);
-                       data = PSI_MASK_MASK | i;
-                       cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK,
-                                    1, &data);
-                       outb(VOYAGER_CAT_END, CAT_CMD);
-                       */
+                          outb(VOYAGER_CAT_RUN, CAT_CMD);
+                          data = PSI_MASK_MASK | i;
+                          cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK,
+                          1, &data);
+                          outb(VOYAGER_CAT_END, CAT_CMD);
+                        */
                         printk(KERN_ERR "Voyager AC power failure\n");
                         outb(VOYAGER_CAT_RUN, CAT_CMD);
                         data = PSI_COLD_START;
@@ -1159,16 +1178,16 @@ voyager_cat_do_common_interrupt(void)
                         voyager_status.request_from_kernel = 1;
                         wake_up_process(voyager_thread);
                 }
-               
-               
-       } else if(psi_reg.regs.intstatus & PSI_FAULT) {
+
+       } else if (psi_reg.regs.intstatus & PSI_FAULT) {
                 /* Major fault! */
-               printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n");
+               printk(KERN_ERR
+                      "Voyager PSI Detected major fault, immediate power off!\n");
                 voyager_cat_power_off();
                 /* not reached */
-       } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM
-                                           | PSI_CURRENT | PSI_DVM
-                                           | PSI_PSCFAULT | PSI_STAT_CHG)) {
+       } else if (psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM
+                                            | PSI_CURRENT | PSI_DVM
+                                            | PSI_PSCFAULT | PSI_STAT_CHG)) {
                 /* other psi fault */
  
                 printk(KERN_WARNING "Voyager PSI status 0x%x\n", data);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c

index 88124dd35406d2484f999c837833d4a9c270ba03..dffa786f61fe1b17c2a3a4b2e00f07d7e3a8bc2f 100644 (file)
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -32,7 +32,8 @@
  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 };
  
  /* CPU IRQ affinity -- set to all ones initially */
-static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1]  = ~0UL };
+static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned =
+       {[0 ... NR_CPUS-1]  = ~0UL };
  
  /* per CPU data structure (for /proc/cpuinfo et al), visible externally
   * indexed physically */
@@ -76,7 +77,6 @@ EXPORT_SYMBOL(cpu_online_map);
   * by scheduler but indexed physically */
  cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
  
-
  /* The internal functions */
  static void send_CPI(__u32 cpuset, __u8 cpi);
  static void ack_CPI(__u8 cpi);
@@ -101,94 +101,86 @@ int hard_smp_processor_id(void);
  int safe_smp_processor_id(void);
  
  /* Inline functions */
-static inline void
-send_one_QIC_CPI(__u8 cpu, __u8 cpi)
+static inline void send_one_QIC_CPI(__u8 cpu, __u8 cpi)
  {
         voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi =
-               (smp_processor_id() << 16) + cpi;
+           (smp_processor_id() << 16) + cpi;
  }
  
-static inline void
-send_QIC_CPI(__u32 cpuset, __u8 cpi)
+static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi)
  {
         int cpu;
  
         for_each_online_cpu(cpu) {
-               if(cpuset & (1<<cpu)) {
+               if (cpuset & (1 << cpu)) {
  #ifdef VOYAGER_DEBUG
-                       if(!cpu_isset(cpu, cpu_online_map))
-                               VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu));
+                       if (!cpu_isset(cpu, cpu_online_map))
+                               VDEBUG(("CPU%d sending cpi %d to CPU%d not in "
+                                       "cpu_online_map\n",
+                                       hard_smp_processor_id(), cpi, cpu));
  #endif
                         send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
                 }
         }
  }
  
-static inline void
-wrapper_smp_local_timer_interrupt(void)
+static inline void wrapper_smp_local_timer_interrupt(void)
  {
         irq_enter();
         smp_local_timer_interrupt();
         irq_exit();
  }
  
-static inline void
-send_one_CPI(__u8 cpu, __u8 cpi)
+static inline void send_one_CPI(__u8 cpu, __u8 cpi)
  {
-       if(voyager_quad_processors & (1<<cpu))
+       if (voyager_quad_processors & (1 << cpu))
                 send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET);
         else
-               send_CPI(1<<cpu, cpi);
+               send_CPI(1 << cpu, cpi);
  }
  
-static inline void
-send_CPI_allbutself(__u8 cpi)
+static inline void send_CPI_allbutself(__u8 cpi)
  {
         __u8 cpu = smp_processor_id();
         __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu);
         send_CPI(mask, cpi);
  }
  
-static inline int
-is_cpu_quad(void)
+static inline int is_cpu_quad(void)
  {
         __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
         return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER);
  }
  
-static inline int
-is_cpu_extended(void)
+static inline int is_cpu_extended(void)
  {
         __u8 cpu = hard_smp_processor_id();
  
-       return(voyager_extended_vic_processors & (1<<cpu));
+       return (voyager_extended_vic_processors & (1 << cpu));
  }
  
-static inline int
-is_cpu_vic_boot(void)
+static inline int is_cpu_vic_boot(void)
  {
         __u8 cpu = hard_smp_processor_id();
  
-       return(voyager_extended_vic_processors
-              & voyager_allowed_boot_processors & (1<<cpu));
+       return (voyager_extended_vic_processors
+               & voyager_allowed_boot_processors & (1 << cpu));
  }
  
-
-static inline void
-ack_CPI(__u8 cpi)
+static inline void ack_CPI(__u8 cpi)
  {
-       switch(cpi) {
+       switch (cpi) {
         case VIC_CPU_BOOT_CPI:
-               if(is_cpu_quad() && !is_cpu_vic_boot())
+               if (is_cpu_quad() && !is_cpu_vic_boot())
                         ack_QIC_CPI(cpi);
                 else
                         ack_VIC_CPI(cpi);
                 break;
         case VIC_SYS_INT:
-       case VIC_CMN_INT: 
+       case VIC_CMN_INT:
                 /* These are slightly strange.  Even on the Quad card,
                  * They are vectored as VIC CPIs */
-               if(is_cpu_quad())
+               if (is_cpu_quad())
                         ack_special_QIC_CPI(cpi);
                 else
                         ack_VIC_CPI(cpi);
@@ -205,11 +197,11 @@ ack_CPI(__u8 cpi)
   * 8259 IRQs except that masks and things must be kept per processor
   */
  static struct irq_chip vic_chip = {
-       .name           = "VIC",
-       .startup        = startup_vic_irq,
-       .mask           = mask_vic_irq,
-       .unmask         = unmask_vic_irq,
-       .set_affinity   = set_vic_irq_affinity,
+       .name = "VIC",
+       .startup = startup_vic_irq,
+       .mask = mask_vic_irq,
+       .unmask = unmask_vic_irq,
+       .set_affinity = set_vic_irq_affinity,
  };
  
  /* used to count up as CPUs are brought on line (starts at 0) */
@@ -223,7 +215,7 @@ static __u32 trampoline_base;
  /* The per cpu profile stuff - used in smp_local_timer_interrupt */
  static DEFINE_PER_CPU(int, prof_multiplier) = 1;
  static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_counter) =  1;
+static DEFINE_PER_CPU(int, prof_counter) = 1;
  
  /* the map used to check if a CPU has booted */
  static __u32 cpu_booted_map;
@@ -235,7 +227,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
  /* This is for the new dynamic CPU boot code */
  cpumask_t cpu_callin_map = CPU_MASK_NONE;
  cpumask_t cpu_callout_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_callout_map);
  cpumask_t cpu_possible_map = CPU_MASK_NONE;
  EXPORT_SYMBOL(cpu_possible_map);
  
@@ -246,9 +237,9 @@ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
  static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 };
  
  /* Lock for enable/disable of VIC interrupts */
-static  __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock);
+static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock);
  
-/* The boot processor is correctly set up in PC mode when it 
+/* The boot processor is correctly set up in PC mode when it
   * comes up, but the secondaries need their master/slave 8259
   * pairs initializing correctly */
  
@@ -262,8 +253,7 @@ static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 };
  static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned;
  
  /* debugging routine to read the isr of the cpu's pic */
-static inline __u16
-vic_read_isr(void)
+static inline __u16 vic_read_isr(void)
  {
         __u16 isr;
  
@@ -275,17 +265,16 @@ vic_read_isr(void)
         return isr;
  }
  
-static __init void
-qic_setup(void)
+static __init void qic_setup(void)
  {
-       if(!is_cpu_quad()) {
+       if (!is_cpu_quad()) {
                 /* not a quad, no setup */
                 return;
         }
         outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
         outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
-       
-       if(is_cpu_extended()) {
+
+       if (is_cpu_extended()) {
                 /* the QIC duplicate of the VIC base register */
                 outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER);
                 outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER);
@@ -295,8 +284,7 @@ qic_setup(void)
         }
  }
  
-static __init void
-vic_setup_pic(void)
+static __init void vic_setup_pic(void)
  {
         outb(1, VIC_REDIRECT_REGISTER_1);
         /* clear the claim registers for dynamic routing */
@@ -333,7 +321,7 @@ vic_setup_pic(void)
  
         /* ICW2: slave vector base */
         outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1);
-       
+
         /* ICW3: slave ID */
         outb(0x02, 0xA1);
  
@@ -341,19 +329,18 @@ vic_setup_pic(void)
         outb(0x01, 0xA1);
  }
  
-static void
-do_quad_bootstrap(void)
+static void do_quad_bootstrap(void)
  {
-       if(is_cpu_quad() && is_cpu_vic_boot()) {
+       if (is_cpu_quad() && is_cpu_vic_boot()) {
                 int i;
                 unsigned long flags;
                 __u8 cpuid = hard_smp_processor_id();
  
                 local_irq_save(flags);
  
-               for(i = 0; i<4; i++) {
+               for (i = 0; i < 4; i++) {
                         /* FIXME: this would be >>3 &0x7 on the 32 way */
-                       if(((cpuid >> 2) & 0x03) == i)
+                       if (((cpuid >> 2) & 0x03) == i)
                                 /* don't lower our own mask! */
                                 continue;
  
@@ -368,12 +355,10 @@ do_quad_bootstrap(void)
         }
  }
  
-
  /* Set up all the basic stuff: read the SMP config and make all the
   * SMP information reflect only the boot cpu.  All others will be
   * brought on-line later. */
-void __init 
-find_smp_config(void)
+void __init find_smp_config(void)
  {
         int i;
  
@@ -382,24 +367,31 @@ find_smp_config(void)
         printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
  
         /* initialize the CPU structures (moved from smp_boot_cpus) */
-       for(i=0; i<NR_CPUS; i++) {
+       for (i = 0; i < NR_CPUS; i++) {
                 cpu_irq_affinity[i] = ~0;
         }
         cpu_online_map = cpumask_of_cpu(boot_cpu_id);
  
         /* The boot CPU must be extended */
-       voyager_extended_vic_processors = 1<<boot_cpu_id;
+       voyager_extended_vic_processors = 1 << boot_cpu_id;
         /* initially, all of the first 8 CPUs can boot */
         voyager_allowed_boot_processors = 0xff;
         /* set up everything for just this CPU, we can alter
          * this as we start the other CPUs later */
         /* now get the CPU disposition from the extended CMOS */
-       cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK);
-       cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8;
-       cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16;
-       cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24;
+       cpus_addr(phys_cpu_present_map)[0] =
+           voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK);
+       cpus_addr(phys_cpu_present_map)[0] |=
+           voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8;
+       cpus_addr(phys_cpu_present_map)[0] |=
+           voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
+                                      2) << 16;
+       cpus_addr(phys_cpu_present_map)[0] |=
+           voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK +
+                                      3) << 24;
         cpu_possible_map = phys_cpu_present_map;
-       printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]);
+       printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n",
+              cpus_addr(phys_cpu_present_map)[0]);
         /* Here we set up the VIC to enable SMP */
         /* enable the CPIs by writing the base vector to their register */
         outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER);
@@ -427,8 +419,7 @@ find_smp_config(void)
  /*
   *     The bootstrap kernel entry code has set these up. Save them
   *     for a given CPU, id is physical */
-void __init
-smp_store_cpu_info(int id)
+void __init smp_store_cpu_info(int id)
  {
         struct cpuinfo_x86 *c = &cpu_data(id);
  
@@ -438,21 +429,19 @@ smp_store_cpu_info(int id)
  }
  
  /* set up the trampoline and return the physical address of the code */
-static __u32 __init
-setup_trampoline(void)
+static __u32 __init setup_trampoline(void)
  {
         /* these two are global symbols in trampoline.S */
         extern const __u8 trampoline_end[];
         extern const __u8 trampoline_data[];
  
-       memcpy((__u8 *)trampoline_base, trampoline_data,
+       memcpy((__u8 *) trampoline_base, trampoline_data,
                trampoline_end - trampoline_data);
-       return virt_to_phys((__u8 *)trampoline_base);
+       return virt_to_phys((__u8 *) trampoline_base);
  }
  
  /* Routine initially called when a non-boot CPU is brought online */
-static void __init
-start_secondary(void *unused)
+static void __init start_secondary(void *unused)
  {
         __u8 cpuid = hard_smp_processor_id();
         /* external functions not defined in the headers */
@@ -464,17 +453,18 @@ start_secondary(void *unused)
         ack_CPI(VIC_CPU_BOOT_CPI);
  
         /* setup the 8259 master slave pair belonging to this CPU ---
-         * we won't actually receive any until the boot CPU
-         * relinquishes it's static routing mask */
+        * we won't actually receive any until the boot CPU
+        * relinquishes it's static routing mask */
         vic_setup_pic();
  
         qic_setup();
  
-       if(is_cpu_quad() && !is_cpu_vic_boot()) {
+       if (is_cpu_quad() && !is_cpu_vic_boot()) {
                 /* clear the boot CPI */
                 __u8 dummy;
  
-               dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi;
+               dummy =
+                   voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi;
                 printk("read dummy %d\n", dummy);
         }
  
@@ -516,7 +506,6 @@ start_secondary(void *unused)
         cpu_idle();
  }
  
-
  /* Routine to kick start the given CPU and wait for it to report ready
   * (or timeout in startup).  When this routine returns, the requested
   * CPU is either fully running and configured or known to be dead.
@@ -524,29 +513,28 @@ start_secondary(void *unused)
   * We call this routine sequentially 1 CPU at a time, so no need for
   * locking */
  
-static void __init
-do_boot_cpu(__u8 cpu)
+static void __init do_boot_cpu(__u8 cpu)
  {
         struct task_struct *idle;
         int timeout;
         unsigned long flags;
-       int quad_boot = (1<<cpu) & voyager_quad_processors 
-               & ~( voyager_extended_vic_processors
-                    & voyager_allowed_boot_processors);
+       int quad_boot = (1 << cpu) & voyager_quad_processors
+           & ~(voyager_extended_vic_processors
+               & voyager_allowed_boot_processors);
  
         /* This is an area in head.S which was used to set up the
          * initial kernel stack.  We need to alter this to give the
          * booting CPU a new stack (taken from its idle process) */
         extern struct {
-               __u8 *esp;
+               __u8 *sp;
                 unsigned short ss;
         } stack_start;
         /* This is the format of the CPI IDT gate (in real mode) which
          * we're hijacking to boot the CPU */
-       union   IDTFormat {
+       union IDTFormat {
                 struct seg {
-                       __u16   Offset;
-                       __u16   Segment;
+                       __u16 Offset;
+                       __u16 Segment;
                 } idt;
                 __u32 val;
         } hijack_source;
@@ -565,37 +553,44 @@ do_boot_cpu(__u8 cpu)
         alternatives_smp_switch(1);
  
         idle = fork_idle(cpu);
-       if(IS_ERR(idle))
+       if (IS_ERR(idle))
                 panic("failed fork for CPU%d", cpu);
-       idle->thread.eip = (unsigned long) start_secondary;
+       idle->thread.ip = (unsigned long)start_secondary;
         /* init_tasks (in sched.c) is indexed logically */
-       stack_start.esp = (void *) idle->thread.esp;
+       stack_start.sp = (void *)idle->thread.sp;
  
         init_gdt(cpu);
-       per_cpu(current_task, cpu) = idle;
+       per_cpu(current_task, cpu) = idle;
         early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
         irq_ctx_init(cpu);
  
         /* Note: Don't modify initial ss override */
-       VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, 
+       VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu,
                 (unsigned long)hijack_source.val, hijack_source.idt.Segment,
-               hijack_source.idt.Offset, stack_start.esp));
+               hijack_source.idt.Offset, stack_start.sp));
  
         /* init lowmem identity mapping */
         clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
                         min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
         flush_tlb_all();
  
-       if(quad_boot) {
+       if (quad_boot) {
                 printk("CPU %d: non extended Quad boot\n", cpu);
-               hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4);
+               hijack_vector =
+                   (__u32 *)
+                   phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE) * 4);
                 *hijack_vector = hijack_source.val;
         } else {
                 printk("CPU%d: extended VIC boot\n", cpu);
-               hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4);
+               hijack_vector =
+                   (__u32 *)
+                   phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE) * 4);
                 *hijack_vector = hijack_source.val;
                 /* VIC errata, may also receive interrupt at this address */
-               hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4);
+               hijack_vector =
+                   (__u32 *)
+                   phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI +
+                                 VIC_DEFAULT_CPI_BASE) * 4);
                 *hijack_vector = hijack_source.val;
         }
         /* All non-boot CPUs start with interrupts fully masked.  Need
@@ -603,73 +598,76 @@ do_boot_cpu(__u8 cpu)
          * this in the VIC by masquerading as the processor we're
          * about to boot and lowering its interrupt mask */
         local_irq_save(flags);
-       if(quad_boot) {
+       if (quad_boot) {
                 send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI);
         } else {
                 outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID);
                 /* here we're altering registers belonging to `cpu' */
-               
+
                 outb(VIC_BOOT_INTERRUPT_MASK, 0x21);
                 /* now go back to our original identity */
                 outb(boot_cpu_id, VIC_PROCESSOR_ID);
  
                 /* and boot the CPU */
  
-               send_CPI((1<<cpu), VIC_CPU_BOOT_CPI);
+               send_CPI((1 << cpu), VIC_CPU_BOOT_CPI);
         }
         cpu_booted_map = 0;
         local_irq_restore(flags);
  
         /* now wait for it to become ready (or timeout) */
-       for(timeout = 0; timeout < 50000; timeout++) {
-               if(cpu_booted_map)
+       for (timeout = 0; timeout < 50000; timeout++) {
+               if (cpu_booted_map)
                         break;
                 udelay(100);
         }
         /* reset the page table */
         zap_low_mappings();
-         
+
         if (cpu_booted_map) {
                 VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n",
                         cpu, smp_processor_id()));
-       
+
                 printk("CPU%d: ", cpu);
                 print_cpu_info(&cpu_data(cpu));
                 wmb();
                 cpu_set(cpu, cpu_callout_map);
                 cpu_set(cpu, cpu_present_map);
-       }
-       else {
+       } else {
                 printk("CPU%d FAILED TO BOOT: ", cpu);
-               if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5)
+               if (*
+                   ((volatile unsigned char *)phys_to_virt(start_phys_address))
+                   == 0xA5)
                         printk("Stuck.\n");
                 else
                         printk("Not responding.\n");
-               
+
                 cpucount--;
         }
  }
  
-void __init
-smp_boot_cpus(void)
+void __init smp_boot_cpus(void)
  {
         int i;
  
         /* CAT BUS initialisation must be done after the memory */
         /* FIXME: The L4 has a catbus too, it just needs to be
          * accessed in a totally different way */
-       if(voyager_level == 5) {
+       if (voyager_level == 5) {
                 voyager_cat_init();
  
                 /* now that the cat has probed the Voyager System Bus, sanity
                  * check the cpu map */
-               if( ((voyager_quad_processors | voyager_extended_vic_processors)
-                    & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) {
+               if (((voyager_quad_processors | voyager_extended_vic_processors)
+                    & cpus_addr(phys_cpu_present_map)[0]) !=
+                   cpus_addr(phys_cpu_present_map)[0]) {
                         /* should panic */
-                       printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n");
+                       printk("\n\n***WARNING*** "
+                              "Sanity check of CPU present map FAILED\n");
                 }
-       } else if(voyager_level == 4)
-               voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0];
+       } else if (voyager_level == 4)
+               voyager_extended_vic_processors =
+                   cpus_addr(phys_cpu_present_map)[0];
  
         /* this sets up the idle task to run on the current cpu */
         voyager_extended_cpus = 1;
@@ -678,14 +676,14 @@ smp_boot_cpus(void)
         //global_irq_holder = boot_cpu_id;
  
         /* FIXME: Need to do something about this but currently only works
-        * on CPUs with a tsc which none of mine have. 
-       smp_tune_scheduling();
+        * on CPUs with a tsc which none of mine have.
+        smp_tune_scheduling();
          */
         smp_store_cpu_info(boot_cpu_id);
         printk("CPU%d: ", boot_cpu_id);
         print_cpu_info(&cpu_data(boot_cpu_id));
  
-       if(is_cpu_quad()) {
+       if (is_cpu_quad()) {
                 /* booting on a Quad CPU */
                 printk("VOYAGER SMP: Boot CPU is Quad\n");
                 qic_setup();
@@ -697,11 +695,11 @@ smp_boot_cpus(void)
  
         cpu_set(boot_cpu_id, cpu_online_map);
         cpu_set(boot_cpu_id, cpu_callout_map);
-       
-       /* loop over all the extended VIC CPUs and boot them.  The 
+
+       /* loop over all the extended VIC CPUs and boot them.  The
          * Quad CPUs must be bootstrapped by their extended VIC cpu */
-       for(i = 0; i < NR_CPUS; i++) {
-               if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
+       for (i = 0; i < NR_CPUS; i++) {
+               if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
                         continue;
                 do_boot_cpu(i);
                 /* This udelay seems to be needed for the Quad boots
@@ -715,25 +713,26 @@ smp_boot_cpus(void)
                 for (i = 0; i < NR_CPUS; i++)
                         if (cpu_isset(i, cpu_online_map))
                                 bogosum += cpu_data(i).loops_per_jiffy;
-               printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
-                       cpucount+1,
-                       bogosum/(500000/HZ),
-                       (bogosum/(5000/HZ))%100);
+               printk(KERN_INFO "Total of %d processors activated "
+                      "(%lu.%02lu BogoMIPS).\n",
+                      cpucount + 1, bogosum / (500000 / HZ),
+                      (bogosum / (5000 / HZ)) % 100);
         }
         voyager_extended_cpus = hweight32(voyager_extended_vic_processors);
-       printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus);
+       printk("VOYAGER: Extended (interrupt handling CPUs): "
+              "%d, non-extended: %d\n", voyager_extended_cpus,
+              num_booting_cpus() - voyager_extended_cpus);
         /* that's it, switch to symmetric mode */
         outb(0, VIC_PRIORITY_REGISTER);
         outb(0, VIC_CLAIM_REGISTER_0);
         outb(0, VIC_CLAIM_REGISTER_1);
-       
+
         VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus()));
  }
  
  /* Reload the secondary CPUs task structure (this function does not
   * return ) */
-void __init 
-initialize_secondary(void)
+void __init initialize_secondary(void)
  {
  #if 0
         // AC kernels only
@@ -745,11 +744,9 @@ initialize_secondary(void)
          * basically just the stack pointer and the eip.
          */
  
-       asm volatile(
-               "movl %0,%%esp\n\t"
-               "jmp *%1"
-               :
-               :"r" (current->thread.esp),"r" (current->thread.eip));
+       asm volatile ("movl %0,%%esp\n\t"
+                     "jmp *%1"::"r" (current->thread.sp),
+                     "r"(current->thread.ip));
  }
  
  /* handle a Voyager SYS_INT -- If we don't, the base board will
@@ -758,25 +755,23 @@ initialize_secondary(void)
   * System interrupts occur because some problem was detected on the
   * various busses.  To find out what you have to probe all the
   * hardware via the CAT bus.  FIXME: At the moment we do nothing. */
-fastcall void
-smp_vic_sys_interrupt(struct pt_regs *regs)
+void smp_vic_sys_interrupt(struct pt_regs *regs)
  {
         ack_CPI(VIC_SYS_INT);
-       printk("Voyager SYSTEM INTERRUPT\n");   
+       printk("Voyager SYSTEM INTERRUPT\n");
  }
  
  /* Handle a voyager CMN_INT; These interrupts occur either because of
   * a system status change or because a single bit memory error
   * occurred.  FIXME: At the moment, ignore all this. */
-fastcall void
-smp_vic_cmn_interrupt(struct pt_regs *regs)
+void smp_vic_cmn_interrupt(struct pt_regs *regs)
  {
         static __u8 in_cmn_int = 0;
         static DEFINE_SPINLOCK(cmn_int_lock);
  
         /* common ints are broadcast, so make sure we only do this once */
         _raw_spin_lock(&cmn_int_lock);
-       if(in_cmn_int)
+       if (in_cmn_int)
                 goto unlock_end;
  
         in_cmn_int++;
@@ -784,12 +779,12 @@ smp_vic_cmn_interrupt(struct pt_regs *regs)
  
         VDEBUG(("Voyager COMMON INTERRUPT\n"));
  
-       if(voyager_level == 5)
+       if (voyager_level == 5)
                 voyager_cat_do_common_interrupt();
  
         _raw_spin_lock(&cmn_int_lock);
         in_cmn_int = 0;
- unlock_end:
+      unlock_end:
         _raw_spin_unlock(&cmn_int_lock);
         ack_CPI(VIC_CMN_INT);
  }
@@ -797,26 +792,23 @@ smp_vic_cmn_interrupt(struct pt_regs *regs)
  /*
   * Reschedule call back. Nothing to do, all the work is done
   * automatically when we return from the interrupt.  */
-static void
-smp_reschedule_interrupt(void)
+static void smp_reschedule_interrupt(void)
  {
         /* do nothing */
  }
  
-static struct mm_struct * flush_mm;
+static struct mm_struct *flush_mm;
  static unsigned long flush_va;
  static DEFINE_SPINLOCK(tlbstate_lock);
-#define FLUSH_ALL      0xffffffff
  
  /*
- * We cannot call mmdrop() because we are in interrupt context, 
+ * We cannot call mmdrop() because we are in interrupt context,
   * instead update mm->cpu_vm_mask.
   *
   * We need to reload %cr3 since the page tables may be going
   * away from under us..
   */
-static inline void
-leave_mm (unsigned long cpu)
+static inline void voyager_leave_mm(unsigned long cpu)
  {
         if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
                 BUG();
@@ -824,12 +816,10 @@ leave_mm (unsigned long cpu)
         load_cr3(swapper_pg_dir);
  }
  
-
  /*
   * Invalidate call-back
   */
-static void 
-smp_invalidate_interrupt(void)
+static void smp_invalidate_interrupt(void)
  {
         __u8 cpu = smp_processor_id();
  
@@ -837,18 +827,18 @@ smp_invalidate_interrupt(void)
                 return;
         /* This will flood messages.  Don't uncomment unless you see
          * Problems with cross cpu invalidation
-       VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n",
-               smp_processor_id()));
-       */
+        VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n",
+        smp_processor_id()));
+        */
  
         if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
                 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
-                       if (flush_va == FLUSH_ALL)
+                       if (flush_va == TLB_FLUSH_ALL)
                                 local_flush_tlb();
                         else
                                 __flush_tlb_one(flush_va);
                 } else
-                       leave_mm(cpu);
+                       voyager_leave_mm(cpu);
         }
         smp_mb__before_clear_bit();
         clear_bit(cpu, &smp_invalidate_needed);
@@ -857,11 +847,10 @@ smp_invalidate_interrupt(void)
  
  /* All the new flush operations for 2.4 */
  
-
  /* This routine is called with a physical cpu mask */
  static void
-voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
-                         unsigned long va)
+voyager_flush_tlb_others(unsigned long cpumask, struct mm_struct *mm,
+                        unsigned long va)
  {
         int stuck = 50000;
  
@@ -875,7 +864,7 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
                 BUG();
  
         spin_lock(&tlbstate_lock);
-       
+
         flush_mm = mm;
         flush_va = va;
         atomic_set_mask(cpumask, &smp_invalidate_needed);
@@ -887,23 +876,23 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm,
  
         while (smp_invalidate_needed) {
                 mb();
-               if(--stuck == 0) {
-                       printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id());
+               if (--stuck == 0) {
+                       printk("***WARNING*** Stuck doing invalidate CPI "
+                              "(CPU%d)\n", smp_processor_id());
                         break;
                 }
         }
  
         /* Uncomment only to debug invalidation problems
-       VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu));
-       */
+          VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu));
+        */
  
         flush_mm = NULL;
         flush_va = 0;
         spin_unlock(&tlbstate_lock);
  }
  
-void
-flush_tlb_current_task(void)
+void flush_tlb_current_task(void)
  {
         struct mm_struct *mm = current->mm;
         unsigned long cpu_mask;
@@ -913,14 +902,12 @@ flush_tlb_current_task(void)
         cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
         local_flush_tlb();
         if (cpu_mask)
-               voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+               voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
  
         preempt_enable();
  }
  
-
-void
-flush_tlb_mm (struct mm_struct * mm)
+void flush_tlb_mm(struct mm_struct *mm)
  {
         unsigned long cpu_mask;
  
@@ -932,15 +919,15 @@ flush_tlb_mm (struct mm_struct * mm)
                 if (current->mm)
                         local_flush_tlb();
                 else
-                       leave_mm(smp_processor_id());
+                       voyager_leave_mm(smp_processor_id());
         }
         if (cpu_mask)
-               voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+               voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
  
         preempt_enable();
  }
  
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
  {
         struct mm_struct *mm = vma->vm_mm;
         unsigned long cpu_mask;
@@ -949,10 +936,10 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
  
         cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
         if (current->active_mm == mm) {
-               if(current->mm)
+               if (current->mm)
                         __flush_tlb_one(va);
-                else
-                       leave_mm(smp_processor_id());
+               else
+                       voyager_leave_mm(smp_processor_id());
         }
  
         if (cpu_mask)
@@ -960,21 +947,21 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
  
         preempt_enable();
  }
+
  EXPORT_SYMBOL(flush_tlb_page);
  
  /* enable the requested IRQs */
-static void
-smp_enable_irq_interrupt(void)
+static void smp_enable_irq_interrupt(void)
  {
         __u8 irq;
         __u8 cpu = get_cpu();
  
         VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu,
-              vic_irq_enable_mask[cpu]));
+               vic_irq_enable_mask[cpu]));
  
         spin_lock(&vic_irq_lock);
-       for(irq = 0; irq < 16; irq++) {
-               if(vic_irq_enable_mask[cpu] & (1<<irq))
+       for (irq = 0; irq < 16; irq++) {
+               if (vic_irq_enable_mask[cpu] & (1 << irq))
                         enable_local_vic_irq(irq);
         }
         vic_irq_enable_mask[cpu] = 0;
@@ -982,17 +969,16 @@ smp_enable_irq_interrupt(void)
  
         put_cpu_no_resched();
  }
-       
+
  /*
   *     CPU halt call-back
   */
-static void
-smp_stop_cpu_function(void *dummy)
+static void smp_stop_cpu_function(void *dummy)
  {
         VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id()));
         cpu_clear(smp_processor_id(), cpu_online_map);
         local_irq_disable();
-       for(;;)
+       for (;;)
                 halt();
  }
  
@@ -1006,14 +992,13 @@ struct call_data_struct {
         int wait;
  };
  
-static struct call_data_struct * call_data;
+static struct call_data_struct *call_data;
  
  /* execute a thread on a new CPU.  The function to be called must be
   * previously set up.  This is used to schedule a function for
   * execution on all CPUs - set up the function then broadcast a
   * function_interrupt CPI to come here on each CPU */
-static void
-smp_call_function_interrupt(void)
+static void smp_call_function_interrupt(void)
  {
         void (*func) (void *info) = call_data->func;
         void *info = call_data->info;
@@ -1027,16 +1012,17 @@ smp_call_function_interrupt(void)
          * about to execute the function
          */
         mb();
-       if(!test_and_clear_bit(cpu, &call_data->started)) {
+       if (!test_and_clear_bit(cpu, &call_data->started)) {
                 /* If the bit wasn't set, this could be a replay */
-               printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu);
+               printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion"
+                      " with no call pending\n", cpu);
                 return;
         }
         /*
          * At this point the info structure may be out of scope unless wait==1
          */
         irq_enter();
-       (*func)(info);
+       (*func) (info);
         __get_cpu_var(irq_stat).irq_call_count++;
         irq_exit();
         if (wait) {
@@ -1046,14 +1032,13 @@ smp_call_function_interrupt(void)
  }
  
  static int
-voyager_smp_call_function_mask (cpumask_t cpumask,
-                               void (*func) (void *info), void *info,
-                               int wait)
+voyager_smp_call_function_mask(cpumask_t cpumask,
+                              void (*func) (void *info), void *info, int wait)
  {
         struct call_data_struct data;
         u32 mask = cpus_addr(cpumask)[0];
  
-       mask &= ~(1<<smp_processor_id());
+       mask &= ~(1 << smp_processor_id());
  
         if (!mask)
                 return 0;
@@ -1093,7 +1078,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask,
   * so we use the system clock to interrupt one processor, which in
   * turn, broadcasts a timer CPI to all the others --- we receive that
   * CPI here.  We don't use this actually for counting so losing
- * ticks doesn't matter 
+ * ticks doesn't matter
   *
   * FIXME: For those CPUs which actually have a local APIC, we could
   * try to use it to trigger this interrupt instead of having to
@@ -1101,8 +1086,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask,
   * no local APIC, so I can't do this
   *
   * This function is currently a placeholder and is unused in the code */
-fastcall void 
-smp_apic_timer_interrupt(struct pt_regs *regs)
+void smp_apic_timer_interrupt(struct pt_regs *regs)
  {
         struct pt_regs *old_regs = set_irq_regs(regs);
         wrapper_smp_local_timer_interrupt();
@@ -1110,8 +1094,7 @@ smp_apic_timer_interrupt(struct pt_regs *regs)
  }
  
  /* All of the QUAD interrupt GATES */
-fastcall void
-smp_qic_timer_interrupt(struct pt_regs *regs)
+void smp_qic_timer_interrupt(struct pt_regs *regs)
  {
         struct pt_regs *old_regs = set_irq_regs(regs);
         ack_QIC_CPI(QIC_TIMER_CPI);
@@ -1119,127 +1102,112 @@ smp_qic_timer_interrupt(struct pt_regs *regs)
         set_irq_regs(old_regs);
  }
  
-fastcall void
-smp_qic_invalidate_interrupt(struct pt_regs *regs)
+void smp_qic_invalidate_interrupt(struct pt_regs *regs)
  {
         ack_QIC_CPI(QIC_INVALIDATE_CPI);
         smp_invalidate_interrupt();
  }
  
-fastcall void
-smp_qic_reschedule_interrupt(struct pt_regs *regs)
+void smp_qic_reschedule_interrupt(struct pt_regs *regs)
  {
         ack_QIC_CPI(QIC_RESCHEDULE_CPI);
         smp_reschedule_interrupt();
  }
  
-fastcall void
-smp_qic_enable_irq_interrupt(struct pt_regs *regs)
+void smp_qic_enable_irq_interrupt(struct pt_regs *regs)
  {
         ack_QIC_CPI(QIC_ENABLE_IRQ_CPI);
         smp_enable_irq_interrupt();
  }
  
-fastcall void
-smp_qic_call_function_interrupt(struct pt_regs *regs)
+void smp_qic_call_function_interrupt(struct pt_regs *regs)
  {
         ack_QIC_CPI(QIC_CALL_FUNCTION_CPI);
         smp_call_function_interrupt();
  }
  
-fastcall void
-smp_vic_cpi_interrupt(struct pt_regs *regs)
+void smp_vic_cpi_interrupt(struct pt_regs *regs)
  {
         struct pt_regs *old_regs = set_irq_regs(regs);
         __u8 cpu = smp_processor_id();
  
-       if(is_cpu_quad())
+       if (is_cpu_quad())
                 ack_QIC_CPI(VIC_CPI_LEVEL0);
         else
                 ack_VIC_CPI(VIC_CPI_LEVEL0);
  
-       if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu]))
+       if (test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu]))
                 wrapper_smp_local_timer_interrupt();
-       if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu]))
+       if (test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu]))
                 smp_invalidate_interrupt();
-       if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu]))
+       if (test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu]))
                 smp_reschedule_interrupt();
-       if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu]))
+       if (test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu]))
                 smp_enable_irq_interrupt();
-       if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
+       if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
                 smp_call_function_interrupt();
         set_irq_regs(old_regs);
  }
  
-static void
-do_flush_tlb_all(void* info)
+static void do_flush_tlb_all(void *info)
  {
         unsigned long cpu = smp_processor_id();
  
         __flush_tlb_all();
         if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
-               leave_mm(cpu);
+               voyager_leave_mm(cpu);
  }
  
-
  /* flush the TLB of every active CPU in the system */
-void
-flush_tlb_all(void)
+void flush_tlb_all(void)
  {
         on_each_cpu(do_flush_tlb_all, 0, 1, 1);
  }
  
  /* used to set up the trampoline for other CPUs when the memory manager
   * is sorted out */
-void __init
-smp_alloc_memory(void)
+void __init smp_alloc_memory(void)
  {
-       trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE);
-       if(__pa(trampoline_base) >= 0x93000)
+       trampoline_base = (__u32) alloc_bootmem_low_pages(PAGE_SIZE);
+       if (__pa(trampoline_base) >= 0x93000)
                 BUG();
  }
  
  /* send a reschedule CPI to one CPU by physical CPU number*/
-static void
-voyager_smp_send_reschedule(int cpu)
+static void voyager_smp_send_reschedule(int cpu)
  {
         send_one_CPI(cpu, VIC_RESCHEDULE_CPI);
  }
  
-
-int
-hard_smp_processor_id(void)
+int hard_smp_processor_id(void)
  {
         __u8 i;
         __u8 cpumask = inb(VIC_PROC_WHO_AM_I);
-       if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER)
+       if ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER)
                 return cpumask & 0x1F;
  
-       for(i = 0; i < 8; i++) {
-               if(cpumask & (1<<i))
+       for (i = 0; i < 8; i++) {
+               if (cpumask & (1 << i))
                         return i;
         }
         printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask);
         return 0;
  }
  
-int
-safe_smp_processor_id(void)
+int safe_smp_processor_id(void)
  {
         return hard_smp_processor_id();
  }
  
  /* broadcast a halt to all other CPUs */
-static void
-voyager_smp_send_stop(void)
+static void voyager_smp_send_stop(void)
  {
         smp_call_function(smp_stop_cpu_function, NULL, 1, 1);
  }
  
  /* this function is triggered in time.c when a clock tick fires
   * we need to re-broadcast the tick to all CPUs */
-void
-smp_vic_timer_interrupt(void)
+void smp_vic_timer_interrupt(void)
  {
         send_CPI_allbutself(VIC_TIMER_CPI);
         smp_local_timer_interrupt();
@@ -1253,8 +1221,7 @@ smp_vic_timer_interrupt(void)
   * multiplier is 1 and it can be changed by writing the new multiplier
   * value into /proc/profile.
   */
-void
-smp_local_timer_interrupt(void)
+void smp_local_timer_interrupt(void)
  {
         int cpu = smp_processor_id();
         long weight;
@@ -1269,18 +1236,18 @@ smp_local_timer_interrupt(void)
                  *
                  * Interrupts are already masked off at this point.
                  */
-               per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu);
+               per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
                 if (per_cpu(prof_counter, cpu) !=
-                                       per_cpu(prof_old_multiplier, cpu)) {
+                   per_cpu(prof_old_multiplier, cpu)) {
                         /* FIXME: need to update the vic timer tick here */
                         per_cpu(prof_old_multiplier, cpu) =
-                                               per_cpu(prof_counter, cpu);
+                           per_cpu(prof_counter, cpu);
                 }
  
                 update_process_times(user_mode_vm(get_irq_regs()));
         }
  
-       if( ((1<<cpu) & voyager_extended_vic_processors) == 0)
+       if (((1 << cpu) & voyager_extended_vic_processors) == 0)
                 /* only extended VIC processors participate in
                  * interrupt distribution */
                 return;
@@ -1296,12 +1263,12 @@ smp_local_timer_interrupt(void)
          * we can take more than 100K local irqs per second on a 100 MHz P5.
          */
  
-       if((++vic_tick[cpu] & 0x7) != 0)
+       if ((++vic_tick[cpu] & 0x7) != 0)
                 return;
         /* get here every 16 ticks (about every 1/6 of a second) */
  
         /* Change our priority to give someone else a chance at getting
-         * the IRQ. The algorithm goes like this:
+        * the IRQ. The algorithm goes like this:
          *
          * In the VIC, the dynamically routed interrupt is always
          * handled by the lowest priority eligible (i.e. receiving
@@ -1325,18 +1292,18 @@ smp_local_timer_interrupt(void)
          * affinity code since we now try to even up the interrupt
          * counts when an affinity binding is keeping them on a
          * particular CPU*/
-       weight = (vic_intr_count[cpu]*voyager_extended_cpus
+       weight = (vic_intr_count[cpu] * voyager_extended_cpus
                   - vic_intr_total) >> 4;
         weight += 4;
-       if(weight > 7)
+       if (weight > 7)
                 weight = 7;
-       if(weight < 0)
+       if (weight < 0)
                 weight = 0;
-       
-       outb((__u8)weight, VIC_PRIORITY_REGISTER);
+
+       outb((__u8) weight, VIC_PRIORITY_REGISTER);
  
  #ifdef VOYAGER_DEBUG
-       if((vic_tick[cpu] & 0xFFF) == 0) {
+       if ((vic_tick[cpu] & 0xFFF) == 0) {
                 /* print this message roughly every 25 secs */
                 printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n",
                        cpu, vic_tick[cpu], weight);
@@ -1345,15 +1312,14 @@ smp_local_timer_interrupt(void)
  }
  
  /* setup the profiling timer */
-int 
-setup_profiling_timer(unsigned int multiplier)
+int setup_profiling_timer(unsigned int multiplier)
  {
         int i;
  
-       if ( (!multiplier))
+       if ((!multiplier))
                 return -EINVAL;
  
-       /* 
+       /*
          * Set the new multiplier for each CPU. CPUs don't start using the
          * new values until the next timer interrupt in which they do process
          * accounting.
@@ -1367,15 +1333,13 @@ setup_profiling_timer(unsigned int multiplier)
  /* This is a bit of a mess, but forced on us by the genirq changes
   * there's no genirq handler that really does what voyager wants
   * so hack it up with the simple IRQ handler */
-static void fastcall
-handle_vic_irq(unsigned int irq, struct irq_desc *desc)
+static void handle_vic_irq(unsigned int irq, struct irq_desc *desc)
  {
         before_handle_vic_irq(irq);
         handle_simple_irq(irq, desc);
         after_handle_vic_irq(irq);
  }
  
-
  /*  The CPIs are handled in the per cpu 8259s, so they must be
   *  enabled to be received: FIX: enabling the CPIs in the early
   *  boot sequence interferes with bug checking; enable them later
@@ -1385,13 +1349,12 @@ handle_vic_irq(unsigned int irq, struct irq_desc *desc)
  #define QIC_SET_GATE(cpi, vector) \
         set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
  
-void __init
-smp_intr_init(void)
+void __init smp_intr_init(void)
  {
         int i;
  
         /* initialize the per cpu irq mask to all disabled */
-       for(i = 0; i < NR_CPUS; i++)
+       for (i = 0; i < NR_CPUS; i++)
                 vic_irq_mask[i] = 0xFFFF;
  
         VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
@@ -1404,42 +1367,40 @@ smp_intr_init(void)
         QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt);
         QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt);
         QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt);
-       
  
-       /* now put the VIC descriptor into the first 48 IRQs 
+       /* now put the VIC descriptor into the first 48 IRQs
          *
          * This is for later: first 16 correspond to PC IRQs; next 16
          * are Primary MC IRQs and final 16 are Secondary MC IRQs */
-       for(i = 0; i < 48; i++)
+       for (i = 0; i < 48; i++)
                 set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq);
  }
  
  /* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per
   * processor to receive CPI */
-static void
-send_CPI(__u32 cpuset, __u8 cpi)
+static void send_CPI(__u32 cpuset, __u8 cpi)
  {
         int cpu;
         __u32 quad_cpuset = (cpuset & voyager_quad_processors);
  
-       if(cpi < VIC_START_FAKE_CPI) {
-               /* fake CPI are only used for booting, so send to the 
+       if (cpi < VIC_START_FAKE_CPI) {
+               /* fake CPI are only used for booting, so send to the
                  * extended quads as well---Quads must be VIC booted */
-               outb((__u8)(cpuset), VIC_CPI_Registers[cpi]);
+               outb((__u8) (cpuset), VIC_CPI_Registers[cpi]);
                 return;
         }
-       if(quad_cpuset)
+       if (quad_cpuset)
                 send_QIC_CPI(quad_cpuset, cpi);
         cpuset &= ~quad_cpuset;
         cpuset &= 0xff;         /* only first 8 CPUs vaild for VIC CPI */
-       if(cpuset == 0)
+       if (cpuset == 0)
                 return;
         for_each_online_cpu(cpu) {
-               if(cpuset & (1<<cpu))
+               if (cpuset & (1 << cpu))
                         set_bit(cpi, &vic_cpi_mailbox[cpu]);
         }
-       if(cpuset)
-               outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]);
+       if (cpuset)
+               outb((__u8) cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]);
  }
  
  /* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and
@@ -1448,20 +1409,19 @@ send_CPI(__u32 cpuset, __u8 cpi)
   * DON'T make this inline otherwise the cache line read will be
   * optimised away
   * */
-static int
-ack_QIC_CPI(__u8 cpi) {
+static int ack_QIC_CPI(__u8 cpi)
+{
         __u8 cpu = hard_smp_processor_id();
  
         cpi &= 7;
  
-       outb(1<<cpi, QIC_INTERRUPT_CLEAR1);
+       outb(1 << cpi, QIC_INTERRUPT_CLEAR1);
         return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi;
  }
  
-static void
-ack_special_QIC_CPI(__u8 cpi)
+static void ack_special_QIC_CPI(__u8 cpi)
  {
-       switch(cpi) {
+       switch (cpi) {
         case VIC_CMN_INT:
                 outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0);
                 break;
@@ -1474,8 +1434,7 @@ ack_special_QIC_CPI(__u8 cpi)
  }
  
  /* Acknowledge receipt of CPI in the VIC (essentially an EOI) */
-static void
-ack_VIC_CPI(__u8 cpi)
+static void ack_VIC_CPI(__u8 cpi)
  {
  #ifdef VOYAGER_DEBUG
         unsigned long flags;
@@ -1484,17 +1443,17 @@ ack_VIC_CPI(__u8 cpi)
  
         local_irq_save(flags);
         isr = vic_read_isr();
-       if((isr & (1<<(cpi &7))) == 0) {
+       if ((isr & (1 << (cpi & 7))) == 0) {
                 printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi);
         }
  #endif
         /* send specific EOI; the two system interrupts have
          * bit 4 set for a separate vector but behave as the
          * corresponding 3 bit intr */
-       outb_p(0x60|(cpi & 7),0x20);
+       outb_p(0x60 | (cpi & 7), 0x20);
  
  #ifdef VOYAGER_DEBUG
-       if((vic_read_isr() & (1<<(cpi &7))) != 0) {
+       if ((vic_read_isr() & (1 << (cpi & 7))) != 0) {
                 printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi);
         }
         local_irq_restore(flags);
@@ -1502,12 +1461,11 @@ ack_VIC_CPI(__u8 cpi)
  }
  
  /* cribbed with thanks from irq.c */
-#define __byte(x,y)    (((unsigned char *)&(y))[x])
+#define __byte(x,y)    (((unsigned char *)&(y))[x])
  #define cached_21(cpu) (__byte(0,vic_irq_mask[cpu]))
  #define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu]))
  
-static unsigned int
-startup_vic_irq(unsigned int irq)
+static unsigned int startup_vic_irq(unsigned int irq)
  {
         unmask_vic_irq(irq);
  
@@ -1535,13 +1493,12 @@ startup_vic_irq(unsigned int irq)
   *    broadcast an Interrupt enable CPI which causes all other CPUs to
   *    adjust their masks accordingly.  */
  
-static void
-unmask_vic_irq(unsigned int irq)
+static void unmask_vic_irq(unsigned int irq)
  {
         /* linux doesn't to processor-irq affinity, so enable on
          * all CPUs we know about */
         int cpu = smp_processor_id(), real_cpu;
-       __u16 mask = (1<<irq);
+       __u16 mask = (1 << irq);
         __u32 processorList = 0;
         unsigned long flags;
  
@@ -1549,78 +1506,72 @@ unmask_vic_irq(unsigned int irq)
                 irq, cpu, cpu_irq_affinity[cpu]));
         spin_lock_irqsave(&vic_irq_lock, flags);
         for_each_online_cpu(real_cpu) {
-               if(!(voyager_extended_vic_processors & (1<<real_cpu)))
+               if (!(voyager_extended_vic_processors & (1 << real_cpu)))
                         continue;
-               if(!(cpu_irq_affinity[real_cpu] & mask)) {
+               if (!(cpu_irq_affinity[real_cpu] & mask)) {
                         /* irq has no affinity for this CPU, ignore */
                         continue;
                 }
-               if(real_cpu == cpu) {
+               if (real_cpu == cpu) {
                         enable_local_vic_irq(irq);
-               }
-               else if(vic_irq_mask[real_cpu] & mask) {
+               } else if (vic_irq_mask[real_cpu] & mask) {
                         vic_irq_enable_mask[real_cpu] |= mask;
-                       processorList |= (1<<real_cpu);
+                       processorList |= (1 << real_cpu);
                 }
         }
         spin_unlock_irqrestore(&vic_irq_lock, flags);
-       if(processorList)
+       if (processorList)
                 send_CPI(processorList, VIC_ENABLE_IRQ_CPI);
  }
  
-static void
-mask_vic_irq(unsigned int irq)
+static void mask_vic_irq(unsigned int irq)
  {
         /* lazy disable, do nothing */
  }
  
-static void
-enable_local_vic_irq(unsigned int irq)
+static void enable_local_vic_irq(unsigned int irq)
  {
         __u8 cpu = smp_processor_id();
         __u16 mask = ~(1 << irq);
         __u16 old_mask = vic_irq_mask[cpu];
  
         vic_irq_mask[cpu] &= mask;
-       if(vic_irq_mask[cpu] == old_mask)
+       if (vic_irq_mask[cpu] == old_mask)
                 return;
  
         VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n",
                 irq, cpu));
  
         if (irq & 8) {
-               outb_p(cached_A1(cpu),0xA1);
+               outb_p(cached_A1(cpu), 0xA1);
                 (void)inb_p(0xA1);
-       }
-       else {
-               outb_p(cached_21(cpu),0x21);
+       } else {
+               outb_p(cached_21(cpu), 0x21);
                 (void)inb_p(0x21);
         }
  }
  
-static void
-disable_local_vic_irq(unsigned int irq)
+static void disable_local_vic_irq(unsigned int irq)
  {
         __u8 cpu = smp_processor_id();
         __u16 mask = (1 << irq);
         __u16 old_mask = vic_irq_mask[cpu];
  
-       if(irq == 7)
+       if (irq == 7)
                 return;
  
         vic_irq_mask[cpu] |= mask;
-       if(old_mask == vic_irq_mask[cpu])
+       if (old_mask == vic_irq_mask[cpu])
                 return;
  
         VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n",
                 irq, cpu));
  
         if (irq & 8) {
-               outb_p(cached_A1(cpu),0xA1);
+               outb_p(cached_A1(cpu), 0xA1);
                 (void)inb_p(0xA1);
-       }
-       else {
-               outb_p(cached_21(cpu),0x21);
+       } else {
+               outb_p(cached_21(cpu), 0x21);
                 (void)inb_p(0x21);
         }
  }
@@ -1631,8 +1582,7 @@ disable_local_vic_irq(unsigned int irq)
   * interrupt in the vic, so we merely set a flag (IRQ_DISABLED).  If
   * this interrupt actually comes in, then we mask and ack here to push
   * the interrupt off to another CPU */
-static void
-before_handle_vic_irq(unsigned int irq)
+static void before_handle_vic_irq(unsigned int irq)
  {
         irq_desc_t *desc = irq_desc + irq;
         __u8 cpu = smp_processor_id();
@@ -1641,16 +1591,16 @@ before_handle_vic_irq(unsigned int irq)
         vic_intr_total++;
         vic_intr_count[cpu]++;
  
-       if(!(cpu_irq_affinity[cpu] & (1<<irq))) {
+       if (!(cpu_irq_affinity[cpu] & (1 << irq))) {
                 /* The irq is not in our affinity mask, push it off
                  * onto another CPU */
-               VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n",
-                       irq, cpu));
+               VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d "
+                       "on cpu %d\n", irq, cpu));
                 disable_local_vic_irq(irq);
                 /* set IRQ_INPROGRESS to prevent the handler in irq.c from
                  * actually calling the interrupt routine */
                 desc->status |= IRQ_REPLAY | IRQ_INPROGRESS;
-       } else if(desc->status & IRQ_DISABLED) {
+       } else if (desc->status & IRQ_DISABLED) {
                 /* Damn, the interrupt actually arrived, do the lazy
                  * disable thing. The interrupt routine in irq.c will
                  * not handle a IRQ_DISABLED interrupt, so nothing more
@@ -1667,8 +1617,7 @@ before_handle_vic_irq(unsigned int irq)
  }
  
  /* Finish the VIC interrupt: basically mask */
-static void
-after_handle_vic_irq(unsigned int irq)
+static void after_handle_vic_irq(unsigned int irq)
  {
         irq_desc_t *desc = irq_desc + irq;
  
@@ -1685,11 +1634,11 @@ after_handle_vic_irq(unsigned int irq)
  #ifdef VOYAGER_DEBUG
                 /* DEBUG: before we ack, check what's in progress */
                 isr = vic_read_isr();
-               if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) {
+               if ((isr & (1 << irq) && !(status & IRQ_REPLAY)) == 0) {
                         int i;
                         __u8 cpu = smp_processor_id();
                         __u8 real_cpu;
-                       int mask; /* Um... initialize me??? --RR */
+                       int mask;       /* Um... initialize me??? --RR */
  
                         printk("VOYAGER SMP: CPU%d lost interrupt %d\n",
                                cpu, irq);
@@ -1698,9 +1647,10 @@ after_handle_vic_irq(unsigned int irq)
                                 outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu,
                                      VIC_PROCESSOR_ID);
                                 isr = vic_read_isr();
-                               if(isr & (1<<irq)) {
-                                       printk("VOYAGER SMP: CPU%d ack irq %d\n",
-                                              real_cpu, irq);
+                               if (isr & (1 << irq)) {
+                                       printk
+                                           ("VOYAGER SMP: CPU%d ack irq %d\n",
+                                            real_cpu, irq);
                                         ack_vic_irq(irq);
                                 }
                                 outb(cpu, VIC_PROCESSOR_ID);
@@ -1711,7 +1661,7 @@ after_handle_vic_irq(unsigned int irq)
                  * receipt by another CPU so everything must be in
                  * order here  */
                 ack_vic_irq(irq);
-               if(status & IRQ_REPLAY) {
+               if (status & IRQ_REPLAY) {
                         /* replay is set if we disable the interrupt
                          * in the before_handle_vic_irq() routine, so
                          * clear the in progress bit here to allow the
@@ -1720,9 +1670,9 @@ after_handle_vic_irq(unsigned int irq)
                 }
  #ifdef VOYAGER_DEBUG
                 isr = vic_read_isr();
-               if((isr & (1<<irq)) != 0)
-                       printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n",
-                              irq, isr);
+               if ((isr & (1 << irq)) != 0)
+                       printk("VOYAGER SMP: after_handle_vic_irq() after "
+                              "ack irq=%d, isr=0x%x\n", irq, isr);
  #endif /* VOYAGER_DEBUG */
         }
         _raw_spin_unlock(&vic_irq_lock);
@@ -1731,7 +1681,6 @@ after_handle_vic_irq(unsigned int irq)
          * may be intercepted by another CPU if reasserted */
  }
  
-
  /* Linux processor - interrupt affinity manipulations.
   *
   * For each processor, we maintain a 32 bit irq affinity mask.
@@ -1748,8 +1697,7 @@ after_handle_vic_irq(unsigned int irq)
   * change the mask and then do an interrupt enable CPI to re-enable on
   * the selected processors */
  
-void
-set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
+void set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
  {
         /* Only extended processors handle interrupts */
         unsigned long real_mask;
@@ -1757,13 +1705,13 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
         int cpu;
  
         real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors;
-       
-       if(cpus_addr(mask)[0] == 0)
+
+       if (cpus_addr(mask)[0] == 0)
                 /* can't have no CPUs to accept the interrupt -- extremely
                  * bad things will happen */
                 return;
  
-       if(irq == 0)
+       if (irq == 0)
                 /* can't change the affinity of the timer IRQ.  This
                  * is due to the constraint in the voyager
                  * architecture that the CPI also comes in on and IRQ
@@ -1772,7 +1720,7 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
                  * will no-longer be able to accept VIC CPIs */
                 return;
  
-       if(irq >= 32) 
+       if (irq >= 32)
                 /* You can only have 32 interrupts in a voyager system
                  * (and 32 only if you have a secondary microchannel
                  * bus) */
@@ -1780,8 +1728,8 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
  
         for_each_online_cpu(cpu) {
                 unsigned long cpu_mask = 1 << cpu;
-               
-               if(cpu_mask & real_mask) {
+
+               if (cpu_mask & real_mask) {
                         /* enable the interrupt for this cpu */
                         cpu_irq_affinity[cpu] |= irq_mask;
                 } else {
@@ -1800,25 +1748,23 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask)
         unmask_vic_irq(irq);
  }
  
-static void
-ack_vic_irq(unsigned int irq)
+static void ack_vic_irq(unsigned int irq)
  {
         if (irq & 8) {
-               outb(0x62,0x20);        /* Specific EOI to cascade */
-               outb(0x60|(irq & 7),0xA0);
+               outb(0x62, 0x20);       /* Specific EOI to cascade */
+               outb(0x60 | (irq & 7), 0xA0);
         } else {
-               outb(0x60 | (irq & 7),0x20);
+               outb(0x60 | (irq & 7), 0x20);
         }
  }
  
  /* enable the CPIs.  In the VIC, the CPIs are delivered by the 8259
   * but are not vectored by it.  This means that the 8259 mask must be
   * lowered to receive them */
-static __init void
-vic_enable_cpi(void)
+static __init void vic_enable_cpi(void)
  {
         __u8 cpu = smp_processor_id();
-       
+
         /* just take a copy of the current mask (nop for boot cpu) */
         vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id];
  
@@ -1827,7 +1773,7 @@ vic_enable_cpi(void)
         /* for sys int and cmn int */
         enable_local_vic_irq(7);
  
-       if(is_cpu_quad()) {
+       if (is_cpu_quad()) {
                 outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0);
                 outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1);
                 VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n",
@@ -1838,8 +1784,7 @@ vic_enable_cpi(void)
                 cpu, vic_irq_mask[cpu]));
  }
  
-void
-voyager_smp_dump()
+void voyager_smp_dump()
  {
         int old_cpu = smp_processor_id(), cpu;
  
@@ -1865,10 +1810,10 @@ voyager_smp_dump()
                        cpu, vic_irq_mask[cpu], imr, irr, isr);
  #if 0
                 /* These lines are put in to try to unstick an un ack'd irq */
-               if(isr != 0) {
+               if (isr != 0) {
                         int irq;
-                       for(irq=0; irq<16; irq++) {
-                               if(isr & (1<<irq)) {
+                       for (irq = 0; irq < 16; irq++) {
+                               if (isr & (1 << irq)) {
                                         printk("\tCPU%d: ack irq %d\n",
                                                cpu, irq);
                                         local_irq_save(flags);
@@ -1884,17 +1829,15 @@ voyager_smp_dump()
         }
  }
  
-void
-smp_voyager_power_off(void *dummy)
+void smp_voyager_power_off(void *dummy)
  {
-       if(smp_processor_id() == boot_cpu_id) 
+       if (smp_processor_id() == boot_cpu_id)
                 voyager_power_off();
         else
                 smp_stop_cpu_function(NULL);
  }
  
-static void __init
-voyager_smp_prepare_cpus(unsigned int max_cpus)
+static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
  {
         /* FIXME: ignore max_cpus for now */
         smp_boot_cpus();
@@ -1911,8 +1854,7 @@ static void __cpuinit voyager_smp_prepare_boot_cpu(void)
         cpu_set(smp_processor_id(), cpu_present_map);
  }
  
-static int __cpuinit
-voyager_cpu_up(unsigned int cpu)
+static int __cpuinit voyager_cpu_up(unsigned int cpu)
  {
         /* This only works at boot for x86.  See "rewrite" above. */
         if (cpu_isset(cpu, smp_commenced_mask))
@@ -1928,14 +1870,12 @@ voyager_cpu_up(unsigned int cpu)
         return 0;
  }
  
-static void __init
-voyager_smp_cpus_done(unsigned int max_cpus)
+static void __init voyager_smp_cpus_done(unsigned int max_cpus)
  {
         zap_low_mappings();
  }
  
-void __init
-smp_setup_processor_id(void)
+void __init smp_setup_processor_id(void)
  {
         current_thread_info()->cpu = hard_smp_processor_id();
         x86_write_percpu(cpu_number, hard_smp_processor_id());
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c

index 50f9366c411edb94ce6daa8f6b5546ad9657e650..c69c931818ed49b8d5d9326a50aefb8aa339ba9e 100644 (file)
--- a/arch/x86/mach-voyager/voyager_thread.c
+++ b/arch/x86/mach-voyager/voyager_thread.c
@@ -30,12 +30,10 @@
  #include <asm/mtrr.h>
  #include <asm/msr.h>
  
-
  struct task_struct *voyager_thread;
  static __u8 set_timeout;
  
-static int
-execute(const char *string)
+static int execute(const char *string)
  {
         int ret;
  
@@ -52,48 +50,48 @@ execute(const char *string)
                 NULL,
         };
  
-       if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
-               printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
-                      string, ret);
+       if ((ret =
+            call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
+               printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string,
+                      ret);
         }
         return ret;
  }
  
-static void
-check_from_kernel(void)
+static void check_from_kernel(void)
  {
-       if(voyager_status.switch_off) {
-               
+       if (voyager_status.switch_off) {
+
                 /* FIXME: This should be configurable via proc */
                 execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1");
-       } else if(voyager_status.power_fail) {
+       } else if (voyager_status.power_fail) {
                 VDEBUG(("Voyager daemon detected AC power failure\n"));
-               
+
                 /* FIXME: This should be configureable via proc */
                 execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1");
                 set_timeout = 1;
         }
  }
  
-static void
-check_continuing_condition(void)
+static void check_continuing_condition(void)
  {
-       if(voyager_status.power_fail) {
+       if (voyager_status.power_fail) {
                 __u8 data;
-               voyager_cat_psi(VOYAGER_PSI_SUBREAD, 
+               voyager_cat_psi(VOYAGER_PSI_SUBREAD,
                                 VOYAGER_PSI_AC_FAIL_REG, &data);
-               if((data & 0x1f) == 0) {
+               if ((data & 0x1f) == 0) {
                         /* all power restored */
-                       printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n");
+                       printk(KERN_NOTICE
+                              "VOYAGER AC power restored, cancelling shutdown\n");
                         /* FIXME: should be user configureable */
-                       execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1");
+                       execute
+                           ("umask 600; echo O > /etc/powerstatus; kill -PWR 1");
                         set_timeout = 0;
                 }
         }
  }
  
-static int
-thread(void *unused)
+static int thread(void *unused)
  {
         printk(KERN_NOTICE "Voyager starting monitor thread\n");
  
@@ -102,7 +100,7 @@ thread(void *unused)
                 schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT);
  
                 VDEBUG(("Voyager Daemon awoken\n"));
-               if(voyager_status.request_from_kernel == 0) {
+               if (voyager_status.request_from_kernel == 0) {
                         /* probably awoken from timeout */
                         check_continuing_condition();
                 } else {
@@ -112,20 +110,18 @@ thread(void *unused)
         }
  }
  
-static int __init
-voyager_thread_start(void)
+static int __init voyager_thread_start(void)
  {
         voyager_thread = kthread_run(thread, NULL, "kvoyagerd");
         if (IS_ERR(voyager_thread)) {
-               printk(KERN_ERR "Voyager: Failed to create system monitor thread.\n");
+               printk(KERN_ERR
+                      "Voyager: Failed to create system monitor thread.\n");
                 return PTR_ERR(voyager_thread);
         }
         return 0;
  }
  
-
-static void __exit
-voyager_thread_stop(void)
+static void __exit voyager_thread_stop(void)
  {
         kthread_stop(voyager_thread);
  }
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c

index a1b0d22f697858b335165a68acd2c487eb8ad403..59d353d2c599ec26d21a9ab65ea2641e121b25a5 100644 (file)
--- a/arch/x86/math-emu/errors.c
+++ b/arch/x86/math-emu/errors.c
@@ -33,45 +33,41 @@
  #undef PRINT_MESSAGES
  /* */
  
-
  #if 0
  void Un_impl(void)
  {
-  u_char byte1, FPU_modrm;
-  unsigned long address = FPU_ORIG_EIP;
-
-  RE_ENTRANT_CHECK_OFF;
-  /* No need to check access_ok(), we have previously fetched these bytes. */
-  printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address);
-  if ( FPU_CS == __USER_CS )
-    {
-      while ( 1 )
-       {
-         FPU_get_user(byte1, (u_char __user *) address);
-         if ( (byte1 & 0xf8) == 0xd8 ) break;
-         printk("[%02x]", byte1);
-         address++;
+       u_char byte1, FPU_modrm;
+       unsigned long address = FPU_ORIG_EIP;
+
+       RE_ENTRANT_CHECK_OFF;
+       /* No need to check access_ok(), we have previously fetched these bytes. */
+       printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *)address);
+       if (FPU_CS == __USER_CS) {
+               while (1) {
+                       FPU_get_user(byte1, (u_char __user *) address);
+                       if ((byte1 & 0xf8) == 0xd8)
+                               break;
+                       printk("[%02x]", byte1);
+                       address++;
+               }
+               printk("%02x ", byte1);
+               FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
+
+               if (FPU_modrm >= 0300)
+                       printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8,
+                              FPU_modrm & 7);
+               else
+                       printk("/%d\n", (FPU_modrm >> 3) & 7);
+       } else {
+               printk("cs selector = %04x\n", FPU_CS);
         }
-      printk("%02x ", byte1);
-      FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
-      
-      if (FPU_modrm >= 0300)
-       printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
-      else
-       printk("/%d\n", (FPU_modrm >> 3) & 7);
-    }
-  else
-    {
-      printk("cs selector = %04x\n", FPU_CS);
-    }
-
-  RE_ENTRANT_CHECK_ON;
-
-  EXCEPTION(EX_Invalid);
  
-}
-#endif  /*  0  */
+       RE_ENTRANT_CHECK_ON;
  
+       EXCEPTION(EX_Invalid);
+
+}
+#endif /*  0  */
  
  /*
     Called for opcodes which are illegal and which are known to result in a
@@ -79,139 +75,152 @@ void Un_impl(void)
     */
  void FPU_illegal(void)
  {
-  math_abort(FPU_info,SIGILL);
+       math_abort(FPU_info, SIGILL);
  }
  
-
-
  void FPU_printall(void)
  {
-  int i;
-  static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty",
-                              "DeNorm", "Inf", "NaN" };
-  u_char byte1, FPU_modrm;
-  unsigned long address = FPU_ORIG_EIP;
-
-  RE_ENTRANT_CHECK_OFF;
-  /* No need to check access_ok(), we have previously fetched these bytes. */
-  printk("At %p:", (void *) address);
-  if ( FPU_CS == __USER_CS )
-    {
+       int i;
+       static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty",
+               "DeNorm", "Inf", "NaN"
+       };
+       u_char byte1, FPU_modrm;
+       unsigned long address = FPU_ORIG_EIP;
+
+       RE_ENTRANT_CHECK_OFF;
+       /* No need to check access_ok(), we have previously fetched these bytes. */
+       printk("At %p:", (void *)address);
+       if (FPU_CS == __USER_CS) {
  #define MAX_PRINTED_BYTES 20
-      for ( i = 0; i < MAX_PRINTED_BYTES; i++ )
-       {
-         FPU_get_user(byte1, (u_char __user *) address);
-         if ( (byte1 & 0xf8) == 0xd8 )
-           {
-             printk(" %02x", byte1);
-             break;
-           }
-         printk(" [%02x]", byte1);
-         address++;
-       }
-      if ( i == MAX_PRINTED_BYTES )
-       printk(" [more..]\n");
-      else
-       {
-         FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
-         
-         if (FPU_modrm >= 0300)
-           printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7);
-         else
-           printk(" /%d, mod=%d rm=%d\n",
-                  (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7);
+               for (i = 0; i < MAX_PRINTED_BYTES; i++) {
+                       FPU_get_user(byte1, (u_char __user *) address);
+                       if ((byte1 & 0xf8) == 0xd8) {
+                               printk(" %02x", byte1);
+                               break;
+                       }
+                       printk(" [%02x]", byte1);
+                       address++;
+               }
+               if (i == MAX_PRINTED_BYTES)
+                       printk(" [more..]\n");
+               else {
+                       FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
+
+                       if (FPU_modrm >= 0300)
+                               printk(" %02x (%02x+%d)\n", FPU_modrm,
+                                      FPU_modrm & 0xf8, FPU_modrm & 7);
+                       else
+                               printk(" /%d, mod=%d rm=%d\n",
+                                      (FPU_modrm >> 3) & 7,
+                                      (FPU_modrm >> 6) & 3, FPU_modrm & 7);
+               }
+       } else {
+               printk("%04x\n", FPU_CS);
         }
-    }
-  else
-    {
-      printk("%04x\n", FPU_CS);
-    }
  
-  partial_status = status_word();
+       partial_status = status_word();
  
  #ifdef DEBUGGING
-if ( partial_status & SW_Backward )    printk("SW: backward compatibility\n");
-if ( partial_status & SW_C3 )          printk("SW: condition bit 3\n");
-if ( partial_status & SW_C2 )          printk("SW: condition bit 2\n");
-if ( partial_status & SW_C1 )          printk("SW: condition bit 1\n");
-if ( partial_status & SW_C0 )          printk("SW: condition bit 0\n");
-if ( partial_status & SW_Summary )     printk("SW: exception summary\n");
-if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n");
-if ( partial_status & SW_Precision )   printk("SW: loss of precision\n");
-if ( partial_status & SW_Underflow )   printk("SW: underflow\n");
-if ( partial_status & SW_Overflow )    printk("SW: overflow\n");
-if ( partial_status & SW_Zero_Div )    printk("SW: divide by zero\n");
-if ( partial_status & SW_Denorm_Op )   printk("SW: denormalized operand\n");
-if ( partial_status & SW_Invalid )     printk("SW: invalid operation\n");
+       if (partial_status & SW_Backward)
+               printk("SW: backward compatibility\n");
+       if (partial_status & SW_C3)
+               printk("SW: condition bit 3\n");
+       if (partial_status & SW_C2)
+               printk("SW: condition bit 2\n");
+       if (partial_status & SW_C1)
+               printk("SW: condition bit 1\n");
+       if (partial_status & SW_C0)
+               printk("SW: condition bit 0\n");
+       if (partial_status & SW_Summary)
+               printk("SW: exception summary\n");
+       if (partial_status & SW_Stack_Fault)
+               printk("SW: stack fault\n");
+       if (partial_status & SW_Precision)
+               printk("SW: loss of precision\n");
+       if (partial_status & SW_Underflow)
+               printk("SW: underflow\n");
+       if (partial_status & SW_Overflow)
+               printk("SW: overflow\n");
+       if (partial_status & SW_Zero_Div)
+               printk("SW: divide by zero\n");
+       if (partial_status & SW_Denorm_Op)
+               printk("SW: denormalized operand\n");
+       if (partial_status & SW_Invalid)
+               printk("SW: invalid operation\n");
  #endif /* DEBUGGING */
  
-  printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n",
-        partial_status & 0x8000 ? 1 : 0,   /* busy */
-        (partial_status & 0x3800) >> 11,   /* stack top pointer */
-        partial_status & 0x80 ? 1 : 0,     /* Error summary status */
-        partial_status & 0x40 ? 1 : 0,     /* Stack flag */
-        partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */
-        partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */
-        partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0,
-        partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0,
-        partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0);
-  
-printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d     ef=%d%d%d%d%d%d\n",
-        control_word & 0x1000 ? 1 : 0,
-        (control_word & 0x800) >> 11, (control_word & 0x400) >> 10,
-        (control_word & 0x200) >> 9, (control_word & 0x100) >> 8,
-        control_word & 0x80 ? 1 : 0,
-        control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0,
-        control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0,
-        control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0);
-
-  for ( i = 0; i < 8; i++ )
-    {
-      FPU_REG *r = &st(i);
-      u_char tagi = FPU_gettagi(i);
-      switch (tagi)
-       {
-       case TAG_Empty:
-         continue;
-         break;
-       case TAG_Zero:
-       case TAG_Special:
-         tagi = FPU_Special(r);
-       case TAG_Valid:
-         printk("st(%d)  %c .%04lx %04lx %04lx %04lx e%+-6d ", i,
-                getsign(r) ? '-' : '+',
-                (long)(r->sigh >> 16),
-                (long)(r->sigh & 0xFFFF),
-                (long)(r->sigl >> 16),
-                (long)(r->sigl & 0xFFFF),
-                exponent(r) - EXP_BIAS + 1);
-         break;
-       default:
-         printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi);
-         continue;
-         break;
+       printk(" SW: b=%d st=%d es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", partial_status & 0x8000 ? 1 : 0,    /* busy */
+              (partial_status & 0x3800) >> 11, /* stack top pointer */
+              partial_status & 0x80 ? 1 : 0,   /* Error summary status */
+              partial_status & 0x40 ? 1 : 0,   /* Stack flag */
+              partial_status & SW_C3 ? 1 : 0, partial_status & SW_C2 ? 1 : 0,  /* cc */
+              partial_status & SW_C1 ? 1 : 0, partial_status & SW_C0 ? 1 : 0,  /* cc */
+              partial_status & SW_Precision ? 1 : 0,
+              partial_status & SW_Underflow ? 1 : 0,
+              partial_status & SW_Overflow ? 1 : 0,
+              partial_status & SW_Zero_Div ? 1 : 0,
+              partial_status & SW_Denorm_Op ? 1 : 0,
+              partial_status & SW_Invalid ? 1 : 0);
+
+       printk(" CW: ic=%d rc=%d%d pc=%d%d iem=%d     ef=%d%d%d%d%d%d\n",
+              control_word & 0x1000 ? 1 : 0,
+              (control_word & 0x800) >> 11, (control_word & 0x400) >> 10,
+              (control_word & 0x200) >> 9, (control_word & 0x100) >> 8,
+              control_word & 0x80 ? 1 : 0,
+              control_word & SW_Precision ? 1 : 0,
+              control_word & SW_Underflow ? 1 : 0,
+              control_word & SW_Overflow ? 1 : 0,
+              control_word & SW_Zero_Div ? 1 : 0,
+              control_word & SW_Denorm_Op ? 1 : 0,
+              control_word & SW_Invalid ? 1 : 0);
+
+       for (i = 0; i < 8; i++) {
+               FPU_REG *r = &st(i);
+               u_char tagi = FPU_gettagi(i);
+               switch (tagi) {
+               case TAG_Empty:
+                       continue;
+                       break;
+               case TAG_Zero:
+               case TAG_Special:
+                       tagi = FPU_Special(r);
+               case TAG_Valid:
+                       printk("st(%d)  %c .%04lx %04lx %04lx %04lx e%+-6d ", i,
+                              getsign(r) ? '-' : '+',
+                              (long)(r->sigh >> 16),
+                              (long)(r->sigh & 0xFFFF),
+                              (long)(r->sigl >> 16),
+                              (long)(r->sigl & 0xFFFF),
+                              exponent(r) - EXP_BIAS + 1);
+                       break;
+               default:
+                       printk("Whoops! Error in errors.c: tag%d is %d ", i,
+                              tagi);
+                       continue;
+                       break;
+               }
+               printk("%s\n", tag_desc[(int)(unsigned)tagi]);
         }
-      printk("%s\n", tag_desc[(int) (unsigned) tagi]);
-    }
  
-  RE_ENTRANT_CHECK_ON;
+       RE_ENTRANT_CHECK_ON;
  
  }
  
  static struct {
-  int type;
-  const char *name;
+       int type;
+       const char *name;
  } exception_names[] = {
-  { EX_StackOver, "stack overflow" },
-  { EX_StackUnder, "stack underflow" },
-  { EX_Precision, "loss of precision" },
-  { EX_Underflow, "underflow" },
-  { EX_Overflow, "overflow" },
-  { EX_ZeroDiv, "divide by zero" },
-  { EX_Denormal, "denormalized operand" },
-  { EX_Invalid, "invalid operation" },
-  { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION },
-  { 0, NULL }
+       {
+       EX_StackOver, "stack overflow"}, {
+       EX_StackUnder, "stack underflow"}, {
+       EX_Precision, "loss of precision"}, {
+       EX_Underflow, "underflow"}, {
+       EX_Overflow, "overflow"}, {
+       EX_ZeroDiv, "divide by zero"}, {
+       EX_Denormal, "denormalized operand"}, {
+       EX_Invalid, "invalid operation"}, {
+       EX_INTERNAL, "INTERNAL BUG in " FPU_VERSION}, {
+       0, NULL}
  };
  
  /*
@@ -295,445 +304,386 @@ static struct {
  
  asmlinkage void FPU_exception(int n)
  {
-  int i, int_type;
-
-  int_type = 0;         /* Needed only to stop compiler warnings */
-  if ( n & EX_INTERNAL )
-    {
-      int_type = n - EX_INTERNAL;
-      n = EX_INTERNAL;
-      /* Set lots of exception bits! */
-      partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward);
-    }
-  else
-    {
-      /* Extract only the bits which we use to set the status word */
-      n &= (SW_Exc_Mask);
-      /* Set the corresponding exception bit */
-      partial_status |= n;
-      /* Set summary bits iff exception isn't masked */
-      if ( partial_status & ~control_word & CW_Exceptions )
-       partial_status |= (SW_Summary | SW_Backward);
-      if ( n & (SW_Stack_Fault | EX_Precision) )
-       {
-         if ( !(n & SW_C1) )
-           /* This bit distinguishes over- from underflow for a stack fault,
-              and roundup from round-down for precision loss. */
-           partial_status &= ~SW_C1;
+       int i, int_type;
+
+       int_type = 0;           /* Needed only to stop compiler warnings */
+       if (n & EX_INTERNAL) {
+               int_type = n - EX_INTERNAL;
+               n = EX_INTERNAL;
+               /* Set lots of exception bits! */
+               partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward);
+       } else {
+               /* Extract only the bits which we use to set the status word */
+               n &= (SW_Exc_Mask);
+               /* Set the corresponding exception bit */
+               partial_status |= n;
+               /* Set summary bits iff exception isn't masked */
+               if (partial_status & ~control_word & CW_Exceptions)
+                       partial_status |= (SW_Summary | SW_Backward);
+               if (n & (SW_Stack_Fault | EX_Precision)) {
+                       if (!(n & SW_C1))
+                               /* This bit distinguishes over- from underflow for a stack fault,
+                                  and roundup from round-down for precision loss. */
+                               partial_status &= ~SW_C1;
+               }
         }
-    }
  
-  RE_ENTRANT_CHECK_OFF;
-  if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) )
-    {
+       RE_ENTRANT_CHECK_OFF;
+       if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) {
  #ifdef PRINT_MESSAGES
-      /* My message from the sponsor */
-      printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n");
+               /* My message from the sponsor */
+               printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n");
  #endif /* PRINT_MESSAGES */
-      
-      /* Get a name string for error reporting */
-      for (i=0; exception_names[i].type; i++)
-       if ( (exception_names[i].type & n) == exception_names[i].type )
-         break;
-      
-      if (exception_names[i].type)
-       {
+
+               /* Get a name string for error reporting */
+               for (i = 0; exception_names[i].type; i++)
+                       if ((exception_names[i].type & n) ==
+                           exception_names[i].type)
+                               break;
+
+               if (exception_names[i].type) {
  #ifdef PRINT_MESSAGES
-         printk("FP Exception: %s!\n", exception_names[i].name);
+                       printk("FP Exception: %s!\n", exception_names[i].name);
  #endif /* PRINT_MESSAGES */
-       }
-      else
-       printk("FPU emulator: Unknown Exception: 0x%04x!\n", n);
-      
-      if ( n == EX_INTERNAL )
-       {
-         printk("FPU emulator: Internal error type 0x%04x\n", int_type);
-         FPU_printall();
-       }
+               } else
+                       printk("FPU emulator: Unknown Exception: 0x%04x!\n", n);
+
+               if (n == EX_INTERNAL) {
+                       printk("FPU emulator: Internal error type 0x%04x\n",
+                              int_type);
+                       FPU_printall();
+               }
  #ifdef PRINT_MESSAGES
-      else
-       FPU_printall();
+               else
+                       FPU_printall();
  #endif /* PRINT_MESSAGES */
  
-      /*
-       * The 80486 generates an interrupt on the next non-control FPU
-       * instruction. So we need some means of flagging it.
-       * We use the ES (Error Summary) bit for this.
-       */
-    }
-  RE_ENTRANT_CHECK_ON;
+               /*
+                * The 80486 generates an interrupt on the next non-control FPU
+                * instruction. So we need some means of flagging it.
+                * We use the ES (Error Summary) bit for this.
+                */
+       }
+       RE_ENTRANT_CHECK_ON;
  
  #ifdef __DEBUG__
-  math_abort(FPU_info,SIGFPE);
+       math_abort(FPU_info, SIGFPE);
  #endif /* __DEBUG__ */
  
  }
  
-
  /* Real operation attempted on a NaN. */
  /* Returns < 0 if the exception is unmasked */
  int real_1op_NaN(FPU_REG *a)
  {
-  int signalling, isNaN;
-
-  isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000);
-
-  /* The default result for the case of two "equal" NaNs (signs may
-     differ) is chosen to reproduce 80486 behaviour */
-  signalling = isNaN && !(a->sigh & 0x40000000);
-
-  if ( !signalling )
-    {
-      if ( !isNaN )  /* pseudo-NaN, or other unsupported? */
-       {
-         if ( control_word & CW_Invalid )
-           {
-             /* Masked response */
-             reg_copy(&CONST_QNaN, a);
-           }
-         EXCEPTION(EX_Invalid);
-         return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+       int signalling, isNaN;
+
+       isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000);
+
+       /* The default result for the case of two "equal" NaNs (signs may
+          differ) is chosen to reproduce 80486 behaviour */
+       signalling = isNaN && !(a->sigh & 0x40000000);
+
+       if (!signalling) {
+               if (!isNaN) {   /* pseudo-NaN, or other unsupported? */
+                       if (control_word & CW_Invalid) {
+                               /* Masked response */
+                               reg_copy(&CONST_QNaN, a);
+                       }
+                       EXCEPTION(EX_Invalid);
+                       return (!(control_word & CW_Invalid) ? FPU_Exception :
+                               0) | TAG_Special;
+               }
+               return TAG_Special;
         }
-      return TAG_Special;
-    }
  
-  if ( control_word & CW_Invalid )
-    {
-      /* The masked response */
-      if ( !(a->sigh & 0x80000000) )  /* pseudo-NaN ? */
-       {
-         reg_copy(&CONST_QNaN, a);
+       if (control_word & CW_Invalid) {
+               /* The masked response */
+               if (!(a->sigh & 0x80000000)) {  /* pseudo-NaN ? */
+                       reg_copy(&CONST_QNaN, a);
+               }
+               /* ensure a Quiet NaN */
+               a->sigh |= 0x40000000;
         }
-      /* ensure a Quiet NaN */
-      a->sigh |= 0x40000000;
-    }
  
-  EXCEPTION(EX_Invalid);
+       EXCEPTION(EX_Invalid);
  
-  return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+       return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
  }
  
-
  /* Real operation attempted on two operands, one a NaN. */
  /* Returns < 0 if the exception is unmasked */
  int real_2op_NaN(FPU_REG const *b, u_char tagb,
-                int deststnr,
-                FPU_REG const *defaultNaN)
+                int deststnr, FPU_REG const *defaultNaN)
  {
-  FPU_REG *dest = &st(deststnr);
-  FPU_REG const *a = dest;
-  u_char taga = FPU_gettagi(deststnr);
-  FPU_REG const *x;
-  int signalling, unsupported;
-
-  if ( taga == TAG_Special )
-    taga = FPU_Special(a);
-  if ( tagb == TAG_Special )
-    tagb = FPU_Special(b);
-
-  /* TW_NaN is also used for unsupported data types. */
-  unsupported = ((taga == TW_NaN)
-                && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000)))
-    || ((tagb == TW_NaN)
-       && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000)));
-  if ( unsupported )
-    {
-      if ( control_word & CW_Invalid )
-       {
-         /* Masked response */
-         FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
-       }
-      EXCEPTION(EX_Invalid);
-      return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
-    }
-
-  if (taga == TW_NaN)
-    {
-      x = a;
-      if (tagb == TW_NaN)
-       {
-         signalling = !(a->sigh & b->sigh & 0x40000000);
-         if ( significand(b) > significand(a) )
-           x = b;
-         else if ( significand(b) == significand(a) )
-           {
-             /* The default result for the case of two "equal" NaNs (signs may
-                differ) is chosen to reproduce 80486 behaviour */
-             x = defaultNaN;
-           }
-       }
-      else
-       {
-         /* return the quiet version of the NaN in a */
-         signalling = !(a->sigh & 0x40000000);
+       FPU_REG *dest = &st(deststnr);
+       FPU_REG const *a = dest;
+       u_char taga = FPU_gettagi(deststnr);
+       FPU_REG const *x;
+       int signalling, unsupported;
+
+       if (taga == TAG_Special)
+               taga = FPU_Special(a);
+       if (tagb == TAG_Special)
+               tagb = FPU_Special(b);
+
+       /* TW_NaN is also used for unsupported data types. */
+       unsupported = ((taga == TW_NaN)
+                      && !((exponent(a) == EXP_OVER)
+                           && (a->sigh & 0x80000000)))
+           || ((tagb == TW_NaN)
+               && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000)));
+       if (unsupported) {
+               if (control_word & CW_Invalid) {
+                       /* Masked response */
+                       FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
+               }
+               EXCEPTION(EX_Invalid);
+               return (!(control_word & CW_Invalid) ? FPU_Exception : 0) |
+                   TAG_Special;
         }
-    }
-  else
+
+       if (taga == TW_NaN) {
+               x = a;
+               if (tagb == TW_NaN) {
+                       signalling = !(a->sigh & b->sigh & 0x40000000);
+                       if (significand(b) > significand(a))
+                               x = b;
+                       else if (significand(b) == significand(a)) {
+                               /* The default result for the case of two "equal" NaNs (signs may
+                                  differ) is chosen to reproduce 80486 behaviour */
+                               x = defaultNaN;
+                       }
+               } else {
+                       /* return the quiet version of the NaN in a */
+                       signalling = !(a->sigh & 0x40000000);
+               }
+       } else
  #ifdef PARANOID
-    if (tagb == TW_NaN)
+       if (tagb == TW_NaN)
  #endif /* PARANOID */
-    {
-      signalling = !(b->sigh & 0x40000000);
-      x = b;
-    }
+       {
+               signalling = !(b->sigh & 0x40000000);
+               x = b;
+       }
  #ifdef PARANOID
-  else
-    {
-      signalling = 0;
-      EXCEPTION(EX_INTERNAL|0x113);
-      x = &CONST_QNaN;
-    }
+       else {
+               signalling = 0;
+               EXCEPTION(EX_INTERNAL | 0x113);
+               x = &CONST_QNaN;
+       }
  #endif /* PARANOID */
  
-  if ( (!signalling) || (control_word & CW_Invalid) )
-    {
-      if ( ! x )
-       x = b;
+       if ((!signalling) || (control_word & CW_Invalid)) {
+               if (!x)
+                       x = b;
  
-      if ( !(x->sigh & 0x80000000) )  /* pseudo-NaN ? */
-       x = &CONST_QNaN;
+               if (!(x->sigh & 0x80000000))    /* pseudo-NaN ? */
+                       x = &CONST_QNaN;
  
-      FPU_copy_to_regi(x, TAG_Special, deststnr);
+               FPU_copy_to_regi(x, TAG_Special, deststnr);
  
-      if ( !signalling )
-       return TAG_Special;
+               if (!signalling)
+                       return TAG_Special;
  
-      /* ensure a Quiet NaN */
-      dest->sigh |= 0x40000000;
-    }
+               /* ensure a Quiet NaN */
+               dest->sigh |= 0x40000000;
+       }
  
-  EXCEPTION(EX_Invalid);
+       EXCEPTION(EX_Invalid);
  
-  return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+       return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
  }
  
-
  /* Invalid arith operation on Valid registers */
  /* Returns < 0 if the exception is unmasked */
  asmlinkage int arith_invalid(int deststnr)
  {
  
-  EXCEPTION(EX_Invalid);
-  
-  if ( control_word & CW_Invalid )
-    {
-      /* The masked response */
-      FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
-    }
-  
-  return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
+       EXCEPTION(EX_Invalid);
  
-}
+       if (control_word & CW_Invalid) {
+               /* The masked response */
+               FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
+       }
  
+       return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
+
+}
  
  /* Divide a finite number by zero */
  asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign)
  {
-  FPU_REG *dest = &st(deststnr);
-  int tag = TAG_Valid;
+       FPU_REG *dest = &st(deststnr);
+       int tag = TAG_Valid;
+
+       if (control_word & CW_ZeroDiv) {
+               /* The masked response */
+               FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
+               setsign(dest, sign);
+               tag = TAG_Special;
+       }
  
-  if ( control_word & CW_ZeroDiv )
-    {
-      /* The masked response */
-      FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
-      setsign(dest, sign);
-      tag = TAG_Special;
-    }
- 
-  EXCEPTION(EX_ZeroDiv);
+       EXCEPTION(EX_ZeroDiv);
  
-  return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag;
+       return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag;
  
  }
  
-
  /* This may be called often, so keep it lean */
  int set_precision_flag(int flags)
  {
-  if ( control_word & CW_Precision )
-    {
-      partial_status &= ~(SW_C1 & flags);
-      partial_status |= flags;   /* The masked response */
-      return 0;
-    }
-  else
-    {
-      EXCEPTION(flags);
-      return 1;
-    }
+       if (control_word & CW_Precision) {
+               partial_status &= ~(SW_C1 & flags);
+               partial_status |= flags;        /* The masked response */
+               return 0;
+       } else {
+               EXCEPTION(flags);
+               return 1;
+       }
  }
  
-
  /* This may be called often, so keep it lean */
  asmlinkage void set_precision_flag_up(void)
  {
-  if ( control_word & CW_Precision )
-    partial_status |= (SW_Precision | SW_C1);   /* The masked response */
-  else
-    EXCEPTION(EX_Precision | SW_C1);
+       if (control_word & CW_Precision)
+               partial_status |= (SW_Precision | SW_C1);       /* The masked response */
+       else
+               EXCEPTION(EX_Precision | SW_C1);
  }
  
-
  /* This may be called often, so keep it lean */
  asmlinkage void set_precision_flag_down(void)
  {
-  if ( control_word & CW_Precision )
-    {   /* The masked response */
-      partial_status &= ~SW_C1;
-      partial_status |= SW_Precision;
-    }
-  else
-    EXCEPTION(EX_Precision);
+       if (control_word & CW_Precision) {      /* The masked response */
+               partial_status &= ~SW_C1;
+               partial_status |= SW_Precision;
+       } else
+               EXCEPTION(EX_Precision);
  }
  
-
  asmlinkage int denormal_operand(void)
  {
-  if ( control_word & CW_Denormal )
-    {   /* The masked response */
-      partial_status |= SW_Denorm_Op;
-      return TAG_Special;
-    }
-  else
-    {
-      EXCEPTION(EX_Denormal);
-      return TAG_Special | FPU_Exception;
-    }
+       if (control_word & CW_Denormal) {       /* The masked response */
+               partial_status |= SW_Denorm_Op;
+               return TAG_Special;
+       } else {
+               EXCEPTION(EX_Denormal);
+               return TAG_Special | FPU_Exception;
+       }
  }
  
-
  asmlinkage int arith_overflow(FPU_REG *dest)
  {
-  int tag = TAG_Valid;
+       int tag = TAG_Valid;
  
-  if ( control_word & CW_Overflow )
-    {
-      /* The masked response */
+       if (control_word & CW_Overflow) {
+               /* The masked response */
  /* ###### The response here depends upon the rounding mode */
-      reg_copy(&CONST_INF, dest);
-      tag = TAG_Special;
-    }
-  else
-    {
-      /* Subtract the magic number from the exponent */
-      addexponent(dest, (-3 * (1 << 13)));
-    }
-
-  EXCEPTION(EX_Overflow);
-  if ( control_word & CW_Overflow )
-    {
-      /* The overflow exception is masked. */
-      /* By definition, precision is lost.
-        The roundup bit (C1) is also set because we have
-        "rounded" upwards to Infinity. */
-      EXCEPTION(EX_Precision | SW_C1);
-      return tag;
-    }
-
-  return tag;
+               reg_copy(&CONST_INF, dest);
+               tag = TAG_Special;
+       } else {
+               /* Subtract the magic number from the exponent */
+               addexponent(dest, (-3 * (1 << 13)));
+       }
  
-}
+       EXCEPTION(EX_Overflow);
+       if (control_word & CW_Overflow) {
+               /* The overflow exception is masked. */
+               /* By definition, precision is lost.
+                  The roundup bit (C1) is also set because we have
+                  "rounded" upwards to Infinity. */
+               EXCEPTION(EX_Precision | SW_C1);
+               return tag;
+       }
+
+       return tag;
  
+}
  
  asmlinkage int arith_underflow(FPU_REG *dest)
  {
-  int tag = TAG_Valid;
-
-  if ( control_word & CW_Underflow )
-    {
-      /* The masked response */
-      if ( exponent16(dest) <= EXP_UNDER - 63 )
-       {
-         reg_copy(&CONST_Z, dest);
-         partial_status &= ~SW_C1;       /* Round down. */
-         tag = TAG_Zero;
+       int tag = TAG_Valid;
+
+       if (control_word & CW_Underflow) {
+               /* The masked response */
+               if (exponent16(dest) <= EXP_UNDER - 63) {
+                       reg_copy(&CONST_Z, dest);
+                       partial_status &= ~SW_C1;       /* Round down. */
+                       tag = TAG_Zero;
+               } else {
+                       stdexp(dest);
+               }
+       } else {
+               /* Add the magic number to the exponent. */
+               addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
         }
-      else
-       {
-         stdexp(dest);
+
+       EXCEPTION(EX_Underflow);
+       if (control_word & CW_Underflow) {
+               /* The underflow exception is masked. */
+               EXCEPTION(EX_Precision);
+               return tag;
         }
-    }
-  else
-    {
-      /* Add the magic number to the exponent. */
-      addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
-    }
-
-  EXCEPTION(EX_Underflow);
-  if ( control_word & CW_Underflow )
-    {
-      /* The underflow exception is masked. */
-      EXCEPTION(EX_Precision);
-      return tag;
-    }
-
-  return tag;
  
-}
+       return tag;
  
+}
  
  void FPU_stack_overflow(void)
  {
  
- if ( control_word & CW_Invalid )
-    {
-      /* The masked response */
-      top--;
-      FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
-    }
+       if (control_word & CW_Invalid) {
+               /* The masked response */
+               top--;
+               FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+       }
  
-  EXCEPTION(EX_StackOver);
+       EXCEPTION(EX_StackOver);
  
-  return;
+       return;
  
  }
  
-
  void FPU_stack_underflow(void)
  {
  
- if ( control_word & CW_Invalid )
-    {
-      /* The masked response */
-      FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
-    }
+       if (control_word & CW_Invalid) {
+               /* The masked response */
+               FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+       }
  
-  EXCEPTION(EX_StackUnder);
+       EXCEPTION(EX_StackUnder);
  
-  return;
+       return;
  
  }
  
-
  void FPU_stack_underflow_i(int i)
  {
  
- if ( control_word & CW_Invalid )
-    {
-      /* The masked response */
-      FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
-    }
+       if (control_word & CW_Invalid) {
+               /* The masked response */
+               FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
+       }
  
-  EXCEPTION(EX_StackUnder);
+       EXCEPTION(EX_StackUnder);
  
-  return;
+       return;
  
  }
  
-
  void FPU_stack_underflow_pop(int i)
  {
  
- if ( control_word & CW_Invalid )
-    {
-      /* The masked response */
-      FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
-      FPU_pop();
-    }
+       if (control_word & CW_Invalid) {
+               /* The masked response */
+               FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
+               FPU_pop();
+       }
  
-  EXCEPTION(EX_StackUnder);
+       EXCEPTION(EX_StackUnder);
  
-  return;
+       return;
  
  }
-
diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h

index b463f21a811e6545f5cdad53a4a283237d3f9f73..67f43a4683d51a909af6e1b89dd8478b510b3e14 100644 (file)
--- a/arch/x86/math-emu/exception.h
+++ b/arch/x86/math-emu/exception.h
@@ -9,7 +9,6 @@
  #ifndef _EXCEPTION_H_
  #define _EXCEPTION_H_
  
-
  #ifdef __ASSEMBLY__
  #define        Const_(x)       $##x
  #else
@@ -20,8 +19,8 @@
  #include "fpu_emu.h"
  #endif /* SW_C1 */
  
-#define FPU_BUSY        Const_(0x8000)   /* FPU busy bit (8087 compatibility) */
-#define EX_ErrorSummary Const_(0x0080)   /* Error summary status */
+#define FPU_BUSY        Const_(0x8000) /* FPU busy bit (8087 compatibility) */
+#define EX_ErrorSummary Const_(0x0080) /* Error summary status */
  /* Special exceptions: */
  #define        EX_INTERNAL     Const_(0x8000)  /* Internal error in wm-FPU-emu */
  #define EX_StackOver   Const_(0x0041|SW_C1)    /* stack overflow */
@@ -34,11 +33,9 @@
  #define EX_Denormal    Const_(0x0002)  /* denormalized operand */
  #define EX_Invalid     Const_(0x0001)  /* invalid operation */
  
-
  #define PRECISION_LOST_UP    Const_((EX_Precision | SW_C1))
  #define PRECISION_LOST_DOWN  Const_(EX_Precision)
  
-
  #ifndef __ASSEMBLY__
  
  #ifdef DEBUG
@@ -48,6 +45,6 @@
  #define        EXCEPTION(x)    FPU_exception(x)
  #endif
  
-#endif /* __ASSEMBLY__ */ 
+#endif /* __ASSEMBLY__ */
  
  #endif /* _EXCEPTION_H_ */
diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c

index 6972dec01af6a109df63a88e76e3de2ee99c5d06..aeab24e083c40a5f4aa231b2346f5af3377b660f 100644 (file)
--- a/arch/x86/math-emu/fpu_arith.c
+++ b/arch/x86/math-emu/fpu_arith.c
@@ -15,160 +15,138 @@
  #include "control_w.h"
  #include "status_w.h"
  
-
  void fadd__(void)
  {
-  /* fadd st,st(i) */
-  int i = FPU_rm;
-  clear_C1();
-  FPU_add(&st(i), FPU_gettagi(i), 0, control_word);
+       /* fadd st,st(i) */
+       int i = FPU_rm;
+       clear_C1();
+       FPU_add(&st(i), FPU_gettagi(i), 0, control_word);
  }
  
-
  void fmul__(void)
  {
-  /* fmul st,st(i) */
-  int i = FPU_rm;
-  clear_C1();
-  FPU_mul(&st(i), FPU_gettagi(i), 0, control_word);
+       /* fmul st,st(i) */
+       int i = FPU_rm;
+       clear_C1();
+       FPU_mul(&st(i), FPU_gettagi(i), 0, control_word);
  }
  
-
-
  void fsub__(void)
  {
-  /* fsub st,st(i) */
-  clear_C1();
-  FPU_sub(0, FPU_rm, control_word);
+       /* fsub st,st(i) */
+       clear_C1();
+       FPU_sub(0, FPU_rm, control_word);
  }
  
-
  void fsubr_(void)
  {
-  /* fsubr st,st(i) */
-  clear_C1();
-  FPU_sub(REV, FPU_rm, control_word);
+       /* fsubr st,st(i) */
+       clear_C1();
+       FPU_sub(REV, FPU_rm, control_word);
  }
  
-
  void fdiv__(void)
  {
-  /* fdiv st,st(i) */
-  clear_C1();
-  FPU_div(0, FPU_rm, control_word);
+       /* fdiv st,st(i) */
+       clear_C1();
+       FPU_div(0, FPU_rm, control_word);
  }
  
-
  void fdivr_(void)
  {
-  /* fdivr st,st(i) */
-  clear_C1();
-  FPU_div(REV, FPU_rm, control_word);
+       /* fdivr st,st(i) */
+       clear_C1();
+       FPU_div(REV, FPU_rm, control_word);
  }
  
-
-
  void fadd_i(void)
  {
-  /* fadd st(i),st */
-  int i = FPU_rm;
-  clear_C1();
-  FPU_add(&st(i), FPU_gettagi(i), i, control_word);
+       /* fadd st(i),st */
+       int i = FPU_rm;
+       clear_C1();
+       FPU_add(&st(i), FPU_gettagi(i), i, control_word);
  }
  
-
  void fmul_i(void)
  {
-  /* fmul st(i),st */
-  clear_C1();
-  FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word);
+       /* fmul st(i),st */
+       clear_C1();
+       FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word);
  }
  
-
  void fsubri(void)
  {
-  /* fsubr st(i),st */
-  clear_C1();
-  FPU_sub(DEST_RM, FPU_rm, control_word);
+       /* fsubr st(i),st */
+       clear_C1();
+       FPU_sub(DEST_RM, FPU_rm, control_word);
  }
  
-
  void fsub_i(void)
  {
-  /* fsub st(i),st */
-  clear_C1();
-  FPU_sub(REV|DEST_RM, FPU_rm, control_word);
+       /* fsub st(i),st */
+       clear_C1();
+       FPU_sub(REV | DEST_RM, FPU_rm, control_word);
  }
  
-
  void fdivri(void)
  {
-  /* fdivr st(i),st */
-  clear_C1();
-  FPU_div(DEST_RM, FPU_rm, control_word);
+       /* fdivr st(i),st */
+       clear_C1();
+       FPU_div(DEST_RM, FPU_rm, control_word);
  }
  
-
  void fdiv_i(void)
  {
-  /* fdiv st(i),st */
-  clear_C1();
-  FPU_div(REV|DEST_RM, FPU_rm, control_word);
+       /* fdiv st(i),st */
+       clear_C1();
+       FPU_div(REV | DEST_RM, FPU_rm, control_word);
  }
  
-
-
  void faddp_(void)
  {
-  /* faddp st(i),st */
-  int i = FPU_rm;
-  clear_C1();
-  if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 )
-    FPU_pop();
+       /* faddp st(i),st */
+       int i = FPU_rm;
+       clear_C1();
+       if (FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0)
+               FPU_pop();
  }
  
-
  void fmulp_(void)
  {
-  /* fmulp st(i),st */
-  clear_C1();
-  if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 )
-    FPU_pop();
+       /* fmulp st(i),st */
+       clear_C1();
+       if (FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0)
+               FPU_pop();
  }
  
-
-
  void fsubrp(void)
  {
-  /* fsubrp st(i),st */
-  clear_C1();
-  if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 )
-    FPU_pop();
+       /* fsubrp st(i),st */
+       clear_C1();
+       if (FPU_sub(DEST_RM, FPU_rm, control_word) >= 0)
+               FPU_pop();
  }
  
-
  void fsubp_(void)
  {
-  /* fsubp st(i),st */
-  clear_C1();
-  if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 )
-    FPU_pop();
+       /* fsubp st(i),st */
+       clear_C1();
+       if (FPU_sub(REV | DEST_RM, FPU_rm, control_word) >= 0)
+               FPU_pop();
  }
  
-
  void fdivrp(void)
  {
-  /* fdivrp st(i),st */
-  clear_C1();
-  if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 )
-    FPU_pop();
+       /* fdivrp st(i),st */
+       clear_C1();
+       if (FPU_div(DEST_RM, FPU_rm, control_word) >= 0)
+               FPU_pop();
  }
  
-
  void fdivp_(void)
  {
-  /* fdivp st(i),st */
-  clear_C1();
-  if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 )
-    FPU_pop();
+       /* fdivp st(i),st */
+       clear_C1();
+       if (FPU_div(REV | DEST_RM, FPU_rm, control_word) >= 0)
+               FPU_pop();
  }
diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h

index 9ba12416df123fdb31c0c42af2570741cc78a79a..955b932735a415a3147b96c9c79c29d6db023554 100644 (file)
--- a/arch/x86/math-emu/fpu_asm.h
+++ b/arch/x86/math-emu/fpu_asm.h
@@ -14,7 +14,6 @@
  
  #define        EXCEPTION       FPU_exception
  
-
  #define PARAM1 8(%ebp)
  #define        PARAM2  12(%ebp)
  #define        PARAM3  16(%ebp)
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c

index 20886cfb9f76f1bc4780ad4511e0ff21ec902d1e..491e737ce547c236830f8ecb4ef45c5415c53c1e 100644 (file)
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -16,34 +16,34 @@
  #include "status_w.h"
  #include "control_w.h"
  
-
  static void fnop(void)
  {
  }
  
  static void fclex(void)
  {
-  partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision|
-                  SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op|
-                  SW_Invalid);
-  no_ip_update = 1;
+       partial_status &=
+           ~(SW_Backward | SW_Summary | SW_Stack_Fault | SW_Precision |
+             SW_Underflow | SW_Overflow | SW_Zero_Div | SW_Denorm_Op |
+             SW_Invalid);
+       no_ip_update = 1;
  }
  
  /* Needs to be externally visible */
  void finit(void)
  {
-  control_word = 0x037f;
-  partial_status = 0;
-  top = 0;            /* We don't keep top in the status word internally. */
-  fpu_tag_word = 0xffff;
-  /* The behaviour is different from that detailed in
-     Section 15.1.6 of the Intel manual */
-  operand_address.offset = 0;
-  operand_address.selector = 0;
-  instruction_address.offset = 0;
-  instruction_address.selector = 0;
-  instruction_address.opcode = 0;
-  no_ip_update = 1;
+       control_word = 0x037f;
+       partial_status = 0;
+       top = 0;                /* We don't keep top in the status word internally. */
+       fpu_tag_word = 0xffff;
+       /* The behaviour is different from that detailed in
+          Section 15.1.6 of the Intel manual */
+       operand_address.offset = 0;
+       operand_address.selector = 0;
+       instruction_address.offset = 0;
+       instruction_address.selector = 0;
+       instruction_address.opcode = 0;
+       no_ip_update = 1;
  }
  
  /*
@@ -54,151 +54,134 @@ void finit(void)
  #define fsetpm fnop
  
  static FUNC const finit_table[] = {
-  feni, fdisi, fclex, finit,
-  fsetpm, FPU_illegal, FPU_illegal, FPU_illegal
+       feni, fdisi, fclex, finit,
+       fsetpm, FPU_illegal, FPU_illegal, FPU_illegal
  };
  
  void finit_(void)
  {
-  (finit_table[FPU_rm])();
+       (finit_table[FPU_rm]) ();
  }
  
-
  static void fstsw_ax(void)
  {
-  *(short *) &FPU_EAX = status_word();
-  no_ip_update = 1;
+       *(short *)&FPU_EAX = status_word();
+       no_ip_update = 1;
  }
  
  static FUNC const fstsw_table[] = {
-  fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal,
-  FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
+       fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal,
+       FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
  };
  
  void fstsw_(void)
  {
-  (fstsw_table[FPU_rm])();
+       (fstsw_table[FPU_rm]) ();
  }
  
-
  static FUNC const fp_nop_table[] = {
-  fnop, FPU_illegal, FPU_illegal, FPU_illegal,
-  FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
+       fnop, FPU_illegal, FPU_illegal, FPU_illegal,
+       FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
  };
  
  void fp_nop(void)
  {
-  (fp_nop_table[FPU_rm])();
+       (fp_nop_table[FPU_rm]) ();
  }
  
-
  void fld_i_(void)
  {
-  FPU_REG *st_new_ptr;
-  int i;
-  u_char tag;
-
-  if ( STACK_OVERFLOW )
-    { FPU_stack_overflow(); return; }
-
-  /* fld st(i) */
-  i = FPU_rm;
-  if ( NOT_EMPTY(i) )
-    {
-      reg_copy(&st(i), st_new_ptr);
-      tag = FPU_gettagi(i);
-      push();
-      FPU_settag0(tag);
-    }
-  else
-    {
-      if ( control_word & CW_Invalid )
-       {
-         /* The masked response */
-         FPU_stack_underflow();
+       FPU_REG *st_new_ptr;
+       int i;
+       u_char tag;
+
+       if (STACK_OVERFLOW) {
+               FPU_stack_overflow();
+               return;
         }
-      else
-       EXCEPTION(EX_StackUnder);
-    }
  
-}
+       /* fld st(i) */
+       i = FPU_rm;
+       if (NOT_EMPTY(i)) {
+               reg_copy(&st(i), st_new_ptr);
+               tag = FPU_gettagi(i);
+               push();
+               FPU_settag0(tag);
+       } else {
+               if (control_word & CW_Invalid) {
+                       /* The masked response */
+                       FPU_stack_underflow();
+               } else
+                       EXCEPTION(EX_StackUnder);
+       }
  
+}
  
  void fxch_i(void)
  {
-  /* fxch st(i) */
-  FPU_REG t;
-  int i = FPU_rm;
-  FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i);
-  long tag_word = fpu_tag_word;
-  int regnr = top & 7, regnri = ((regnr + i) & 7);
-  u_char st0_tag = (tag_word >> (regnr*2)) & 3;
-  u_char sti_tag = (tag_word >> (regnri*2)) & 3;
-
-  if ( st0_tag == TAG_Empty )
-    {
-      if ( sti_tag == TAG_Empty )
-       {
-         FPU_stack_underflow();
-         FPU_stack_underflow_i(i);
-         return;
+       /* fxch st(i) */
+       FPU_REG t;
+       int i = FPU_rm;
+       FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i);
+       long tag_word = fpu_tag_word;
+       int regnr = top & 7, regnri = ((regnr + i) & 7);
+       u_char st0_tag = (tag_word >> (regnr * 2)) & 3;
+       u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
+
+       if (st0_tag == TAG_Empty) {
+               if (sti_tag == TAG_Empty) {
+                       FPU_stack_underflow();
+                       FPU_stack_underflow_i(i);
+                       return;
+               }
+               if (control_word & CW_Invalid) {
+                       /* Masked response */
+                       FPU_copy_to_reg0(sti_ptr, sti_tag);
+               }
+               FPU_stack_underflow_i(i);
+               return;
         }
-      if ( control_word & CW_Invalid )
-       {
-         /* Masked response */
-         FPU_copy_to_reg0(sti_ptr, sti_tag);
+       if (sti_tag == TAG_Empty) {
+               if (control_word & CW_Invalid) {
+                       /* Masked response */
+                       FPU_copy_to_regi(st0_ptr, st0_tag, i);
+               }
+               FPU_stack_underflow();
+               return;
         }
-      FPU_stack_underflow_i(i);
-      return;
-    }
-  if ( sti_tag == TAG_Empty )
-    {
-      if ( control_word & CW_Invalid )
-       {
-         /* Masked response */
-         FPU_copy_to_regi(st0_ptr, st0_tag, i);
-       }
-      FPU_stack_underflow();
-      return;
-    }
-  clear_C1();
-
-  reg_copy(st0_ptr, &t);
-  reg_copy(sti_ptr, st0_ptr);
-  reg_copy(&t, sti_ptr);
-
-  tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2));
-  tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2));
-  fpu_tag_word = tag_word;
-}
+       clear_C1();
  
+       reg_copy(st0_ptr, &t);
+       reg_copy(sti_ptr, st0_ptr);
+       reg_copy(&t, sti_ptr);
+
+       tag_word &= ~(3 << (regnr * 2)) & ~(3 << (regnri * 2));
+       tag_word |= (sti_tag << (regnr * 2)) | (st0_tag << (regnri * 2));
+       fpu_tag_word = tag_word;
+}
  
  void ffree_(void)
  {
-  /* ffree st(i) */
-  FPU_settagi(FPU_rm, TAG_Empty);
+       /* ffree st(i) */
+       FPU_settagi(FPU_rm, TAG_Empty);
  }
  
-
  void ffreep(void)
  {
-  /* ffree st(i) + pop - unofficial code */
-  FPU_settagi(FPU_rm, TAG_Empty);
-  FPU_pop();
+       /* ffree st(i) + pop - unofficial code */
+       FPU_settagi(FPU_rm, TAG_Empty);
+       FPU_pop();
  }
  
-
  void fst_i_(void)
  {
-  /* fst st(i) */
-  FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
+       /* fst st(i) */
+       FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
  }
  
-
  void fstp_i(void)
  {
-  /* fstp st(i) */
-  FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
-  FPU_pop();
+       /* fstp st(i) */
+       FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
+       FPU_pop();
  }
-
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h

index 65120f52385332c67aa1b5275f407bc377a79c17..4dae511c85ad0dead11962f635cb405d13961551 100644 (file)
--- a/arch/x86/math-emu/fpu_emu.h
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -7,7 +7,6 @@
   |                                                                           |
   +---------------------------------------------------------------------------*/
  
-
  #ifndef _FPU_EMU_H_
  #define _FPU_EMU_H_
  
@@ -28,15 +27,15 @@
  #endif
  
  #define EXP_BIAS       Const(0)
-#define EXP_OVER       Const(0x4000)    /* smallest invalid large exponent */
-#define        EXP_UNDER       Const(-0x3fff)   /* largest invalid small exponent */
-#define EXP_WAY_UNDER   Const(-0x6000)   /* Below the smallest denormal, but
-                                           still a 16 bit nr. */
+#define EXP_OVER       Const(0x4000)   /* smallest invalid large exponent */
+#define        EXP_UNDER       Const(-0x3fff)  /* largest invalid small exponent */
+#define EXP_WAY_UNDER   Const(-0x6000) /* Below the smallest denormal, but
+                                          still a 16 bit nr. */
  #define EXP_Infinity    EXP_OVER
  #define EXP_NaN         EXP_OVER
  
  #define EXTENDED_Ebias Const(0x3fff)
-#define EXTENDED_Emin (-0x3ffe)  /* smallest valid exponent */
+#define EXTENDED_Emin (-0x3ffe)        /* smallest valid exponent */
  
  #define SIGN_POS       Const(0)
  #define SIGN_NEG       Const(0x80)
@@ -44,10 +43,9 @@
  #define SIGN_Positive  Const(0)
  #define SIGN_Negative  Const(0x8000)
  
-
  /* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */
  /* The following fold to 2 (Special) in the Tag Word */
-#define TW_Denormal     Const(4)        /* De-normal */
+#define TW_Denormal     Const(4)       /* De-normal */
  #define TW_Infinity    Const(5)        /* + or - infinity */
  #define        TW_NaN          Const(6)        /* Not a Number */
  #define        TW_Unsupported  Const(7)        /* Not supported by an 80486 */
@@ -67,14 +65,13 @@
  #define DEST_RM         0x20
  #define LOADED          0x40
  
-#define FPU_Exception   Const(0x80000000)   /* Added to tag returns. */
-
+#define FPU_Exception   Const(0x80000000)      /* Added to tag returns. */
  
  #ifndef __ASSEMBLY__
  
  #include "fpu_system.h"
  
-#include <asm/sigcontext.h>   /* for struct _fpstate */
+#include <asm/sigcontext.h>    /* for struct _fpstate */
  #include <asm/math_emu.h>
  #include <linux/linkage.h>
  
@@ -112,30 +109,33 @@ extern u_char emulating;
  #define PREFIX_DEFAULT 7
  
  struct address {
-  unsigned int offset;
-  unsigned int selector:16;
-  unsigned int opcode:11;
-  unsigned int empty:5;
+       unsigned int offset;
+       unsigned int selector:16;
+       unsigned int opcode:11;
+       unsigned int empty:5;
  };
  struct fpu__reg {
-  unsigned sigl;
-  unsigned sigh;
-  short exp;
+       unsigned sigl;
+       unsigned sigh;
+       short exp;
  };
  
-typedef void (*FUNC)(void);
+typedef void (*FUNC) (void);
  typedef struct fpu__reg FPU_REG;
-typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag);
-typedef struct { u_char address_size, operand_size, segment; }
-        overrides;
+typedef void (*FUNC_ST0) (FPU_REG *st0_ptr, u_char st0_tag);
+typedef struct {
+       u_char address_size, operand_size, segment;
+} overrides;
  /* This structure is 32 bits: */
-typedef struct { overrides override;
-                u_char default_mode; } fpu_addr_modes;
+typedef struct {
+       overrides override;
+       u_char default_mode;
+} fpu_addr_modes;
  /* PROTECTED has a restricted meaning in the emulator; it is used
     to signal that the emulator needs to do special things to ensure
     that protection is respected in a segmented model. */
  #define PROTECTED 4
-#define SIXTEEN   1         /* We rely upon this being 1 (true) */
+#define SIXTEEN   1            /* We rely upon this being 1 (true) */
  #define VM86      SIXTEEN
  #define PM16      (SIXTEEN | PROTECTED)
  #define SEG32     PROTECTED
@@ -168,8 +168,8 @@ extern u_char const data_sizes_16[32];
  
  static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
  {
-  *(short *)&(y->exp) = *(const short *)&(x->exp); 
-  *(long long *)&(y->sigl) = *(const long long *)&(x->sigl);
+       *(short *)&(y->exp) = *(const short *)&(x->exp);
+       *(long long *)&(y->sigl) = *(const long long *)&(x->sigl);
  }
  
  #define exponent(x)  (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias)
@@ -184,27 +184,26 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
  
  #define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] )
  
-
  /*----- Prototypes for functions written in assembler -----*/
  /* extern void reg_move(FPU_REG *a, FPU_REG *b); */
  
  asmlinkage int FPU_normalize(FPU_REG *x);
  asmlinkage int FPU_normalize_nuo(FPU_REG *x);
  asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2,
-                        FPU_REG *answ, unsigned int control_w, u_char sign,
+                        FPU_REG * answ, unsigned int control_w, u_char sign,
                          int expa, int expb);
  asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2,
-                        FPU_REG *answ, unsigned int control_w, u_char sign,
+                        FPU_REG * answ, unsigned int control_w, u_char sign,
                          int expon);
  asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2,
-                        FPU_REG *answ, unsigned int control_w, u_char sign);
+                        FPU_REG * answ, unsigned int control_w, u_char sign);
  asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2,
-                        FPU_REG *answ, unsigned int control_w, u_char sign,
+                        FPU_REG * answ, unsigned int control_w, u_char sign,
                          int expa, int expb);
  asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2,
                        unsigned int control_w, u_char sign);
-asmlinkage unsigned    FPU_shrx(void *l, unsigned x);
-asmlinkage unsigned    FPU_shrxs(void *v, unsigned x);
+asmlinkage unsigned FPU_shrx(void *l, unsigned x);
+asmlinkage unsigned FPU_shrxs(void *v, unsigned x);
  asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y);
  asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy,
                          unsigned int control_w, u_char sign);
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c

index 1853524c8b576f5420de5cbd746aecb30f594d50..760baeea5f07c9ad13858d19faa1060a7de3d849 100644 (file)
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -25,10 +25,11 @@
   +---------------------------------------------------------------------------*/
  
  #include <linux/signal.h>
-#include <linux/ptrace.h>
+#include <linux/regset.h>
  
  #include <asm/uaccess.h>
  #include <asm/desc.h>
+#include <asm/user.h>
  
  #include "fpu_system.h"
  #include "fpu_emu.h"
@@ -36,726 +37,727 @@
  #include "control_w.h"
  #include "status_w.h"
  
-#define __BAD__ FPU_illegal   /* Illegal on an 80486, causes SIGILL */
+#define __BAD__ FPU_illegal    /* Illegal on an 80486, causes SIGILL */
  
-#ifndef NO_UNDOC_CODE    /* Un-documented FPU op-codes supported by default. */
+#ifndef NO_UNDOC_CODE          /* Un-documented FPU op-codes supported by default. */
  
  /* WARNING: These codes are not documented by Intel in their 80486 manual
     and may not work on FPU clones or later Intel FPUs. */
  
  /* Changes to support the un-doc codes provided by Linus Torvalds. */
  
-#define _d9_d8_ fstp_i    /* unofficial code (19) */
-#define _dc_d0_ fcom_st   /* unofficial code (14) */
-#define _dc_d8_ fcompst   /* unofficial code (1c) */
-#define _dd_c8_ fxch_i    /* unofficial code (0d) */
-#define _de_d0_ fcompst   /* unofficial code (16) */
-#define _df_c0_ ffreep    /* unofficial code (07) ffree + pop */
-#define _df_c8_ fxch_i    /* unofficial code (0f) */
-#define _df_d0_ fstp_i    /* unofficial code (17) */
-#define _df_d8_ fstp_i    /* unofficial code (1f) */
+#define _d9_d8_ fstp_i         /* unofficial code (19) */
+#define _dc_d0_ fcom_st                /* unofficial code (14) */
+#define _dc_d8_ fcompst                /* unofficial code (1c) */
+#define _dd_c8_ fxch_i         /* unofficial code (0d) */
+#define _de_d0_ fcompst                /* unofficial code (16) */
+#define _df_c0_ ffreep         /* unofficial code (07) ffree + pop */
+#define _df_c8_ fxch_i         /* unofficial code (0f) */
+#define _df_d0_ fstp_i         /* unofficial code (17) */
+#define _df_d8_ fstp_i         /* unofficial code (1f) */
  
  static FUNC const st_instr_table[64] = {
-  fadd__,   fld_i_,     __BAD__, __BAD__, fadd_i,  ffree_,  faddp_,  _df_c0_,
-  fmul__,   fxch_i,     __BAD__, __BAD__, fmul_i,  _dd_c8_, fmulp_,  _df_c8_,
-  fcom_st,  fp_nop,     __BAD__, __BAD__, _dc_d0_, fst_i_,  _de_d0_, _df_d0_,
-  fcompst,  _d9_d8_,    __BAD__, __BAD__, _dc_d8_, fstp_i,  fcompp,  _df_d8_,
-  fsub__,   FPU_etc,    __BAD__, finit_,  fsubri,  fucom_,  fsubrp,  fstsw_,
-  fsubr_,   fconst,     fucompp, __BAD__, fsub_i,  fucomp,  fsubp_,  __BAD__,
-  fdiv__,   FPU_triga,  __BAD__, __BAD__, fdivri,  __BAD__, fdivrp,  __BAD__,
-  fdivr_,   FPU_trigb,  __BAD__, __BAD__, fdiv_i,  __BAD__, fdivp_,  __BAD__,
+       fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
+       fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
+       fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
+       fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
+       fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
+       fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
+       fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
+       fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
  };
  
-#else     /* Support only documented FPU op-codes */
+#else /* Support only documented FPU op-codes */
  
  static FUNC const st_instr_table[64] = {
-  fadd__,   fld_i_,     __BAD__, __BAD__, fadd_i,  ffree_,  faddp_,  __BAD__,
-  fmul__,   fxch_i,     __BAD__, __BAD__, fmul_i,  __BAD__, fmulp_,  __BAD__,
-  fcom_st,  fp_nop,     __BAD__, __BAD__, __BAD__, fst_i_,  __BAD__, __BAD__,
-  fcompst,  __BAD__,    __BAD__, __BAD__, __BAD__, fstp_i,  fcompp,  __BAD__,
-  fsub__,   FPU_etc,    __BAD__, finit_,  fsubri,  fucom_,  fsubrp,  fstsw_,
-  fsubr_,   fconst,     fucompp, __BAD__, fsub_i,  fucomp,  fsubp_,  __BAD__,
-  fdiv__,   FPU_triga,  __BAD__, __BAD__, fdivri,  __BAD__, fdivrp,  __BAD__,
-  fdivr_,   FPU_trigb,  __BAD__, __BAD__, fdiv_i,  __BAD__, fdivp_,  __BAD__,
+       fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
+       fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
+       fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
+       fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
+       fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
+       fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
+       fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
+       fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
  };
  
  #endif /* NO_UNDOC_CODE */
  
-
-#define _NONE_ 0   /* Take no special action */
-#define _REG0_ 1   /* Need to check for not empty st(0) */
-#define _REGI_ 2   /* Need to check for not empty st(0) and st(rm) */
-#define _REGi_ 0   /* Uses st(rm) */
-#define _PUSH_ 3   /* Need to check for space to push onto stack */
-#define _null_ 4   /* Function illegal or not implemented */
-#define _REGIi 5   /* Uses st(0) and st(rm), result to st(rm) */
-#define _REGIp 6   /* Uses st(0) and st(rm), result to st(rm) then pop */
-#define _REGIc 0   /* Compare st(0) and st(rm) */
-#define _REGIn 0   /* Uses st(0) and st(rm), but handle checks later */
+#define _NONE_ 0               /* Take no special action */
+#define _REG0_ 1               /* Need to check for not empty st(0) */
+#define _REGI_ 2               /* Need to check for not empty st(0) and st(rm) */
+#define _REGi_ 0               /* Uses st(rm) */
+#define _PUSH_ 3               /* Need to check for space to push onto stack */
+#define _null_ 4               /* Function illegal or not implemented */
+#define _REGIi 5               /* Uses st(0) and st(rm), result to st(rm) */
+#define _REGIp 6               /* Uses st(0) and st(rm), result to st(rm) then pop */
+#define _REGIc 0               /* Compare st(0) and st(rm) */
+#define _REGIn 0               /* Uses st(0) and st(rm), but handle checks later */
  
  #ifndef NO_UNDOC_CODE
  
  /* Un-documented FPU op-codes supported by default. (see above) */
  
  static u_char const type_table[64] = {
-  _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
-  _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
-  _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-  _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-  _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-  _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+       _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
+       _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
+       _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
+       _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
+       _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+       _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
+       _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
+       _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
  };
  
-#else     /* Support only documented FPU op-codes */
+#else /* Support only documented FPU op-codes */
  
  static u_char const type_table[64] = {
-  _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
-  _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-  _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
-  _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
-  _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-  _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-  _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+       _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
+       _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
+       _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
+       _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
+       _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+       _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
+       _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
+       _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
  };
  
  #endif /* NO_UNDOC_CODE */
  
-
  #ifdef RE_ENTRANT_CHECKING
-u_char emulating=0;
+u_char emulating = 0;
  #endif /* RE_ENTRANT_CHECKING */
  
-static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
-                       overrides *override);
+static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip,
+                       overrides * override);
  
  asmlinkage void math_emulate(long arg)
  {
-  u_char  FPU_modrm, byte1;
-  unsigned short code;
-  fpu_addr_modes addr_modes;
-  int unmasked;
-  FPU_REG loaded_data;
-  FPU_REG *st0_ptr;
-  u_char         loaded_tag, st0_tag;
-  void __user *data_address;
-  struct address data_sel_off;
-  struct address entry_sel_off;
-  unsigned long code_base = 0;
-  unsigned long code_limit = 0;  /* Initialized to stop compiler warnings */
-  struct desc_struct code_descriptor;
+       u_char FPU_modrm, byte1;
+       unsigned short code;
+       fpu_addr_modes addr_modes;
+       int unmasked;
+       FPU_REG loaded_data;
+       FPU_REG *st0_ptr;
+       u_char loaded_tag, st0_tag;
+       void __user *data_address;
+       struct address data_sel_off;
+       struct address entry_sel_off;
+       unsigned long code_base = 0;
+       unsigned long code_limit = 0;   /* Initialized to stop compiler warnings */
+       struct desc_struct code_descriptor;
  
  #ifdef RE_ENTRANT_CHECKING
-  if ( emulating )
-    {
-      printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
-    }
-  RE_ENTRANT_CHECK_ON;
+       if (emulating) {
+               printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
+       }
+       RE_ENTRANT_CHECK_ON;
  #endif /* RE_ENTRANT_CHECKING */
  
-  if (!used_math())
-    {
-      finit();
-      set_used_math();
-    }
-
-  SETUP_DATA_AREA(arg);
-
-  FPU_ORIG_EIP = FPU_EIP;
-
-  if ( (FPU_EFLAGS & 0x00020000) != 0 )
-    {
-      /* Virtual 8086 mode */
-      addr_modes.default_mode = VM86;
-      FPU_EIP += code_base = FPU_CS << 4;
-      code_limit = code_base + 0xffff;  /* Assumes code_base <= 0xffff0000 */
-    }
-  else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS )
-    {
-      addr_modes.default_mode = 0;
-    }
-  else if ( FPU_CS == __KERNEL_CS )
-    {
-      printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP);
-      panic("Math emulation needed in kernel");
-    }
-  else
-    {
-
-      if ( (FPU_CS & 4) != 4 )   /* Must be in the LDT */
-       {
-         /* Can only handle segmented addressing via the LDT
-            for now, and it must be 16 bit */
-         printk("FPU emulator: Unsupported addressing mode\n");
-         math_abort(FPU_info, SIGILL);
+       if (!used_math()) {
+               finit();
+               set_used_math();
         }
  
-      code_descriptor = LDT_DESCRIPTOR(FPU_CS);
-      if ( SEG_D_SIZE(code_descriptor) )
-       {
-         /* The above test may be wrong, the book is not clear */
-         /* Segmented 32 bit protected mode */
-         addr_modes.default_mode = SEG32;
+       SETUP_DATA_AREA(arg);
+
+       FPU_ORIG_EIP = FPU_EIP;
+
+       if ((FPU_EFLAGS & 0x00020000) != 0) {
+               /* Virtual 8086 mode */
+               addr_modes.default_mode = VM86;
+               FPU_EIP += code_base = FPU_CS << 4;
+               code_limit = code_base + 0xffff;        /* Assumes code_base <= 0xffff0000 */
+       } else if (FPU_CS == __USER_CS && FPU_DS == __USER_DS) {
+               addr_modes.default_mode = 0;
+       } else if (FPU_CS == __KERNEL_CS) {
+               printk("math_emulate: %04x:%08lx\n", FPU_CS, FPU_EIP);
+               panic("Math emulation needed in kernel");
+       } else {
+
+               if ((FPU_CS & 4) != 4) {        /* Must be in the LDT */
+                       /* Can only handle segmented addressing via the LDT
+                          for now, and it must be 16 bit */
+                       printk("FPU emulator: Unsupported addressing mode\n");
+                       math_abort(FPU_info, SIGILL);
+               }
+
+               code_descriptor = LDT_DESCRIPTOR(FPU_CS);
+               if (SEG_D_SIZE(code_descriptor)) {
+                       /* The above test may be wrong, the book is not clear */
+                       /* Segmented 32 bit protected mode */
+                       addr_modes.default_mode = SEG32;
+               } else {
+                       /* 16 bit protected mode */
+                       addr_modes.default_mode = PM16;
+               }
+               FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
+               code_limit = code_base
+                   + (SEG_LIMIT(code_descriptor) +
+                      1) * SEG_GRANULARITY(code_descriptor)
+                   - 1;
+               if (code_limit < code_base)
+                       code_limit = 0xffffffff;
         }
-      else
-       {
-         /* 16 bit protected mode */
-         addr_modes.default_mode = PM16;
+
+       FPU_lookahead = !(FPU_EFLAGS & X86_EFLAGS_TF);
+
+       if (!valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
+                         &addr_modes.override)) {
+               RE_ENTRANT_CHECK_OFF;
+               printk
+                   ("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
+                    "FPU emulator: self-modifying code! (emulation impossible)\n",
+                    byte1);
+               RE_ENTRANT_CHECK_ON;
+               EXCEPTION(EX_INTERNAL | 0x126);
+               math_abort(FPU_info, SIGILL);
         }
-      FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
-      code_limit = code_base
-       + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor)
-         - 1;
-      if ( code_limit < code_base ) code_limit = 0xffffffff;
-    }
-
-  FPU_lookahead = 1;
-  if (current->ptrace & PT_PTRACED)
-    FPU_lookahead = 0;
-
-  if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP,
-                    &addr_modes.override) )
-    {
-      RE_ENTRANT_CHECK_OFF;
-      printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
-            "FPU emulator: self-modifying code! (emulation impossible)\n",
-            byte1);
-      RE_ENTRANT_CHECK_ON;
-      EXCEPTION(EX_INTERNAL|0x126);
-      math_abort(FPU_info,SIGILL);
-    }
-
-do_another_FPU_instruction:
-
-  no_ip_update = 0;
-
-  FPU_EIP++;  /* We have fetched the prefix and first code bytes. */
-
-  if ( addr_modes.default_mode )
-    {
-      /* This checks for the minimum instruction bytes.
-        We also need to check any extra (address mode) code access. */
-      if ( FPU_EIP > code_limit )
-       math_abort(FPU_info,SIGSEGV);
-    }
-
-  if ( (byte1 & 0xf8) != 0xd8 )
-    {
-      if ( byte1 == FWAIT_OPCODE )
-       {
-         if (partial_status & SW_Summary)
-           goto do_the_FPU_interrupt;
-         else
-           goto FPU_fwait_done;
+
+      do_another_FPU_instruction:
+
+       no_ip_update = 0;
+
+       FPU_EIP++;              /* We have fetched the prefix and first code bytes. */
+
+       if (addr_modes.default_mode) {
+               /* This checks for the minimum instruction bytes.
+                  We also need to check any extra (address mode) code access. */
+               if (FPU_EIP > code_limit)
+                       math_abort(FPU_info, SIGSEGV);
         }
+
+       if ((byte1 & 0xf8) != 0xd8) {
+               if (byte1 == FWAIT_OPCODE) {
+                       if (partial_status & SW_Summary)
+                               goto do_the_FPU_interrupt;
+                       else
+                               goto FPU_fwait_done;
+               }
  #ifdef PARANOID
-      EXCEPTION(EX_INTERNAL|0x128);
-      math_abort(FPU_info,SIGILL);
+               EXCEPTION(EX_INTERNAL | 0x128);
+               math_abort(FPU_info, SIGILL);
  #endif /* PARANOID */
-    }
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_code_access_ok(1);
-  FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
-  RE_ENTRANT_CHECK_ON;
-  FPU_EIP++;
-
-  if (partial_status & SW_Summary)
-    {
-      /* Ignore the error for now if the current instruction is a no-wait
-        control instruction */
-      /* The 80486 manual contradicts itself on this topic,
-        but a real 80486 uses the following instructions:
-        fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
-       */
-      code = (FPU_modrm << 8) | byte1;
-      if ( ! ( (((code & 0xf803) == 0xe003) ||    /* fnclex, fninit, fnstsw */
-               (((code & 0x3003) == 0x3001) &&   /* fnsave, fnstcw, fnstenv,
-                                                    fnstsw */
-                ((code & 0xc000) != 0xc000))) ) )
-       {
-         /*
-          *  We need to simulate the action of the kernel to FPU
-          *  interrupts here.
-          */
-       do_the_FPU_interrupt:
-
-         FPU_EIP = FPU_ORIG_EIP;       /* Point to current FPU instruction. */
-
-         RE_ENTRANT_CHECK_OFF;
-         current->thread.trap_no = 16;
-         current->thread.error_code = 0;
-         send_sig(SIGFPE, current, 1);
-         return;
-       }
-    }
-
-  entry_sel_off.offset = FPU_ORIG_EIP;
-  entry_sel_off.selector = FPU_CS;
-  entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
-
-  FPU_rm = FPU_modrm & 7;
-
-  if ( FPU_modrm < 0300 )
-    {
-      /* All of these instructions use the mod/rm byte to get a data address */
-
-      if ( (addr_modes.default_mode & SIXTEEN)
-         ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) )
-       data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off,
-                                         addr_modes);
-      else
-       data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
-                                      addr_modes);
-
-      if ( addr_modes.default_mode )
-       {
-         if ( FPU_EIP-1 > code_limit )
-           math_abort(FPU_info,SIGSEGV);
         }
  
-      if ( !(byte1 & 1) )
-       {
-         unsigned short status1 = partial_status;
-
-         st0_ptr = &st(0);
-         st0_tag = FPU_gettag0();
-
-         /* Stack underflow has priority */
-         if ( NOT_EMPTY_ST0 )
-           {
-             if ( addr_modes.default_mode & PROTECTED )
-               {
-                 /* This table works for 16 and 32 bit protected mode */
-                 if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] )
-                   math_abort(FPU_info,SIGSEGV);
+       RE_ENTRANT_CHECK_OFF;
+       FPU_code_access_ok(1);
+       FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
+       RE_ENTRANT_CHECK_ON;
+       FPU_EIP++;
+
+       if (partial_status & SW_Summary) {
+               /* Ignore the error for now if the current instruction is a no-wait
+                  control instruction */
+               /* The 80486 manual contradicts itself on this topic,
+                  but a real 80486 uses the following instructions:
+                  fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
+                */
+               code = (FPU_modrm << 8) | byte1;
+               if (!((((code & 0xf803) == 0xe003) ||   /* fnclex, fninit, fnstsw */
+                      (((code & 0x3003) == 0x3001) &&  /* fnsave, fnstcw, fnstenv,
+                                                          fnstsw */
+                       ((code & 0xc000) != 0xc000))))) {
+                       /*
+                        *  We need to simulate the action of the kernel to FPU
+                        *  interrupts here.
+                        */
+                     do_the_FPU_interrupt:
+
+                       FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */
+
+                       RE_ENTRANT_CHECK_OFF;
+                       current->thread.trap_no = 16;
+                       current->thread.error_code = 0;
+                       send_sig(SIGFPE, current, 1);
+                       return;
                 }
+       }
  
-             unmasked = 0;  /* Do this here to stop compiler warnings. */
-             switch ( (byte1 >> 1) & 3 )
-               {
-               case 0:
-                 unmasked = FPU_load_single((float __user *)data_address,
-                                            &loaded_data);
-                 loaded_tag = unmasked & 0xff;
-                 unmasked &= ~0xff;
-                 break;
-               case 1:
-                 loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
-                 break;
-               case 2:
-                 unmasked = FPU_load_double((double __user *)data_address,
-                                            &loaded_data);
-                 loaded_tag = unmasked & 0xff;
-                 unmasked &= ~0xff;
-                 break;
-               case 3:
-               default:  /* Used here to suppress gcc warnings. */
-                 loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
-                 break;
-               }
+       entry_sel_off.offset = FPU_ORIG_EIP;
+       entry_sel_off.selector = FPU_CS;
+       entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
  
-             /* No more access to user memory, it is safe
-                to use static data now */
-
-             /* NaN operands have the next priority. */
-             /* We have to delay looking at st(0) until after
-                loading the data, because that data might contain an SNaN */
-             if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) ||
-                 ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) )
-               {
-                 /* Restore the status word; we might have loaded a
-                    denormal. */
-                 partial_status = status1;
-                 if ( (FPU_modrm & 0x30) == 0x10 )
-                   {
-                     /* fcom or fcomp */
-                     EXCEPTION(EX_Invalid);
-                     setcc(SW_C3 | SW_C2 | SW_C0);
-                     if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
-                       FPU_pop();             /* fcomp, masked, so we pop. */
-                   }
-                 else
-                   {
-                     if ( loaded_tag == TAG_Special )
-                       loaded_tag = FPU_Special(&loaded_data);
-#ifdef PECULIAR_486
-                     /* This is not really needed, but gives behaviour
-                        identical to an 80486 */
-                     if ( (FPU_modrm & 0x28) == 0x20 )
-                       /* fdiv or fsub */
-                       real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data);
-                     else
-#endif /* PECULIAR_486 */ 
-                       /* fadd, fdivr, fmul, or fsubr */
-                       real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr);
-                   }
-                 goto reg_mem_instr_done;
-               }
+       FPU_rm = FPU_modrm & 7;
  
-             if ( unmasked && !((FPU_modrm & 0x30) == 0x10) )
-               {
-                 /* Is not a comparison instruction. */
-                 if ( (FPU_modrm & 0x38) == 0x38 )
-                   {
-                     /* fdivr */
-                     if ( (st0_tag == TAG_Zero) &&
-                          ((loaded_tag == TAG_Valid)
-                           || (loaded_tag == TAG_Special
-                               && isdenormal(&loaded_data))) )
-                       {
-                         if ( FPU_divide_by_zero(0, getsign(&loaded_data))
-                              < 0 )
-                           {
-                             /* We use the fact here that the unmasked
-                                exception in the loaded data was for a
-                                denormal operand */
-                             /* Restore the state of the denormal op bit */
-                             partial_status &= ~SW_Denorm_Op;
-                             partial_status |= status1 & SW_Denorm_Op;
-                           }
-                         else
-                           setsign(st0_ptr, getsign(&loaded_data));
-                       }
-                   }
-                 goto reg_mem_instr_done;
-               }
+       if (FPU_modrm < 0300) {
+               /* All of these instructions use the mod/rm byte to get a data address */
  
-             switch ( (FPU_modrm >> 3) & 7 )
-               {
-               case 0:         /* fadd */
-                 clear_C1();
-                 FPU_add(&loaded_data, loaded_tag, 0, control_word);
-                 break;
-               case 1:         /* fmul */
-                 clear_C1();
-                 FPU_mul(&loaded_data, loaded_tag, 0, control_word);
-                 break;
-               case 2:         /* fcom */
-                 FPU_compare_st_data(&loaded_data, loaded_tag);
-                 break;
-               case 3:         /* fcomp */
-                 if ( !FPU_compare_st_data(&loaded_data, loaded_tag)
-                      && !unmasked )
-                   FPU_pop();
-                 break;
-               case 4:         /* fsub */
-                 clear_C1();
-                 FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word);
-                 break;
-               case 5:         /* fsubr */
-                 clear_C1();
-                 FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
-                 break;
-               case 6:         /* fdiv */
-                 clear_C1();
-                 FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word);
-                 break;
-               case 7:         /* fdivr */
-                 clear_C1();
-                 if ( st0_tag == TAG_Zero )
-                   partial_status = status1;  /* Undo any denorm tag,
-                                                 zero-divide has priority. */
-                 FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word);
-                 break;
+               if ((addr_modes.default_mode & SIXTEEN)
+                   ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX))
+                       data_address =
+                           FPU_get_address_16(FPU_modrm, &FPU_EIP,
+                                              &data_sel_off, addr_modes);
+               else
+                       data_address =
+                           FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
+                                           addr_modes);
+
+               if (addr_modes.default_mode) {
+                       if (FPU_EIP - 1 > code_limit)
+                               math_abort(FPU_info, SIGSEGV);
                 }
-           }
-         else
-           {
-             if ( (FPU_modrm & 0x30) == 0x10 )
-               {
-                 /* The instruction is fcom or fcomp */
-                 EXCEPTION(EX_StackUnder);
-                 setcc(SW_C3 | SW_C2 | SW_C0);
-                 if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) )
-                   FPU_pop();             /* fcomp */
+
+               if (!(byte1 & 1)) {
+                       unsigned short status1 = partial_status;
+
+                       st0_ptr = &st(0);
+                       st0_tag = FPU_gettag0();
+
+                       /* Stack underflow has priority */
+                       if (NOT_EMPTY_ST0) {
+                               if (addr_modes.default_mode & PROTECTED) {
+                                       /* This table works for 16 and 32 bit protected mode */
+                                       if (access_limit <
+                                           data_sizes_16[(byte1 >> 1) & 3])
+                                               math_abort(FPU_info, SIGSEGV);
+                               }
+
+                               unmasked = 0;   /* Do this here to stop compiler warnings. */
+                               switch ((byte1 >> 1) & 3) {
+                               case 0:
+                                       unmasked =
+                                           FPU_load_single((float __user *)
+                                                           data_address,
+                                                           &loaded_data);
+                                       loaded_tag = unmasked & 0xff;
+                                       unmasked &= ~0xff;
+                                       break;
+                               case 1:
+                                       loaded_tag =
+                                           FPU_load_int32((long __user *)
+                                                          data_address,
+                                                          &loaded_data);
+                                       break;
+                               case 2:
+                                       unmasked =
+                                           FPU_load_double((double __user *)
+                                                           data_address,
+                                                           &loaded_data);
+                                       loaded_tag = unmasked & 0xff;
+                                       unmasked &= ~0xff;
+                                       break;
+                               case 3:
+                               default:        /* Used here to suppress gcc warnings. */
+                                       loaded_tag =
+                                           FPU_load_int16((short __user *)
+                                                          data_address,
+                                                          &loaded_data);
+                                       break;
+                               }
+
+                               /* No more access to user memory, it is safe
+                                  to use static data now */
+
+                               /* NaN operands have the next priority. */
+                               /* We have to delay looking at st(0) until after
+                                  loading the data, because that data might contain an SNaN */
+                               if (((st0_tag == TAG_Special) && isNaN(st0_ptr))
+                                   || ((loaded_tag == TAG_Special)
+                                       && isNaN(&loaded_data))) {
+                                       /* Restore the status word; we might have loaded a
+                                          denormal. */
+                                       partial_status = status1;
+                                       if ((FPU_modrm & 0x30) == 0x10) {
+                                               /* fcom or fcomp */
+                                               EXCEPTION(EX_Invalid);
+                                               setcc(SW_C3 | SW_C2 | SW_C0);
+                                               if ((FPU_modrm & 0x08)
+                                                   && (control_word &
+                                                       CW_Invalid))
+                                                       FPU_pop();      /* fcomp, masked, so we pop. */
+                                       } else {
+                                               if (loaded_tag == TAG_Special)
+                                                       loaded_tag =
+                                                           FPU_Special
+                                                           (&loaded_data);
+#ifdef PECULIAR_486
+                                               /* This is not really needed, but gives behaviour
+                                                  identical to an 80486 */
+                                               if ((FPU_modrm & 0x28) == 0x20)
+                                                       /* fdiv or fsub */
+                                                       real_2op_NaN
+                                                           (&loaded_data,
+                                                            loaded_tag, 0,
+                                                            &loaded_data);
+                                               else
+#endif /* PECULIAR_486 */
+                                                       /* fadd, fdivr, fmul, or fsubr */
+                                                       real_2op_NaN
+                                                           (&loaded_data,
+                                                            loaded_tag, 0,
+                                                            st0_ptr);
+                                       }
+                                       goto reg_mem_instr_done;
+                               }
+
+                               if (unmasked && !((FPU_modrm & 0x30) == 0x10)) {
+                                       /* Is not a comparison instruction. */
+                                       if ((FPU_modrm & 0x38) == 0x38) {
+                                               /* fdivr */
+                                               if ((st0_tag == TAG_Zero) &&
+                                                   ((loaded_tag == TAG_Valid)
+                                                    || (loaded_tag ==
+                                                        TAG_Special
+                                                        &&
+                                                        isdenormal
+                                                        (&loaded_data)))) {
+                                                       if (FPU_divide_by_zero
+                                                           (0,
+                                                            getsign
+                                                            (&loaded_data))
+                                                           < 0) {
+                                                               /* We use the fact here that the unmasked
+                                                                  exception in the loaded data was for a
+                                                                  denormal operand */
+                                                               /* Restore the state of the denormal op bit */
+                                                               partial_status
+                                                                   &=
+                                                                   ~SW_Denorm_Op;
+                                                               partial_status
+                                                                   |=
+                                                                   status1 &
+                                                                   SW_Denorm_Op;
+                                                       } else
+                                                               setsign(st0_ptr,
+                                                                       getsign
+                                                                       (&loaded_data));
+                                               }
+                                       }
+                                       goto reg_mem_instr_done;
+                               }
+
+                               switch ((FPU_modrm >> 3) & 7) {
+                               case 0: /* fadd */
+                                       clear_C1();
+                                       FPU_add(&loaded_data, loaded_tag, 0,
+                                               control_word);
+                                       break;
+                               case 1: /* fmul */
+                                       clear_C1();
+                                       FPU_mul(&loaded_data, loaded_tag, 0,
+                                               control_word);
+                                       break;
+                               case 2: /* fcom */
+                                       FPU_compare_st_data(&loaded_data,
+                                                           loaded_tag);
+                                       break;
+                               case 3: /* fcomp */
+                                       if (!FPU_compare_st_data
+                                           (&loaded_data, loaded_tag)
+                                           && !unmasked)
+                                               FPU_pop();
+                                       break;
+                               case 4: /* fsub */
+                                       clear_C1();
+                                       FPU_sub(LOADED | loaded_tag,
+                                               (int)&loaded_data,
+                                               control_word);
+                                       break;
+                               case 5: /* fsubr */
+                                       clear_C1();
+                                       FPU_sub(REV | LOADED | loaded_tag,
+                                               (int)&loaded_data,
+                                               control_word);
+                                       break;
+                               case 6: /* fdiv */
+                                       clear_C1();
+                                       FPU_div(LOADED | loaded_tag,
+                                               (int)&loaded_data,
+                                               control_word);
+                                       break;
+                               case 7: /* fdivr */
+                                       clear_C1();
+                                       if (st0_tag == TAG_Zero)
+                                               partial_status = status1;       /* Undo any denorm tag,
+                                                                                  zero-divide has priority. */
+                                       FPU_div(REV | LOADED | loaded_tag,
+                                               (int)&loaded_data,
+                                               control_word);
+                                       break;
+                               }
+                       } else {
+                               if ((FPU_modrm & 0x30) == 0x10) {
+                                       /* The instruction is fcom or fcomp */
+                                       EXCEPTION(EX_StackUnder);
+                                       setcc(SW_C3 | SW_C2 | SW_C0);
+                                       if ((FPU_modrm & 0x08)
+                                           && (control_word & CW_Invalid))
+                                               FPU_pop();      /* fcomp */
+                               } else
+                                       FPU_stack_underflow();
+                       }
+                     reg_mem_instr_done:
+                       operand_address = data_sel_off;
+               } else {
+                       if (!(no_ip_update =
+                             FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6))
+                                            >> 1, addr_modes, data_address))) {
+                               operand_address = data_sel_off;
+                       }
                 }
-             else
-               FPU_stack_underflow();
-           }
-       reg_mem_instr_done:
-         operand_address = data_sel_off;
-       }
-      else
-       {
-         if ( !(no_ip_update =
-                FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1,
-                               addr_modes, data_address)) )
-           {
-             operand_address = data_sel_off;
-           }
-       }
  
-    }
-  else
-    {
-      /* None of these instructions access user memory */
-      u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
+       } else {
+               /* None of these instructions access user memory */
+               u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
  
  #ifdef PECULIAR_486
-      /* This is supposed to be undefined, but a real 80486 seems
-        to do this: */
-      operand_address.offset = 0;
-      operand_address.selector = FPU_DS;
+               /* This is supposed to be undefined, but a real 80486 seems
+                  to do this: */
+               operand_address.offset = 0;
+               operand_address.selector = FPU_DS;
  #endif /* PECULIAR_486 */
  
-      st0_ptr = &st(0);
-      st0_tag = FPU_gettag0();
-      switch ( type_table[(int) instr_index] )
-       {
-       case _NONE_:   /* also _REGIc: _REGIn */
-         break;
-       case _REG0_:
-         if ( !NOT_EMPTY_ST0 )
-           {
-             FPU_stack_underflow();
-             goto FPU_instruction_done;
-           }
-         break;
-       case _REGIi:
-         if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
-           {
-             FPU_stack_underflow_i(FPU_rm);
-             goto FPU_instruction_done;
-           }
-         break;
-       case _REGIp:
-         if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
-           {
-             FPU_stack_underflow_pop(FPU_rm);
-             goto FPU_instruction_done;
-           }
-         break;
-       case _REGI_:
-         if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) )
-           {
-             FPU_stack_underflow();
-             goto FPU_instruction_done;
-           }
-         break;
-       case _PUSH_:     /* Only used by the fld st(i) instruction */
-         break;
-       case _null_:
-         FPU_illegal();
-         goto FPU_instruction_done;
-       default:
-         EXCEPTION(EX_INTERNAL|0x111);
-         goto FPU_instruction_done;
-       }
-      (*st_instr_table[(int) instr_index])();
+               st0_ptr = &st(0);
+               st0_tag = FPU_gettag0();
+               switch (type_table[(int)instr_index]) {
+               case _NONE_:    /* also _REGIc: _REGIn */
+                       break;
+               case _REG0_:
+                       if (!NOT_EMPTY_ST0) {
+                               FPU_stack_underflow();
+                               goto FPU_instruction_done;
+                       }
+                       break;
+               case _REGIi:
+                       if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
+                               FPU_stack_underflow_i(FPU_rm);
+                               goto FPU_instruction_done;
+                       }
+                       break;
+               case _REGIp:
+                       if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
+                               FPU_stack_underflow_pop(FPU_rm);
+                               goto FPU_instruction_done;
+                       }
+                       break;
+               case _REGI_:
+                       if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
+                               FPU_stack_underflow();
+                               goto FPU_instruction_done;
+                       }
+                       break;
+               case _PUSH_:    /* Only used by the fld st(i) instruction */
+                       break;
+               case _null_:
+                       FPU_illegal();
+                       goto FPU_instruction_done;
+               default:
+                       EXCEPTION(EX_INTERNAL | 0x111);
+                       goto FPU_instruction_done;
+               }
+               (*st_instr_table[(int)instr_index]) ();
  
-FPU_instruction_done:
-      ;
-    }
+             FPU_instruction_done:
+               ;
+       }
  
-  if ( ! no_ip_update )
-    instruction_address = entry_sel_off;
+       if (!no_ip_update)
+               instruction_address = entry_sel_off;
  
-FPU_fwait_done:
+      FPU_fwait_done:
  
  #ifdef DEBUG
-  RE_ENTRANT_CHECK_OFF;
-  FPU_printall();
-  RE_ENTRANT_CHECK_ON;
+       RE_ENTRANT_CHECK_OFF;
+       FPU_printall();
+       RE_ENTRANT_CHECK_ON;
  #endif /* DEBUG */
  
-  if (FPU_lookahead && !need_resched())
-    {
-      FPU_ORIG_EIP = FPU_EIP - code_base;
-      if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP,
-                       &addr_modes.override) )
-       goto do_another_FPU_instruction;
-    }
+       if (FPU_lookahead && !need_resched()) {
+               FPU_ORIG_EIP = FPU_EIP - code_base;
+               if (valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
+                                &addr_modes.override))
+                       goto do_another_FPU_instruction;
+       }
  
-  if ( addr_modes.default_mode )
-    FPU_EIP -= code_base;
+       if (addr_modes.default_mode)
+               FPU_EIP -= code_base;
  
-  RE_ENTRANT_CHECK_OFF;
+       RE_ENTRANT_CHECK_OFF;
  }
  
-
  /* Support for prefix bytes is not yet complete. To properly handle
     all prefix bytes, further changes are needed in the emulator code
     which accesses user address space. Access to separate segments is
     important for msdos emulation. */
  static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
-                       overrides *override)
+                       overrides * override)
  {
-  u_char byte;
-  u_char __user *ip = *fpu_eip;
-
-  *override = (overrides) { 0, 0, PREFIX_DEFAULT };       /* defaults */
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_code_access_ok(1);
-  FPU_get_user(byte, ip);
-  RE_ENTRANT_CHECK_ON;
-
-  while ( 1 )
-    {
-      switch ( byte )
-       {
-       case ADDR_SIZE_PREFIX:
-         override->address_size = ADDR_SIZE_PREFIX;
-         goto do_next_byte;
-
-       case OP_SIZE_PREFIX:
-         override->operand_size = OP_SIZE_PREFIX;
-         goto do_next_byte;
-
-       case PREFIX_CS:
-         override->segment = PREFIX_CS_;
-         goto do_next_byte;
-       case PREFIX_ES:
-         override->segment = PREFIX_ES_;
-         goto do_next_byte;
-       case PREFIX_SS:
-         override->segment = PREFIX_SS_;
-         goto do_next_byte;
-       case PREFIX_FS:
-         override->segment = PREFIX_FS_;
-         goto do_next_byte;
-       case PREFIX_GS:
-         override->segment = PREFIX_GS_;
-         goto do_next_byte;
-       case PREFIX_DS:
-         override->segment = PREFIX_DS_;
-         goto do_next_byte;
+       u_char byte;
+       u_char __user *ip = *fpu_eip;
+
+       *override = (overrides) {
+       0, 0, PREFIX_DEFAULT};  /* defaults */
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_code_access_ok(1);
+       FPU_get_user(byte, ip);
+       RE_ENTRANT_CHECK_ON;
+
+       while (1) {
+               switch (byte) {
+               case ADDR_SIZE_PREFIX:
+                       override->address_size = ADDR_SIZE_PREFIX;
+                       goto do_next_byte;
+
+               case OP_SIZE_PREFIX:
+                       override->operand_size = OP_SIZE_PREFIX;
+                       goto do_next_byte;
+
+               case PREFIX_CS:
+                       override->segment = PREFIX_CS_;
+                       goto do_next_byte;
+               case PREFIX_ES:
+                       override->segment = PREFIX_ES_;
+                       goto do_next_byte;
+               case PREFIX_SS:
+                       override->segment = PREFIX_SS_;
+                       goto do_next_byte;
+               case PREFIX_FS:
+                       override->segment = PREFIX_FS_;
+                       goto do_next_byte;
+               case PREFIX_GS:
+                       override->segment = PREFIX_GS_;
+                       goto do_next_byte;
+               case PREFIX_DS:
+                       override->segment = PREFIX_DS_;
+                       goto do_next_byte;
  
  /* lock is not a valid prefix for FPU instructions,
     let the cpu handle it to generate a SIGILL. */
  /*     case PREFIX_LOCK: */
  
-         /* rep.. prefixes have no meaning for FPU instructions */
-       case PREFIX_REPE:
-       case PREFIX_REPNE:
-
-       do_next_byte:
-         ip++;
-         RE_ENTRANT_CHECK_OFF;
-         FPU_code_access_ok(1);
-         FPU_get_user(byte, ip);
-         RE_ENTRANT_CHECK_ON;
-         break;
-       case FWAIT_OPCODE:
-         *Byte = byte;
-         return 1;
-       default:
-         if ( (byte & 0xf8) == 0xd8 )
-           {
-             *Byte = byte;
-             *fpu_eip = ip;
-             return 1;
-           }
-         else
-           {
-             /* Not a valid sequence of prefix bytes followed by
-                an FPU instruction. */
-             *Byte = byte;  /* Needed for error message. */
-             return 0;
-           }
+                       /* rep.. prefixes have no meaning for FPU instructions */
+               case PREFIX_REPE:
+               case PREFIX_REPNE:
+
+                     do_next_byte:
+                       ip++;
+                       RE_ENTRANT_CHECK_OFF;
+                       FPU_code_access_ok(1);
+                       FPU_get_user(byte, ip);
+                       RE_ENTRANT_CHECK_ON;
+                       break;
+               case FWAIT_OPCODE:
+                       *Byte = byte;
+                       return 1;
+               default:
+                       if ((byte & 0xf8) == 0xd8) {
+                               *Byte = byte;
+                               *fpu_eip = ip;
+                               return 1;
+                       } else {
+                               /* Not a valid sequence of prefix bytes followed by
+                                  an FPU instruction. */
+                               *Byte = byte;   /* Needed for error message. */
+                               return 0;
+                       }
+               }
         }
-    }
  }
  
-
-void math_abort(struct info * info, unsigned int signal)
+void math_abort(struct info *info, unsigned int signal)
  {
         FPU_EIP = FPU_ORIG_EIP;
         current->thread.trap_no = 16;
         current->thread.error_code = 0;
-       send_sig(signal,current,1);
+       send_sig(signal, current, 1);
         RE_ENTRANT_CHECK_OFF;
-       __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4));
+      __asm__("movl %0,%%esp ; ret": :"g"(((long)info) - 4));
  #ifdef PARANOID
-      printk("ERROR: wm-FPU-emu math_abort failed!\n");
+       printk("ERROR: wm-FPU-emu math_abort failed!\n");
  #endif /* PARANOID */
  }
  
-
-
  #define S387 ((struct i387_soft_struct *)s387)
  #define sstatus_word() \
    ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
  
-int restore_i387_soft(void *s387, struct _fpstate __user *buf)
+int fpregs_soft_set(struct task_struct *target,
+                   const struct user_regset *regset,
+                   unsigned int pos, unsigned int count,
+                   const void *kbuf, const void __user *ubuf)
  {
-  u_char __user *d = (u_char __user *)buf;
-  int offset, other, i, tags, regnr, tag, newtop;
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10);
-  if (__copy_from_user(&S387->cwd, d, 7*4))
-    return -1;
-  RE_ENTRANT_CHECK_ON;
-
-  d += 7*4;
-
-  S387->ftop = (S387->swd >> SW_Top_Shift) & 7;
-  offset = (S387->ftop & 7) * 10;
-  other = 80 - offset;
-
-  RE_ENTRANT_CHECK_OFF;
-  /* Copy all registers in stack order. */
-  if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other))
-    return -1;
-  if ( offset )
-    if (__copy_from_user((u_char *)&S387->st_space, d+other, offset))
-      return -1;
-  RE_ENTRANT_CHECK_ON;
-
-  /* The tags may need to be corrected now. */
-  tags = S387->twd;
-  newtop = S387->ftop;
-  for ( i = 0; i < 8; i++ )
-    {
-      regnr = (i+newtop) & 7;
-      if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty )
-       {
-         /* The loaded data over-rides all other cases. */
-         tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr));
-         tags &= ~(3 << (regnr*2));
-         tags |= (tag & 3) << (regnr*2);
+       struct i387_soft_struct *s387 = &target->thread.i387.soft;
+       void *space = s387->st_space;
+       int ret;
+       int offset, other, i, tags, regnr, tag, newtop;
+
+       RE_ENTRANT_CHECK_OFF;
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
+                                offsetof(struct i387_soft_struct, st_space));
+       RE_ENTRANT_CHECK_ON;
+
+       if (ret)
+               return ret;
+
+       S387->ftop = (S387->swd >> SW_Top_Shift) & 7;
+       offset = (S387->ftop & 7) * 10;
+       other = 80 - offset;
+
+       RE_ENTRANT_CHECK_OFF;
+
+       /* Copy all registers in stack order. */
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                space + offset, 0, other);
+       if (!ret && offset)
+               ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                        space, 0, offset);
+
+       RE_ENTRANT_CHECK_ON;
+
+       /* The tags may need to be corrected now. */
+       tags = S387->twd;
+       newtop = S387->ftop;
+       for (i = 0; i < 8; i++) {
+               regnr = (i + newtop) & 7;
+               if (((tags >> ((regnr & 7) * 2)) & 3) != TAG_Empty) {
+                       /* The loaded data over-rides all other cases. */
+                       tag =
+                           FPU_tagof((FPU_REG *) ((u_char *) S387->st_space +
+                                                  10 * regnr));
+                       tags &= ~(3 << (regnr * 2));
+                       tags |= (tag & 3) << (regnr * 2);
+               }
         }
-    }
-  S387->twd = tags;
+       S387->twd = tags;
  
-  return 0;
+       return ret;
  }
  
-
-int save_i387_soft(void *s387, struct _fpstate __user * buf)
+int fpregs_soft_get(struct task_struct *target,
+                   const struct user_regset *regset,
+                   unsigned int pos, unsigned int count,
+                   void *kbuf, void __user *ubuf)
  {
-  u_char __user *d = (u_char __user *)buf;
-  int offset = (S387->ftop & 7) * 10, other = 80 - offset;
+       struct i387_soft_struct *s387 = &target->thread.i387.soft;
+       const void *space = s387->st_space;
+       int ret;
+       int offset = (S387->ftop & 7) * 10, other = 80 - offset;
+
+       RE_ENTRANT_CHECK_OFF;
  
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10);
  #ifdef PECULIAR_486
-  S387->cwd &= ~0xe080;
-  /* An 80486 sets nearly all of the reserved bits to 1. */
-  S387->cwd |= 0xffff0040;
-  S387->swd = sstatus_word() | 0xffff0000;
-  S387->twd |= 0xffff0000;
-  S387->fcs &= ~0xf8000000;
-  S387->fos |= 0xffff0000;
+       S387->cwd &= ~0xe080;
+       /* An 80486 sets nearly all of the reserved bits to 1. */
+       S387->cwd |= 0xffff0040;
+       S387->swd = sstatus_word() | 0xffff0000;
+       S387->twd |= 0xffff0000;
+       S387->fcs &= ~0xf8000000;
+       S387->fos |= 0xffff0000;
  #endif /* PECULIAR_486 */
-  if (__copy_to_user(d, &S387->cwd, 7*4))
-    return -1;
-  RE_ENTRANT_CHECK_ON;
-
-  d += 7*4;
-
-  RE_ENTRANT_CHECK_OFF;
-  /* Copy all registers in stack order. */
-  if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other))
-    return -1;
-  if ( offset )
-    if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset))
-      return -1;
-  RE_ENTRANT_CHECK_ON;
-
-  return 1;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
+                                 offsetof(struct i387_soft_struct, st_space));
+
+       /* Copy all registers in stack order. */
+       if (!ret)
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                         space + offset, 0, other);
+       if (!ret)
+               ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                         space, 0, offset);
+
+       RE_ENTRANT_CHECK_ON;
+
+       return ret;
  }
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c

index e3b5d465587f2a848a69890dd0d952dd9bed87c4..233e5af566f5123d53e0f8794c75c6af3f23fb89 100644 (file)
--- a/arch/x86/math-emu/fpu_etc.c
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -16,128 +16,115 @@
  #include "status_w.h"
  #include "reg_constant.h"
  
-
  static void fchs(FPU_REG *st0_ptr, u_char st0tag)
  {
-  if ( st0tag ^ TAG_Empty )
-    {
-      signbyte(st0_ptr) ^= SIGN_NEG;
-      clear_C1();
-    }
-  else
-    FPU_stack_underflow();
+       if (st0tag ^ TAG_Empty) {
+               signbyte(st0_ptr) ^= SIGN_NEG;
+               clear_C1();
+       } else
+               FPU_stack_underflow();
  }
  
-
  static void fabs(FPU_REG *st0_ptr, u_char st0tag)
  {
-  if ( st0tag ^ TAG_Empty )
-    {
-      setpositive(st0_ptr);
-      clear_C1();
-    }
-  else
-    FPU_stack_underflow();
+       if (st0tag ^ TAG_Empty) {
+               setpositive(st0_ptr);
+               clear_C1();
+       } else
+               FPU_stack_underflow();
  }
  
-
  static void ftst_(FPU_REG *st0_ptr, u_char st0tag)
  {
-  switch (st0tag)
-    {
-    case TAG_Zero:
-      setcc(SW_C3);
-      break;
-    case TAG_Valid:
-      if (getsign(st0_ptr) == SIGN_POS)
-        setcc(0);
-      else
-        setcc(SW_C0);
-      break;
-    case TAG_Special:
-      switch ( FPU_Special(st0_ptr) )
-       {
-       case TW_Denormal:
-         if (getsign(st0_ptr) == SIGN_POS)
-           setcc(0);
-         else
-           setcc(SW_C0);
-         if ( denormal_operand() < 0 )
-           {
-#ifdef PECULIAR_486
-             /* This is weird! */
-             if (getsign(st0_ptr) == SIGN_POS)
+       switch (st0tag) {
+       case TAG_Zero:
                 setcc(SW_C3);
+               break;
+       case TAG_Valid:
+               if (getsign(st0_ptr) == SIGN_POS)
+                       setcc(0);
+               else
+                       setcc(SW_C0);
+               break;
+       case TAG_Special:
+               switch (FPU_Special(st0_ptr)) {
+               case TW_Denormal:
+                       if (getsign(st0_ptr) == SIGN_POS)
+                               setcc(0);
+                       else
+                               setcc(SW_C0);
+                       if (denormal_operand() < 0) {
+#ifdef PECULIAR_486
+                               /* This is weird! */
+                               if (getsign(st0_ptr) == SIGN_POS)
+                                       setcc(SW_C3);
  #endif /* PECULIAR_486 */
-             return;
-           }
-         break;
-       case TW_NaN:
-         setcc(SW_C0|SW_C2|SW_C3);   /* Operand is not comparable */ 
-         EXCEPTION(EX_Invalid);
-         break;
-       case TW_Infinity:
-         if (getsign(st0_ptr) == SIGN_POS)
-           setcc(0);
-         else
-           setcc(SW_C0);
-         break;
-       default:
-         setcc(SW_C0|SW_C2|SW_C3);   /* Operand is not comparable */ 
-         EXCEPTION(EX_INTERNAL|0x14);
-         break;
+                               return;
+                       }
+                       break;
+               case TW_NaN:
+                       setcc(SW_C0 | SW_C2 | SW_C3);   /* Operand is not comparable */
+                       EXCEPTION(EX_Invalid);
+                       break;
+               case TW_Infinity:
+                       if (getsign(st0_ptr) == SIGN_POS)
+                               setcc(0);
+                       else
+                               setcc(SW_C0);
+                       break;
+               default:
+                       setcc(SW_C0 | SW_C2 | SW_C3);   /* Operand is not comparable */
+                       EXCEPTION(EX_INTERNAL | 0x14);
+                       break;
+               }
+               break;
+       case TAG_Empty:
+               setcc(SW_C0 | SW_C2 | SW_C3);
+               EXCEPTION(EX_StackUnder);
+               break;
         }
-      break;
-    case TAG_Empty:
-      setcc(SW_C0|SW_C2|SW_C3);
-      EXCEPTION(EX_StackUnder);
-      break;
-    }
  }
  
-
  static void fxam(FPU_REG *st0_ptr, u_char st0tag)
  {
-  int c = 0;
-  switch (st0tag)
-    {
-    case TAG_Empty:
-      c = SW_C3|SW_C0;
-      break;
-    case TAG_Zero:
-      c = SW_C3;
-      break;
-    case TAG_Valid:
-      c = SW_C2;
-      break;
-    case TAG_Special:
-      switch ( FPU_Special(st0_ptr) )
-       {
-       case TW_Denormal:
-         c = SW_C2|SW_C3;  /* Denormal */
-         break;
-       case TW_NaN:
-         /* We also use NaN for unsupported types. */
-         if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) )
-           c = SW_C0;
-         break;
-       case TW_Infinity:
-         c = SW_C2|SW_C0;
-         break;
+       int c = 0;
+       switch (st0tag) {
+       case TAG_Empty:
+               c = SW_C3 | SW_C0;
+               break;
+       case TAG_Zero:
+               c = SW_C3;
+               break;
+       case TAG_Valid:
+               c = SW_C2;
+               break;
+       case TAG_Special:
+               switch (FPU_Special(st0_ptr)) {
+               case TW_Denormal:
+                       c = SW_C2 | SW_C3;      /* Denormal */
+                       break;
+               case TW_NaN:
+                       /* We also use NaN for unsupported types. */
+                       if ((st0_ptr->sigh & 0x80000000)
+                           && (exponent(st0_ptr) == EXP_OVER))
+                               c = SW_C0;
+                       break;
+               case TW_Infinity:
+                       c = SW_C2 | SW_C0;
+                       break;
+               }
         }
-    }
-  if ( getsign(st0_ptr) == SIGN_NEG )
-    c |= SW_C1;
-  setcc(c);
+       if (getsign(st0_ptr) == SIGN_NEG)
+               c |= SW_C1;
+       setcc(c);
  }
  
-
  static FUNC_ST0 const fp_etc_table[] = {
-  fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal,
-  ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal
+       fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal,
+       ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal
  };
  
  void FPU_etc(void)
  {
-  (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0());
+       (fp_etc_table[FPU_rm]) (&st(0), FPU_gettag0());
  }
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h

index 37a8a7fe7e2b1926d9dc4ceae05e72e9b602363a..aa49b6a0d850c616d1f257c563d8ad92ab255d95 100644 (file)
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -66,7 +66,7 @@ extern int FPU_Special(FPU_REG const *ptr);
  extern int isNaN(FPU_REG const *ptr);
  extern void FPU_pop(void);
  extern int FPU_empty_i(int stnr);
-extern int FPU_stackoverflow(FPU_REG **st_new_ptr);
+extern int FPU_stackoverflow(FPU_REG ** st_new_ptr);
  extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr);
  extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag);
  extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag);
@@ -75,21 +75,23 @@ extern void FPU_triga(void);
  extern void FPU_trigb(void);
  /* get_address.c */
  extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
-                        struct address *addr, fpu_addr_modes addr_modes);
+                                   struct address *addr,
+                                   fpu_addr_modes addr_modes);
  extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
-                           struct address *addr, fpu_addr_modes addr_modes);
+                                      struct address *addr,
+                                      fpu_addr_modes addr_modes);
  /* load_store.c */
  extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
-                           void __user *data_address);
+                         void __user * data_address);
  /* poly_2xm1.c */
-extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result);
+extern int poly_2xm1(u_char sign, FPU_REG * arg, FPU_REG *result);
  /* poly_atan.c */
-extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr,
+extern void poly_atan(FPU_REG * st0_ptr, u_char st0_tag, FPU_REG *st1_ptr,
                       u_char st1_tag);
  /* poly_l2.c */
  extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign);
  extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1,
-                    FPU_REG *d);
+                    FPU_REG * d);
  /* poly_sin.c */
  extern void poly_sine(FPU_REG *st0_ptr);
  extern void poly_cos(FPU_REG *st0_ptr);
@@ -117,10 +119,13 @@ extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data);
  extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data);
  extern int FPU_load_bcd(u_char __user *s);
  extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
-                             long double __user *d);
-extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat);
-extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single);
-extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d);
+                             long double __user * d);
+extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag,
+                           double __user * dfloat);
+extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag,
+                           float __user * single);
+extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag,
+                          long long __user * d);
  extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d);
  extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
  extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
@@ -137,4 +142,3 @@ extern int FPU_div(int flags, int regrm, int control_w);
  /* reg_convert.c */
  extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x);
  #endif /* _FPU_PROTO_H */
-
diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c

index cb436fe20e4cf774b4edeac8784b5ae4c1e9b40d..d9c657cd7746ece9578766cd114f4dc9abf5a855 100644 (file)
--- a/arch/x86/math-emu/fpu_tags.c
+++ b/arch/x86/math-emu/fpu_tags.c
@@ -14,114 +14,102 @@
  #include "fpu_system.h"
  #include "exception.h"
  
-
  void FPU_pop(void)
  {
-  fpu_tag_word |= 3 << ((top & 7)*2);
-  top++;
+       fpu_tag_word |= 3 << ((top & 7) * 2);
+       top++;
  }
  
-
  int FPU_gettag0(void)
  {
-  return (fpu_tag_word >> ((top & 7)*2)) & 3;
+       return (fpu_tag_word >> ((top & 7) * 2)) & 3;
  }
  
-
  int FPU_gettagi(int stnr)
  {
-  return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3;
+       return (fpu_tag_word >> (((top + stnr) & 7) * 2)) & 3;
  }
  
-
  int FPU_gettag(int regnr)
  {
-  return (fpu_tag_word >> ((regnr & 7)*2)) & 3;
+       return (fpu_tag_word >> ((regnr & 7) * 2)) & 3;
  }
  
-
  void FPU_settag0(int tag)
  {
-  int regnr = top;
-  regnr &= 7;
-  fpu_tag_word &= ~(3 << (regnr*2));
-  fpu_tag_word |= (tag & 3) << (regnr*2);
+       int regnr = top;
+       regnr &= 7;
+       fpu_tag_word &= ~(3 << (regnr * 2));
+       fpu_tag_word |= (tag & 3) << (regnr * 2);
  }
  
-
  void FPU_settagi(int stnr, int tag)
  {
-  int regnr = stnr+top;
-  regnr &= 7;
-  fpu_tag_word &= ~(3 << (regnr*2));
-  fpu_tag_word |= (tag & 3) << (regnr*2);
+       int regnr = stnr + top;
+       regnr &= 7;
+       fpu_tag_word &= ~(3 << (regnr * 2));
+       fpu_tag_word |= (tag & 3) << (regnr * 2);
  }
  
-
  void FPU_settag(int regnr, int tag)
  {
-  regnr &= 7;
-  fpu_tag_word &= ~(3 << (regnr*2));
-  fpu_tag_word |= (tag & 3) << (regnr*2);
+       regnr &= 7;
+       fpu_tag_word &= ~(3 << (regnr * 2));
+       fpu_tag_word |= (tag & 3) << (regnr * 2);
  }
  
-
  int FPU_Special(FPU_REG const *ptr)
  {
-  int exp = exponent(ptr);
-
-  if ( exp == EXP_BIAS+EXP_UNDER )
-    return TW_Denormal;
-  else if ( exp != EXP_BIAS+EXP_OVER )
-    return TW_NaN;
-  else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) )
-    return TW_Infinity;
-  return TW_NaN;
+       int exp = exponent(ptr);
+
+       if (exp == EXP_BIAS + EXP_UNDER)
+               return TW_Denormal;
+       else if (exp != EXP_BIAS + EXP_OVER)
+               return TW_NaN;
+       else if ((ptr->sigh == 0x80000000) && (ptr->sigl == 0))
+               return TW_Infinity;
+       return TW_NaN;
  }
  
-
  int isNaN(FPU_REG const *ptr)
  {
-  return ( (exponent(ptr) == EXP_BIAS+EXP_OVER)
-          && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) );
+       return ((exponent(ptr) == EXP_BIAS + EXP_OVER)
+               && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)));
  }
  
-
  int FPU_empty_i(int stnr)
  {
-  int regnr = (top+stnr) & 7;
+       int regnr = (top + stnr) & 7;
  
-  return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty;
+       return ((fpu_tag_word >> (regnr * 2)) & 3) == TAG_Empty;
  }
  
-
-int FPU_stackoverflow(FPU_REG **st_new_ptr)
+int FPU_stackoverflow(FPU_REG ** st_new_ptr)
  {
-  *st_new_ptr = &st(-1);
+       *st_new_ptr = &st(-1);
  
-  return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty;
+       return ((fpu_tag_word >> (((top - 1) & 7) * 2)) & 3) != TAG_Empty;
  }
  
-
  void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr)
  {
-  reg_copy(r, &st(stnr));
-  FPU_settagi(stnr, tag);
+       reg_copy(r, &st(stnr));
+       FPU_settagi(stnr, tag);
  }
  
  void FPU_copy_to_reg1(FPU_REG const *r, u_char tag)
  {
-  reg_copy(r, &st(1));
-  FPU_settagi(1, tag);
+       reg_copy(r, &st(1));
+       FPU_settagi(1, tag);
  }
  
  void FPU_copy_to_reg0(FPU_REG const *r, u_char tag)
  {
-  int regnr = top;
-  regnr &= 7;
+       int regnr = top;
+       regnr &= 7;
  
-  reg_copy(r, &st(0));
+       reg_copy(r, &st(0));
  
-  fpu_tag_word &= ~(3 << (regnr*2));
-  fpu_tag_word |= (tag & 3) << (regnr*2);
+       fpu_tag_word &= ~(3 << (regnr * 2));
+       fpu_tag_word |= (tag & 3) << (regnr * 2);
  }
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c

index 403cbde1d4251a1ae2c5b4e5444fb81addd60dcb..ecd06680581c51cbb7631cd0240708e92d9ef919 100644 (file)
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -15,11 +15,10 @@
  #include "fpu_emu.h"
  #include "status_w.h"
  #include "control_w.h"
-#include "reg_constant.h"      
+#include "reg_constant.h"
  
  static void rem_kernel(unsigned long long st0, unsigned long long *y,
-                      unsigned long long st1,
-                      unsigned long long q, int n);
+                      unsigned long long st1, unsigned long long q, int n);
  
  #define BETTER_THAN_486
  
@@ -33,788 +32,706 @@ static void rem_kernel(unsigned long long st0, unsigned long long *y,
     precision of the result sometimes degrades to about 63.9 bits */
  static int trig_arg(FPU_REG *st0_ptr, int even)
  {
-  FPU_REG tmp;
-  u_char tmptag;
-  unsigned long long q;
-  int old_cw = control_word, saved_status = partial_status;
-  int tag, st0_tag = TAG_Valid;
-
-  if ( exponent(st0_ptr) >= 63 )
-    {
-      partial_status |= SW_C2;     /* Reduction incomplete. */
-      return -1;
-    }
-
-  control_word &= ~CW_RC;
-  control_word |= RC_CHOP;
-
-  setpositive(st0_ptr);
-  tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
-                 SIGN_POS);
-
-  FPU_round_to_int(&tmp, tag);  /* Fortunately, this can't overflow
-                                  to 2^64 */
-  q = significand(&tmp);
-  if ( q )
-    {
-      rem_kernel(significand(st0_ptr),
-                &significand(&tmp),
-                significand(&CONST_PI2),
-                q, exponent(st0_ptr) - exponent(&CONST_PI2));
-      setexponent16(&tmp, exponent(&CONST_PI2));
-      st0_tag = FPU_normalize(&tmp);
-      FPU_copy_to_reg0(&tmp, st0_tag);
-    }
-
-  if ( (even && !(q & 1)) || (!even && (q & 1)) )
-    {
-      st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION);
+       FPU_REG tmp;
+       u_char tmptag;
+       unsigned long long q;
+       int old_cw = control_word, saved_status = partial_status;
+       int tag, st0_tag = TAG_Valid;
+
+       if (exponent(st0_ptr) >= 63) {
+               partial_status |= SW_C2;        /* Reduction incomplete. */
+               return -1;
+       }
  
-#ifdef BETTER_THAN_486
-      /* So far, the results are exact but based upon a 64 bit
-        precision approximation to pi/2. The technique used
-        now is equivalent to using an approximation to pi/2 which
-        is accurate to about 128 bits. */
-      if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) )
-       {
-         /* This code gives the effect of having pi/2 to better than
-            128 bits precision. */
-
-         significand(&tmp) = q + 1;
-         setexponent16(&tmp, 63);
-         FPU_normalize(&tmp);
-         tmptag =
-           FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS,
-                     exponent(&CONST_PI2extra) + exponent(&tmp));
-         setsign(&tmp, getsign(&CONST_PI2extra));
-         st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION);
-         if ( signnegative(st0_ptr) )
-           {
-             /* CONST_PI2extra is negative, so the result of the addition
-                can be negative. This means that the argument is actually
-                in a different quadrant. The correction is always < pi/2,
-                so it can't overflow into yet another quadrant. */
-             setpositive(st0_ptr);
-             q++;
-           }
+       control_word &= ~CW_RC;
+       control_word |= RC_CHOP;
+
+       setpositive(st0_ptr);
+       tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
+                       SIGN_POS);
+
+       FPU_round_to_int(&tmp, tag);    /* Fortunately, this can't overflow
+                                          to 2^64 */
+       q = significand(&tmp);
+       if (q) {
+               rem_kernel(significand(st0_ptr),
+                          &significand(&tmp),
+                          significand(&CONST_PI2),
+                          q, exponent(st0_ptr) - exponent(&CONST_PI2));
+               setexponent16(&tmp, exponent(&CONST_PI2));
+               st0_tag = FPU_normalize(&tmp);
+               FPU_copy_to_reg0(&tmp, st0_tag);
         }
+
+       if ((even && !(q & 1)) || (!even && (q & 1))) {
+               st0_tag =
+                   FPU_sub(REV | LOADED | TAG_Valid, (int)&CONST_PI2,
+                           FULL_PRECISION);
+
+#ifdef BETTER_THAN_486
+               /* So far, the results are exact but based upon a 64 bit
+                  precision approximation to pi/2. The technique used
+                  now is equivalent to using an approximation to pi/2 which
+                  is accurate to about 128 bits. */
+               if ((exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)
+                   || (q > 1)) {
+                       /* This code gives the effect of having pi/2 to better than
+                          128 bits precision. */
+
+                       significand(&tmp) = q + 1;
+                       setexponent16(&tmp, 63);
+                       FPU_normalize(&tmp);
+                       tmptag =
+                           FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
+                                     FULL_PRECISION, SIGN_POS,
+                                     exponent(&CONST_PI2extra) +
+                                     exponent(&tmp));
+                       setsign(&tmp, getsign(&CONST_PI2extra));
+                       st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION);
+                       if (signnegative(st0_ptr)) {
+                               /* CONST_PI2extra is negative, so the result of the addition
+                                  can be negative. This means that the argument is actually
+                                  in a different quadrant. The correction is always < pi/2,
+                                  so it can't overflow into yet another quadrant. */
+                               setpositive(st0_ptr);
+                               q++;
+                       }
+               }
  #endif /* BETTER_THAN_486 */
-    }
+       }
  #ifdef BETTER_THAN_486
-  else
-    {
-      /* So far, the results are exact but based upon a 64 bit
-        precision approximation to pi/2. The technique used
-        now is equivalent to using an approximation to pi/2 which
-        is accurate to about 128 bits. */
-      if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64))
-          || (q > 1) )
-       {
-         /* This code gives the effect of having p/2 to better than
-            128 bits precision. */
-
-         significand(&tmp) = q;
-         setexponent16(&tmp, 63);
-         FPU_normalize(&tmp);         /* This must return TAG_Valid */
-         tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION,
-                            SIGN_POS,
-                            exponent(&CONST_PI2extra) + exponent(&tmp));
-         setsign(&tmp, getsign(&CONST_PI2extra));
-         st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp,
-                           FULL_PRECISION);
-         if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) &&
-             ((st0_ptr->sigh > CONST_PI2.sigh)
-              || ((st0_ptr->sigh == CONST_PI2.sigh)
-                  && (st0_ptr->sigl > CONST_PI2.sigl))) )
-           {
-             /* CONST_PI2extra is negative, so the result of the
-                subtraction can be larger than pi/2. This means
-                that the argument is actually in a different quadrant.
-                The correction is always < pi/2, so it can't overflow
-                into yet another quadrant. */
-             st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2,
-                               FULL_PRECISION);
-             q++;
-           }
+       else {
+               /* So far, the results are exact but based upon a 64 bit
+                  precision approximation to pi/2. The technique used
+                  now is equivalent to using an approximation to pi/2 which
+                  is accurate to about 128 bits. */
+               if (((q > 0)
+                    && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64))
+                   || (q > 1)) {
+                       /* This code gives the effect of having p/2 to better than
+                          128 bits precision. */
+
+                       significand(&tmp) = q;
+                       setexponent16(&tmp, 63);
+                       FPU_normalize(&tmp);    /* This must return TAG_Valid */
+                       tmptag =
+                           FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
+                                     FULL_PRECISION, SIGN_POS,
+                                     exponent(&CONST_PI2extra) +
+                                     exponent(&tmp));
+                       setsign(&tmp, getsign(&CONST_PI2extra));
+                       st0_tag = FPU_sub(LOADED | (tmptag & 0x0f), (int)&tmp,
+                                         FULL_PRECISION);
+                       if ((exponent(st0_ptr) == exponent(&CONST_PI2)) &&
+                           ((st0_ptr->sigh > CONST_PI2.sigh)
+                            || ((st0_ptr->sigh == CONST_PI2.sigh)
+                                && (st0_ptr->sigl > CONST_PI2.sigl)))) {
+                               /* CONST_PI2extra is negative, so the result of the
+                                  subtraction can be larger than pi/2. This means
+                                  that the argument is actually in a different quadrant.
+                                  The correction is always < pi/2, so it can't overflow
+                                  into yet another quadrant. */
+                               st0_tag =
+                                   FPU_sub(REV | LOADED | TAG_Valid,
+                                           (int)&CONST_PI2, FULL_PRECISION);
+                               q++;
+                       }
+               }
         }
-    }
  #endif /* BETTER_THAN_486 */
  
-  FPU_settag0(st0_tag);
-  control_word = old_cw;
-  partial_status = saved_status & ~SW_C2;     /* Reduction complete. */
+       FPU_settag0(st0_tag);
+       control_word = old_cw;
+       partial_status = saved_status & ~SW_C2; /* Reduction complete. */
  
-  return (q & 3) | even;
+       return (q & 3) | even;
  }
  
-
  /* Convert a long to register */
  static void convert_l2reg(long const *arg, int deststnr)
  {
-  int tag;
-  long num = *arg;
-  u_char sign;
-  FPU_REG *dest = &st(deststnr);
-
-  if (num == 0)
-    {
-      FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
-      return;
-    }
-
-  if (num > 0)
-    { sign = SIGN_POS; }
-  else
-    { num = -num; sign = SIGN_NEG; }
-
-  dest->sigh = num;
-  dest->sigl = 0;
-  setexponent16(dest, 31);
-  tag = FPU_normalize(dest);
-  FPU_settagi(deststnr, tag);
-  setsign(dest, sign);
-  return;
-}
+       int tag;
+       long num = *arg;
+       u_char sign;
+       FPU_REG *dest = &st(deststnr);
  
+       if (num == 0) {
+               FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+               return;
+       }
+
+       if (num > 0) {
+               sign = SIGN_POS;
+       } else {
+               num = -num;
+               sign = SIGN_NEG;
+       }
+
+       dest->sigh = num;
+       dest->sigl = 0;
+       setexponent16(dest, 31);
+       tag = FPU_normalize(dest);
+       FPU_settagi(deststnr, tag);
+       setsign(dest, sign);
+       return;
+}
  
  static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  if ( st0_tag == TAG_Empty )
-    FPU_stack_underflow();  /* Puts a QNaN in st(0) */
-  else if ( st0_tag == TW_NaN )
-    real_1op_NaN(st0_ptr);       /* return with a NaN in st(0) */
+       if (st0_tag == TAG_Empty)
+               FPU_stack_underflow();  /* Puts a QNaN in st(0) */
+       else if (st0_tag == TW_NaN)
+               real_1op_NaN(st0_ptr);  /* return with a NaN in st(0) */
  #ifdef PARANOID
-  else
-    EXCEPTION(EX_INTERNAL|0x0112);
+       else
+               EXCEPTION(EX_INTERNAL | 0x0112);
  #endif /* PARANOID */
  }
  
-
  static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  int isNaN;
-
-  switch ( st0_tag )
-    {
-    case TW_NaN:
-      isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000);
-      if ( isNaN && !(st0_ptr->sigh & 0x40000000) )   /* Signaling ? */
-       {
-         EXCEPTION(EX_Invalid);
-         if ( control_word & CW_Invalid )
-           {
-             /* The masked response */
-             /* Convert to a QNaN */
-             st0_ptr->sigh |= 0x40000000;
-             push();
-             FPU_copy_to_reg0(st0_ptr, TAG_Special);
-           }
-       }
-      else if ( isNaN )
-       {
-         /* A QNaN */
-         push();
-         FPU_copy_to_reg0(st0_ptr, TAG_Special);
-       }
-      else
-       {
-         /* pseudoNaN or other unsupported */
-         EXCEPTION(EX_Invalid);
-         if ( control_word & CW_Invalid )
-           {
-             /* The masked response */
-             FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
-             push();
-             FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
-           }
-       }
-      break;              /* return with a NaN in st(0) */
+       int isNaN;
+
+       switch (st0_tag) {
+       case TW_NaN:
+               isNaN = (exponent(st0_ptr) == EXP_OVER)
+                   && (st0_ptr->sigh & 0x80000000);
+               if (isNaN && !(st0_ptr->sigh & 0x40000000)) {   /* Signaling ? */
+                       EXCEPTION(EX_Invalid);
+                       if (control_word & CW_Invalid) {
+                               /* The masked response */
+                               /* Convert to a QNaN */
+                               st0_ptr->sigh |= 0x40000000;
+                               push();
+                               FPU_copy_to_reg0(st0_ptr, TAG_Special);
+                       }
+               } else if (isNaN) {
+                       /* A QNaN */
+                       push();
+                       FPU_copy_to_reg0(st0_ptr, TAG_Special);
+               } else {
+                       /* pseudoNaN or other unsupported */
+                       EXCEPTION(EX_Invalid);
+                       if (control_word & CW_Invalid) {
+                               /* The masked response */
+                               FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+                               push();
+                               FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+                       }
+               }
+               break;          /* return with a NaN in st(0) */
  #ifdef PARANOID
-    default:
-      EXCEPTION(EX_INTERNAL|0x0112);
+       default:
+               EXCEPTION(EX_INTERNAL | 0x0112);
  #endif /* PARANOID */
-    }
+       }
  }
  
-
  /*---------------------------------------------------------------------------*/
  
  static void f2xm1(FPU_REG *st0_ptr, u_char tag)
  {
-  FPU_REG a;
+       FPU_REG a;
  
-  clear_C1();
+       clear_C1();
  
-  if ( tag == TAG_Valid )
-    {
-      /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */
-      if ( exponent(st0_ptr) < 0 )
-       {
-       denormal_arg:
+       if (tag == TAG_Valid) {
+               /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */
+               if (exponent(st0_ptr) < 0) {
+                     denormal_arg:
  
-         FPU_to_exp16(st0_ptr, &a);
+                       FPU_to_exp16(st0_ptr, &a);
  
-         /* poly_2xm1(x) requires 0 < st(0) < 1. */
-         poly_2xm1(getsign(st0_ptr), &a, st0_ptr);
+                       /* poly_2xm1(x) requires 0 < st(0) < 1. */
+                       poly_2xm1(getsign(st0_ptr), &a, st0_ptr);
+               }
+               set_precision_flag_up();        /* 80486 appears to always do this */
+               return;
         }
-      set_precision_flag_up();   /* 80486 appears to always do this */
-      return;
-    }
  
-  if ( tag == TAG_Zero )
-    return;
+       if (tag == TAG_Zero)
+               return;
  
-  if ( tag == TAG_Special )
-    tag = FPU_Special(st0_ptr);
+       if (tag == TAG_Special)
+               tag = FPU_Special(st0_ptr);
  
-  switch ( tag )
-    {
-    case TW_Denormal:
-      if ( denormal_operand() < 0 )
-       return;
-      goto denormal_arg;
-    case TW_Infinity:
-      if ( signnegative(st0_ptr) )
-       {
-         /* -infinity gives -1 (p16-10) */
-         FPU_copy_to_reg0(&CONST_1, TAG_Valid);
-         setnegative(st0_ptr);
+       switch (tag) {
+       case TW_Denormal:
+               if (denormal_operand() < 0)
+                       return;
+               goto denormal_arg;
+       case TW_Infinity:
+               if (signnegative(st0_ptr)) {
+                       /* -infinity gives -1 (p16-10) */
+                       FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+                       setnegative(st0_ptr);
+               }
+               return;
+       default:
+               single_arg_error(st0_ptr, tag);
         }
-      return;
-    default:
-      single_arg_error(st0_ptr, tag);
-    }
  }
  
-
  static void fptan(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  FPU_REG *st_new_ptr;
-  int q;
-  u_char arg_sign = getsign(st0_ptr);
-
-  /* Stack underflow has higher priority */
-  if ( st0_tag == TAG_Empty )
-    {
-      FPU_stack_underflow();  /* Puts a QNaN in st(0) */
-      if ( control_word & CW_Invalid )
-       {
-         st_new_ptr = &st(-1);
-         push();
-         FPU_stack_underflow();  /* Puts a QNaN in the new st(0) */
+       FPU_REG *st_new_ptr;
+       int q;
+       u_char arg_sign = getsign(st0_ptr);
+
+       /* Stack underflow has higher priority */
+       if (st0_tag == TAG_Empty) {
+               FPU_stack_underflow();  /* Puts a QNaN in st(0) */
+               if (control_word & CW_Invalid) {
+                       st_new_ptr = &st(-1);
+                       push();
+                       FPU_stack_underflow();  /* Puts a QNaN in the new st(0) */
+               }
+               return;
         }
-      return;
-    }
-
-  if ( STACK_OVERFLOW )
-    { FPU_stack_overflow(); return; }
-
-  if ( st0_tag == TAG_Valid )
-    {
-      if ( exponent(st0_ptr) > -40 )
-       {
-         if ( (q = trig_arg(st0_ptr, 0)) == -1 )
-           {
-             /* Operand is out of range */
-             return;
-           }
-
-         poly_tan(st0_ptr);
-         setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
-         set_precision_flag_up();  /* We do not really know if up or down */
+
+       if (STACK_OVERFLOW) {
+               FPU_stack_overflow();
+               return;
         }
-      else
-       {
-         /* For a small arg, the result == the argument */
-         /* Underflow may happen */
  
-       denormal_arg:
+       if (st0_tag == TAG_Valid) {
+               if (exponent(st0_ptr) > -40) {
+                       if ((q = trig_arg(st0_ptr, 0)) == -1) {
+                               /* Operand is out of range */
+                               return;
+                       }
+
+                       poly_tan(st0_ptr);
+                       setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
+                       set_precision_flag_up();        /* We do not really know if up or down */
+               } else {
+                       /* For a small arg, the result == the argument */
+                       /* Underflow may happen */
+
+                     denormal_arg:
+
+                       FPU_to_exp16(st0_ptr, st0_ptr);
  
-         FPU_to_exp16(st0_ptr, st0_ptr);
-      
-         st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
-         FPU_settag0(st0_tag);
+                       st0_tag =
+                           FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
+                       FPU_settag0(st0_tag);
+               }
+               push();
+               FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+               return;
         }
-      push();
-      FPU_copy_to_reg0(&CONST_1, TAG_Valid);
-      return;
-    }
-
-  if ( st0_tag == TAG_Zero )
-    {
-      push();
-      FPU_copy_to_reg0(&CONST_1, TAG_Valid);
-      setcc(0);
-      return;
-    }
-
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-
-  if ( st0_tag == TW_Denormal )
-    {
-      if ( denormal_operand() < 0 )
-       return;
  
-      goto denormal_arg;
-    }
-
-  if ( st0_tag == TW_Infinity )
-    {
-      /* The 80486 treats infinity as an invalid operand */
-      if ( arith_invalid(0) >= 0 )
-       {
-         st_new_ptr = &st(-1);
-         push();
-         arith_invalid(0);
+       if (st0_tag == TAG_Zero) {
+               push();
+               FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+               setcc(0);
+               return;
+       }
+
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+
+       if (st0_tag == TW_Denormal) {
+               if (denormal_operand() < 0)
+                       return;
+
+               goto denormal_arg;
         }
-      return;
-    }
  
-  single_arg_2_error(st0_ptr, st0_tag);
-}
+       if (st0_tag == TW_Infinity) {
+               /* The 80486 treats infinity as an invalid operand */
+               if (arith_invalid(0) >= 0) {
+                       st_new_ptr = &st(-1);
+                       push();
+                       arith_invalid(0);
+               }
+               return;
+       }
  
+       single_arg_2_error(st0_ptr, st0_tag);
+}
  
  static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  FPU_REG *st_new_ptr;
-  u_char sign;
-  register FPU_REG *st1_ptr = st0_ptr;  /* anticipate */
-
-  if ( STACK_OVERFLOW )
-    {  FPU_stack_overflow(); return; }
-
-  clear_C1();
-
-  if ( st0_tag == TAG_Valid )
-    {
-      long e;
-
-      push();
-      sign = getsign(st1_ptr);
-      reg_copy(st1_ptr, st_new_ptr);
-      setexponent16(st_new_ptr, exponent(st_new_ptr));
-
-    denormal_arg:
-
-      e = exponent16(st_new_ptr);
-      convert_l2reg(&e, 1);
-      setexponentpos(st_new_ptr, 0);
-      setsign(st_new_ptr, sign);
-      FPU_settag0(TAG_Valid);       /* Needed if arg was a denormal */
-      return;
-    }
-  else if ( st0_tag == TAG_Zero )
-    {
-      sign = getsign(st0_ptr);
-
-      if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 )
-       return;
+       FPU_REG *st_new_ptr;
+       u_char sign;
+       register FPU_REG *st1_ptr = st0_ptr;    /* anticipate */
  
-      push();
-      FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
-      setsign(st_new_ptr, sign);
-      return;
-    }
+       if (STACK_OVERFLOW) {
+               FPU_stack_overflow();
+               return;
+       }
  
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
+       clear_C1();
  
-  if ( st0_tag == TW_Denormal )
-    {
-      if (denormal_operand() < 0 )
-       return;
+       if (st0_tag == TAG_Valid) {
+               long e;
  
-      push();
-      sign = getsign(st1_ptr);
-      FPU_to_exp16(st1_ptr, st_new_ptr);
-      goto denormal_arg;
-    }
-  else if ( st0_tag == TW_Infinity )
-    {
-      sign = getsign(st0_ptr);
-      setpositive(st0_ptr);
-      push();
-      FPU_copy_to_reg0(&CONST_INF, TAG_Special);
-      setsign(st_new_ptr, sign);
-      return;
-    }
-  else if ( st0_tag == TW_NaN )
-    {
-      if ( real_1op_NaN(st0_ptr) < 0 )
-       return;
+               push();
+               sign = getsign(st1_ptr);
+               reg_copy(st1_ptr, st_new_ptr);
+               setexponent16(st_new_ptr, exponent(st_new_ptr));
+
+             denormal_arg:
+
+               e = exponent16(st_new_ptr);
+               convert_l2reg(&e, 1);
+               setexponentpos(st_new_ptr, 0);
+               setsign(st_new_ptr, sign);
+               FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */
+               return;
+       } else if (st0_tag == TAG_Zero) {
+               sign = getsign(st0_ptr);
+
+               if (FPU_divide_by_zero(0, SIGN_NEG) < 0)
+                       return;
  
-      push();
-      FPU_copy_to_reg0(st0_ptr, TAG_Special);
-      return;
-    }
-  else if ( st0_tag == TAG_Empty )
-    {
-      /* Is this the correct behaviour? */
-      if ( control_word & EX_Invalid )
-       {
-         FPU_stack_underflow();
-         push();
-         FPU_stack_underflow();
+               push();
+               FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+               setsign(st_new_ptr, sign);
+               return;
+       }
+
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+
+       if (st0_tag == TW_Denormal) {
+               if (denormal_operand() < 0)
+                       return;
+
+               push();
+               sign = getsign(st1_ptr);
+               FPU_to_exp16(st1_ptr, st_new_ptr);
+               goto denormal_arg;
+       } else if (st0_tag == TW_Infinity) {
+               sign = getsign(st0_ptr);
+               setpositive(st0_ptr);
+               push();
+               FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+               setsign(st_new_ptr, sign);
+               return;
+       } else if (st0_tag == TW_NaN) {
+               if (real_1op_NaN(st0_ptr) < 0)
+                       return;
+
+               push();
+               FPU_copy_to_reg0(st0_ptr, TAG_Special);
+               return;
+       } else if (st0_tag == TAG_Empty) {
+               /* Is this the correct behaviour? */
+               if (control_word & EX_Invalid) {
+                       FPU_stack_underflow();
+                       push();
+                       FPU_stack_underflow();
+               } else
+                       EXCEPTION(EX_StackUnder);
         }
-      else
-       EXCEPTION(EX_StackUnder);
-    }
  #ifdef PARANOID
-  else
-    EXCEPTION(EX_INTERNAL | 0x119);
+       else
+               EXCEPTION(EX_INTERNAL | 0x119);
  #endif /* PARANOID */
  }
  
-
  static void fdecstp(void)
  {
-  clear_C1();
-  top--;
+       clear_C1();
+       top--;
  }
  
  static void fincstp(void)
  {
-  clear_C1();
-  top++;
+       clear_C1();
+       top++;
  }
  
-
  static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  int expon;
-
-  clear_C1();
-
-  if ( st0_tag == TAG_Valid )
-    {
-      u_char tag;
-      
-      if (signnegative(st0_ptr))
-       {
-         arith_invalid(0);  /* sqrt(negative) is invalid */
-         return;
-       }
+       int expon;
+
+       clear_C1();
  
-      /* make st(0) in  [1.0 .. 4.0) */
-      expon = exponent(st0_ptr);
-
-    denormal_arg:
-
-      setexponent16(st0_ptr, (expon & 1));
-
-      /* Do the computation, the sign of the result will be positive. */
-      tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS);
-      addexponent(st0_ptr, expon >> 1);
-      FPU_settag0(tag);
-      return;
-    }
-
-  if ( st0_tag == TAG_Zero )
-    return;
-
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-
-  if ( st0_tag == TW_Infinity )
-    {
-      if ( signnegative(st0_ptr) )
-       arith_invalid(0);  /* sqrt(-Infinity) is invalid */
-      return;
-    }
-  else if ( st0_tag == TW_Denormal )
-    {
-      if (signnegative(st0_ptr))
-       {
-         arith_invalid(0);  /* sqrt(negative) is invalid */
-         return;
+       if (st0_tag == TAG_Valid) {
+               u_char tag;
+
+               if (signnegative(st0_ptr)) {
+                       arith_invalid(0);       /* sqrt(negative) is invalid */
+                       return;
+               }
+
+               /* make st(0) in  [1.0 .. 4.0) */
+               expon = exponent(st0_ptr);
+
+             denormal_arg:
+
+               setexponent16(st0_ptr, (expon & 1));
+
+               /* Do the computation, the sign of the result will be positive. */
+               tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS);
+               addexponent(st0_ptr, expon >> 1);
+               FPU_settag0(tag);
+               return;
         }
  
-      if ( denormal_operand() < 0 )
-       return;
+       if (st0_tag == TAG_Zero)
+               return;
  
-      FPU_to_exp16(st0_ptr, st0_ptr);
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
  
-      expon = exponent16(st0_ptr);
+       if (st0_tag == TW_Infinity) {
+               if (signnegative(st0_ptr))
+                       arith_invalid(0);       /* sqrt(-Infinity) is invalid */
+               return;
+       } else if (st0_tag == TW_Denormal) {
+               if (signnegative(st0_ptr)) {
+                       arith_invalid(0);       /* sqrt(negative) is invalid */
+                       return;
+               }
  
-      goto denormal_arg;
-    }
+               if (denormal_operand() < 0)
+                       return;
  
-  single_arg_error(st0_ptr, st0_tag);
+               FPU_to_exp16(st0_ptr, st0_ptr);
  
-}
+               expon = exponent16(st0_ptr);
+
+               goto denormal_arg;
+       }
  
+       single_arg_error(st0_ptr, st0_tag);
+
+}
  
  static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  int flags, tag;
+       int flags, tag;
  
-  if ( st0_tag == TAG_Valid )
-    {
-      u_char sign;
+       if (st0_tag == TAG_Valid) {
+               u_char sign;
  
-    denormal_arg:
+             denormal_arg:
  
-      sign = getsign(st0_ptr);
+               sign = getsign(st0_ptr);
  
-      if (exponent(st0_ptr) > 63)
-       return;
+               if (exponent(st0_ptr) > 63)
+                       return;
+
+               if (st0_tag == TW_Denormal) {
+                       if (denormal_operand() < 0)
+                               return;
+               }
+
+               /* Fortunately, this can't overflow to 2^64 */
+               if ((flags = FPU_round_to_int(st0_ptr, st0_tag)))
+                       set_precision_flag(flags);
  
-      if ( st0_tag == TW_Denormal )
-       {
-         if (denormal_operand() < 0 )
-           return;
+               setexponent16(st0_ptr, 63);
+               tag = FPU_normalize(st0_ptr);
+               setsign(st0_ptr, sign);
+               FPU_settag0(tag);
+               return;
         }
  
-      /* Fortunately, this can't overflow to 2^64 */
-      if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) )
-       set_precision_flag(flags);
-
-      setexponent16(st0_ptr, 63);
-      tag = FPU_normalize(st0_ptr);
-      setsign(st0_ptr, sign);
-      FPU_settag0(tag);
-      return;
-    }
-
-  if ( st0_tag == TAG_Zero )
-    return;
-
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-
-  if ( st0_tag == TW_Denormal )
-    goto denormal_arg;
-  else if ( st0_tag == TW_Infinity )
-    return;
-  else
-    single_arg_error(st0_ptr, st0_tag);
-}
+       if (st0_tag == TAG_Zero)
+               return;
  
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+
+       if (st0_tag == TW_Denormal)
+               goto denormal_arg;
+       else if (st0_tag == TW_Infinity)
+               return;
+       else
+               single_arg_error(st0_ptr, st0_tag);
+}
  
  static int fsin(FPU_REG *st0_ptr, u_char tag)
  {
-  u_char arg_sign = getsign(st0_ptr);
-
-  if ( tag == TAG_Valid )
-    {
-      int q;
-
-      if ( exponent(st0_ptr) > -40 )
-       {
-         if ( (q = trig_arg(st0_ptr, 0)) == -1 )
-           {
-             /* Operand is out of range */
-             return 1;
-           }
-
-         poly_sine(st0_ptr);
-         
-         if (q & 2)
-           changesign(st0_ptr);
-
-         setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign);
-
-         /* We do not really know if up or down */
-         set_precision_flag_up();
-         return 0;
+       u_char arg_sign = getsign(st0_ptr);
+
+       if (tag == TAG_Valid) {
+               int q;
+
+               if (exponent(st0_ptr) > -40) {
+                       if ((q = trig_arg(st0_ptr, 0)) == -1) {
+                               /* Operand is out of range */
+                               return 1;
+                       }
+
+                       poly_sine(st0_ptr);
+
+                       if (q & 2)
+                               changesign(st0_ptr);
+
+                       setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign);
+
+                       /* We do not really know if up or down */
+                       set_precision_flag_up();
+                       return 0;
+               } else {
+                       /* For a small arg, the result == the argument */
+                       set_precision_flag_up();        /* Must be up. */
+                       return 0;
+               }
         }
-      else
-       {
-         /* For a small arg, the result == the argument */
-         set_precision_flag_up();  /* Must be up. */
-         return 0;
+
+       if (tag == TAG_Zero) {
+               setcc(0);
+               return 0;
         }
-    }
-
-  if ( tag == TAG_Zero )
-    {
-      setcc(0);
-      return 0;
-    }
-
-  if ( tag == TAG_Special )
-    tag = FPU_Special(st0_ptr);
-
-  if ( tag == TW_Denormal )
-    {
-      if ( denormal_operand() < 0 )
-       return 1;
-
-      /* For a small arg, the result == the argument */
-      /* Underflow may happen */
-      FPU_to_exp16(st0_ptr, st0_ptr);
-      
-      tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
-
-      FPU_settag0(tag);
-
-      return 0;
-    }
-  else if ( tag == TW_Infinity )
-    {
-      /* The 80486 treats infinity as an invalid operand */
-      arith_invalid(0);
-      return 1;
-    }
-  else
-    {
-      single_arg_error(st0_ptr, tag);
-      return 1;
-    }
-}
  
+       if (tag == TAG_Special)
+               tag = FPU_Special(st0_ptr);
+
+       if (tag == TW_Denormal) {
+               if (denormal_operand() < 0)
+                       return 1;
+
+               /* For a small arg, the result == the argument */
+               /* Underflow may happen */
+               FPU_to_exp16(st0_ptr, st0_ptr);
+
+               tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
+
+               FPU_settag0(tag);
+
+               return 0;
+       } else if (tag == TW_Infinity) {
+               /* The 80486 treats infinity as an invalid operand */
+               arith_invalid(0);
+               return 1;
+       } else {
+               single_arg_error(st0_ptr, tag);
+               return 1;
+       }
+}
  
  static int f_cos(FPU_REG *st0_ptr, u_char tag)
  {
-  u_char st0_sign;
-
-  st0_sign = getsign(st0_ptr);
-
-  if ( tag == TAG_Valid )
-    {
-      int q;
-
-      if ( exponent(st0_ptr) > -40 )
-       {
-         if ( (exponent(st0_ptr) < 0)
-             || ((exponent(st0_ptr) == 0)
-                 && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) )
-           {
-             poly_cos(st0_ptr);
-
-             /* We do not really know if up or down */
-             set_precision_flag_down();
-         
-             return 0;
-           }
-         else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 )
-           {
-             poly_sine(st0_ptr);
-
-             if ((q+1) & 2)
-               changesign(st0_ptr);
-
-             /* We do not really know if up or down */
-             set_precision_flag_down();
-         
-             return 0;
-           }
-         else
-           {
-             /* Operand is out of range */
-             return 1;
-           }
-       }
-      else
-       {
-       denormal_arg:
+       u_char st0_sign;
+
+       st0_sign = getsign(st0_ptr);
  
-         setcc(0);
-         FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+       if (tag == TAG_Valid) {
+               int q;
+
+               if (exponent(st0_ptr) > -40) {
+                       if ((exponent(st0_ptr) < 0)
+                           || ((exponent(st0_ptr) == 0)
+                               && (significand(st0_ptr) <=
+                                   0xc90fdaa22168c234LL))) {
+                               poly_cos(st0_ptr);
+
+                               /* We do not really know if up or down */
+                               set_precision_flag_down();
+
+                               return 0;
+                       } else if ((q = trig_arg(st0_ptr, FCOS)) != -1) {
+                               poly_sine(st0_ptr);
+
+                               if ((q + 1) & 2)
+                                       changesign(st0_ptr);
+
+                               /* We do not really know if up or down */
+                               set_precision_flag_down();
+
+                               return 0;
+                       } else {
+                               /* Operand is out of range */
+                               return 1;
+                       }
+               } else {
+                     denormal_arg:
+
+                       setcc(0);
+                       FPU_copy_to_reg0(&CONST_1, TAG_Valid);
  #ifdef PECULIAR_486
-         set_precision_flag_down();  /* 80486 appears to do this. */
+                       set_precision_flag_down();      /* 80486 appears to do this. */
  #else
-         set_precision_flag_up();  /* Must be up. */
+                       set_precision_flag_up();        /* Must be up. */
  #endif /* PECULIAR_486 */
-         return 0;
+                       return 0;
+               }
+       } else if (tag == TAG_Zero) {
+               FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+               setcc(0);
+               return 0;
         }
-    }
-  else if ( tag == TAG_Zero )
-    {
-      FPU_copy_to_reg0(&CONST_1, TAG_Valid);
-      setcc(0);
-      return 0;
-    }
-
-  if ( tag == TAG_Special )
-    tag = FPU_Special(st0_ptr);
-
-  if ( tag == TW_Denormal )
-    {
-      if ( denormal_operand() < 0 )
-       return 1;
-
-      goto denormal_arg;
-    }
-  else if ( tag == TW_Infinity )
-    {
-      /* The 80486 treats infinity as an invalid operand */
-      arith_invalid(0);
-      return 1;
-    }
-  else
-    {
-      single_arg_error(st0_ptr, tag);  /* requires st0_ptr == &st(0) */
-      return 1;
-    }
-}
  
+       if (tag == TAG_Special)
+               tag = FPU_Special(st0_ptr);
+
+       if (tag == TW_Denormal) {
+               if (denormal_operand() < 0)
+                       return 1;
+
+               goto denormal_arg;
+       } else if (tag == TW_Infinity) {
+               /* The 80486 treats infinity as an invalid operand */
+               arith_invalid(0);
+               return 1;
+       } else {
+               single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */
+               return 1;
+       }
+}
  
  static void fcos(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  f_cos(st0_ptr, st0_tag);
+       f_cos(st0_ptr, st0_tag);
  }
  
-
  static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  FPU_REG *st_new_ptr;
-  FPU_REG arg;
-  u_char tag;
-
-  /* Stack underflow has higher priority */
-  if ( st0_tag == TAG_Empty )
-    {
-      FPU_stack_underflow();  /* Puts a QNaN in st(0) */
-      if ( control_word & CW_Invalid )
-       {
-         st_new_ptr = &st(-1);
-         push();
-         FPU_stack_underflow();  /* Puts a QNaN in the new st(0) */
+       FPU_REG *st_new_ptr;
+       FPU_REG arg;
+       u_char tag;
+
+       /* Stack underflow has higher priority */
+       if (st0_tag == TAG_Empty) {
+               FPU_stack_underflow();  /* Puts a QNaN in st(0) */
+               if (control_word & CW_Invalid) {
+                       st_new_ptr = &st(-1);
+                       push();
+                       FPU_stack_underflow();  /* Puts a QNaN in the new st(0) */
+               }
+               return;
         }
-      return;
-    }
-
-  if ( STACK_OVERFLOW )
-    { FPU_stack_overflow(); return; }
-
-  if ( st0_tag == TAG_Special )
-    tag = FPU_Special(st0_ptr);
-  else
-    tag = st0_tag;
-
-  if ( tag == TW_NaN )
-    {
-      single_arg_2_error(st0_ptr, TW_NaN);
-      return;
-    }
-  else if ( tag == TW_Infinity )
-    {
-      /* The 80486 treats infinity as an invalid operand */
-      if ( arith_invalid(0) >= 0 )
-       {
-         /* Masked response */
-         push();
-         arith_invalid(0);
+
+       if (STACK_OVERFLOW) {
+               FPU_stack_overflow();
+               return;
         }
-      return;
-    }
-
-  reg_copy(st0_ptr, &arg);
-  if ( !fsin(st0_ptr, st0_tag) )
-    {
-      push();
-      FPU_copy_to_reg0(&arg, st0_tag);
-      f_cos(&st(0), st0_tag);
-    }
-  else
-    {
-      /* An error, so restore st(0) */
-      FPU_copy_to_reg0(&arg, st0_tag);
-    }
-}
  
+       if (st0_tag == TAG_Special)
+               tag = FPU_Special(st0_ptr);
+       else
+               tag = st0_tag;
+
+       if (tag == TW_NaN) {
+               single_arg_2_error(st0_ptr, TW_NaN);
+               return;
+       } else if (tag == TW_Infinity) {
+               /* The 80486 treats infinity as an invalid operand */
+               if (arith_invalid(0) >= 0) {
+                       /* Masked response */
+                       push();
+                       arith_invalid(0);
+               }
+               return;
+       }
+
+       reg_copy(st0_ptr, &arg);
+       if (!fsin(st0_ptr, st0_tag)) {
+               push();
+               FPU_copy_to_reg0(&arg, st0_tag);
+               f_cos(&st(0), st0_tag);
+       } else {
+               /* An error, so restore st(0) */
+               FPU_copy_to_reg0(&arg, st0_tag);
+       }
+}
  
  /*---------------------------------------------------------------------------*/
  /* The following all require two arguments: st(0) and st(1) */
@@ -826,1020 +743,901 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
     result must be zero.
   */
  static void rem_kernel(unsigned long long st0, unsigned long long *y,
-                      unsigned long long st1,
-                      unsigned long long q, int n)
+                      unsigned long long st1, unsigned long long q, int n)
  {
-  int dummy;
-  unsigned long long x;
-
-  x = st0 << n;
-
-  /* Do the required multiplication and subtraction in the one operation */
-
-  /* lsw x -= lsw st1 * lsw q */
-  asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1"
-               :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]),
-               "=a" (dummy)
-               :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0])
-               :"%dx");
-  /* msw x -= msw st1 * lsw q */
-  asm volatile ("mull %3; subl %%eax,%0"
-               :"=m" (((unsigned *)&x)[1]), "=a" (dummy)
-               :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0])
-               :"%dx");
-  /* msw x -= lsw st1 * msw q */
-  asm volatile ("mull %3; subl %%eax,%0"
-               :"=m" (((unsigned *)&x)[1]), "=a" (dummy)
-               :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1])
-               :"%dx");
-
-  *y = x;
+       int dummy;
+       unsigned long long x;
+
+       x = st0 << n;
+
+       /* Do the required multiplication and subtraction in the one operation */
+
+       /* lsw x -= lsw st1 * lsw q */
+       asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1":"=m"
+                     (((unsigned *)&x)[0]), "=m"(((unsigned *)&x)[1]),
+                     "=a"(dummy)
+                     :"2"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[0])
+                     :"%dx");
+       /* msw x -= msw st1 * lsw q */
+       asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
+                     "=a"(dummy)
+                     :"1"(((unsigned *)&st1)[1]), "m"(((unsigned *)&q)[0])
+                     :"%dx");
+       /* msw x -= lsw st1 * msw q */
+       asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
+                     "=a"(dummy)
+                     :"1"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[1])
+                     :"%dx");
+
+       *y = x;
  }
  
-
  /* Remainder of st(0) / st(1) */
  /* This routine produces exact results, i.e. there is never any
     rounding or truncation, etc of the result. */
  static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round)
  {
-  FPU_REG *st1_ptr = &st(1);
-  u_char st1_tag = FPU_gettagi(1);
-
-  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
-    {
-      FPU_REG tmp, st0, st1;
-      u_char st0_sign, st1_sign;
-      u_char tmptag;
-      int tag;
-      int old_cw;
-      int expdif;
-      long long q;
-      unsigned short saved_status;
-      int cc;
-
-    fprem_valid:
-      /* Convert registers for internal use. */
-      st0_sign = FPU_to_exp16(st0_ptr, &st0);
-      st1_sign = FPU_to_exp16(st1_ptr, &st1);
-      expdif = exponent16(&st0) - exponent16(&st1);
-
-      old_cw = control_word;
-      cc = 0;
-
-      /* We want the status following the denorm tests, but don't want
-        the status changed by the arithmetic operations. */
-      saved_status = partial_status;
-      control_word &= ~CW_RC;
-      control_word |= RC_CHOP;
-
-      if ( expdif < 64 )
-       {
-         /* This should be the most common case */
-
-         if ( expdif > -2 )
-           {
-             u_char sign = st0_sign ^ st1_sign;
-             tag = FPU_u_div(&st0, &st1, &tmp,
-                             PR_64_BITS | RC_CHOP | 0x3f,
-                             sign);
-             setsign(&tmp, sign);
-
-             if ( exponent(&tmp) >= 0 )
-               {
-                 FPU_round_to_int(&tmp, tag);  /* Fortunately, this can't
-                                                  overflow to 2^64 */
-                 q = significand(&tmp);
-
-                 rem_kernel(significand(&st0),
-                            &significand(&tmp),
-                            significand(&st1),
-                            q, expdif);
-
-                 setexponent16(&tmp, exponent16(&st1));
-               }
-             else
-               {
-                 reg_copy(&st0, &tmp);
-                 q = 0;
-               }
-
-             if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) )
-               {
-                 /* We may need to subtract st(1) once more,
-                    to get a result <= 1/2 of st(1). */
-                 unsigned long long x;
-                 expdif = exponent16(&st1) - exponent16(&tmp);
-                 if ( expdif <= 1 )
-                   {
-                     if ( expdif == 0 )
-                       x = significand(&st1) - significand(&tmp);
-                     else /* expdif is 1 */
-                       x = (significand(&st1) << 1) - significand(&tmp);
-                     if ( (x < significand(&tmp)) ||
-                         /* or equi-distant (from 0 & st(1)) and q is odd */
-                         ((x == significand(&tmp)) && (q & 1) ) )
-                       {
-                         st0_sign = ! st0_sign;
-                         significand(&tmp) = x;
-                         q++;
+       FPU_REG *st1_ptr = &st(1);
+       u_char st1_tag = FPU_gettagi(1);
+
+       if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+               FPU_REG tmp, st0, st1;
+               u_char st0_sign, st1_sign;
+               u_char tmptag;
+               int tag;
+               int old_cw;
+               int expdif;
+               long long q;
+               unsigned short saved_status;
+               int cc;
+
+             fprem_valid:
+               /* Convert registers for internal use. */
+               st0_sign = FPU_to_exp16(st0_ptr, &st0);
+               st1_sign = FPU_to_exp16(st1_ptr, &st1);
+               expdif = exponent16(&st0) - exponent16(&st1);
+
+               old_cw = control_word;
+               cc = 0;
+
+               /* We want the status following the denorm tests, but don't want
+                  the status changed by the arithmetic operations. */
+               saved_status = partial_status;
+               control_word &= ~CW_RC;
+               control_word |= RC_CHOP;
+
+               if (expdif < 64) {
+                       /* This should be the most common case */
+
+                       if (expdif > -2) {
+                               u_char sign = st0_sign ^ st1_sign;
+                               tag = FPU_u_div(&st0, &st1, &tmp,
+                                               PR_64_BITS | RC_CHOP | 0x3f,
+                                               sign);
+                               setsign(&tmp, sign);
+
+                               if (exponent(&tmp) >= 0) {
+                                       FPU_round_to_int(&tmp, tag);    /* Fortunately, this can't
+                                                                          overflow to 2^64 */
+                                       q = significand(&tmp);
+
+                                       rem_kernel(significand(&st0),
+                                                  &significand(&tmp),
+                                                  significand(&st1),
+                                                  q, expdif);
+
+                                       setexponent16(&tmp, exponent16(&st1));
+                               } else {
+                                       reg_copy(&st0, &tmp);
+                                       q = 0;
+                               }
+
+                               if ((round == RC_RND)
+                                   && (tmp.sigh & 0xc0000000)) {
+                                       /* We may need to subtract st(1) once more,
+                                          to get a result <= 1/2 of st(1). */
+                                       unsigned long long x;
+                                       expdif =
+                                           exponent16(&st1) - exponent16(&tmp);
+                                       if (expdif <= 1) {
+                                               if (expdif == 0)
+                                                       x = significand(&st1) -
+                                                           significand(&tmp);
+                                               else    /* expdif is 1 */
+                                                       x = (significand(&st1)
+                                                            << 1) -
+                                                           significand(&tmp);
+                                               if ((x < significand(&tmp)) ||
+                                                   /* or equi-distant (from 0 & st(1)) and q is odd */
+                                                   ((x == significand(&tmp))
+                                                    && (q & 1))) {
+                                                       st0_sign = !st0_sign;
+                                                       significand(&tmp) = x;
+                                                       q++;
+                                               }
+                                       }
+                               }
+
+                               if (q & 4)
+                                       cc |= SW_C0;
+                               if (q & 2)
+                                       cc |= SW_C3;
+                               if (q & 1)
+                                       cc |= SW_C1;
+                       } else {
+                               control_word = old_cw;
+                               setcc(0);
+                               return;
                         }
-                   }
-               }
-
-             if (q & 4) cc |= SW_C0;
-             if (q & 2) cc |= SW_C3;
-             if (q & 1) cc |= SW_C1;
-           }
-         else
-           {
-             control_word = old_cw;
-             setcc(0);
-             return;
-           }
-       }
-      else
-       {
-         /* There is a large exponent difference ( >= 64 ) */
-         /* To make much sense, the code in this section should
-            be done at high precision. */
-         int exp_1, N;
-         u_char sign;
-
-         /* prevent overflow here */
-         /* N is 'a number between 32 and 63' (p26-113) */
-         reg_copy(&st0, &tmp);
-         tmptag = st0_tag;
-         N = (expdif & 0x0000001f) + 32;  /* This choice gives results
-                                             identical to an AMD 486 */
-         setexponent16(&tmp, N);
-         exp_1 = exponent16(&st1);
-         setexponent16(&st1, 0);
-         expdif -= N;
-
-         sign = getsign(&tmp) ^ st1_sign;
-         tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
-                         sign);
-         setsign(&tmp, sign);
-
-         FPU_round_to_int(&tmp, tag);  /* Fortunately, this can't
-                                          overflow to 2^64 */
-
-         rem_kernel(significand(&st0),
-                    &significand(&tmp),
-                    significand(&st1),
-                    significand(&tmp),
-                    exponent(&tmp)
-                    ); 
-         setexponent16(&tmp, exp_1 + expdif);
-
-         /* It is possible for the operation to be complete here.
-            What does the IEEE standard say? The Intel 80486 manual
-            implies that the operation will never be completed at this
-            point, and the behaviour of a real 80486 confirms this.
-          */
-         if ( !(tmp.sigh | tmp.sigl) )
-           {
-             /* The result is zero */
-             control_word = old_cw;
-             partial_status = saved_status;
-             FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
-             setsign(&st0, st0_sign);
+               } else {
+                       /* There is a large exponent difference ( >= 64 ) */
+                       /* To make much sense, the code in this section should
+                          be done at high precision. */
+                       int exp_1, N;
+                       u_char sign;
+
+                       /* prevent overflow here */
+                       /* N is 'a number between 32 and 63' (p26-113) */
+                       reg_copy(&st0, &tmp);
+                       tmptag = st0_tag;
+                       N = (expdif & 0x0000001f) + 32; /* This choice gives results
+                                                          identical to an AMD 486 */
+                       setexponent16(&tmp, N);
+                       exp_1 = exponent16(&st1);
+                       setexponent16(&st1, 0);
+                       expdif -= N;
+
+                       sign = getsign(&tmp) ^ st1_sign;
+                       tag =
+                           FPU_u_div(&tmp, &st1, &tmp,
+                                     PR_64_BITS | RC_CHOP | 0x3f, sign);
+                       setsign(&tmp, sign);
+
+                       FPU_round_to_int(&tmp, tag);    /* Fortunately, this can't
+                                                          overflow to 2^64 */
+
+                       rem_kernel(significand(&st0),
+                                  &significand(&tmp),
+                                  significand(&st1),
+                                  significand(&tmp), exponent(&tmp)
+                           );
+                       setexponent16(&tmp, exp_1 + expdif);
+
+                       /* It is possible for the operation to be complete here.
+                          What does the IEEE standard say? The Intel 80486 manual
+                          implies that the operation will never be completed at this
+                          point, and the behaviour of a real 80486 confirms this.
+                        */
+                       if (!(tmp.sigh | tmp.sigl)) {
+                               /* The result is zero */
+                               control_word = old_cw;
+                               partial_status = saved_status;
+                               FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+                               setsign(&st0, st0_sign);
  #ifdef PECULIAR_486
-             setcc(SW_C2);
+                               setcc(SW_C2);
  #else
-             setcc(0);
+                               setcc(0);
  #endif /* PECULIAR_486 */
-             return;
-           }
-         cc = SW_C2;
-       }
+                               return;
+                       }
+                       cc = SW_C2;
+               }
  
-      control_word = old_cw;
-      partial_status = saved_status;
-      tag = FPU_normalize_nuo(&tmp);
-      reg_copy(&tmp, st0_ptr);
-
-      /* The only condition to be looked for is underflow,
-        and it can occur here only if underflow is unmasked. */
-      if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero)
-         && !(control_word & CW_Underflow) )
-       {
-         setcc(cc);
-         tag = arith_underflow(st0_ptr);
-         setsign(st0_ptr, st0_sign);
-         FPU_settag0(tag);
-         return;
-       }
-      else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) )
-       {
-         stdexp(st0_ptr);
-         setsign(st0_ptr, st0_sign);
-       }
-      else
-       {
-         tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
-       }
-      FPU_settag0(tag);
-      setcc(cc);
+               control_word = old_cw;
+               partial_status = saved_status;
+               tag = FPU_normalize_nuo(&tmp);
+               reg_copy(&tmp, st0_ptr);
+
+               /* The only condition to be looked for is underflow,
+                  and it can occur here only if underflow is unmasked. */
+               if ((exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero)
+                   && !(control_word & CW_Underflow)) {
+                       setcc(cc);
+                       tag = arith_underflow(st0_ptr);
+                       setsign(st0_ptr, st0_sign);
+                       FPU_settag0(tag);
+                       return;
+               } else if ((exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero)) {
+                       stdexp(st0_ptr);
+                       setsign(st0_ptr, st0_sign);
+               } else {
+                       tag =
+                           FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
+               }
+               FPU_settag0(tag);
+               setcc(cc);
  
-      return;
-    }
+               return;
+       }
  
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-  if ( st1_tag == TAG_Special )
-    st1_tag = FPU_Special(st1_ptr);
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+       if (st1_tag == TAG_Special)
+               st1_tag = FPU_Special(st1_ptr);
  
-  if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+       if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
             || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
-           || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
-    {
-      if ( denormal_operand() < 0 )
-       return;
-      goto fprem_valid;
-    }
-  else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
-    {
-      FPU_stack_underflow();
-      return;
-    }
-  else if ( st0_tag == TAG_Zero )
-    {
-      if ( st1_tag == TAG_Valid )
-       {
-         setcc(0); return;
-       }
-      else if ( st1_tag == TW_Denormal )
-       {
-         if ( denormal_operand() < 0 )
-           return;
-         setcc(0); return;
-       }
-      else if ( st1_tag == TAG_Zero )
-       { arith_invalid(0); return; } /* fprem(?,0) always invalid */
-      else if ( st1_tag == TW_Infinity )
-       { setcc(0); return; }
-    }
-  else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
-    {
-      if ( st1_tag == TAG_Zero )
-       {
-         arith_invalid(0); /* fprem(Valid,Zero) is invalid */
-         return;
-       }
-      else if ( st1_tag != TW_NaN )
-       {
-         if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal))
-              && (denormal_operand() < 0) )
-           return;
-
-         if ( st1_tag == TW_Infinity )
-           {
-             /* fprem(Valid,Infinity) is o.k. */
-             setcc(0); return;
-           }
-       }
-    }
-  else if ( st0_tag == TW_Infinity )
-    {
-      if ( st1_tag != TW_NaN )
-       {
-         arith_invalid(0); /* fprem(Infinity,?) is invalid */
-         return;
+           || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
+               if (denormal_operand() < 0)
+                       return;
+               goto fprem_valid;
+       } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
+               FPU_stack_underflow();
+               return;
+       } else if (st0_tag == TAG_Zero) {
+               if (st1_tag == TAG_Valid) {
+                       setcc(0);
+                       return;
+               } else if (st1_tag == TW_Denormal) {
+                       if (denormal_operand() < 0)
+                               return;
+                       setcc(0);
+                       return;
+               } else if (st1_tag == TAG_Zero) {
+                       arith_invalid(0);
+                       return;
+               } /* fprem(?,0) always invalid */
+               else if (st1_tag == TW_Infinity) {
+                       setcc(0);
+                       return;
+               }
+       } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
+               if (st1_tag == TAG_Zero) {
+                       arith_invalid(0);       /* fprem(Valid,Zero) is invalid */
+                       return;
+               } else if (st1_tag != TW_NaN) {
+                       if (((st0_tag == TW_Denormal)
+                            || (st1_tag == TW_Denormal))
+                           && (denormal_operand() < 0))
+                               return;
+
+                       if (st1_tag == TW_Infinity) {
+                               /* fprem(Valid,Infinity) is o.k. */
+                               setcc(0);
+                               return;
+                       }
+               }
+       } else if (st0_tag == TW_Infinity) {
+               if (st1_tag != TW_NaN) {
+                       arith_invalid(0);       /* fprem(Infinity,?) is invalid */
+                       return;
+               }
         }
-    }
  
-  /* One of the registers must contain a NaN if we got here. */
+       /* One of the registers must contain a NaN if we got here. */
  
  #ifdef PARANOID
-  if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) )
-      EXCEPTION(EX_INTERNAL | 0x118);
+       if ((st0_tag != TW_NaN) && (st1_tag != TW_NaN))
+               EXCEPTION(EX_INTERNAL | 0x118);
  #endif /* PARANOID */
  
-  real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr);
+       real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr);
  
  }
  
-
  /* ST(1) <- ST(1) * log ST;  pop ST */
  static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  FPU_REG *st1_ptr = &st(1), exponent;
-  u_char st1_tag = FPU_gettagi(1);
-  u_char sign;
-  int e, tag;
-
-  clear_C1();
-
-  if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) )
-    {
-    both_valid:
-      /* Both regs are Valid or Denormal */
-      if ( signpositive(st0_ptr) )
-       {
-         if ( st0_tag == TW_Denormal )
-           FPU_to_exp16(st0_ptr, st0_ptr);
-         else
-           /* Convert st(0) for internal use. */
-           setexponent16(st0_ptr, exponent(st0_ptr));
-
-         if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) )
-           {
-             /* Special case. The result can be precise. */
-             u_char esign;
-             e = exponent16(st0_ptr);
-             if ( e >= 0 )
-               {
-                 exponent.sigh = e;
-                 esign = SIGN_POS;
-               }
-             else
-               {
-                 exponent.sigh = -e;
-                 esign = SIGN_NEG;
+       FPU_REG *st1_ptr = &st(1), exponent;
+       u_char st1_tag = FPU_gettagi(1);
+       u_char sign;
+       int e, tag;
+
+       clear_C1();
+
+       if ((st0_tag == TAG_Valid) && (st1_tag == TAG_Valid)) {
+             both_valid:
+               /* Both regs are Valid or Denormal */
+               if (signpositive(st0_ptr)) {
+                       if (st0_tag == TW_Denormal)
+                               FPU_to_exp16(st0_ptr, st0_ptr);
+                       else
+                               /* Convert st(0) for internal use. */
+                               setexponent16(st0_ptr, exponent(st0_ptr));
+
+                       if ((st0_ptr->sigh == 0x80000000)
+                           && (st0_ptr->sigl == 0)) {
+                               /* Special case. The result can be precise. */
+                               u_char esign;
+                               e = exponent16(st0_ptr);
+                               if (e >= 0) {
+                                       exponent.sigh = e;
+                                       esign = SIGN_POS;
+                               } else {
+                                       exponent.sigh = -e;
+                                       esign = SIGN_NEG;
+                               }
+                               exponent.sigl = 0;
+                               setexponent16(&exponent, 31);
+                               tag = FPU_normalize_nuo(&exponent);
+                               stdexp(&exponent);
+                               setsign(&exponent, esign);
+                               tag =
+                                   FPU_mul(&exponent, tag, 1, FULL_PRECISION);
+                               if (tag >= 0)
+                                       FPU_settagi(1, tag);
+                       } else {
+                               /* The usual case */
+                               sign = getsign(st1_ptr);
+                               if (st1_tag == TW_Denormal)
+                                       FPU_to_exp16(st1_ptr, st1_ptr);
+                               else
+                                       /* Convert st(1) for internal use. */
+                                       setexponent16(st1_ptr,
+                                                     exponent(st1_ptr));
+                               poly_l2(st0_ptr, st1_ptr, sign);
+                       }
+               } else {
+                       /* negative */
+                       if (arith_invalid(1) < 0)
+                               return;
                 }
-             exponent.sigl = 0;
-             setexponent16(&exponent, 31);
-             tag = FPU_normalize_nuo(&exponent);
-             stdexp(&exponent);
-             setsign(&exponent, esign);
-             tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION);
-             if ( tag >= 0 )
-               FPU_settagi(1, tag);
-           }
-         else
-           {
-             /* The usual case */
-             sign = getsign(st1_ptr);
-             if ( st1_tag == TW_Denormal )
-               FPU_to_exp16(st1_ptr, st1_ptr);
-             else
-               /* Convert st(1) for internal use. */
-               setexponent16(st1_ptr, exponent(st1_ptr));
-             poly_l2(st0_ptr, st1_ptr, sign);
-           }
-       }
-      else
-       {
-         /* negative */
-         if ( arith_invalid(1) < 0 )
-           return;
-       }
  
-      FPU_pop();
-
-      return;
-    }
-
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-  if ( st1_tag == TAG_Special )
-    st1_tag = FPU_Special(st1_ptr);
-
-  if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
-    {
-      FPU_stack_underflow_pop(1);
-      return;
-    }
-  else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) )
-    {
-      if ( st0_tag == TAG_Zero )
-       {
-         if ( st1_tag == TAG_Zero )
-           {
-             /* Both args zero is invalid */
-             if ( arith_invalid(1) < 0 )
-               return;
-           }
-         else
-           {
-             u_char sign;
-             sign = getsign(st1_ptr)^SIGN_NEG;
-             if ( FPU_divide_by_zero(1, sign) < 0 )
-               return;
+               FPU_pop();
  
-             setsign(st1_ptr, sign);
-           }
-       }
-      else if ( st1_tag == TAG_Zero )
-       {
-         /* st(1) contains zero, st(0) valid <> 0 */
-         /* Zero is the valid answer */
-         sign = getsign(st1_ptr);
-         
-         if ( signnegative(st0_ptr) )
-           {
-             /* log(negative) */
-             if ( arith_invalid(1) < 0 )
                 return;
-           }
-         else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
-         else
-           {
-             if ( exponent(st0_ptr) < 0 )
-               sign ^= SIGN_NEG;
-
-             FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
-             setsign(st1_ptr, sign);
-           }
         }
-      else
-       {
-         /* One or both operands are denormals. */
-         if ( denormal_operand() < 0 )
-           return;
-         goto both_valid;
-       }
-    }
-  else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
-    {
-      if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
-       return;
-    }
-  /* One or both arg must be an infinity */
-  else if ( st0_tag == TW_Infinity )
-    {
-      if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) )
-       {
-         /* log(-infinity) or 0*log(infinity) */
-         if ( arith_invalid(1) < 0 )
-           return;
-       }
-      else
-       {
-         u_char sign = getsign(st1_ptr);
  
-         if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+       if (st1_tag == TAG_Special)
+               st1_tag = FPU_Special(st1_ptr);
  
-         FPU_copy_to_reg1(&CONST_INF, TAG_Special);
-         setsign(st1_ptr, sign);
-       }
-    }
-  /* st(1) must be infinity here */
-  else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
-           && ( signpositive(st0_ptr) ) )
-    {
-      if ( exponent(st0_ptr) >= 0 )
-       {
-         if ( (exponent(st0_ptr) == 0) &&
-             (st0_ptr->sigh == 0x80000000) &&
-             (st0_ptr->sigl == 0) )
-           {
-             /* st(0) holds 1.0 */
-             /* infinity*log(1) */
-             if ( arith_invalid(1) < 0 )
+       if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
+               FPU_stack_underflow_pop(1);
                 return;
-           }
-         /* else st(0) is positive and > 1.0 */
+       } else if ((st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal)) {
+               if (st0_tag == TAG_Zero) {
+                       if (st1_tag == TAG_Zero) {
+                               /* Both args zero is invalid */
+                               if (arith_invalid(1) < 0)
+                                       return;
+                       } else {
+                               u_char sign;
+                               sign = getsign(st1_ptr) ^ SIGN_NEG;
+                               if (FPU_divide_by_zero(1, sign) < 0)
+                                       return;
+
+                               setsign(st1_ptr, sign);
+                       }
+               } else if (st1_tag == TAG_Zero) {
+                       /* st(1) contains zero, st(0) valid <> 0 */
+                       /* Zero is the valid answer */
+                       sign = getsign(st1_ptr);
+
+                       if (signnegative(st0_ptr)) {
+                               /* log(negative) */
+                               if (arith_invalid(1) < 0)
+                                       return;
+                       } else if ((st0_tag == TW_Denormal)
+                                  && (denormal_operand() < 0))
+                               return;
+                       else {
+                               if (exponent(st0_ptr) < 0)
+                                       sign ^= SIGN_NEG;
+
+                               FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+                               setsign(st1_ptr, sign);
+                       }
+               } else {
+                       /* One or both operands are denormals. */
+                       if (denormal_operand() < 0)
+                               return;
+                       goto both_valid;
+               }
+       } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
+               if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+                       return;
+       }
+       /* One or both arg must be an infinity */
+       else if (st0_tag == TW_Infinity) {
+               if ((signnegative(st0_ptr)) || (st1_tag == TAG_Zero)) {
+                       /* log(-infinity) or 0*log(infinity) */
+                       if (arith_invalid(1) < 0)
+                               return;
+               } else {
+                       u_char sign = getsign(st1_ptr);
+
+                       if ((st1_tag == TW_Denormal)
+                           && (denormal_operand() < 0))
+                               return;
+
+                       FPU_copy_to_reg1(&CONST_INF, TAG_Special);
+                       setsign(st1_ptr, sign);
+               }
         }
-      else
-       {
-         /* st(0) is positive and < 1.0 */
+       /* st(1) must be infinity here */
+       else if (((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
+                && (signpositive(st0_ptr))) {
+               if (exponent(st0_ptr) >= 0) {
+                       if ((exponent(st0_ptr) == 0) &&
+                           (st0_ptr->sigh == 0x80000000) &&
+                           (st0_ptr->sigl == 0)) {
+                               /* st(0) holds 1.0 */
+                               /* infinity*log(1) */
+                               if (arith_invalid(1) < 0)
+                                       return;
+                       }
+                       /* else st(0) is positive and > 1.0 */
+               } else {
+                       /* st(0) is positive and < 1.0 */
  
-         if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
+                       if ((st0_tag == TW_Denormal)
+                           && (denormal_operand() < 0))
+                               return;
  
-         changesign(st1_ptr);
-       }
-    }
-  else
-    {
-      /* st(0) must be zero or negative */
-      if ( st0_tag == TAG_Zero )
-       {
-         /* This should be invalid, but a real 80486 is happy with it. */
+                       changesign(st1_ptr);
+               }
+       } else {
+               /* st(0) must be zero or negative */
+               if (st0_tag == TAG_Zero) {
+                       /* This should be invalid, but a real 80486 is happy with it. */
  
  #ifndef PECULIAR_486
-         sign = getsign(st1_ptr);
-         if ( FPU_divide_by_zero(1, sign) < 0 )
-           return;
+                       sign = getsign(st1_ptr);
+                       if (FPU_divide_by_zero(1, sign) < 0)
+                               return;
  #endif /* PECULIAR_486 */
  
-         changesign(st1_ptr);
+                       changesign(st1_ptr);
+               } else if (arith_invalid(1) < 0)        /* log(negative) */
+                       return;
         }
-      else if ( arith_invalid(1) < 0 )   /* log(negative) */
-       return;
-    }
  
-  FPU_pop();
+       FPU_pop();
  }
  
-
  static void fpatan(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  FPU_REG *st1_ptr = &st(1);
-  u_char st1_tag = FPU_gettagi(1);
-  int tag;
+       FPU_REG *st1_ptr = &st(1);
+       u_char st1_tag = FPU_gettagi(1);
+       int tag;
  
-  clear_C1();
-  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
-    {
-    valid_atan:
+       clear_C1();
+       if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+             valid_atan:
  
-      poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag);
+               poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag);
  
-      FPU_pop();
+               FPU_pop();
  
-      return;
-    }
+               return;
+       }
  
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-  if ( st1_tag == TAG_Special )
-    st1_tag = FPU_Special(st1_ptr);
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+       if (st1_tag == TAG_Special)
+               st1_tag = FPU_Special(st1_ptr);
  
-  if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+       if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
             || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
-           || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
-    {
-      if ( denormal_operand() < 0 )
-       return;
+           || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
+               if (denormal_operand() < 0)
+                       return;
  
-      goto valid_atan;
-    }
-  else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) )
-    {
-      FPU_stack_underflow_pop(1);
-      return;
-    }
-  else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) )
-    {
-      if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 )
-         FPU_pop();
-      return;
-    }
-  else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) )
-    {
-      u_char sign = getsign(st1_ptr);
-      if ( st0_tag == TW_Infinity )
-       {
-         if ( st1_tag == TW_Infinity )
-           {
-             if ( signpositive(st0_ptr) )
-               {
-                 FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
-               }
-             else
-               {
-                 setpositive(st1_ptr);
-                 tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr,
-                                 FULL_PRECISION, SIGN_POS,
-                                 exponent(&CONST_PI4), exponent(&CONST_PI2));
-                 if ( tag >= 0 )
-                   FPU_settagi(1, tag);
-               }
-           }
-         else
-           {
-             if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
+               goto valid_atan;
+       } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
+               FPU_stack_underflow_pop(1);
+               return;
+       } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
+               if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0)
+                       FPU_pop();
                 return;
+       } else if ((st0_tag == TW_Infinity) || (st1_tag == TW_Infinity)) {
+               u_char sign = getsign(st1_ptr);
+               if (st0_tag == TW_Infinity) {
+                       if (st1_tag == TW_Infinity) {
+                               if (signpositive(st0_ptr)) {
+                                       FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
+                               } else {
+                                       setpositive(st1_ptr);
+                                       tag =
+                                           FPU_u_add(&CONST_PI4, &CONST_PI2,
+                                                     st1_ptr, FULL_PRECISION,
+                                                     SIGN_POS,
+                                                     exponent(&CONST_PI4),
+                                                     exponent(&CONST_PI2));
+                                       if (tag >= 0)
+                                               FPU_settagi(1, tag);
+                               }
+                       } else {
+                               if ((st1_tag == TW_Denormal)
+                                   && (denormal_operand() < 0))
+                                       return;
+
+                               if (signpositive(st0_ptr)) {
+                                       FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+                                       setsign(st1_ptr, sign); /* An 80486 preserves the sign */
+                                       FPU_pop();
+                                       return;
+                               } else {
+                                       FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
+                               }
+                       }
+               } else {
+                       /* st(1) is infinity, st(0) not infinity */
+                       if ((st0_tag == TW_Denormal)
+                           && (denormal_operand() < 0))
+                               return;
  
-             if ( signpositive(st0_ptr) )
-               {
-                 FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
-                 setsign(st1_ptr, sign);   /* An 80486 preserves the sign */
-                 FPU_pop();
-                 return;
+                       FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
                 }
-             else
-               {
-                 FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
+               setsign(st1_ptr, sign);
+       } else if (st1_tag == TAG_Zero) {
+               /* st(0) must be valid or zero */
+               u_char sign = getsign(st1_ptr);
+
+               if ((st0_tag == TW_Denormal) && (denormal_operand() < 0))
+                       return;
+
+               if (signpositive(st0_ptr)) {
+                       /* An 80486 preserves the sign */
+                       FPU_pop();
+                       return;
                 }
-           }
-       }
-      else
-       {
-         /* st(1) is infinity, st(0) not infinity */
-         if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
  
-         FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
-       }
-      setsign(st1_ptr, sign);
-    }
-  else if ( st1_tag == TAG_Zero )
-    {
-      /* st(0) must be valid or zero */
-      u_char sign = getsign(st1_ptr);
-
-      if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-       return;
+               FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
+               setsign(st1_ptr, sign);
+       } else if (st0_tag == TAG_Zero) {
+               /* st(1) must be TAG_Valid here */
+               u_char sign = getsign(st1_ptr);
  
-      if ( signpositive(st0_ptr) )
-       {
-         /* An 80486 preserves the sign */
-         FPU_pop();
-         return;
-       }
+               if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
+                       return;
  
-      FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
-      setsign(st1_ptr, sign);
-    }
-  else if ( st0_tag == TAG_Zero )
-    {
-      /* st(1) must be TAG_Valid here */
-      u_char sign = getsign(st1_ptr);
-
-      if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
-       return;
-
-      FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
-      setsign(st1_ptr, sign);
-    }
+               FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
+               setsign(st1_ptr, sign);
+       }
  #ifdef PARANOID
-  else
-    EXCEPTION(EX_INTERNAL | 0x125);
+       else
+               EXCEPTION(EX_INTERNAL | 0x125);
  #endif /* PARANOID */
  
-  FPU_pop();
-  set_precision_flag_up();  /* We do not really know if up or down */
+       FPU_pop();
+       set_precision_flag_up();        /* We do not really know if up or down */
  }
  
-
  static void fprem(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  do_fprem(st0_ptr, st0_tag, RC_CHOP);
+       do_fprem(st0_ptr, st0_tag, RC_CHOP);
  }
  
-
  static void fprem1(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  do_fprem(st0_ptr, st0_tag, RC_RND);
+       do_fprem(st0_ptr, st0_tag, RC_RND);
  }
  
-
  static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  u_char sign, sign1;
-  FPU_REG *st1_ptr = &st(1), a, b;
-  u_char st1_tag = FPU_gettagi(1);
+       u_char sign, sign1;
+       FPU_REG *st1_ptr = &st(1), a, b;
+       u_char st1_tag = FPU_gettagi(1);
  
-  clear_C1();
-  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
-    {
-    valid_yl2xp1:
+       clear_C1();
+       if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+             valid_yl2xp1:
  
-      sign = getsign(st0_ptr);
-      sign1 = getsign(st1_ptr);
+               sign = getsign(st0_ptr);
+               sign1 = getsign(st1_ptr);
  
-      FPU_to_exp16(st0_ptr, &a);
-      FPU_to_exp16(st1_ptr, &b);
+               FPU_to_exp16(st0_ptr, &a);
+               FPU_to_exp16(st1_ptr, &b);
  
-      if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) )
-       return;
+               if (poly_l2p1(sign, sign1, &a, &b, st1_ptr))
+                       return;
  
-      FPU_pop();
-      return;
-    }
+               FPU_pop();
+               return;
+       }
  
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-  if ( st1_tag == TAG_Special )
-    st1_tag = FPU_Special(st1_ptr);
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+       if (st1_tag == TAG_Special)
+               st1_tag = FPU_Special(st1_ptr);
  
-  if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+       if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
             || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
-           || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) )
-    {
-      if ( denormal_operand() < 0 )
-       return;
-
-      goto valid_yl2xp1;
-    }
-  else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) )
-    {
-      FPU_stack_underflow_pop(1);
-      return;
-    }
-  else if ( st0_tag == TAG_Zero )
-    {
-      switch ( st1_tag )
-       {
-       case TW_Denormal:
-         if ( denormal_operand() < 0 )
-           return;
-
-       case TAG_Zero:
-       case TAG_Valid:
-         setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
-         FPU_copy_to_reg1(st0_ptr, st0_tag);
-         break;
-
-       case TW_Infinity:
-         /* Infinity*log(1) */
-         if ( arith_invalid(1) < 0 )
-           return;
-         break;
+           || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
+               if (denormal_operand() < 0)
+                       return;
  
-       case TW_NaN:
-         if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
-           return;
-         break;
-
-       default:
+               goto valid_yl2xp1;
+       } else if ((st0_tag == TAG_Empty) | (st1_tag == TAG_Empty)) {
+               FPU_stack_underflow_pop(1);
+               return;
+       } else if (st0_tag == TAG_Zero) {
+               switch (st1_tag) {
+               case TW_Denormal:
+                       if (denormal_operand() < 0)
+                               return;
+
+               case TAG_Zero:
+               case TAG_Valid:
+                       setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
+                       FPU_copy_to_reg1(st0_ptr, st0_tag);
+                       break;
+
+               case TW_Infinity:
+                       /* Infinity*log(1) */
+                       if (arith_invalid(1) < 0)
+                               return;
+                       break;
+
+               case TW_NaN:
+                       if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+                               return;
+                       break;
+
+               default:
  #ifdef PARANOID
-         EXCEPTION(EX_INTERNAL | 0x116);
-         return;
+                       EXCEPTION(EX_INTERNAL | 0x116);
+                       return;
  #endif /* PARANOID */
-         break;
-       }
-    }
-  else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
-    {
-      switch ( st1_tag )
-       {
-       case TAG_Zero:
-         if ( signnegative(st0_ptr) )
-           {
-             if ( exponent(st0_ptr) >= 0 )
-               {
-                 /* st(0) holds <= -1.0 */
-#ifdef PECULIAR_486   /* Stupid 80486 doesn't worry about log(negative). */
-                 changesign(st1_ptr);
+                       break;
+               }
+       } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
+               switch (st1_tag) {
+               case TAG_Zero:
+                       if (signnegative(st0_ptr)) {
+                               if (exponent(st0_ptr) >= 0) {
+                                       /* st(0) holds <= -1.0 */
+#ifdef PECULIAR_486            /* Stupid 80486 doesn't worry about log(negative). */
+                                       changesign(st1_ptr);
  #else
-                 if ( arith_invalid(1) < 0 )
-                   return;
+                                       if (arith_invalid(1) < 0)
+                                               return;
  #endif /* PECULIAR_486 */
-               }
-             else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-               return;
-             else
-               changesign(st1_ptr);
-           }
-         else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
-         break;
-
-       case TW_Infinity:
-         if ( signnegative(st0_ptr) )
-           {
-             if ( (exponent(st0_ptr) >= 0) &&
-                 !((st0_ptr->sigh == 0x80000000) &&
-                   (st0_ptr->sigl == 0)) )
-               {
-                 /* st(0) holds < -1.0 */
-#ifdef PECULIAR_486   /* Stupid 80486 doesn't worry about log(negative). */
-                 changesign(st1_ptr);
+                               } else if ((st0_tag == TW_Denormal)
+                                          && (denormal_operand() < 0))
+                                       return;
+                               else
+                                       changesign(st1_ptr);
+                       } else if ((st0_tag == TW_Denormal)
+                                  && (denormal_operand() < 0))
+                               return;
+                       break;
+
+               case TW_Infinity:
+                       if (signnegative(st0_ptr)) {
+                               if ((exponent(st0_ptr) >= 0) &&
+                                   !((st0_ptr->sigh == 0x80000000) &&
+                                     (st0_ptr->sigl == 0))) {
+                                       /* st(0) holds < -1.0 */
+#ifdef PECULIAR_486            /* Stupid 80486 doesn't worry about log(negative). */
+                                       changesign(st1_ptr);
  #else
-                 if ( arith_invalid(1) < 0 ) return;
+                                       if (arith_invalid(1) < 0)
+                                               return;
  #endif /* PECULIAR_486 */
+                               } else if ((st0_tag == TW_Denormal)
+                                          && (denormal_operand() < 0))
+                                       return;
+                               else
+                                       changesign(st1_ptr);
+                       } else if ((st0_tag == TW_Denormal)
+                                  && (denormal_operand() < 0))
+                               return;
+                       break;
+
+               case TW_NaN:
+                       if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+                               return;
                 }
-             else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-               return;
-             else
-               changesign(st1_ptr);
-           }
-         else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
-         break;
-
-       case TW_NaN:
-         if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
-           return;
-       }
  
-    }
-  else if ( st0_tag == TW_NaN )
-    {
-      if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
-       return;
-    }
-  else if ( st0_tag == TW_Infinity )
-    {
-      if ( st1_tag == TW_NaN )
-       {
-         if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 )
-           return;
-       }
-      else if ( signnegative(st0_ptr) )
-       {
+       } else if (st0_tag == TW_NaN) {
+               if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+                       return;
+       } else if (st0_tag == TW_Infinity) {
+               if (st1_tag == TW_NaN) {
+                       if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+                               return;
+               } else if (signnegative(st0_ptr)) {
  #ifndef PECULIAR_486
-         /* This should have higher priority than denormals, but... */
-         if ( arith_invalid(1) < 0 )  /* log(-infinity) */
-           return;
+                       /* This should have higher priority than denormals, but... */
+                       if (arith_invalid(1) < 0)       /* log(-infinity) */
+                               return;
  #endif /* PECULIAR_486 */
-         if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
+                       if ((st1_tag == TW_Denormal)
+                           && (denormal_operand() < 0))
+                               return;
  #ifdef PECULIAR_486
-         /* Denormal operands actually get higher priority */
-         if ( arith_invalid(1) < 0 )  /* log(-infinity) */
-           return;
+                       /* Denormal operands actually get higher priority */
+                       if (arith_invalid(1) < 0)       /* log(-infinity) */
+                               return;
  #endif /* PECULIAR_486 */
-       }
-      else if ( st1_tag == TAG_Zero )
-       {
-         /* log(infinity) */
-         if ( arith_invalid(1) < 0 )
-           return;
-       }
-       
-      /* st(1) must be valid here. */
+               } else if (st1_tag == TAG_Zero) {
+                       /* log(infinity) */
+                       if (arith_invalid(1) < 0)
+                               return;
+               }
  
-      else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) )
-       return;
+               /* st(1) must be valid here. */
+
+               else if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
+                       return;
  
-      /* The Manual says that log(Infinity) is invalid, but a real
-        80486 sensibly says that it is o.k. */
-      else
-       {
-         u_char sign = getsign(st1_ptr);
-         FPU_copy_to_reg1(&CONST_INF, TAG_Special);
-         setsign(st1_ptr, sign);
+               /* The Manual says that log(Infinity) is invalid, but a real
+                  80486 sensibly says that it is o.k. */
+               else {
+                       u_char sign = getsign(st1_ptr);
+                       FPU_copy_to_reg1(&CONST_INF, TAG_Special);
+                       setsign(st1_ptr, sign);
+               }
         }
-    }
  #ifdef PARANOID
-  else
-    {
-      EXCEPTION(EX_INTERNAL | 0x117);
-      return;
-    }
+       else {
+               EXCEPTION(EX_INTERNAL | 0x117);
+               return;
+       }
  #endif /* PARANOID */
  
-  FPU_pop();
-  return;
+       FPU_pop();
+       return;
  
  }
  
-
  static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
  {
-  FPU_REG *st1_ptr = &st(1);
-  u_char st1_tag = FPU_gettagi(1);
-  int old_cw = control_word;
-  u_char sign = getsign(st0_ptr);
-
-  clear_C1();
-  if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) )
-    {
-      long scale;
-      FPU_REG tmp;
-
-      /* Convert register for internal use. */
-      setexponent16(st0_ptr, exponent(st0_ptr));
-
-    valid_scale:
-
-      if ( exponent(st1_ptr) > 30 )
-       {
-         /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */
-
-         if ( signpositive(st1_ptr) )
-           {
-             EXCEPTION(EX_Overflow);
-             FPU_copy_to_reg0(&CONST_INF, TAG_Special);
-           }
-         else
-           {
-             EXCEPTION(EX_Underflow);
-             FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
-           }
-         setsign(st0_ptr, sign);
-         return;
-       }
-
-      control_word &= ~CW_RC;
-      control_word |= RC_CHOP;
-      reg_copy(st1_ptr, &tmp);
-      FPU_round_to_int(&tmp, st1_tag);      /* This can never overflow here */
-      control_word = old_cw;
-      scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
-      scale += exponent16(st0_ptr);
-
-      setexponent16(st0_ptr, scale);
-
-      /* Use FPU_round() to properly detect under/overflow etc */
-      FPU_round(st0_ptr, 0, 0, control_word, sign);
-
-      return;
-    }
-
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-  if ( st1_tag == TAG_Special )
-    st1_tag = FPU_Special(st1_ptr);
-
-  if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) )
-    {
-      switch ( st1_tag )
-       {
-       case TAG_Valid:
-         /* st(0) must be a denormal */
-         if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
-
-         FPU_to_exp16(st0_ptr, st0_ptr);  /* Will not be left on stack */
-         goto valid_scale;
-
-       case TAG_Zero:
-         if ( st0_tag == TW_Denormal )
-           denormal_operand();
-         return;
-
-       case TW_Denormal:
-         denormal_operand();
-         return;
-
-       case TW_Infinity:
-         if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) )
-           return;
-
-         if ( signpositive(st1_ptr) )
-           FPU_copy_to_reg0(&CONST_INF, TAG_Special);
-         else
-           FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
-         setsign(st0_ptr, sign);
-         return;
+       FPU_REG *st1_ptr = &st(1);
+       u_char st1_tag = FPU_gettagi(1);
+       int old_cw = control_word;
+       u_char sign = getsign(st0_ptr);
+
+       clear_C1();
+       if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+               long scale;
+               FPU_REG tmp;
+
+               /* Convert register for internal use. */
+               setexponent16(st0_ptr, exponent(st0_ptr));
+
+             valid_scale:
+
+               if (exponent(st1_ptr) > 30) {
+                       /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */
+
+                       if (signpositive(st1_ptr)) {
+                               EXCEPTION(EX_Overflow);
+                               FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+                       } else {
+                               EXCEPTION(EX_Underflow);
+                               FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+                       }
+                       setsign(st0_ptr, sign);
+                       return;
+               }
  
-       case TW_NaN:
-         real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
-         return;
-       }
-    }
-  else if ( st0_tag == TAG_Zero )
-    {
-      switch ( st1_tag )
-       {
-       case TAG_Valid:
-       case TAG_Zero:
-         return;
+               control_word &= ~CW_RC;
+               control_word |= RC_CHOP;
+               reg_copy(st1_ptr, &tmp);
+               FPU_round_to_int(&tmp, st1_tag);        /* This can never overflow here */
+               control_word = old_cw;
+               scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
+               scale += exponent16(st0_ptr);
  
-       case TW_Denormal:
-         denormal_operand();
-         return;
+               setexponent16(st0_ptr, scale);
  
-       case TW_Infinity:
-         if ( signpositive(st1_ptr) )
-           arith_invalid(0); /* Zero scaled by +Infinity */
-         return;
+               /* Use FPU_round() to properly detect under/overflow etc */
+               FPU_round(st0_ptr, 0, 0, control_word, sign);
  
-       case TW_NaN:
-         real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
-         return;
+               return;
         }
-    }
-  else if ( st0_tag == TW_Infinity )
-    {
-      switch ( st1_tag )
-       {
-       case TAG_Valid:
-       case TAG_Zero:
-         return;
-
-       case TW_Denormal:
-         denormal_operand();
-         return;
  
-       case TW_Infinity:
-         if ( signnegative(st1_ptr) )
-           arith_invalid(0); /* Infinity scaled by -Infinity */
-         return;
-
-       case TW_NaN:
-         real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
-         return;
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+       if (st1_tag == TAG_Special)
+               st1_tag = FPU_Special(st1_ptr);
+
+       if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
+               switch (st1_tag) {
+               case TAG_Valid:
+                       /* st(0) must be a denormal */
+                       if ((st0_tag == TW_Denormal)
+                           && (denormal_operand() < 0))
+                               return;
+
+                       FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */
+                       goto valid_scale;
+
+               case TAG_Zero:
+                       if (st0_tag == TW_Denormal)
+                               denormal_operand();
+                       return;
+
+               case TW_Denormal:
+                       denormal_operand();
+                       return;
+
+               case TW_Infinity:
+                       if ((st0_tag == TW_Denormal)
+                           && (denormal_operand() < 0))
+                               return;
+
+                       if (signpositive(st1_ptr))
+                               FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+                       else
+                               FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+                       setsign(st0_ptr, sign);
+                       return;
+
+               case TW_NaN:
+                       real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+                       return;
+               }
+       } else if (st0_tag == TAG_Zero) {
+               switch (st1_tag) {
+               case TAG_Valid:
+               case TAG_Zero:
+                       return;
+
+               case TW_Denormal:
+                       denormal_operand();
+                       return;
+
+               case TW_Infinity:
+                       if (signpositive(st1_ptr))
+                               arith_invalid(0);       /* Zero scaled by +Infinity */
+                       return;
+
+               case TW_NaN:
+                       real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+                       return;
+               }
+       } else if (st0_tag == TW_Infinity) {
+               switch (st1_tag) {
+               case TAG_Valid:
+               case TAG_Zero:
+                       return;
+
+               case TW_Denormal:
+                       denormal_operand();
+                       return;
+
+               case TW_Infinity:
+                       if (signnegative(st1_ptr))
+                               arith_invalid(0);       /* Infinity scaled by -Infinity */
+                       return;
+
+               case TW_NaN:
+                       real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+                       return;
+               }
+       } else if (st0_tag == TW_NaN) {
+               if (st1_tag != TAG_Empty) {
+                       real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+                       return;
+               }
         }
-    }
-  else if ( st0_tag == TW_NaN )
-    {
-      if ( st1_tag != TAG_Empty )
-       { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; }
-    }
-
  #ifdef PARANOID
-  if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) )
-    {
-      EXCEPTION(EX_INTERNAL | 0x115);
-      return;
-    }
+       if (!((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty))) {
+               EXCEPTION(EX_INTERNAL | 0x115);
+               return;
+       }
  #endif
  
-  /* At least one of st(0), st(1) must be empty */
-  FPU_stack_underflow();
+       /* At least one of st(0), st(1) must be empty */
+       FPU_stack_underflow();
  
  }
  
-
  /*---------------------------------------------------------------------------*/
  
  static FUNC_ST0 const trig_table_a[] = {
-  f2xm1, fyl2x, fptan, fpatan,
-  fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp
+       f2xm1, fyl2x, fptan, fpatan,
+       fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp
  };
  
  void FPU_triga(void)
  {
-  (trig_table_a[FPU_rm])(&st(0), FPU_gettag0());
+       (trig_table_a[FPU_rm]) (&st(0), FPU_gettag0());
  }
  
-
-static FUNC_ST0 const trig_table_b[] =
-  {
-    fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos
-  };
+static FUNC_ST0 const trig_table_b[] = {
+       fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos
+};
  
  void FPU_trigb(void)
  {
-  (trig_table_b[FPU_rm])(&st(0), FPU_gettag0());
+       (trig_table_b[FPU_rm]) (&st(0), FPU_gettag0());
  }
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c

index 2e2c51a8bd3ae6d921167607977d9f25d2f75ae0..d701e2b39e4438aacb8fc22ff000e79083f3bcec 100644 (file)
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -17,7 +17,6 @@
   |    other processes using the emulator while swapping is in progress.      |
   +---------------------------------------------------------------------------*/
  
-
  #include <linux/stddef.h>
  
  #include <asm/uaccess.h>
@@ -27,31 +26,30 @@
  #include "exception.h"
  #include "fpu_emu.h"
  
-
  #define FPU_WRITE_BIT 0x10
  
  static int reg_offset[] = {
-       offsetof(struct info,___eax),
-       offsetof(struct info,___ecx),
-       offsetof(struct info,___edx),
-       offsetof(struct info,___ebx),
-       offsetof(struct info,___esp),
-       offsetof(struct info,___ebp),
-       offsetof(struct info,___esi),
-       offsetof(struct info,___edi)
+       offsetof(struct info, ___eax),
+       offsetof(struct info, ___ecx),
+       offsetof(struct info, ___edx),
+       offsetof(struct info, ___ebx),
+       offsetof(struct info, ___esp),
+       offsetof(struct info, ___ebp),
+       offsetof(struct info, ___esi),
+       offsetof(struct info, ___edi)
  };
  
  #define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info))
  
  static int reg_offset_vm86[] = {
-       offsetof(struct info,___cs),
-       offsetof(struct info,___vm86_ds),
-       offsetof(struct info,___vm86_es),
-       offsetof(struct info,___vm86_fs),
-       offsetof(struct info,___vm86_gs),
-       offsetof(struct info,___ss),
-       offsetof(struct info,___vm86_ds)
-      };
+       offsetof(struct info, ___cs),
+       offsetof(struct info, ___vm86_ds),
+       offsetof(struct info, ___vm86_es),
+       offsetof(struct info, ___vm86_fs),
+       offsetof(struct info, ___vm86_gs),
+       offsetof(struct info, ___ss),
+       offsetof(struct info, ___vm86_ds)
+};
  
  #define VM86_REG_(x) (*(unsigned short *) \
                       (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info))
@@ -60,158 +58,141 @@ static int reg_offset_vm86[] = {
  #define ___GS ___ds
  
  static int reg_offset_pm[] = {
-       offsetof(struct info,___cs),
-       offsetof(struct info,___ds),
-       offsetof(struct info,___es),
-       offsetof(struct info,___fs),
-       offsetof(struct info,___GS),
-       offsetof(struct info,___ss),
-       offsetof(struct info,___ds)
-      };
+       offsetof(struct info, ___cs),
+       offsetof(struct info, ___ds),
+       offsetof(struct info, ___es),
+       offsetof(struct info, ___fs),
+       offsetof(struct info, ___GS),
+       offsetof(struct info, ___ss),
+       offsetof(struct info, ___ds)
+};
  
  #define PM_REG_(x) (*(unsigned short *) \
                       (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info))
  
-
  /* Decode the SIB byte. This function assumes mod != 0 */
  static int sib(int mod, unsigned long *fpu_eip)
  {
-  u_char ss,index,base;
-  long offset;
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_code_access_ok(1);
-  FPU_get_user(base, (u_char __user *) (*fpu_eip));   /* The SIB byte */
-  RE_ENTRANT_CHECK_ON;
-  (*fpu_eip)++;
-  ss = base >> 6;
-  index = (base >> 3) & 7;
-  base &= 7;
-
-  if ((mod == 0) && (base == 5))
-    offset = 0;              /* No base register */
-  else
-    offset = REG_(base);
-
-  if (index == 4)
-    {
-      /* No index register */
-      /* A non-zero ss is illegal */
-      if ( ss )
-       EXCEPTION(EX_Invalid);
-    }
-  else
-    {
-      offset += (REG_(index)) << ss;
-    }
-
-  if (mod == 1)
-    {
-      /* 8 bit signed displacement */
-      long displacement;
-      RE_ENTRANT_CHECK_OFF;
-      FPU_code_access_ok(1);
-      FPU_get_user(displacement, (signed char __user *) (*fpu_eip));
-      offset += displacement;
-      RE_ENTRANT_CHECK_ON;
-      (*fpu_eip)++;
-    }
-  else if (mod == 2 || base == 5) /* The second condition also has mod==0 */
-    {
-      /* 32 bit displacement */
-      long displacement;
-      RE_ENTRANT_CHECK_OFF;
-      FPU_code_access_ok(4);
-      FPU_get_user(displacement, (long __user *) (*fpu_eip));
-      offset += displacement;
-      RE_ENTRANT_CHECK_ON;
-      (*fpu_eip) += 4;
-    }
-
-  return offset;
-}
+       u_char ss, index, base;
+       long offset;
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_code_access_ok(1);
+       FPU_get_user(base, (u_char __user *) (*fpu_eip));       /* The SIB byte */
+       RE_ENTRANT_CHECK_ON;
+       (*fpu_eip)++;
+       ss = base >> 6;
+       index = (base >> 3) & 7;
+       base &= 7;
+
+       if ((mod == 0) && (base == 5))
+               offset = 0;     /* No base register */
+       else
+               offset = REG_(base);
+
+       if (index == 4) {
+               /* No index register */
+               /* A non-zero ss is illegal */
+               if (ss)
+                       EXCEPTION(EX_Invalid);
+       } else {
+               offset += (REG_(index)) << ss;
+       }
+
+       if (mod == 1) {
+               /* 8 bit signed displacement */
+               long displacement;
+               RE_ENTRANT_CHECK_OFF;
+               FPU_code_access_ok(1);
+               FPU_get_user(displacement, (signed char __user *)(*fpu_eip));
+               offset += displacement;
+               RE_ENTRANT_CHECK_ON;
+               (*fpu_eip)++;
+       } else if (mod == 2 || base == 5) {     /* The second condition also has mod==0 */
+               /* 32 bit displacement */
+               long displacement;
+               RE_ENTRANT_CHECK_OFF;
+               FPU_code_access_ok(4);
+               FPU_get_user(displacement, (long __user *)(*fpu_eip));
+               offset += displacement;
+               RE_ENTRANT_CHECK_ON;
+               (*fpu_eip) += 4;
+       }
  
+       return offset;
+}
  
-static unsigned long vm86_segment(u_char segment,
-                                 struct address *addr)
+static unsigned long vm86_segment(u_char segment, struct address *addr)
  {
-  segment--;
+       segment--;
  #ifdef PARANOID
-  if ( segment > PREFIX_SS_ )
-    {
-      EXCEPTION(EX_INTERNAL|0x130);
-      math_abort(FPU_info,SIGSEGV);
-    }
+       if (segment > PREFIX_SS_) {
+               EXCEPTION(EX_INTERNAL | 0x130);
+               math_abort(FPU_info, SIGSEGV);
+       }
  #endif /* PARANOID */
-  addr->selector = VM86_REG_(segment);
-  return (unsigned long)VM86_REG_(segment) << 4;
+       addr->selector = VM86_REG_(segment);
+       return (unsigned long)VM86_REG_(segment) << 4;
  }
  
-
  /* This should work for 16 and 32 bit protected mode. */
  static long pm_address(u_char FPU_modrm, u_char segment,
                        struct address *addr, long offset)
-{ 
-  struct desc_struct descriptor;
-  unsigned long base_address, limit, address, seg_top;
+{
+       struct desc_struct descriptor;
+       unsigned long base_address, limit, address, seg_top;
  
-  segment--;
+       segment--;
  
  #ifdef PARANOID
-  /* segment is unsigned, so this also detects if segment was 0: */
-  if ( segment > PREFIX_SS_ )
-    {
-      EXCEPTION(EX_INTERNAL|0x132);
-      math_abort(FPU_info,SIGSEGV);
-    }
+       /* segment is unsigned, so this also detects if segment was 0: */
+       if (segment > PREFIX_SS_) {
+               EXCEPTION(EX_INTERNAL | 0x132);
+               math_abort(FPU_info, SIGSEGV);
+       }
  #endif /* PARANOID */
  
-  switch ( segment )
-    {
-      /* gs isn't used by the kernel, so it still has its
-        user-space value. */
-    case PREFIX_GS_-1:
-      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
-      savesegment(gs, addr->selector);
-      break;
-    default:
-      addr->selector = PM_REG_(segment);
-    }
-
-  descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
-  base_address = SEG_BASE_ADDR(descriptor);
-  address = base_address + offset;
-  limit = base_address
-       + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1;
-  if ( limit < base_address ) limit = 0xffffffff;
-
-  if ( SEG_EXPAND_DOWN(descriptor) )
-    {
-      if ( SEG_G_BIT(descriptor) )
-       seg_top = 0xffffffff;
-      else
-       {
-         seg_top = base_address + (1 << 20);
-         if ( seg_top < base_address ) seg_top = 0xffffffff;
+       switch (segment) {
+               /* gs isn't used by the kernel, so it still has its
+                  user-space value. */
+       case PREFIX_GS_ - 1:
+               /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
+               savesegment(gs, addr->selector);
+               break;
+       default:
+               addr->selector = PM_REG_(segment);
         }
-      access_limit =
-       (address <= limit) || (address >= seg_top) ? 0 :
-         ((seg_top-address) >= 255 ? 255 : seg_top-address);
-    }
-  else
-    {
-      access_limit =
-       (address > limit) || (address < base_address) ? 0 :
-         ((limit-address) >= 254 ? 255 : limit-address+1);
-    }
-  if ( SEG_EXECUTE_ONLY(descriptor) ||
-      (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) )
-    {
-      access_limit = 0;
-    }
-  return address;
-}
  
+       descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
+       base_address = SEG_BASE_ADDR(descriptor);
+       address = base_address + offset;
+       limit = base_address
+           + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1;
+       if (limit < base_address)
+               limit = 0xffffffff;
+
+       if (SEG_EXPAND_DOWN(descriptor)) {
+               if (SEG_G_BIT(descriptor))
+                       seg_top = 0xffffffff;
+               else {
+                       seg_top = base_address + (1 << 20);
+                       if (seg_top < base_address)
+                               seg_top = 0xffffffff;
+               }
+               access_limit =
+                   (address <= limit) || (address >= seg_top) ? 0 :
+                   ((seg_top - address) >= 255 ? 255 : seg_top - address);
+       } else {
+               access_limit =
+                   (address > limit) || (address < base_address) ? 0 :
+                   ((limit - address) >= 254 ? 255 : limit - address + 1);
+       }
+       if (SEG_EXECUTE_ONLY(descriptor) ||
+           (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
+               access_limit = 0;
+       }
+       return address;
+}
  
  /*
         MOD R/M byte:  MOD == 3 has a special use for the FPU
@@ -221,7 +202,6 @@ static long pm_address(u_char FPU_modrm, u_char segment,
         .....   .........   .........
          MOD    OPCODE(2)     R/M
  
-
         SIB byte
  
         7   6   5   4   3   2   1   0
@@ -231,208 +211,194 @@ static long pm_address(u_char FPU_modrm, u_char segment,
  */
  
  void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
-                 struct address *addr,
-                 fpu_addr_modes addr_modes)
+                            struct address *addr, fpu_addr_modes addr_modes)
+{
+       u_char mod;
+       unsigned rm = FPU_modrm & 7;
+       long *cpu_reg_ptr;
+       int address = 0;        /* Initialized just to stop compiler warnings. */
+
+       /* Memory accessed via the cs selector is write protected
+          in `non-segmented' 32 bit protected mode. */
+       if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
+           && (addr_modes.override.segment == PREFIX_CS_)) {
+               math_abort(FPU_info, SIGSEGV);
+       }
+
+       addr->selector = FPU_DS;        /* Default, for 32 bit non-segmented mode. */
+
+       mod = (FPU_modrm >> 6) & 3;
+
+       if (rm == 4 && mod != 3) {
+               address = sib(mod, fpu_eip);
+       } else {
+               cpu_reg_ptr = &REG_(rm);
+               switch (mod) {
+               case 0:
+                       if (rm == 5) {
+                               /* Special case: disp32 */
+                               RE_ENTRANT_CHECK_OFF;
+                               FPU_code_access_ok(4);
+                               FPU_get_user(address,
+                                            (unsigned long __user
+                                             *)(*fpu_eip));
+                               (*fpu_eip) += 4;
+                               RE_ENTRANT_CHECK_ON;
+                               addr->offset = address;
+                               return (void __user *)address;
+                       } else {
+                               address = *cpu_reg_ptr; /* Just return the contents
+                                                          of the cpu register */
+                               addr->offset = address;
+                               return (void __user *)address;
+                       }
+               case 1:
+                       /* 8 bit signed displacement */
+                       RE_ENTRANT_CHECK_OFF;
+                       FPU_code_access_ok(1);
+                       FPU_get_user(address, (signed char __user *)(*fpu_eip));
+                       RE_ENTRANT_CHECK_ON;
+                       (*fpu_eip)++;
+                       break;
+               case 2:
+                       /* 32 bit displacement */
+                       RE_ENTRANT_CHECK_OFF;
+                       FPU_code_access_ok(4);
+                       FPU_get_user(address, (long __user *)(*fpu_eip));
+                       (*fpu_eip) += 4;
+                       RE_ENTRANT_CHECK_ON;
+                       break;
+               case 3:
+                       /* Not legal for the FPU */
+                       EXCEPTION(EX_Invalid);
+               }
+               address += *cpu_reg_ptr;
+       }
+
+       addr->offset = address;
+
+       switch (addr_modes.default_mode) {
+       case 0:
+               break;
+       case VM86:
+               address += vm86_segment(addr_modes.override.segment, addr);
+               break;
+       case PM16:
+       case SEG32:
+               address = pm_address(FPU_modrm, addr_modes.override.segment,
+                                    addr, address);
+               break;
+       default:
+               EXCEPTION(EX_INTERNAL | 0x133);
+       }
+
+       return (void __user *)address;
+}
+
+void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
+                               struct address *addr, fpu_addr_modes addr_modes)
  {
-  u_char mod;
-  unsigned rm = FPU_modrm & 7;
-  long *cpu_reg_ptr;
-  int address = 0;     /* Initialized just to stop compiler warnings. */
-
-  /* Memory accessed via the cs selector is write protected
-     in `non-segmented' 32 bit protected mode. */
-  if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
-      && (addr_modes.override.segment == PREFIX_CS_) )
-    {
-      math_abort(FPU_info,SIGSEGV);
-    }
-
-  addr->selector = FPU_DS;   /* Default, for 32 bit non-segmented mode. */
-
-  mod = (FPU_modrm >> 6) & 3;
-
-  if (rm == 4 && mod != 3)
-    {
-      address = sib(mod, fpu_eip);
-    }
-  else
-    {
-      cpu_reg_ptr = & REG_(rm);
-      switch (mod)
-       {
+       u_char mod;
+       unsigned rm = FPU_modrm & 7;
+       int address = 0;        /* Default used for mod == 0 */
+
+       /* Memory accessed via the cs selector is write protected
+          in `non-segmented' 32 bit protected mode. */
+       if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
+           && (addr_modes.override.segment == PREFIX_CS_)) {
+               math_abort(FPU_info, SIGSEGV);
+       }
+
+       addr->selector = FPU_DS;        /* Default, for 32 bit non-segmented mode. */
+
+       mod = (FPU_modrm >> 6) & 3;
+
+       switch (mod) {
         case 0:
-         if (rm == 5)
-           {
-             /* Special case: disp32 */
-             RE_ENTRANT_CHECK_OFF;
-             FPU_code_access_ok(4);
-             FPU_get_user(address, (unsigned long __user *) (*fpu_eip));
-             (*fpu_eip) += 4;
-             RE_ENTRANT_CHECK_ON;
-             addr->offset = address;
-             return (void __user *) address;
-           }
-         else
-           {
-             address = *cpu_reg_ptr;  /* Just return the contents
-                                         of the cpu register */
-             addr->offset = address;
-             return (void __user *) address;
-           }
+               if (rm == 6) {
+                       /* Special case: disp16 */
+                       RE_ENTRANT_CHECK_OFF;
+                       FPU_code_access_ok(2);
+                       FPU_get_user(address,
+                                    (unsigned short __user *)(*fpu_eip));
+                       (*fpu_eip) += 2;
+                       RE_ENTRANT_CHECK_ON;
+                       goto add_segment;
+               }
+               break;
         case 1:
-         /* 8 bit signed displacement */
-         RE_ENTRANT_CHECK_OFF;
-         FPU_code_access_ok(1);
-         FPU_get_user(address, (signed char __user *) (*fpu_eip));
-         RE_ENTRANT_CHECK_ON;
-         (*fpu_eip)++;
-         break;
+               /* 8 bit signed displacement */
+               RE_ENTRANT_CHECK_OFF;
+               FPU_code_access_ok(1);
+               FPU_get_user(address, (signed char __user *)(*fpu_eip));
+               RE_ENTRANT_CHECK_ON;
+               (*fpu_eip)++;
+               break;
         case 2:
-         /* 32 bit displacement */
-         RE_ENTRANT_CHECK_OFF;
-         FPU_code_access_ok(4);
-         FPU_get_user(address, (long __user *) (*fpu_eip));
-         (*fpu_eip) += 4;
-         RE_ENTRANT_CHECK_ON;
-         break;
+               /* 16 bit displacement */
+               RE_ENTRANT_CHECK_OFF;
+               FPU_code_access_ok(2);
+               FPU_get_user(address, (unsigned short __user *)(*fpu_eip));
+               (*fpu_eip) += 2;
+               RE_ENTRANT_CHECK_ON;
+               break;
         case 3:
-         /* Not legal for the FPU */
-         EXCEPTION(EX_Invalid);
+               /* Not legal for the FPU */
+               EXCEPTION(EX_Invalid);
+               break;
+       }
+       switch (rm) {
+       case 0:
+               address += FPU_info->___ebx + FPU_info->___esi;
+               break;
+       case 1:
+               address += FPU_info->___ebx + FPU_info->___edi;
+               break;
+       case 2:
+               address += FPU_info->___ebp + FPU_info->___esi;
+               if (addr_modes.override.segment == PREFIX_DEFAULT)
+                       addr_modes.override.segment = PREFIX_SS_;
+               break;
+       case 3:
+               address += FPU_info->___ebp + FPU_info->___edi;
+               if (addr_modes.override.segment == PREFIX_DEFAULT)
+                       addr_modes.override.segment = PREFIX_SS_;
+               break;
+       case 4:
+               address += FPU_info->___esi;
+               break;
+       case 5:
+               address += FPU_info->___edi;
+               break;
+       case 6:
+               address += FPU_info->___ebp;
+               if (addr_modes.override.segment == PREFIX_DEFAULT)
+                       addr_modes.override.segment = PREFIX_SS_;
+               break;
+       case 7:
+               address += FPU_info->___ebx;
+               break;
         }
-      address += *cpu_reg_ptr;
-    }
-
-  addr->offset = address;
-
-  switch ( addr_modes.default_mode )
-    {
-    case 0:
-      break;
-    case VM86:
-      address += vm86_segment(addr_modes.override.segment, addr);
-      break;
-    case PM16:
-    case SEG32:
-      address = pm_address(FPU_modrm, addr_modes.override.segment,
-                          addr, address);
-      break;
-    default:
-      EXCEPTION(EX_INTERNAL|0x133);
-    }
-
-  return (void __user *)address;
-}
  
+      add_segment:
+       address &= 0xffff;
  
-void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
-                    struct address *addr,
-                    fpu_addr_modes addr_modes)
-{
-  u_char mod;
-  unsigned rm = FPU_modrm & 7;
-  int address = 0;     /* Default used for mod == 0 */
-
-  /* Memory accessed via the cs selector is write protected
-     in `non-segmented' 32 bit protected mode. */
-  if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
-      && (addr_modes.override.segment == PREFIX_CS_) )
-    {
-      math_abort(FPU_info,SIGSEGV);
-    }
-
-  addr->selector = FPU_DS;   /* Default, for 32 bit non-segmented mode. */
-
-  mod = (FPU_modrm >> 6) & 3;
-
-  switch (mod)
-    {
-    case 0:
-      if (rm == 6)
-       {
-         /* Special case: disp16 */
-         RE_ENTRANT_CHECK_OFF;
-         FPU_code_access_ok(2);
-         FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
-         (*fpu_eip) += 2;
-         RE_ENTRANT_CHECK_ON;
-         goto add_segment;
+       addr->offset = address;
+
+       switch (addr_modes.default_mode) {
+       case 0:
+               break;
+       case VM86:
+               address += vm86_segment(addr_modes.override.segment, addr);
+               break;
+       case PM16:
+       case SEG32:
+               address = pm_address(FPU_modrm, addr_modes.override.segment,
+                                    addr, address);
+               break;
+       default:
+               EXCEPTION(EX_INTERNAL | 0x131);
         }
-      break;
-    case 1:
-      /* 8 bit signed displacement */
-      RE_ENTRANT_CHECK_OFF;
-      FPU_code_access_ok(1);
-      FPU_get_user(address, (signed char __user *) (*fpu_eip));
-      RE_ENTRANT_CHECK_ON;
-      (*fpu_eip)++;
-      break;
-    case 2:
-      /* 16 bit displacement */
-      RE_ENTRANT_CHECK_OFF;
-      FPU_code_access_ok(2);
-      FPU_get_user(address, (unsigned short __user *) (*fpu_eip));
-      (*fpu_eip) += 2;
-      RE_ENTRANT_CHECK_ON;
-      break;
-    case 3:
-      /* Not legal for the FPU */
-      EXCEPTION(EX_Invalid);
-      break;
-    }
-  switch ( rm )
-    {
-    case 0:
-      address += FPU_info->___ebx + FPU_info->___esi;
-      break;
-    case 1:
-      address += FPU_info->___ebx + FPU_info->___edi;
-      break;
-    case 2:
-      address += FPU_info->___ebp + FPU_info->___esi;
-      if ( addr_modes.override.segment == PREFIX_DEFAULT )
-       addr_modes.override.segment = PREFIX_SS_;
-      break;
-    case 3:
-      address += FPU_info->___ebp + FPU_info->___edi;
-      if ( addr_modes.override.segment == PREFIX_DEFAULT )
-       addr_modes.override.segment = PREFIX_SS_;
-      break;
-    case 4:
-      address += FPU_info->___esi;
-      break;
-    case 5:
-      address += FPU_info->___edi;
-      break;
-    case 6:
-      address += FPU_info->___ebp;
-      if ( addr_modes.override.segment == PREFIX_DEFAULT )
-       addr_modes.override.segment = PREFIX_SS_;
-      break;
-    case 7:
-      address += FPU_info->___ebx;
-      break;
-    }
-
- add_segment:
-  address &= 0xffff;
-
-  addr->offset = address;
-
-  switch ( addr_modes.default_mode )
-    {
-    case 0:
-      break;
-    case VM86:
-      address += vm86_segment(addr_modes.override.segment, addr);
-      break;
-    case PM16:
-    case SEG32:
-      address = pm_address(FPU_modrm, addr_modes.override.segment,
-                          addr, address);
-      break;
-    default:
-      EXCEPTION(EX_INTERNAL|0x131);
-    }
-
-  return (void __user *)address ;
+
+       return (void __user *)address;
  }
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c

index eebd6fb1c8a8a1933b0ffd1c3563aeac0b31a731..2931ff355218041221033379c18aa7b6c4b9458c 100644 (file)
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -26,247 +26,257 @@
  #include "status_w.h"
  #include "control_w.h"
  
-
-#define _NONE_ 0   /* st0_ptr etc not needed */
-#define _REG0_ 1   /* Will be storing st(0) */
-#define _PUSH_ 3   /* Need to check for space to push onto stack */
-#define _null_ 4   /* Function illegal or not implemented */
+#define _NONE_ 0               /* st0_ptr etc not needed */
+#define _REG0_ 1               /* Will be storing st(0) */
+#define _PUSH_ 3               /* Need to check for space to push onto stack */
+#define _null_ 4               /* Function illegal or not implemented */
  
  #define pop_0()        { FPU_settag0(TAG_Empty); top++; }
  
-
  static u_char const type_table[32] = {
-  _PUSH_, _PUSH_, _PUSH_, _PUSH_,
-  _null_, _null_, _null_, _null_,
-  _REG0_, _REG0_, _REG0_, _REG0_,
-  _REG0_, _REG0_, _REG0_, _REG0_,
-  _NONE_, _null_, _NONE_, _PUSH_,
-  _NONE_, _PUSH_, _null_, _PUSH_,
-  _NONE_, _null_, _NONE_, _REG0_,
-  _NONE_, _REG0_, _NONE_, _REG0_
-  };
+       _PUSH_, _PUSH_, _PUSH_, _PUSH_,
+       _null_, _null_, _null_, _null_,
+       _REG0_, _REG0_, _REG0_, _REG0_,
+       _REG0_, _REG0_, _REG0_, _REG0_,
+       _NONE_, _null_, _NONE_, _PUSH_,
+       _NONE_, _PUSH_, _null_, _PUSH_,
+       _NONE_, _null_, _NONE_, _REG0_,
+       _NONE_, _REG0_, _NONE_, _REG0_
+};
  
  u_char const data_sizes_16[32] = {
-  4,  4,  8,  2,  0,  0,  0,  0,
-  4,  4,  8,  2,  4,  4,  8,  2,
-  14, 0, 94, 10,  2, 10,  0,  8,  
-  14, 0, 94, 10,  2, 10,  2,  8
+       4, 4, 8, 2, 0, 0, 0, 0,
+       4, 4, 8, 2, 4, 4, 8, 2,
+       14, 0, 94, 10, 2, 10, 0, 8,
+       14, 0, 94, 10, 2, 10, 2, 8
  };
  
  static u_char const data_sizes_32[32] = {
-  4,  4,  8,  2,  0,  0,  0,  0,
-  4,  4,  8,  2,  4,  4,  8,  2,
-  28, 0,108, 10,  2, 10,  0,  8,  
-  28, 0,108, 10,  2, 10,  2,  8
+       4, 4, 8, 2, 0, 0, 0, 0,
+       4, 4, 8, 2, 4, 4, 8, 2,
+       28, 0, 108, 10, 2, 10, 0, 8,
+       28, 0, 108, 10, 2, 10, 2, 8
  };
  
  int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
-                    void __user *data_address)
+                  void __user * data_address)
  {
-  FPU_REG loaded_data;
-  FPU_REG *st0_ptr;
-  u_char st0_tag = TAG_Empty;  /* This is just to stop a gcc warning. */
-  u_char loaded_tag;
+       FPU_REG loaded_data;
+       FPU_REG *st0_ptr;
+       u_char st0_tag = TAG_Empty;     /* This is just to stop a gcc warning. */
+       u_char loaded_tag;
  
-  st0_ptr = NULL;    /* Initialized just to stop compiler warnings. */
+       st0_ptr = NULL;         /* Initialized just to stop compiler warnings. */
  
-  if ( addr_modes.default_mode & PROTECTED )
-    {
-      if ( addr_modes.default_mode == SEG32 )
-       {
-         if ( access_limit < data_sizes_32[type] )
-           math_abort(FPU_info,SIGSEGV);
-       }
-      else if ( addr_modes.default_mode == PM16 )
-       {
-         if ( access_limit < data_sizes_16[type] )
-           math_abort(FPU_info,SIGSEGV);
-       }
+       if (addr_modes.default_mode & PROTECTED) {
+               if (addr_modes.default_mode == SEG32) {
+                       if (access_limit < data_sizes_32[type])
+                               math_abort(FPU_info, SIGSEGV);
+               } else if (addr_modes.default_mode == PM16) {
+                       if (access_limit < data_sizes_16[type])
+                               math_abort(FPU_info, SIGSEGV);
+               }
  #ifdef PARANOID
-      else
-       EXCEPTION(EX_INTERNAL|0x140);
+               else
+                       EXCEPTION(EX_INTERNAL | 0x140);
  #endif /* PARANOID */
-    }
+       }
  
-  switch ( type_table[type] )
-    {
-    case _NONE_:
-      break;
-    case _REG0_:
-      st0_ptr = &st(0);       /* Some of these instructions pop after
-                                storing */
-      st0_tag = FPU_gettag0();
-      break;
-    case _PUSH_:
-      {
-       if ( FPU_gettagi(-1) != TAG_Empty )
-         { FPU_stack_overflow(); return 0; }
-       top--;
-       st0_ptr = &st(0);
-      }
-      break;
-    case _null_:
-      FPU_illegal();
-      return 0;
+       switch (type_table[type]) {
+       case _NONE_:
+               break;
+       case _REG0_:
+               st0_ptr = &st(0);       /* Some of these instructions pop after
+                                          storing */
+               st0_tag = FPU_gettag0();
+               break;
+       case _PUSH_:
+               {
+                       if (FPU_gettagi(-1) != TAG_Empty) {
+                               FPU_stack_overflow();
+                               return 0;
+                       }
+                       top--;
+                       st0_ptr = &st(0);
+               }
+               break;
+       case _null_:
+               FPU_illegal();
+               return 0;
  #ifdef PARANOID
-    default:
-      EXCEPTION(EX_INTERNAL|0x141);
-      return 0;
+       default:
+               EXCEPTION(EX_INTERNAL | 0x141);
+               return 0;
  #endif /* PARANOID */
-    }
-
-  switch ( type )
-    {
-    case 000:       /* fld m32real */
-      clear_C1();
-      loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data);
-      if ( (loaded_tag == TAG_Special)
-          && isNaN(&loaded_data)
-          && (real_1op_NaN(&loaded_data) < 0) )
-       {
-         top++;
-         break;
-       }
-      FPU_copy_to_reg0(&loaded_data, loaded_tag);
-      break;
-    case 001:      /* fild m32int */
-      clear_C1();
-      loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data);
-      FPU_copy_to_reg0(&loaded_data, loaded_tag);
-      break;
-    case 002:      /* fld m64real */
-      clear_C1();
-      loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data);
-      if ( (loaded_tag == TAG_Special)
-          && isNaN(&loaded_data)
-          && (real_1op_NaN(&loaded_data) < 0) )
-       {
-         top++;
-         break;
         }
-      FPU_copy_to_reg0(&loaded_data, loaded_tag);
-      break;
-    case 003:      /* fild m16int */
-      clear_C1();
-      loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data);
-      FPU_copy_to_reg0(&loaded_data, loaded_tag);
-      break;
-    case 010:      /* fst m32real */
-      clear_C1();
-      FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address);
-      break;
-    case 011:      /* fist m32int */
-      clear_C1();
-      FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address);
-      break;
-    case 012:     /* fst m64real */
-      clear_C1();
-      FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address);
-      break;
-    case 013:     /* fist m16int */
-      clear_C1();
-      FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address);
-      break;
-    case 014:     /* fstp m32real */
-      clear_C1();
-      if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) )
-       pop_0();  /* pop only if the number was actually stored
-                    (see the 80486 manual p16-28) */
-      break;
-    case 015:     /* fistp m32int */
-      clear_C1();
-      if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) )
-       pop_0();  /* pop only if the number was actually stored
-                    (see the 80486 manual p16-28) */
-      break;
-    case 016:     /* fstp m64real */
-      clear_C1();
-      if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) )
-       pop_0();  /* pop only if the number was actually stored
-                    (see the 80486 manual p16-28) */
-      break;
-    case 017:     /* fistp m16int */
-      clear_C1();
-      if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) )
-       pop_0();  /* pop only if the number was actually stored
-                    (see the 80486 manual p16-28) */
-      break;
-    case 020:     /* fldenv  m14/28byte */
-      fldenv(addr_modes, (u_char __user *)data_address);
-      /* Ensure that the values just loaded are not changed by
-        fix-up operations. */
-      return 1;
-    case 022:     /* frstor m94/108byte */
-      frstor(addr_modes, (u_char __user *)data_address);
-      /* Ensure that the values just loaded are not changed by
-        fix-up operations. */
-      return 1;
-    case 023:     /* fbld m80dec */
-      clear_C1();
-      loaded_tag = FPU_load_bcd((u_char __user *)data_address);
-      FPU_settag0(loaded_tag);
-      break;
-    case 024:     /* fldcw */
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_READ, data_address, 2);
-      FPU_get_user(control_word, (unsigned short __user *) data_address);
-      RE_ENTRANT_CHECK_ON;
-      if ( partial_status & ~control_word & CW_Exceptions )
-       partial_status |= (SW_Summary | SW_Backward);
-      else
-       partial_status &= ~(SW_Summary | SW_Backward);
+
+       switch (type) {
+       case 000:               /* fld m32real */
+               clear_C1();
+               loaded_tag =
+                   FPU_load_single((float __user *)data_address, &loaded_data);
+               if ((loaded_tag == TAG_Special)
+                   && isNaN(&loaded_data)
+                   && (real_1op_NaN(&loaded_data) < 0)) {
+                       top++;
+                       break;
+               }
+               FPU_copy_to_reg0(&loaded_data, loaded_tag);
+               break;
+       case 001:               /* fild m32int */
+               clear_C1();
+               loaded_tag =
+                   FPU_load_int32((long __user *)data_address, &loaded_data);
+               FPU_copy_to_reg0(&loaded_data, loaded_tag);
+               break;
+       case 002:               /* fld m64real */
+               clear_C1();
+               loaded_tag =
+                   FPU_load_double((double __user *)data_address,
+                                   &loaded_data);
+               if ((loaded_tag == TAG_Special)
+                   && isNaN(&loaded_data)
+                   && (real_1op_NaN(&loaded_data) < 0)) {
+                       top++;
+                       break;
+               }
+               FPU_copy_to_reg0(&loaded_data, loaded_tag);
+               break;
+       case 003:               /* fild m16int */
+               clear_C1();
+               loaded_tag =
+                   FPU_load_int16((short __user *)data_address, &loaded_data);
+               FPU_copy_to_reg0(&loaded_data, loaded_tag);
+               break;
+       case 010:               /* fst m32real */
+               clear_C1();
+               FPU_store_single(st0_ptr, st0_tag,
+                                (float __user *)data_address);
+               break;
+       case 011:               /* fist m32int */
+               clear_C1();
+               FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address);
+               break;
+       case 012:               /* fst m64real */
+               clear_C1();
+               FPU_store_double(st0_ptr, st0_tag,
+                                (double __user *)data_address);
+               break;
+       case 013:               /* fist m16int */
+               clear_C1();
+               FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address);
+               break;
+       case 014:               /* fstp m32real */
+               clear_C1();
+               if (FPU_store_single
+                   (st0_ptr, st0_tag, (float __user *)data_address))
+                       pop_0();        /* pop only if the number was actually stored
+                                          (see the 80486 manual p16-28) */
+               break;
+       case 015:               /* fistp m32int */
+               clear_C1();
+               if (FPU_store_int32
+                   (st0_ptr, st0_tag, (long __user *)data_address))
+                       pop_0();        /* pop only if the number was actually stored
+                                          (see the 80486 manual p16-28) */
+               break;
+       case 016:               /* fstp m64real */
+               clear_C1();
+               if (FPU_store_double
+                   (st0_ptr, st0_tag, (double __user *)data_address))
+                       pop_0();        /* pop only if the number was actually stored
+                                          (see the 80486 manual p16-28) */
+               break;
+       case 017:               /* fistp m16int */
+               clear_C1();
+               if (FPU_store_int16
+                   (st0_ptr, st0_tag, (short __user *)data_address))
+                       pop_0();        /* pop only if the number was actually stored
+                                          (see the 80486 manual p16-28) */
+               break;
+       case 020:               /* fldenv  m14/28byte */
+               fldenv(addr_modes, (u_char __user *) data_address);
+               /* Ensure that the values just loaded are not changed by
+                  fix-up operations. */
+               return 1;
+       case 022:               /* frstor m94/108byte */
+               frstor(addr_modes, (u_char __user *) data_address);
+               /* Ensure that the values just loaded are not changed by
+                  fix-up operations. */
+               return 1;
+       case 023:               /* fbld m80dec */
+               clear_C1();
+               loaded_tag = FPU_load_bcd((u_char __user *) data_address);
+               FPU_settag0(loaded_tag);
+               break;
+       case 024:               /* fldcw */
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_READ, data_address, 2);
+               FPU_get_user(control_word,
+                            (unsigned short __user *)data_address);
+               RE_ENTRANT_CHECK_ON;
+               if (partial_status & ~control_word & CW_Exceptions)
+                       partial_status |= (SW_Summary | SW_Backward);
+               else
+                       partial_status &= ~(SW_Summary | SW_Backward);
  #ifdef PECULIAR_486
-      control_word |= 0x40;  /* An 80486 appears to always set this bit */
+               control_word |= 0x40;   /* An 80486 appears to always set this bit */
  #endif /* PECULIAR_486 */
-      return 1;
-    case 025:      /* fld m80real */
-      clear_C1();
-      loaded_tag = FPU_load_extended((long double __user *)data_address, 0);
-      FPU_settag0(loaded_tag);
-      break;
-    case 027:      /* fild m64int */
-      clear_C1();
-      loaded_tag = FPU_load_int64((long long __user *)data_address);
-      if (loaded_tag == TAG_Error)
+               return 1;
+       case 025:               /* fld m80real */
+               clear_C1();
+               loaded_tag =
+                   FPU_load_extended((long double __user *)data_address, 0);
+               FPU_settag0(loaded_tag);
+               break;
+       case 027:               /* fild m64int */
+               clear_C1();
+               loaded_tag = FPU_load_int64((long long __user *)data_address);
+               if (loaded_tag == TAG_Error)
+                       return 0;
+               FPU_settag0(loaded_tag);
+               break;
+       case 030:               /* fstenv  m14/28byte */
+               fstenv(addr_modes, (u_char __user *) data_address);
+               return 1;
+       case 032:               /* fsave */
+               fsave(addr_modes, (u_char __user *) data_address);
+               return 1;
+       case 033:               /* fbstp m80dec */
+               clear_C1();
+               if (FPU_store_bcd
+                   (st0_ptr, st0_tag, (u_char __user *) data_address))
+                       pop_0();        /* pop only if the number was actually stored
+                                          (see the 80486 manual p16-28) */
+               break;
+       case 034:               /* fstcw m16int */
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_WRITE, data_address, 2);
+               FPU_put_user(control_word,
+                            (unsigned short __user *)data_address);
+               RE_ENTRANT_CHECK_ON;
+               return 1;
+       case 035:               /* fstp m80real */
+               clear_C1();
+               if (FPU_store_extended
+                   (st0_ptr, st0_tag, (long double __user *)data_address))
+                       pop_0();        /* pop only if the number was actually stored
+                                          (see the 80486 manual p16-28) */
+               break;
+       case 036:               /* fstsw m2byte */
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_WRITE, data_address, 2);
+               FPU_put_user(status_word(),
+                            (unsigned short __user *)data_address);
+               RE_ENTRANT_CHECK_ON;
+               return 1;
+       case 037:               /* fistp m64int */
+               clear_C1();
+               if (FPU_store_int64
+                   (st0_ptr, st0_tag, (long long __user *)data_address))
+                       pop_0();        /* pop only if the number was actually stored
+                                          (see the 80486 manual p16-28) */
+               break;
+       }
         return 0;
-      FPU_settag0(loaded_tag);
-      break;
-    case 030:     /* fstenv  m14/28byte */
-      fstenv(addr_modes, (u_char __user *)data_address);
-      return 1;
-    case 032:      /* fsave */
-      fsave(addr_modes, (u_char __user *)data_address);
-      return 1;
-    case 033:      /* fbstp m80dec */
-      clear_C1();
-      if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) )
-       pop_0();  /* pop only if the number was actually stored
-                    (see the 80486 manual p16-28) */
-      break;
-    case 034:      /* fstcw m16int */
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_WRITE,data_address,2);
-      FPU_put_user(control_word, (unsigned short __user *) data_address);
-      RE_ENTRANT_CHECK_ON;
-      return 1;
-    case 035:      /* fstp m80real */
-      clear_C1();
-      if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) )
-       pop_0();  /* pop only if the number was actually stored
-                    (see the 80486 manual p16-28) */
-      break;
-    case 036:      /* fstsw m2byte */
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_WRITE,data_address,2);
-      FPU_put_user(status_word(),(unsigned short __user *) data_address);
-      RE_ENTRANT_CHECK_ON;
-      return 1;
-    case 037:      /* fistp m64int */
-      clear_C1();
-      if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) )
-       pop_0();  /* pop only if the number was actually stored
-                    (see the 80486 manual p16-28) */
-      break;
-    }
-  return 0;
  }
diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h

index 4db798114923be7f73d5fccfb0fa6fb5b870cae5..168eb44c93c8eff2a7c27b16778204971493bde1 100644 (file)
--- a/arch/x86/math-emu/poly.h
+++ b/arch/x86/math-emu/poly.h
@@ -21,9 +21,9 @@
     allows. 9-byte would probably be sufficient.
     */
  typedef struct {
-  unsigned long lsw;
-  unsigned long midw;
-  unsigned long msw;
+       unsigned long lsw;
+       unsigned long midw;
+       unsigned long msw;
  } Xsig;
  
  asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b,
@@ -49,7 +49,6 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
  /* Macro to access the 8 ms bytes of an Xsig as a long long */
  #define XSIG_LL(x)         (*(unsigned long long *)&x.midw)
  
-
  /*
     Need to run gcc with optimizations on to get these to
     actually be in-line.
@@ -63,59 +62,53 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
  static inline unsigned long mul_32_32(const unsigned long arg1,
                                       const unsigned long arg2)
  {
-  int retval;
-  asm volatile ("mull %2; movl %%edx,%%eax" \
-               :"=a" (retval) \
-               :"0" (arg1), "g" (arg2) \
-               :"dx");
-  return retval;
+       int retval;
+       asm volatile ("mull %2; movl %%edx,%%eax":"=a" (retval)
+                     :"0"(arg1), "g"(arg2)
+                     :"dx");
+       return retval;
  }
  
-
  /* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */
  static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2)
  {
-  asm volatile ("movl %1,%%edi; movl %2,%%esi;\n"
-                "movl (%%esi),%%eax; addl %%eax,(%%edi);\n"
-                "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n"
-                "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n"
-                 :"=g" (*dest):"g" (dest), "g" (x2)
-                 :"ax","si","di");
+       asm volatile ("movl %1,%%edi; movl %2,%%esi;\n"
+                     "movl (%%esi),%%eax; addl %%eax,(%%edi);\n"
+                     "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n"
+                     "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n":"=g"
+                     (*dest):"g"(dest), "g"(x2)
+                     :"ax", "si", "di");
  }
  
-
  /* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */
  /* Note: the constraints in the asm statement didn't always work properly
     with gcc 2.5.8.  Changing from using edi to using ecx got around the
     problem, but keep fingers crossed! */
  static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp)
  {
-  asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n"
-                "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n"
-                "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n"
-                "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n"
-                "jnc 0f;\n"
-               "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n"
-                "movl %4,%%ecx; incl (%%ecx)\n"
-                "movl $1,%%eax; jmp 1f;\n"
-                "0: xorl %%eax,%%eax;\n"
-                "1:\n"
-               :"=g" (*exp), "=g" (*dest)
-               :"g" (dest), "g" (x2), "g" (exp)
-               :"cx","si","ax");
+       asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n"
+                     "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n"
+                     "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n"
+                     "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n"
+                     "jnc 0f;\n"
+                     "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n"
+                     "movl %4,%%ecx; incl (%%ecx)\n"
+                     "movl $1,%%eax; jmp 1f;\n"
+                     "0: xorl %%eax,%%eax;\n" "1:\n":"=g" (*exp), "=g"(*dest)
+                     :"g"(dest), "g"(x2), "g"(exp)
+                     :"cx", "si", "ax");
  }
  
-
  /* Negate (subtract from 1.0) the 12 byte Xsig */
  /* This is faster in a loop on my 386 than using the "neg" instruction. */
  static inline void negate_Xsig(Xsig *x)
  {
-  asm volatile("movl %1,%%esi;\n"
-               "xorl %%ecx,%%ecx;\n"
-               "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n"
-               "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n"
-               "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n"
-               :"=g" (*x):"g" (x):"si","ax","cx");
+       asm volatile ("movl %1,%%esi;\n"
+                     "xorl %%ecx,%%ecx;\n"
+                     "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n"
+                     "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n"
+                     "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n":"=g"
+                     (*x):"g"(x):"si", "ax", "cx");
  }
  
  #endif /* _POLY_H */
diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c

index 9766ad5e97438a3d2d5e2a1862142e09a66fed07..b00e9e10cdce244ba073e84f54690ec95bb5e698 100644 (file)
--- a/arch/x86/math-emu/poly_2xm1.c
+++ b/arch/x86/math-emu/poly_2xm1.c
@@ -17,21 +17,19 @@
  #include "control_w.h"
  #include "poly.h"
  
-
  #define        HIPOWER 11
-static const unsigned long long lterms[HIPOWER] =
-{
-  0x0000000000000000LL,  /* This term done separately as 12 bytes */
-  0xf5fdeffc162c7543LL,
-  0x1c6b08d704a0bfa6LL,
-  0x0276556df749cc21LL,
-  0x002bb0ffcf14f6b8LL,
-  0x0002861225ef751cLL,
-  0x00001ffcbfcd5422LL,
-  0x00000162c005d5f1LL,
-  0x0000000da96ccb1bLL,
-  0x0000000078d1b897LL,
-  0x000000000422b029LL
+static const unsigned long long lterms[HIPOWER] = {
+       0x0000000000000000LL,   /* This term done separately as 12 bytes */
+       0xf5fdeffc162c7543LL,
+       0x1c6b08d704a0bfa6LL,
+       0x0276556df749cc21LL,
+       0x002bb0ffcf14f6b8LL,
+       0x0002861225ef751cLL,
+       0x00001ffcbfcd5422LL,
+       0x00000162c005d5f1LL,
+       0x0000000da96ccb1bLL,
+       0x0000000078d1b897LL,
+       0x000000000422b029LL
  };
  
  static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194);
@@ -45,112 +43,103 @@ static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3);
  static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9);
  
  static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1,
-                                    &shiftterm2, &shiftterm3 };
-
+       &shiftterm2, &shiftterm3
+};
  
  /*--- poly_2xm1() -----------------------------------------------------------+
   | Requires st(0) which is TAG_Valid and < 1.                                |
   +---------------------------------------------------------------------------*/
-int    poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result)
+int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result)
  {
-  long int              exponent, shift;
-  unsigned long long    Xll;
-  Xsig                  accumulator, Denom, argSignif;
-  u_char                tag;
+       long int exponent, shift;
+       unsigned long long Xll;
+       Xsig accumulator, Denom, argSignif;
+       u_char tag;
  
-  exponent = exponent16(arg);
+       exponent = exponent16(arg);
  
  #ifdef PARANOID
-  if ( exponent >= 0 )         /* Don't want a |number| >= 1.0 */
-    {
-      /* Number negative, too large, or not Valid. */
-      EXCEPTION(EX_INTERNAL|0x127);
-      return 1;
-    }
+       if (exponent >= 0) {    /* Don't want a |number| >= 1.0 */
+               /* Number negative, too large, or not Valid. */
+               EXCEPTION(EX_INTERNAL | 0x127);
+               return 1;
+       }
  #endif /* PARANOID */
  
-  argSignif.lsw = 0;
-  XSIG_LL(argSignif) = Xll = significand(arg);
-
-  if ( exponent == -1 )
-    {
-      shift = (argSignif.msw & 0x40000000) ? 3 : 2;
-      /* subtract 0.5 or 0.75 */
-      exponent -= 2;
-      XSIG_LL(argSignif) <<= 2;
-      Xll <<= 2;
-    }
-  else if ( exponent == -2 )
-    {
-      shift = 1;
-      /* subtract 0.25 */
-      exponent--;
-      XSIG_LL(argSignif) <<= 1;
-      Xll <<= 1;
-    }
-  else
-    shift = 0;
-
-  if ( exponent < -2 )
-    {
-      /* Shift the argument right by the required places. */
-      if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U )
-       Xll++;  /* round up */
-    }
-
-  accumulator.lsw = accumulator.midw = accumulator.msw = 0;
-  polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1);
-  mul_Xsig_Xsig(&accumulator, &argSignif);
-  shr_Xsig(&accumulator, 3);
-
-  mul_Xsig_Xsig(&argSignif, &hiterm);   /* The leading term */
-  add_two_Xsig(&accumulator, &argSignif, &exponent);
-
-  if ( shift )
-    {
-      /* The argument is large, use the identity:
-        f(x+a) = f(a) * (f(x) + 1) - 1;
-        */
-      shr_Xsig(&accumulator, - exponent);
-      accumulator.msw |= 0x80000000;      /* add 1.0 */
-      mul_Xsig_Xsig(&accumulator, shiftterm[shift]);
-      accumulator.msw &= 0x3fffffff;      /* subtract 1.0 */
-      exponent = 1;
-    }
-
-  if ( sign != SIGN_POS )
-    {
-      /* The argument is negative, use the identity:
-            f(-x) = -f(x) / (1 + f(x))
-        */
-      Denom.lsw = accumulator.lsw;
-      XSIG_LL(Denom) = XSIG_LL(accumulator);
-      if ( exponent < 0 )
-       shr_Xsig(&Denom, - exponent);
-      else if ( exponent > 0 )
-       {
-         /* exponent must be 1 here */
-         XSIG_LL(Denom) <<= 1;
-         if ( Denom.lsw & 0x80000000 )
-           XSIG_LL(Denom) |= 1;
-         (Denom.lsw) <<= 1;
+       argSignif.lsw = 0;
+       XSIG_LL(argSignif) = Xll = significand(arg);
+
+       if (exponent == -1) {
+               shift = (argSignif.msw & 0x40000000) ? 3 : 2;
+               /* subtract 0.5 or 0.75 */
+               exponent -= 2;
+               XSIG_LL(argSignif) <<= 2;
+               Xll <<= 2;
+       } else if (exponent == -2) {
+               shift = 1;
+               /* subtract 0.25 */
+               exponent--;
+               XSIG_LL(argSignif) <<= 1;
+               Xll <<= 1;
+       } else
+               shift = 0;
+
+       if (exponent < -2) {
+               /* Shift the argument right by the required places. */
+               if (FPU_shrx(&Xll, -2 - exponent) >= 0x80000000U)
+                       Xll++;  /* round up */
+       }
+
+       accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+       polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER - 1);
+       mul_Xsig_Xsig(&accumulator, &argSignif);
+       shr_Xsig(&accumulator, 3);
+
+       mul_Xsig_Xsig(&argSignif, &hiterm);     /* The leading term */
+       add_two_Xsig(&accumulator, &argSignif, &exponent);
+
+       if (shift) {
+               /* The argument is large, use the identity:
+                  f(x+a) = f(a) * (f(x) + 1) - 1;
+                */
+               shr_Xsig(&accumulator, -exponent);
+               accumulator.msw |= 0x80000000;  /* add 1.0 */
+               mul_Xsig_Xsig(&accumulator, shiftterm[shift]);
+               accumulator.msw &= 0x3fffffff;  /* subtract 1.0 */
+               exponent = 1;
+       }
+
+       if (sign != SIGN_POS) {
+               /* The argument is negative, use the identity:
+                  f(-x) = -f(x) / (1 + f(x))
+                */
+               Denom.lsw = accumulator.lsw;
+               XSIG_LL(Denom) = XSIG_LL(accumulator);
+               if (exponent < 0)
+                       shr_Xsig(&Denom, -exponent);
+               else if (exponent > 0) {
+                       /* exponent must be 1 here */
+                       XSIG_LL(Denom) <<= 1;
+                       if (Denom.lsw & 0x80000000)
+                               XSIG_LL(Denom) |= 1;
+                       (Denom.lsw) <<= 1;
+               }
+               Denom.msw |= 0x80000000;        /* add 1.0 */
+               div_Xsig(&accumulator, &Denom, &accumulator);
         }
-      Denom.msw |= 0x80000000;      /* add 1.0 */
-      div_Xsig(&accumulator, &Denom, &accumulator);
-    }
  
-  /* Convert to 64 bit signed-compatible */
-  exponent += round_Xsig(&accumulator);
+       /* Convert to 64 bit signed-compatible */
+       exponent += round_Xsig(&accumulator);
  
-  result = &st(0);
-  significand(result) = XSIG_LL(accumulator);
-  setexponent16(result, exponent);
+       result = &st(0);
+       significand(result) = XSIG_LL(accumulator);
+       setexponent16(result, exponent);
  
-  tag = FPU_round(result, 1, 0, FULL_PRECISION, sign);
+       tag = FPU_round(result, 1, 0, FULL_PRECISION, sign);
  
-  setsign(result, sign);
-  FPU_settag0(tag);
+       setsign(result, sign);
+       FPU_settag0(tag);
  
-  return 0;
+       return 0;
  
  }
diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c

index 82f702952f690982e841a655de465595e5ed429d..20c28e58e2d446c84adb3a04dbad6275270dc641 100644 (file)
--- a/arch/x86/math-emu/poly_atan.c
+++ b/arch/x86/math-emu/poly_atan.c
@@ -18,28 +18,25 @@
  #include "control_w.h"
  #include "poly.h"
  
-
  #define        HIPOWERon       6       /* odd poly, negative terms */
-static const unsigned long long oddnegterms[HIPOWERon] =
-{
-  0x0000000000000000LL, /* Dummy (not for - 1.0) */
-  0x015328437f756467LL,
-  0x0005dda27b73dec6LL,
-  0x0000226bf2bfb91aLL,
-  0x000000ccc439c5f7LL,
-  0x0000000355438407LL
-} ;
+static const unsigned long long oddnegterms[HIPOWERon] = {
+       0x0000000000000000LL,   /* Dummy (not for - 1.0) */
+       0x015328437f756467LL,
+       0x0005dda27b73dec6LL,
+       0x0000226bf2bfb91aLL,
+       0x000000ccc439c5f7LL,
+       0x0000000355438407LL
+};
  
  #define        HIPOWERop       6       /* odd poly, positive terms */
-static const unsigned long long oddplterms[HIPOWERop] =
-{
+static const unsigned long long oddplterms[HIPOWERop] = {
  /*  0xaaaaaaaaaaaaaaabLL,  transferred to fixedpterm[] */
-  0x0db55a71875c9ac2LL,
-  0x0029fce2d67880b0LL,
-  0x0000dfd3908b4596LL,
-  0x00000550fd61dab4LL,
-  0x0000001c9422b3f9LL,
-  0x000000003e3301e1LL
+       0x0db55a71875c9ac2LL,
+       0x0029fce2d67880b0LL,
+       0x0000dfd3908b4596LL,
+       0x00000550fd61dab4LL,
+       0x0000001c9422b3f9LL,
+       0x000000003e3301e1LL
  };
  
  static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL;
@@ -48,182 +45,164 @@ static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa);
  
  static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b);
  
-
  /*--- poly_atan() -----------------------------------------------------------+
   |                                                                           |
   +---------------------------------------------------------------------------*/
-void   poly_atan(FPU_REG *st0_ptr, u_char st0_tag,
-                 FPU_REG *st1_ptr, u_char st1_tag)
+void poly_atan(FPU_REG *st0_ptr, u_char st0_tag,
+              FPU_REG *st1_ptr, u_char st1_tag)
  {
-  u_char       transformed, inverted,
-                sign1, sign2;
-  int           exponent;
-  long int     dummy_exp;
-  Xsig          accumulator, Numer, Denom, accumulatore, argSignif,
-                argSq, argSqSq;
-  u_char        tag;
-  
-  sign1 = getsign(st0_ptr);
-  sign2 = getsign(st1_ptr);
-  if ( st0_tag == TAG_Valid )
-    {
-      exponent = exponent(st0_ptr);
-    }
-  else
-    {
-      /* This gives non-compatible stack contents... */
-      FPU_to_exp16(st0_ptr, st0_ptr);
-      exponent = exponent16(st0_ptr);
-    }
-  if ( st1_tag == TAG_Valid )
-    {
-      exponent -= exponent(st1_ptr);
-    }
-  else
-    {
-      /* This gives non-compatible stack contents... */
-      FPU_to_exp16(st1_ptr, st1_ptr);
-      exponent -= exponent16(st1_ptr);
-    }
-
-  if ( (exponent < 0) || ((exponent == 0) &&
-                         ((st0_ptr->sigh < st1_ptr->sigh) ||
-                          ((st0_ptr->sigh == st1_ptr->sigh) &&
-                           (st0_ptr->sigl < st1_ptr->sigl))) ) )
-    {
-      inverted = 1;
-      Numer.lsw = Denom.lsw = 0;
-      XSIG_LL(Numer) = significand(st0_ptr);
-      XSIG_LL(Denom) = significand(st1_ptr);
-    }
-  else
-    {
-      inverted = 0;
-      exponent = -exponent;
-      Numer.lsw = Denom.lsw = 0;
-      XSIG_LL(Numer) = significand(st1_ptr);
-      XSIG_LL(Denom) = significand(st0_ptr);
-     }
-  div_Xsig(&Numer, &Denom, &argSignif);
-  exponent += norm_Xsig(&argSignif);
-
-  if ( (exponent >= -1)
-      || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) )
-    {
-      /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
-      /* Convert the argument by an identity for atan */
-      transformed = 1;
-
-      if ( exponent >= 0 )
-       {
+       u_char transformed, inverted, sign1, sign2;
+       int exponent;
+       long int dummy_exp;
+       Xsig accumulator, Numer, Denom, accumulatore, argSignif, argSq, argSqSq;
+       u_char tag;
+
+       sign1 = getsign(st0_ptr);
+       sign2 = getsign(st1_ptr);
+       if (st0_tag == TAG_Valid) {
+               exponent = exponent(st0_ptr);
+       } else {
+               /* This gives non-compatible stack contents... */
+               FPU_to_exp16(st0_ptr, st0_ptr);
+               exponent = exponent16(st0_ptr);
+       }
+       if (st1_tag == TAG_Valid) {
+               exponent -= exponent(st1_ptr);
+       } else {
+               /* This gives non-compatible stack contents... */
+               FPU_to_exp16(st1_ptr, st1_ptr);
+               exponent -= exponent16(st1_ptr);
+       }
+
+       if ((exponent < 0) || ((exponent == 0) &&
+                              ((st0_ptr->sigh < st1_ptr->sigh) ||
+                               ((st0_ptr->sigh == st1_ptr->sigh) &&
+                                (st0_ptr->sigl < st1_ptr->sigl))))) {
+               inverted = 1;
+               Numer.lsw = Denom.lsw = 0;
+               XSIG_LL(Numer) = significand(st0_ptr);
+               XSIG_LL(Denom) = significand(st1_ptr);
+       } else {
+               inverted = 0;
+               exponent = -exponent;
+               Numer.lsw = Denom.lsw = 0;
+               XSIG_LL(Numer) = significand(st1_ptr);
+               XSIG_LL(Denom) = significand(st0_ptr);
+       }
+       div_Xsig(&Numer, &Denom, &argSignif);
+       exponent += norm_Xsig(&argSignif);
+
+       if ((exponent >= -1)
+           || ((exponent == -2) && (argSignif.msw > 0xd413ccd0))) {
+               /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
+               /* Convert the argument by an identity for atan */
+               transformed = 1;
+
+               if (exponent >= 0) {
  #ifdef PARANOID
-         if ( !( (exponent == 0) && 
-                (argSignif.lsw == 0) && (argSignif.midw == 0) &&
-                (argSignif.msw == 0x80000000) ) )
-           {
-             EXCEPTION(EX_INTERNAL|0x104);  /* There must be a logic error */
-             return;
-           }
+                       if (!((exponent == 0) &&
+                             (argSignif.lsw == 0) && (argSignif.midw == 0) &&
+                             (argSignif.msw == 0x80000000))) {
+                               EXCEPTION(EX_INTERNAL | 0x104); /* There must be a logic error */
+                               return;
+                       }
  #endif /* PARANOID */
-         argSignif.msw = 0;   /* Make the transformed arg -> 0.0 */
+                       argSignif.msw = 0;      /* Make the transformed arg -> 0.0 */
+               } else {
+                       Numer.lsw = Denom.lsw = argSignif.lsw;
+                       XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif);
+
+                       if (exponent < -1)
+                               shr_Xsig(&Numer, -1 - exponent);
+                       negate_Xsig(&Numer);
+
+                       shr_Xsig(&Denom, -exponent);
+                       Denom.msw |= 0x80000000;
+
+                       div_Xsig(&Numer, &Denom, &argSignif);
+
+                       exponent = -1 + norm_Xsig(&argSignif);
+               }
+       } else {
+               transformed = 0;
+       }
+
+       argSq.lsw = argSignif.lsw;
+       argSq.midw = argSignif.midw;
+       argSq.msw = argSignif.msw;
+       mul_Xsig_Xsig(&argSq, &argSq);
+
+       argSqSq.lsw = argSq.lsw;
+       argSqSq.midw = argSq.midw;
+       argSqSq.msw = argSq.msw;
+       mul_Xsig_Xsig(&argSqSq, &argSqSq);
+
+       accumulatore.lsw = argSq.lsw;
+       XSIG_LL(accumulatore) = XSIG_LL(argSq);
+
+       shr_Xsig(&argSq, 2 * (-1 - exponent - 1));
+       shr_Xsig(&argSqSq, 4 * (-1 - exponent - 1));
+
+       /* Now have argSq etc with binary point at the left
+          .1xxxxxxxx */
+
+       /* Do the basic fixed point polynomial evaluation */
+       accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+       polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
+                       oddplterms, HIPOWERop - 1);
+       mul64_Xsig(&accumulator, &XSIG_LL(argSq));
+       negate_Xsig(&accumulator);
+       polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms,
+                       HIPOWERon - 1);
+       negate_Xsig(&accumulator);
+       add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
+
+       mul64_Xsig(&accumulatore, &denomterm);
+       shr_Xsig(&accumulatore, 1 + 2 * (-1 - exponent));
+       accumulatore.msw |= 0x80000000;
+
+       div_Xsig(&accumulator, &accumulatore, &accumulator);
+
+       mul_Xsig_Xsig(&accumulator, &argSignif);
+       mul_Xsig_Xsig(&accumulator, &argSq);
+
+       shr_Xsig(&accumulator, 3);
+       negate_Xsig(&accumulator);
+       add_Xsig_Xsig(&accumulator, &argSignif);
+
+       if (transformed) {
+               /* compute pi/4 - accumulator */
+               shr_Xsig(&accumulator, -1 - exponent);
+               negate_Xsig(&accumulator);
+               add_Xsig_Xsig(&accumulator, &pi_signif);
+               exponent = -1;
+       }
+
+       if (inverted) {
+               /* compute pi/2 - accumulator */
+               shr_Xsig(&accumulator, -exponent);
+               negate_Xsig(&accumulator);
+               add_Xsig_Xsig(&accumulator, &pi_signif);
+               exponent = 0;
         }
-      else
-       {
-         Numer.lsw = Denom.lsw = argSignif.lsw;
-         XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif);
-
-         if ( exponent < -1 )
-           shr_Xsig(&Numer, -1-exponent);
-         negate_Xsig(&Numer);
-      
-         shr_Xsig(&Denom, -exponent);
-         Denom.msw |= 0x80000000;
-      
-         div_Xsig(&Numer, &Denom, &argSignif);
-
-         exponent = -1 + norm_Xsig(&argSignif);
+
+       if (sign1) {
+               /* compute pi - accumulator */
+               shr_Xsig(&accumulator, 1 - exponent);
+               negate_Xsig(&accumulator);
+               add_Xsig_Xsig(&accumulator, &pi_signif);
+               exponent = 1;
         }
-    }
-  else
-    {
-      transformed = 0;
-    }
-
-  argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw;
-  argSq.msw = argSignif.msw;
-  mul_Xsig_Xsig(&argSq, &argSq);
-  
-  argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw;
-  mul_Xsig_Xsig(&argSqSq, &argSqSq);
-
-  accumulatore.lsw = argSq.lsw;
-  XSIG_LL(accumulatore) = XSIG_LL(argSq);
-
-  shr_Xsig(&argSq, 2*(-1-exponent-1));
-  shr_Xsig(&argSqSq, 4*(-1-exponent-1));
-
-  /* Now have argSq etc with binary point at the left
-     .1xxxxxxxx */
-
-  /* Do the basic fixed point polynomial evaluation */
-  accumulator.msw = accumulator.midw = accumulator.lsw = 0;
-  polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
-                  oddplterms, HIPOWERop-1);
-  mul64_Xsig(&accumulator, &XSIG_LL(argSq));
-  negate_Xsig(&accumulator);
-  polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1);
-  negate_Xsig(&accumulator);
-  add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
-
-  mul64_Xsig(&accumulatore, &denomterm);
-  shr_Xsig(&accumulatore, 1 + 2*(-1-exponent));
-  accumulatore.msw |= 0x80000000;
-
-  div_Xsig(&accumulator, &accumulatore, &accumulator);
-
-  mul_Xsig_Xsig(&accumulator, &argSignif);
-  mul_Xsig_Xsig(&accumulator, &argSq);
-
-  shr_Xsig(&accumulator, 3);
-  negate_Xsig(&accumulator);
-  add_Xsig_Xsig(&accumulator, &argSignif);
-
-  if ( transformed )
-    {
-      /* compute pi/4 - accumulator */
-      shr_Xsig(&accumulator, -1-exponent);
-      negate_Xsig(&accumulator);
-      add_Xsig_Xsig(&accumulator, &pi_signif);
-      exponent = -1;
-    }
-
-  if ( inverted )
-    {
-      /* compute pi/2 - accumulator */
-      shr_Xsig(&accumulator, -exponent);
-      negate_Xsig(&accumulator);
-      add_Xsig_Xsig(&accumulator, &pi_signif);
-      exponent = 0;
-    }
-
-  if ( sign1 )
-    {
-      /* compute pi - accumulator */
-      shr_Xsig(&accumulator, 1 - exponent);
-      negate_Xsig(&accumulator);
-      add_Xsig_Xsig(&accumulator, &pi_signif);
-      exponent = 1;
-    }
-
-  exponent += round_Xsig(&accumulator);
-
-  significand(st1_ptr) = XSIG_LL(accumulator);
-  setexponent16(st1_ptr, exponent);
-
-  tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
-  FPU_settagi(1, tag);
-
-  set_precision_flag_up();  /* We do not really know if up or down,
-                              use this as the default. */
+
+       exponent += round_Xsig(&accumulator);
+
+       significand(st1_ptr) = XSIG_LL(accumulator);
+       setexponent16(st1_ptr, exponent);
+
+       tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
+       FPU_settagi(1, tag);
+
+       set_precision_flag_up();        /* We do not really know if up or down,
+                                          use this as the default. */
  
  }
diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c

index dd00e1d5b0743929508ad45926285eac43548ee8..8e2ff4b28a0abbe455c965282a7b3096265c10cc 100644 (file)
--- a/arch/x86/math-emu/poly_l2.c
+++ b/arch/x86/math-emu/poly_l2.c
@@ -10,7 +10,6 @@
   |                                                                           |
   +---------------------------------------------------------------------------*/
  
-
  #include "exception.h"
  #include "reg_constant.h"
  #include "fpu_emu.h"
@@ -18,184 +17,163 @@
  #include "control_w.h"
  #include "poly.h"
  
-
  static void log2_kernel(FPU_REG const *arg, u_char argsign,
-                       Xsig *accum_result, long int *expon);
-
+                       Xsig * accum_result, long int *expon);
  
  /*--- poly_l2() -------------------------------------------------------------+
   |   Base 2 logarithm by a polynomial approximation.                         |
   +---------------------------------------------------------------------------*/
-void   poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign)
+void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign)
  {
-  long int            exponent, expon, expon_expon;
-  Xsig                 accumulator, expon_accum, yaccum;
-  u_char                      sign, argsign;
-  FPU_REG              x;
-  int                  tag;
-
-  exponent = exponent16(st0_ptr);
-
-  /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */
-  if ( st0_ptr->sigh > (unsigned)0xb504f334 )
-    {
-      /* Treat as  sqrt(2)/2 < st0_ptr < 1 */
-      significand(&x) = - significand(st0_ptr);
-      setexponent16(&x, -1);
-      exponent++;
-      argsign = SIGN_NEG;
-    }
-  else
-    {
-      /* Treat as  1 <= st0_ptr < sqrt(2) */
-      x.sigh = st0_ptr->sigh - 0x80000000;
-      x.sigl = st0_ptr->sigl;
-      setexponent16(&x, 0);
-      argsign = SIGN_POS;
-    }
-  tag = FPU_normalize_nuo(&x);
-
-  if ( tag == TAG_Zero )
-    {
-      expon = 0;
-      accumulator.msw = accumulator.midw = accumulator.lsw = 0;
-    }
-  else
-    {
-      log2_kernel(&x, argsign, &accumulator, &expon);
-    }
-
-  if ( exponent < 0 )
-    {
-      sign = SIGN_NEG;
-      exponent = -exponent;
-    }
-  else
-    sign = SIGN_POS;
-  expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0;
-  if ( exponent )
-    {
-      expon_expon = 31 + norm_Xsig(&expon_accum);
-      shr_Xsig(&accumulator, expon_expon - expon);
-
-      if ( sign ^ argsign )
-       negate_Xsig(&accumulator);
-      add_Xsig_Xsig(&accumulator, &expon_accum);
-    }
-  else
-    {
-      expon_expon = expon;
-      sign = argsign;
-    }
-
-  yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr);
-  mul_Xsig_Xsig(&accumulator, &yaccum);
-
-  expon_expon += round_Xsig(&accumulator);
-
-  if ( accumulator.msw == 0 )
-    {
-      FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
-      return;
-    }
-
-  significand(st1_ptr) = XSIG_LL(accumulator);
-  setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
-
-  tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
-  FPU_settagi(1, tag);
-
-  set_precision_flag_up();  /* 80486 appears to always do this */
-
-  return;
+       long int exponent, expon, expon_expon;
+       Xsig accumulator, expon_accum, yaccum;
+       u_char sign, argsign;
+       FPU_REG x;
+       int tag;
+
+       exponent = exponent16(st0_ptr);
+
+       /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */
+       if (st0_ptr->sigh > (unsigned)0xb504f334) {
+               /* Treat as  sqrt(2)/2 < st0_ptr < 1 */
+               significand(&x) = -significand(st0_ptr);
+               setexponent16(&x, -1);
+               exponent++;
+               argsign = SIGN_NEG;
+       } else {
+               /* Treat as  1 <= st0_ptr < sqrt(2) */
+               x.sigh = st0_ptr->sigh - 0x80000000;
+               x.sigl = st0_ptr->sigl;
+               setexponent16(&x, 0);
+               argsign = SIGN_POS;
+       }
+       tag = FPU_normalize_nuo(&x);
  
-}
+       if (tag == TAG_Zero) {
+               expon = 0;
+               accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+       } else {
+               log2_kernel(&x, argsign, &accumulator, &expon);
+       }
+
+       if (exponent < 0) {
+               sign = SIGN_NEG;
+               exponent = -exponent;
+       } else
+               sign = SIGN_POS;
+       expon_accum.msw = exponent;
+       expon_accum.midw = expon_accum.lsw = 0;
+       if (exponent) {
+               expon_expon = 31 + norm_Xsig(&expon_accum);
+               shr_Xsig(&accumulator, expon_expon - expon);
+
+               if (sign ^ argsign)
+                       negate_Xsig(&accumulator);
+               add_Xsig_Xsig(&accumulator, &expon_accum);
+       } else {
+               expon_expon = expon;
+               sign = argsign;
+       }
+
+       yaccum.lsw = 0;
+       XSIG_LL(yaccum) = significand(st1_ptr);
+       mul_Xsig_Xsig(&accumulator, &yaccum);
+
+       expon_expon += round_Xsig(&accumulator);
+
+       if (accumulator.msw == 0) {
+               FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+               return;
+       }
+
+       significand(st1_ptr) = XSIG_LL(accumulator);
+       setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
  
+       tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
+       FPU_settagi(1, tag);
+
+       set_precision_flag_up();        /* 80486 appears to always do this */
+
+       return;
+
+}
  
  /*--- poly_l2p1() -----------------------------------------------------------+
   |   Base 2 logarithm by a polynomial approximation.                         |
   |   log2(x+1)                                                               |
   +---------------------------------------------------------------------------*/
-int    poly_l2p1(u_char sign0, u_char sign1,
-                 FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest)
+int poly_l2p1(u_char sign0, u_char sign1,
+             FPU_REG * st0_ptr, FPU_REG * st1_ptr, FPU_REG * dest)
  {
-  u_char               tag;
-  long int             exponent;
-  Xsig                 accumulator, yaccum;
+       u_char tag;
+       long int exponent;
+       Xsig accumulator, yaccum;
  
-  if ( exponent16(st0_ptr) < 0 )
-    {
-      log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
+       if (exponent16(st0_ptr) < 0) {
+               log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
  
-      yaccum.lsw = 0;
-      XSIG_LL(yaccum) = significand(st1_ptr);
-      mul_Xsig_Xsig(&accumulator, &yaccum);
+               yaccum.lsw = 0;
+               XSIG_LL(yaccum) = significand(st1_ptr);
+               mul_Xsig_Xsig(&accumulator, &yaccum);
  
-      exponent += round_Xsig(&accumulator);
+               exponent += round_Xsig(&accumulator);
  
-      exponent += exponent16(st1_ptr) + 1;
-      if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER;
+               exponent += exponent16(st1_ptr) + 1;
+               if (exponent < EXP_WAY_UNDER)
+                       exponent = EXP_WAY_UNDER;
  
-      significand(dest) = XSIG_LL(accumulator);
-      setexponent16(dest, exponent);
+               significand(dest) = XSIG_LL(accumulator);
+               setexponent16(dest, exponent);
  
-      tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1);
-      FPU_settagi(1, tag);
+               tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1);
+               FPU_settagi(1, tag);
  
-      if ( tag == TAG_Valid )
-       set_precision_flag_up();   /* 80486 appears to always do this */
-    }
-  else
-    {
-      /* The magnitude of st0_ptr is far too large. */
+               if (tag == TAG_Valid)
+                       set_precision_flag_up();        /* 80486 appears to always do this */
+       } else {
+               /* The magnitude of st0_ptr is far too large. */
  
-      if ( sign0 != SIGN_POS )
-       {
-         /* Trying to get the log of a negative number. */
-#ifdef PECULIAR_486   /* Stupid 80486 doesn't worry about log(negative). */
-         changesign(st1_ptr);
+               if (sign0 != SIGN_POS) {
+                       /* Trying to get the log of a negative number. */
+#ifdef PECULIAR_486            /* Stupid 80486 doesn't worry about log(negative). */
+                       changesign(st1_ptr);
  #else
-         if ( arith_invalid(1) < 0 )
-           return 1;
+                       if (arith_invalid(1) < 0)
+                               return 1;
  #endif /* PECULIAR_486 */
-       }
+               }
  
-      /* 80486 appears to do this */
-      if ( sign0 == SIGN_NEG )
-       set_precision_flag_down();
-      else
-       set_precision_flag_up();
-    }
+               /* 80486 appears to do this */
+               if (sign0 == SIGN_NEG)
+                       set_precision_flag_down();
+               else
+                       set_precision_flag_up();
+       }
  
-  if ( exponent(dest) <= EXP_UNDER )
-    EXCEPTION(EX_Underflow);
+       if (exponent(dest) <= EXP_UNDER)
+               EXCEPTION(EX_Underflow);
  
-  return 0;
+       return 0;
  
  }
  
-
-
-
  #undef HIPOWER
  #define        HIPOWER 10
-static const unsigned long long logterms[HIPOWER] =
-{
-  0x2a8eca5705fc2ef0LL,
-  0xf6384ee1d01febceLL,
-  0x093bb62877cdf642LL,
-  0x006985d8a9ec439bLL,
-  0x0005212c4f55a9c8LL,
-  0x00004326a16927f0LL,
-  0x0000038d1d80a0e7LL,
-  0x0000003141cc80c6LL,
-  0x00000002b1668c9fLL,
-  0x000000002c7a46aaLL
+static const unsigned long long logterms[HIPOWER] = {
+       0x2a8eca5705fc2ef0LL,
+       0xf6384ee1d01febceLL,
+       0x093bb62877cdf642LL,
+       0x006985d8a9ec439bLL,
+       0x0005212c4f55a9c8LL,
+       0x00004326a16927f0LL,
+       0x0000038d1d80a0e7LL,
+       0x0000003141cc80c6LL,
+       0x00000002b1668c9fLL,
+       0x000000002c7a46aaLL
  };
  
  static const unsigned long leadterm = 0xb8000000;
  
-
  /*--- log2_kernel() ---------------------------------------------------------+
   |   Base 2 logarithm by a polynomial approximation.                         |
   |   log2(x+1)                                                               |
@@ -203,70 +181,64 @@ static const unsigned long leadterm = 0xb8000000;
  static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result,
                         long int *expon)
  {
-  long int             exponent, adj;
-  unsigned long long   Xsq;
-  Xsig                 accumulator, Numer, Denom, argSignif, arg_signif;
-
-  exponent = exponent16(arg);
-  Numer.lsw = Denom.lsw = 0;
-  XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg);
-  if ( argsign == SIGN_POS )
-    {
-      shr_Xsig(&Denom, 2 - (1 + exponent));
-      Denom.msw |= 0x80000000;
-      div_Xsig(&Numer, &Denom, &argSignif);
-    }
-  else
-    {
-      shr_Xsig(&Denom, 1 - (1 + exponent));
-      negate_Xsig(&Denom);
-      if ( Denom.msw & 0x80000000 )
-       {
-         div_Xsig(&Numer, &Denom, &argSignif);
-         exponent ++;
-       }
-      else
-       {
-         /* Denom must be 1.0 */
-         argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw;
-         argSignif.msw = Numer.msw;
+       long int exponent, adj;
+       unsigned long long Xsq;
+       Xsig accumulator, Numer, Denom, argSignif, arg_signif;
+
+       exponent = exponent16(arg);
+       Numer.lsw = Denom.lsw = 0;
+       XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg);
+       if (argsign == SIGN_POS) {
+               shr_Xsig(&Denom, 2 - (1 + exponent));
+               Denom.msw |= 0x80000000;
+               div_Xsig(&Numer, &Denom, &argSignif);
+       } else {
+               shr_Xsig(&Denom, 1 - (1 + exponent));
+               negate_Xsig(&Denom);
+               if (Denom.msw & 0x80000000) {
+                       div_Xsig(&Numer, &Denom, &argSignif);
+                       exponent++;
+               } else {
+                       /* Denom must be 1.0 */
+                       argSignif.lsw = Numer.lsw;
+                       argSignif.midw = Numer.midw;
+                       argSignif.msw = Numer.msw;
+               }
         }
-    }
  
  #ifndef PECULIAR_486
-  /* Should check here that  |local_arg|  is within the valid range */
-  if ( exponent >= -2 )
-    {
-      if ( (exponent > -2) ||
-         (argSignif.msw > (unsigned)0xafb0ccc0) )
-       {
-         /* The argument is too large */
+       /* Should check here that  |local_arg|  is within the valid range */
+       if (exponent >= -2) {
+               if ((exponent > -2) || (argSignif.msw > (unsigned)0xafb0ccc0)) {
+                       /* The argument is too large */
+               }
         }
-    }
  #endif /* PECULIAR_486 */
  
-  arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif);
-  adj = norm_Xsig(&argSignif);
-  accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif);
-  mul_Xsig_Xsig(&accumulator, &accumulator);
-  shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj)));
-  Xsq = XSIG_LL(accumulator);
-  if ( accumulator.lsw & 0x80000000 )
-    Xsq++;
-
-  accumulator.msw = accumulator.midw = accumulator.lsw = 0;
-  /* Do the basic fixed point polynomial evaluation */
-  polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1);
-
-  mul_Xsig_Xsig(&accumulator, &argSignif);
-  shr_Xsig(&accumulator, 6 - adj);
-
-  mul32_Xsig(&arg_signif, leadterm);
-  add_two_Xsig(&accumulator, &arg_signif, &exponent);
-
-  *expon = exponent + 1;
-  accum_result->lsw = accumulator.lsw;
-  accum_result->midw = accumulator.midw;
-  accum_result->msw = accumulator.msw;
+       arg_signif.lsw = argSignif.lsw;
+       XSIG_LL(arg_signif) = XSIG_LL(argSignif);
+       adj = norm_Xsig(&argSignif);
+       accumulator.lsw = argSignif.lsw;
+       XSIG_LL(accumulator) = XSIG_LL(argSignif);
+       mul_Xsig_Xsig(&accumulator, &accumulator);
+       shr_Xsig(&accumulator, 2 * (-1 - (1 + exponent + adj)));
+       Xsq = XSIG_LL(accumulator);
+       if (accumulator.lsw & 0x80000000)
+               Xsq++;
+
+       accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+       /* Do the basic fixed point polynomial evaluation */
+       polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER - 1);
+
+       mul_Xsig_Xsig(&accumulator, &argSignif);
+       shr_Xsig(&accumulator, 6 - adj);
+
+       mul32_Xsig(&arg_signif, leadterm);
+       add_two_Xsig(&accumulator, &arg_signif, &exponent);
+
+       *expon = exponent + 1;
+       accum_result->lsw = accumulator.lsw;
+       accum_result->midw = accumulator.midw;
+       accum_result->msw = accumulator.msw;
  
  }
diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c

index a36313fb06f14988f3dbed1faf1ab75ce29f47f1..b862039c728e628cd4257afecb194f3b2703699f 100644 (file)
--- a/arch/x86/math-emu/poly_sin.c
+++ b/arch/x86/math-emu/poly_sin.c
@@ -11,7 +11,6 @@
   |                                                                           |
   +---------------------------------------------------------------------------*/
  
-
  #include "exception.h"
  #include "reg_constant.h"
  #include "fpu_emu.h"
@@ -19,379 +18,361 @@
  #include "control_w.h"
  #include "poly.h"
  
-
  #define        N_COEFF_P       4
  #define        N_COEFF_N       4
  
-static const unsigned long long pos_terms_l[N_COEFF_P] =
-{
-  0xaaaaaaaaaaaaaaabLL,
-  0x00d00d00d00cf906LL,
-  0x000006b99159a8bbLL,
-  0x000000000d7392e6LL
+static const unsigned long long pos_terms_l[N_COEFF_P] = {
+       0xaaaaaaaaaaaaaaabLL,
+       0x00d00d00d00cf906LL,
+       0x000006b99159a8bbLL,
+       0x000000000d7392e6LL
  };
  
-static const unsigned long long neg_terms_l[N_COEFF_N] =
-{
-  0x2222222222222167LL,
-  0x0002e3bc74aab624LL,
-  0x0000000b09229062LL,
-  0x00000000000c7973LL
+static const unsigned long long neg_terms_l[N_COEFF_N] = {
+       0x2222222222222167LL,
+       0x0002e3bc74aab624LL,
+       0x0000000b09229062LL,
+       0x00000000000c7973LL
  };
  
-
-
  #define        N_COEFF_PH      4
  #define        N_COEFF_NH      4
-static const unsigned long long pos_terms_h[N_COEFF_PH] =
-{
-  0x0000000000000000LL,
-  0x05b05b05b05b0406LL,
-  0x000049f93edd91a9LL,
-  0x00000000c9c9ed62LL
+static const unsigned long long pos_terms_h[N_COEFF_PH] = {
+       0x0000000000000000LL,
+       0x05b05b05b05b0406LL,
+       0x000049f93edd91a9LL,
+       0x00000000c9c9ed62LL
  };
  
-static const unsigned long long neg_terms_h[N_COEFF_NH] =
-{
-  0xaaaaaaaaaaaaaa98LL,
-  0x001a01a01a019064LL,
-  0x0000008f76c68a77LL,
-  0x0000000000d58f5eLL
+static const unsigned long long neg_terms_h[N_COEFF_NH] = {
+       0xaaaaaaaaaaaaaa98LL,
+       0x001a01a01a019064LL,
+       0x0000008f76c68a77LL,
+       0x0000000000d58f5eLL
  };
  
-
  /*--- poly_sine() -----------------------------------------------------------+
   |                                                                           |
   +---------------------------------------------------------------------------*/
-void   poly_sine(FPU_REG *st0_ptr)
+void poly_sine(FPU_REG *st0_ptr)
  {
-  int                 exponent, echange;
-  Xsig                accumulator, argSqrd, argTo4;
-  unsigned long       fix_up, adj;
-  unsigned long long  fixed_arg;
-  FPU_REG            result;
+       int exponent, echange;
+       Xsig accumulator, argSqrd, argTo4;
+       unsigned long fix_up, adj;
+       unsigned long long fixed_arg;
+       FPU_REG result;
  
-  exponent = exponent(st0_ptr);
+       exponent = exponent(st0_ptr);
  
-  accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+       accumulator.lsw = accumulator.midw = accumulator.msw = 0;
  
-  /* Split into two ranges, for arguments below and above 1.0 */
-  /* The boundary between upper and lower is approx 0.88309101259 */
-  if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) )
-    {
-      /* The argument is <= 0.88309101259 */
+       /* Split into two ranges, for arguments below and above 1.0 */
+       /* The boundary between upper and lower is approx 0.88309101259 */
+       if ((exponent < -1)
+           || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa))) {
+               /* The argument is <= 0.88309101259 */
+
+               argSqrd.msw = st0_ptr->sigh;
+               argSqrd.midw = st0_ptr->sigl;
+               argSqrd.lsw = 0;
+               mul64_Xsig(&argSqrd, &significand(st0_ptr));
+               shr_Xsig(&argSqrd, 2 * (-1 - exponent));
+               argTo4.msw = argSqrd.msw;
+               argTo4.midw = argSqrd.midw;
+               argTo4.lsw = argSqrd.lsw;
+               mul_Xsig_Xsig(&argTo4, &argTo4);
  
-      argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0;
-      mul64_Xsig(&argSqrd, &significand(st0_ptr));
-      shr_Xsig(&argSqrd, 2*(-1-exponent));
-      argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
-      argTo4.lsw = argSqrd.lsw;
-      mul_Xsig_Xsig(&argTo4, &argTo4);
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
+                               N_COEFF_N - 1);
+               mul_Xsig_Xsig(&accumulator, &argSqrd);
+               negate_Xsig(&accumulator);
  
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
-                     N_COEFF_N-1);
-      mul_Xsig_Xsig(&accumulator, &argSqrd);
-      negate_Xsig(&accumulator);
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
+                               N_COEFF_P - 1);
  
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
-                     N_COEFF_P-1);
+               shr_Xsig(&accumulator, 2);      /* Divide by four */
+               accumulator.msw |= 0x80000000;  /* Add 1.0 */
  
-      shr_Xsig(&accumulator, 2);    /* Divide by four */
-      accumulator.msw |= 0x80000000;  /* Add 1.0 */
+               mul64_Xsig(&accumulator, &significand(st0_ptr));
+               mul64_Xsig(&accumulator, &significand(st0_ptr));
+               mul64_Xsig(&accumulator, &significand(st0_ptr));
  
-      mul64_Xsig(&accumulator, &significand(st0_ptr));
-      mul64_Xsig(&accumulator, &significand(st0_ptr));
-      mul64_Xsig(&accumulator, &significand(st0_ptr));
+               /* Divide by four, FPU_REG compatible, etc */
+               exponent = 3 * exponent;
  
-      /* Divide by four, FPU_REG compatible, etc */
-      exponent = 3*exponent;
+               /* The minimum exponent difference is 3 */
+               shr_Xsig(&accumulator, exponent(st0_ptr) - exponent);
  
-      /* The minimum exponent difference is 3 */
-      shr_Xsig(&accumulator, exponent(st0_ptr) - exponent);
+               negate_Xsig(&accumulator);
+               XSIG_LL(accumulator) += significand(st0_ptr);
  
-      negate_Xsig(&accumulator);
-      XSIG_LL(accumulator) += significand(st0_ptr);
+               echange = round_Xsig(&accumulator);
  
-      echange = round_Xsig(&accumulator);
+               setexponentpos(&result, exponent(st0_ptr) + echange);
+       } else {
+               /* The argument is > 0.88309101259 */
+               /* We use sin(st(0)) = cos(pi/2-st(0)) */
  
-      setexponentpos(&result, exponent(st0_ptr) + echange);
-    }
-  else
-    {
-      /* The argument is > 0.88309101259 */
-      /* We use sin(st(0)) = cos(pi/2-st(0)) */
+               fixed_arg = significand(st0_ptr);
  
-      fixed_arg = significand(st0_ptr);
+               if (exponent == 0) {
+                       /* The argument is >= 1.0 */
  
-      if ( exponent == 0 )
-       {
-         /* The argument is >= 1.0 */
+                       /* Put the binary point at the left. */
+                       fixed_arg <<= 1;
+               }
+               /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+               fixed_arg = 0x921fb54442d18469LL - fixed_arg;
+               /* There is a special case which arises due to rounding, to fix here. */
+               if (fixed_arg == 0xffffffffffffffffLL)
+                       fixed_arg = 0;
  
-         /* Put the binary point at the left. */
-         fixed_arg <<= 1;
-       }
-      /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
-      fixed_arg = 0x921fb54442d18469LL - fixed_arg;
-      /* There is a special case which arises due to rounding, to fix here. */
-      if ( fixed_arg == 0xffffffffffffffffLL )
-       fixed_arg = 0;
+               XSIG_LL(argSqrd) = fixed_arg;
+               argSqrd.lsw = 0;
+               mul64_Xsig(&argSqrd, &fixed_arg);
  
-      XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0;
-      mul64_Xsig(&argSqrd, &fixed_arg);
+               XSIG_LL(argTo4) = XSIG_LL(argSqrd);
+               argTo4.lsw = argSqrd.lsw;
+               mul_Xsig_Xsig(&argTo4, &argTo4);
  
-      XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw;
-      mul_Xsig_Xsig(&argTo4, &argTo4);
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
+                               N_COEFF_NH - 1);
+               mul_Xsig_Xsig(&accumulator, &argSqrd);
+               negate_Xsig(&accumulator);
  
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
-                     N_COEFF_NH-1);
-      mul_Xsig_Xsig(&accumulator, &argSqrd);
-      negate_Xsig(&accumulator);
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
+                               N_COEFF_PH - 1);
+               negate_Xsig(&accumulator);
  
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
-                     N_COEFF_PH-1);
-      negate_Xsig(&accumulator);
+               mul64_Xsig(&accumulator, &fixed_arg);
+               mul64_Xsig(&accumulator, &fixed_arg);
  
-      mul64_Xsig(&accumulator, &fixed_arg);
-      mul64_Xsig(&accumulator, &fixed_arg);
+               shr_Xsig(&accumulator, 3);
+               negate_Xsig(&accumulator);
  
-      shr_Xsig(&accumulator, 3);
-      negate_Xsig(&accumulator);
+               add_Xsig_Xsig(&accumulator, &argSqrd);
  
-      add_Xsig_Xsig(&accumulator, &argSqrd);
+               shr_Xsig(&accumulator, 1);
  
-      shr_Xsig(&accumulator, 1);
+               accumulator.lsw |= 1;   /* A zero accumulator here would cause problems */
+               negate_Xsig(&accumulator);
  
-      accumulator.lsw |= 1;  /* A zero accumulator here would cause problems */
-      negate_Xsig(&accumulator);
+               /* The basic computation is complete. Now fix the answer to
+                  compensate for the error due to the approximation used for
+                  pi/2
+                */
  
-      /* The basic computation is complete. Now fix the answer to
-        compensate for the error due to the approximation used for
-        pi/2
-        */
+               /* This has an exponent of -65 */
+               fix_up = 0x898cc517;
+               /* The fix-up needs to be improved for larger args */
+               if (argSqrd.msw & 0xffc00000) {
+                       /* Get about 32 bit precision in these: */
+                       fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
+               }
+               fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
  
-      /* This has an exponent of -65 */
-      fix_up = 0x898cc517;
-      /* The fix-up needs to be improved for larger args */
-      if ( argSqrd.msw & 0xffc00000 )
-       {
-         /* Get about 32 bit precision in these: */
-         fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
-       }
-      fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
+               adj = accumulator.lsw;  /* temp save */
+               accumulator.lsw -= fix_up;
+               if (accumulator.lsw > adj)
+                       XSIG_LL(accumulator)--;
  
-      adj = accumulator.lsw;    /* temp save */
-      accumulator.lsw -= fix_up;
-      if ( accumulator.lsw > adj )
-       XSIG_LL(accumulator) --;
+               echange = round_Xsig(&accumulator);
  
-      echange = round_Xsig(&accumulator);
-
-      setexponentpos(&result, echange - 1);
-    }
+               setexponentpos(&result, echange - 1);
+       }
  
-  significand(&result) = XSIG_LL(accumulator);
-  setsign(&result, getsign(st0_ptr));
-  FPU_copy_to_reg0(&result, TAG_Valid);
+       significand(&result) = XSIG_LL(accumulator);
+       setsign(&result, getsign(st0_ptr));
+       FPU_copy_to_reg0(&result, TAG_Valid);
  
  #ifdef PARANOID
-  if ( (exponent(&result) >= 0)
-      && (significand(&result) > 0x8000000000000000LL) )
-    {
-      EXCEPTION(EX_INTERNAL|0x150);
-    }
+       if ((exponent(&result) >= 0)
+           && (significand(&result) > 0x8000000000000000LL)) {
+               EXCEPTION(EX_INTERNAL | 0x150);
+       }
  #endif /* PARANOID */
  
  }
  
-
-
  /*--- poly_cos() ------------------------------------------------------------+
   |                                                                           |
   +---------------------------------------------------------------------------*/
-void   poly_cos(FPU_REG *st0_ptr)
+void poly_cos(FPU_REG *st0_ptr)
  {
-  FPU_REG            result;
-  long int            exponent, exp2, echange;
-  Xsig                accumulator, argSqrd, fix_up, argTo4;
-  unsigned long long  fixed_arg;
+       FPU_REG result;
+       long int exponent, exp2, echange;
+       Xsig accumulator, argSqrd, fix_up, argTo4;
+       unsigned long long fixed_arg;
  
  #ifdef PARANOID
-  if ( (exponent(st0_ptr) > 0)
-      || ((exponent(st0_ptr) == 0)
-         && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) )
-    {
-      EXCEPTION(EX_Invalid);
-      FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
-      return;
-    }
-#endif /* PARANOID */
-
-  exponent = exponent(st0_ptr);
-
-  accumulator.lsw = accumulator.midw = accumulator.msw = 0;
-
-  if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) )
-    {
-      /* arg is < 0.687705 */
-
-      argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl;
-      argSqrd.lsw = 0;
-      mul64_Xsig(&argSqrd, &significand(st0_ptr));
-
-      if ( exponent < -1 )
-       {
-         /* shift the argument right by the required places */
-         shr_Xsig(&argSqrd, 2*(-1-exponent));
-       }
-
-      argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
-      argTo4.lsw = argSqrd.lsw;
-      mul_Xsig_Xsig(&argTo4, &argTo4);
-
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
-                     N_COEFF_NH-1);
-      mul_Xsig_Xsig(&accumulator, &argSqrd);
-      negate_Xsig(&accumulator);
-
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
-                     N_COEFF_PH-1);
-      negate_Xsig(&accumulator);
-
-      mul64_Xsig(&accumulator, &significand(st0_ptr));
-      mul64_Xsig(&accumulator, &significand(st0_ptr));
-      shr_Xsig(&accumulator, -2*(1+exponent));
-
-      shr_Xsig(&accumulator, 3);
-      negate_Xsig(&accumulator);
-
-      add_Xsig_Xsig(&accumulator, &argSqrd);
-
-      shr_Xsig(&accumulator, 1);
-
-      /* It doesn't matter if accumulator is all zero here, the
-        following code will work ok */
-      negate_Xsig(&accumulator);
-
-      if ( accumulator.lsw & 0x80000000 )
-       XSIG_LL(accumulator) ++;
-      if ( accumulator.msw == 0 )
-       {
-         /* The result is 1.0 */
-         FPU_copy_to_reg0(&CONST_1, TAG_Valid);
-         return;
-       }
-      else
-       {
-         significand(&result) = XSIG_LL(accumulator);
-      
-         /* will be a valid positive nr with expon = -1 */
-         setexponentpos(&result, -1);
-       }
-    }
-  else
-    {
-      fixed_arg = significand(st0_ptr);
-
-      if ( exponent == 0 )
-       {
-         /* The argument is >= 1.0 */
-
-         /* Put the binary point at the left. */
-         fixed_arg <<= 1;
-       }
-      /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
-      fixed_arg = 0x921fb54442d18469LL - fixed_arg;
-      /* There is a special case which arises due to rounding, to fix here. */
-      if ( fixed_arg == 0xffffffffffffffffLL )
-       fixed_arg = 0;
-
-      exponent = -1;
-      exp2 = -1;
-
-      /* A shift is needed here only for a narrow range of arguments,
-        i.e. for fixed_arg approx 2^-32, but we pick up more... */
-      if ( !(LL_MSW(fixed_arg) & 0xffff0000) )
-       {
-         fixed_arg <<= 16;
-         exponent -= 16;
-         exp2 -= 16;
+       if ((exponent(st0_ptr) > 0)
+           || ((exponent(st0_ptr) == 0)
+               && (significand(st0_ptr) > 0xc90fdaa22168c234LL))) {
+               EXCEPTION(EX_Invalid);
+               FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+               return;
         }
+#endif /* PARANOID */
  
-      XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0;
-      mul64_Xsig(&argSqrd, &fixed_arg);
-
-      if ( exponent < -1 )
-       {
-         /* shift the argument right by the required places */
-         shr_Xsig(&argSqrd, 2*(-1-exponent));
-       }
-
-      argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw;
-      argTo4.lsw = argSqrd.lsw;
-      mul_Xsig_Xsig(&argTo4, &argTo4);
-
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
-                     N_COEFF_N-1);
-      mul_Xsig_Xsig(&accumulator, &argSqrd);
-      negate_Xsig(&accumulator);
-
-      polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
-                     N_COEFF_P-1);
-
-      shr_Xsig(&accumulator, 2);    /* Divide by four */
-      accumulator.msw |= 0x80000000;  /* Add 1.0 */
-
-      mul64_Xsig(&accumulator, &fixed_arg);
-      mul64_Xsig(&accumulator, &fixed_arg);
-      mul64_Xsig(&accumulator, &fixed_arg);
-
-      /* Divide by four, FPU_REG compatible, etc */
-      exponent = 3*exponent;
-
-      /* The minimum exponent difference is 3 */
-      shr_Xsig(&accumulator, exp2 - exponent);
-
-      negate_Xsig(&accumulator);
-      XSIG_LL(accumulator) += fixed_arg;
-
-      /* The basic computation is complete. Now fix the answer to
-        compensate for the error due to the approximation used for
-        pi/2
-        */
-
-      /* This has an exponent of -65 */
-      XSIG_LL(fix_up) = 0x898cc51701b839a2ll;
-      fix_up.lsw = 0;
-
-      /* The fix-up needs to be improved for larger args */
-      if ( argSqrd.msw & 0xffc00000 )
-       {
-         /* Get about 32 bit precision in these: */
-         fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2;
-         fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24;
+       exponent = exponent(st0_ptr);
+
+       accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+
+       if ((exponent < -1)
+           || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54))) {
+               /* arg is < 0.687705 */
+
+               argSqrd.msw = st0_ptr->sigh;
+               argSqrd.midw = st0_ptr->sigl;
+               argSqrd.lsw = 0;
+               mul64_Xsig(&argSqrd, &significand(st0_ptr));
+
+               if (exponent < -1) {
+                       /* shift the argument right by the required places */
+                       shr_Xsig(&argSqrd, 2 * (-1 - exponent));
+               }
+
+               argTo4.msw = argSqrd.msw;
+               argTo4.midw = argSqrd.midw;
+               argTo4.lsw = argSqrd.lsw;
+               mul_Xsig_Xsig(&argTo4, &argTo4);
+
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
+                               N_COEFF_NH - 1);
+               mul_Xsig_Xsig(&accumulator, &argSqrd);
+               negate_Xsig(&accumulator);
+
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
+                               N_COEFF_PH - 1);
+               negate_Xsig(&accumulator);
+
+               mul64_Xsig(&accumulator, &significand(st0_ptr));
+               mul64_Xsig(&accumulator, &significand(st0_ptr));
+               shr_Xsig(&accumulator, -2 * (1 + exponent));
+
+               shr_Xsig(&accumulator, 3);
+               negate_Xsig(&accumulator);
+
+               add_Xsig_Xsig(&accumulator, &argSqrd);
+
+               shr_Xsig(&accumulator, 1);
+
+               /* It doesn't matter if accumulator is all zero here, the
+                  following code will work ok */
+               negate_Xsig(&accumulator);
+
+               if (accumulator.lsw & 0x80000000)
+                       XSIG_LL(accumulator)++;
+               if (accumulator.msw == 0) {
+                       /* The result is 1.0 */
+                       FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+                       return;
+               } else {
+                       significand(&result) = XSIG_LL(accumulator);
+
+                       /* will be a valid positive nr with expon = -1 */
+                       setexponentpos(&result, -1);
+               }
+       } else {
+               fixed_arg = significand(st0_ptr);
+
+               if (exponent == 0) {
+                       /* The argument is >= 1.0 */
+
+                       /* Put the binary point at the left. */
+                       fixed_arg <<= 1;
+               }
+               /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+               fixed_arg = 0x921fb54442d18469LL - fixed_arg;
+               /* There is a special case which arises due to rounding, to fix here. */
+               if (fixed_arg == 0xffffffffffffffffLL)
+                       fixed_arg = 0;
+
+               exponent = -1;
+               exp2 = -1;
+
+               /* A shift is needed here only for a narrow range of arguments,
+                  i.e. for fixed_arg approx 2^-32, but we pick up more... */
+               if (!(LL_MSW(fixed_arg) & 0xffff0000)) {
+                       fixed_arg <<= 16;
+                       exponent -= 16;
+                       exp2 -= 16;
+               }
+
+               XSIG_LL(argSqrd) = fixed_arg;
+               argSqrd.lsw = 0;
+               mul64_Xsig(&argSqrd, &fixed_arg);
+
+               if (exponent < -1) {
+                       /* shift the argument right by the required places */
+                       shr_Xsig(&argSqrd, 2 * (-1 - exponent));
+               }
+
+               argTo4.msw = argSqrd.msw;
+               argTo4.midw = argSqrd.midw;
+               argTo4.lsw = argSqrd.lsw;
+               mul_Xsig_Xsig(&argTo4, &argTo4);
+
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
+                               N_COEFF_N - 1);
+               mul_Xsig_Xsig(&accumulator, &argSqrd);
+               negate_Xsig(&accumulator);
+
+               polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
+                               N_COEFF_P - 1);
+
+               shr_Xsig(&accumulator, 2);      /* Divide by four */
+               accumulator.msw |= 0x80000000;  /* Add 1.0 */
+
+               mul64_Xsig(&accumulator, &fixed_arg);
+               mul64_Xsig(&accumulator, &fixed_arg);
+               mul64_Xsig(&accumulator, &fixed_arg);
+
+               /* Divide by four, FPU_REG compatible, etc */
+               exponent = 3 * exponent;
+
+               /* The minimum exponent difference is 3 */
+               shr_Xsig(&accumulator, exp2 - exponent);
+
+               negate_Xsig(&accumulator);
+               XSIG_LL(accumulator) += fixed_arg;
+
+               /* The basic computation is complete. Now fix the answer to
+                  compensate for the error due to the approximation used for
+                  pi/2
+                */
+
+               /* This has an exponent of -65 */
+               XSIG_LL(fix_up) = 0x898cc51701b839a2ll;
+               fix_up.lsw = 0;
+
+               /* The fix-up needs to be improved for larger args */
+               if (argSqrd.msw & 0xffc00000) {
+                       /* Get about 32 bit precision in these: */
+                       fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2;
+                       fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24;
+               }
+
+               exp2 += norm_Xsig(&accumulator);
+               shr_Xsig(&accumulator, 1);      /* Prevent overflow */
+               exp2++;
+               shr_Xsig(&fix_up, 65 + exp2);
+
+               add_Xsig_Xsig(&accumulator, &fix_up);
+
+               echange = round_Xsig(&accumulator);
+
+               setexponentpos(&result, exp2 + echange);
+               significand(&result) = XSIG_LL(accumulator);
         }
  
-      exp2 += norm_Xsig(&accumulator);
-      shr_Xsig(&accumulator, 1); /* Prevent overflow */
-      exp2++;
-      shr_Xsig(&fix_up, 65 + exp2);
-
-      add_Xsig_Xsig(&accumulator, &fix_up);
-
-      echange = round_Xsig(&accumulator);
-
-      setexponentpos(&result, exp2 + echange);
-      significand(&result) = XSIG_LL(accumulator);
-    }
-
-  FPU_copy_to_reg0(&result, TAG_Valid);
+       FPU_copy_to_reg0(&result, TAG_Valid);
  
  #ifdef PARANOID
-  if ( (exponent(&result) >= 0)
-      && (significand(&result) > 0x8000000000000000LL) )
-    {
-      EXCEPTION(EX_INTERNAL|0x151);
-    }
+       if ((exponent(&result) >= 0)
+           && (significand(&result) > 0x8000000000000000LL)) {
+               EXCEPTION(EX_INTERNAL | 0x151);
+       }
  #endif /* PARANOID */
  
  }
diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c

index 8df3e03b6e6f658c9fdcd98e300e10235765c64e..1875763e0c02b88dbd23061e6d9797efbf82b089 100644 (file)
--- a/arch/x86/math-emu/poly_tan.c
+++ b/arch/x86/math-emu/poly_tan.c
@@ -17,206 +17,196 @@
  #include "control_w.h"
  #include "poly.h"
  
-
  #define        HiPOWERop       3       /* odd poly, positive terms */
-static const unsigned long long oddplterm[HiPOWERop] =
-{
-  0x0000000000000000LL,
-  0x0051a1cf08fca228LL,
-  0x0000000071284ff7LL
+static const unsigned long long oddplterm[HiPOWERop] = {
+       0x0000000000000000LL,
+       0x0051a1cf08fca228LL,
+       0x0000000071284ff7LL
  };
  
  #define        HiPOWERon       2       /* odd poly, negative terms */
-static const unsigned long long oddnegterm[HiPOWERon] =
-{
-   0x1291a9a184244e80LL,
-   0x0000583245819c21LL
+static const unsigned long long oddnegterm[HiPOWERon] = {
+       0x1291a9a184244e80LL,
+       0x0000583245819c21LL
  };
  
  #define        HiPOWERep       2       /* even poly, positive terms */
-static const unsigned long long evenplterm[HiPOWERep] =
-{
-  0x0e848884b539e888LL,
-  0x00003c7f18b887daLL
+static const unsigned long long evenplterm[HiPOWERep] = {
+       0x0e848884b539e888LL,
+       0x00003c7f18b887daLL
  };
  
  #define        HiPOWERen       2       /* even poly, negative terms */
-static const unsigned long long evennegterm[HiPOWERen] =
-{
-  0xf1f0200fd51569ccLL,
-  0x003afb46105c4432LL
+static const unsigned long long evennegterm[HiPOWERen] = {
+       0xf1f0200fd51569ccLL,
+       0x003afb46105c4432LL
  };
  
  static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL;
  
-
  /*--- poly_tan() ------------------------------------------------------------+
   |                                                                           |
   +---------------------------------------------------------------------------*/
-void   poly_tan(FPU_REG *st0_ptr)
+void poly_tan(FPU_REG *st0_ptr)
  {
-  long int             exponent;
-  int                   invert;
-  Xsig                  argSq, argSqSq, accumulatoro, accumulatore, accum,
-                        argSignif, fix_up;
-  unsigned long         adj;
+       long int exponent;
+       int invert;
+       Xsig argSq, argSqSq, accumulatoro, accumulatore, accum,
+           argSignif, fix_up;
+       unsigned long adj;
  
-  exponent = exponent(st0_ptr);
+       exponent = exponent(st0_ptr);
  
  #ifdef PARANOID
-  if ( signnegative(st0_ptr) ) /* Can't hack a number < 0.0 */
-    { arith_invalid(0); return; }  /* Need a positive number */
+       if (signnegative(st0_ptr)) {    /* Can't hack a number < 0.0 */
+               arith_invalid(0);
+               return;
+       }                       /* Need a positive number */
  #endif /* PARANOID */
  
-  /* Split the problem into two domains, smaller and larger than pi/4 */
-  if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) )
-    {
-      /* The argument is greater than (approx) pi/4 */
-      invert = 1;
-      accum.lsw = 0;
-      XSIG_LL(accum) = significand(st0_ptr);
- 
-      if ( exponent == 0 )
-       {
-         /* The argument is >= 1.0 */
-         /* Put the binary point at the left. */
-         XSIG_LL(accum) <<= 1;
-       }
-      /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
-      XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum);
-      /* This is a special case which arises due to rounding. */
-      if ( XSIG_LL(accum) == 0xffffffffffffffffLL )
-       {
-         FPU_settag0(TAG_Valid);
-         significand(st0_ptr) = 0x8a51e04daabda360LL;
-         setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative);
-         return;
+       /* Split the problem into two domains, smaller and larger than pi/4 */
+       if ((exponent == 0)
+           || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2))) {
+               /* The argument is greater than (approx) pi/4 */
+               invert = 1;
+               accum.lsw = 0;
+               XSIG_LL(accum) = significand(st0_ptr);
+
+               if (exponent == 0) {
+                       /* The argument is >= 1.0 */
+                       /* Put the binary point at the left. */
+                       XSIG_LL(accum) <<= 1;
+               }
+               /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+               XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum);
+               /* This is a special case which arises due to rounding. */
+               if (XSIG_LL(accum) == 0xffffffffffffffffLL) {
+                       FPU_settag0(TAG_Valid);
+                       significand(st0_ptr) = 0x8a51e04daabda360LL;
+                       setexponent16(st0_ptr,
+                                     (0x41 + EXTENDED_Ebias) | SIGN_Negative);
+                       return;
+               }
+
+               argSignif.lsw = accum.lsw;
+               XSIG_LL(argSignif) = XSIG_LL(accum);
+               exponent = -1 + norm_Xsig(&argSignif);
+       } else {
+               invert = 0;
+               argSignif.lsw = 0;
+               XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr);
+
+               if (exponent < -1) {
+                       /* shift the argument right by the required places */
+                       if (FPU_shrx(&XSIG_LL(accum), -1 - exponent) >=
+                           0x80000000U)
+                               XSIG_LL(accum)++;       /* round up */
+               }
         }
  
-      argSignif.lsw = accum.lsw;
-      XSIG_LL(argSignif) = XSIG_LL(accum);
-      exponent = -1 + norm_Xsig(&argSignif);
-    }
-  else
-    {
-      invert = 0;
-      argSignif.lsw = 0;
-      XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr);
- 
-      if ( exponent < -1 )
-       {
-         /* shift the argument right by the required places */
-         if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U )
-           XSIG_LL(accum) ++;  /* round up */
-       }
-    }
-
-  XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw;
-  mul_Xsig_Xsig(&argSq, &argSq);
-  XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw;
-  mul_Xsig_Xsig(&argSqSq, &argSqSq);
-
-  /* Compute the negative terms for the numerator polynomial */
-  accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0;
-  polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1);
-  mul_Xsig_Xsig(&accumulatoro, &argSq);
-  negate_Xsig(&accumulatoro);
-  /* Add the positive terms */
-  polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1);
-
-  
-  /* Compute the positive terms for the denominator polynomial */
-  accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0;
-  polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1);
-  mul_Xsig_Xsig(&accumulatore, &argSq);
-  negate_Xsig(&accumulatore);
-  /* Add the negative terms */
-  polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1);
-  /* Multiply by arg^2 */
-  mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
-  mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
-  /* de-normalize and divide by 2 */
-  shr_Xsig(&accumulatore, -2*(1+exponent) + 1);
-  negate_Xsig(&accumulatore);      /* This does 1 - accumulator */
-
-  /* Now find the ratio. */
-  if ( accumulatore.msw == 0 )
-    {
-      /* accumulatoro must contain 1.0 here, (actually, 0) but it
-        really doesn't matter what value we use because it will
-        have negligible effect in later calculations
-        */
-      XSIG_LL(accum) = 0x8000000000000000LL;
-      accum.lsw = 0;
-    }
-  else
-    {
-      div_Xsig(&accumulatoro, &accumulatore, &accum);
-    }
-
-  /* Multiply by 1/3 * arg^3 */
-  mul64_Xsig(&accum, &XSIG_LL(argSignif));
-  mul64_Xsig(&accum, &XSIG_LL(argSignif));
-  mul64_Xsig(&accum, &XSIG_LL(argSignif));
-  mul64_Xsig(&accum, &twothirds);
-  shr_Xsig(&accum, -2*(exponent+1));
-
-  /* tan(arg) = arg + accum */
-  add_two_Xsig(&accum, &argSignif, &exponent);
-
-  if ( invert )
-    {
-      /* We now have the value of tan(pi_2 - arg) where pi_2 is an
-        approximation for pi/2
-        */
-      /* The next step is to fix the answer to compensate for the
-        error due to the approximation used for pi/2
-        */
-
-      /* This is (approx) delta, the error in our approx for pi/2
-        (see above). It has an exponent of -65
-        */
-      XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
-      fix_up.lsw = 0;
-
-      if ( exponent == 0 )
-       adj = 0xffffffff;   /* We want approx 1.0 here, but
-                              this is close enough. */
-      else if ( exponent > -30 )
-       {
-         adj = accum.msw >> -(exponent+1);      /* tan */
-         adj = mul_32_32(adj, adj);             /* tan^2 */
+       XSIG_LL(argSq) = XSIG_LL(accum);
+       argSq.lsw = accum.lsw;
+       mul_Xsig_Xsig(&argSq, &argSq);
+       XSIG_LL(argSqSq) = XSIG_LL(argSq);
+       argSqSq.lsw = argSq.lsw;
+       mul_Xsig_Xsig(&argSqSq, &argSqSq);
+
+       /* Compute the negative terms for the numerator polynomial */
+       accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0;
+       polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm,
+                       HiPOWERon - 1);
+       mul_Xsig_Xsig(&accumulatoro, &argSq);
+       negate_Xsig(&accumulatoro);
+       /* Add the positive terms */
+       polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm,
+                       HiPOWERop - 1);
+
+       /* Compute the positive terms for the denominator polynomial */
+       accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0;
+       polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm,
+                       HiPOWERep - 1);
+       mul_Xsig_Xsig(&accumulatore, &argSq);
+       negate_Xsig(&accumulatore);
+       /* Add the negative terms */
+       polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm,
+                       HiPOWERen - 1);
+       /* Multiply by arg^2 */
+       mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
+       mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
+       /* de-normalize and divide by 2 */
+       shr_Xsig(&accumulatore, -2 * (1 + exponent) + 1);
+       negate_Xsig(&accumulatore);     /* This does 1 - accumulator */
+
+       /* Now find the ratio. */
+       if (accumulatore.msw == 0) {
+               /* accumulatoro must contain 1.0 here, (actually, 0) but it
+                  really doesn't matter what value we use because it will
+                  have negligible effect in later calculations
+                */
+               XSIG_LL(accum) = 0x8000000000000000LL;
+               accum.lsw = 0;
+       } else {
+               div_Xsig(&accumulatoro, &accumulatore, &accum);
         }
-      else
-       adj = 0;
-      adj = mul_32_32(0x898cc517, adj);          /* delta * tan^2 */
-
-      fix_up.msw += adj;
-      if ( !(fix_up.msw & 0x80000000) )   /* did fix_up overflow ? */
-       {
-         /* Yes, we need to add an msb */
-         shr_Xsig(&fix_up, 1);
-         fix_up.msw |= 0x80000000;
-         shr_Xsig(&fix_up, 64 + exponent);
+
+       /* Multiply by 1/3 * arg^3 */
+       mul64_Xsig(&accum, &XSIG_LL(argSignif));
+       mul64_Xsig(&accum, &XSIG_LL(argSignif));
+       mul64_Xsig(&accum, &XSIG_LL(argSignif));
+       mul64_Xsig(&accum, &twothirds);
+       shr_Xsig(&accum, -2 * (exponent + 1));
+
+       /* tan(arg) = arg + accum */
+       add_two_Xsig(&accum, &argSignif, &exponent);
+
+       if (invert) {
+               /* We now have the value of tan(pi_2 - arg) where pi_2 is an
+                  approximation for pi/2
+                */
+               /* The next step is to fix the answer to compensate for the
+                  error due to the approximation used for pi/2
+                */
+
+               /* This is (approx) delta, the error in our approx for pi/2
+                  (see above). It has an exponent of -65
+                */
+               XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
+               fix_up.lsw = 0;
+
+               if (exponent == 0)
+                       adj = 0xffffffff;       /* We want approx 1.0 here, but
+                                                  this is close enough. */
+               else if (exponent > -30) {
+                       adj = accum.msw >> -(exponent + 1);     /* tan */
+                       adj = mul_32_32(adj, adj);      /* tan^2 */
+               } else
+                       adj = 0;
+               adj = mul_32_32(0x898cc517, adj);       /* delta * tan^2 */
+
+               fix_up.msw += adj;
+               if (!(fix_up.msw & 0x80000000)) {       /* did fix_up overflow ? */
+                       /* Yes, we need to add an msb */
+                       shr_Xsig(&fix_up, 1);
+                       fix_up.msw |= 0x80000000;
+                       shr_Xsig(&fix_up, 64 + exponent);
+               } else
+                       shr_Xsig(&fix_up, 65 + exponent);
+
+               add_two_Xsig(&accum, &fix_up, &exponent);
+
+               /* accum now contains tan(pi/2 - arg).
+                  Use tan(arg) = 1.0 / tan(pi/2 - arg)
+                */
+               accumulatoro.lsw = accumulatoro.midw = 0;
+               accumulatoro.msw = 0x80000000;
+               div_Xsig(&accumulatoro, &accum, &accum);
+               exponent = -exponent - 1;
         }
-      else
-       shr_Xsig(&fix_up, 65 + exponent);
-
-      add_two_Xsig(&accum, &fix_up, &exponent);
-
-      /* accum now contains tan(pi/2 - arg).
-        Use tan(arg) = 1.0 / tan(pi/2 - arg)
-        */
-      accumulatoro.lsw = accumulatoro.midw = 0;
-      accumulatoro.msw = 0x80000000;
-      div_Xsig(&accumulatoro, &accum, &accum);
-      exponent = - exponent - 1;
-    }
-
-  /* Transfer the result */
-  round_Xsig(&accum);
-  FPU_settag0(TAG_Valid);
-  significand(st0_ptr) = XSIG_LL(accum);
-  setexponent16(st0_ptr, exponent + EXTENDED_Ebias);  /* Result is positive. */
+
+       /* Transfer the result */
+       round_Xsig(&accum);
+       FPU_settag0(TAG_Valid);
+       significand(st0_ptr) = XSIG_LL(accum);
+       setexponent16(st0_ptr, exponent + EXTENDED_Ebias);      /* Result is positive. */
  
  }
diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c

index 7cd3b37ac084e558cd963ae32be90e82e4764e0c..deea48b9f13a41c6807ecde0ca0f7f9d4d39bd60 100644 (file)
--- a/arch/x86/math-emu/reg_add_sub.c
+++ b/arch/x86/math-emu/reg_add_sub.c
@@ -27,7 +27,7 @@
  static
  int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
                      FPU_REG const *b, u_char tagb, u_char signb,
-                    FPU_REG *dest, int deststnr, int control_w);
+                    FPU_REG * dest, int deststnr, int control_w);
  
  /*
    Operates on st(0) and st(n), or on st(0) and temporary data.
@@ -35,340 +35,299 @@ int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
    */
  int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
  {
-  FPU_REG *a = &st(0);
-  FPU_REG *dest = &st(deststnr);
-  u_char signb = getsign(b);
-  u_char taga = FPU_gettag0();
-  u_char signa = getsign(a);
-  u_char saved_sign = getsign(dest);
-  int diff, tag, expa, expb;
-  
-  if ( !(taga | tagb) )
-    {
-      expa = exponent(a);
-      expb = exponent(b);
-
-    valid_add:
-      /* Both registers are valid */
-      if (!(signa ^ signb))
-       {
-         /* signs are the same */
-         tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb);
-       }
-      else
-       {
-         /* The signs are different, so do a subtraction */
-         diff = expa - expb;
-         if (!diff)
-           {
-             diff = a->sigh - b->sigh;  /* This works only if the ms bits
-                                           are identical. */
-             if (!diff)
-               {
-                 diff = a->sigl > b->sigl;
-                 if (!diff)
-                   diff = -(a->sigl < b->sigl);
+       FPU_REG *a = &st(0);
+       FPU_REG *dest = &st(deststnr);
+       u_char signb = getsign(b);
+       u_char taga = FPU_gettag0();
+       u_char signa = getsign(a);
+       u_char saved_sign = getsign(dest);
+       int diff, tag, expa, expb;
+
+       if (!(taga | tagb)) {
+               expa = exponent(a);
+               expb = exponent(b);
+
+             valid_add:
+               /* Both registers are valid */
+               if (!(signa ^ signb)) {
+                       /* signs are the same */
+                       tag =
+                           FPU_u_add(a, b, dest, control_w, signa, expa, expb);
+               } else {
+                       /* The signs are different, so do a subtraction */
+                       diff = expa - expb;
+                       if (!diff) {
+                               diff = a->sigh - b->sigh;       /* This works only if the ms bits
+                                                                  are identical. */
+                               if (!diff) {
+                                       diff = a->sigl > b->sigl;
+                                       if (!diff)
+                                               diff = -(a->sigl < b->sigl);
+                               }
+                       }
+
+                       if (diff > 0) {
+                               tag =
+                                   FPU_u_sub(a, b, dest, control_w, signa,
+                                             expa, expb);
+                       } else if (diff < 0) {
+                               tag =
+                                   FPU_u_sub(b, a, dest, control_w, signb,
+                                             expb, expa);
+                       } else {
+                               FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+                               /* sign depends upon rounding mode */
+                               setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+                                       ? SIGN_POS : SIGN_NEG);
+                               return TAG_Zero;
+                       }
                 }
-           }
-      
-         if (diff > 0)
-           {
-             tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
-           }
-         else if ( diff < 0 )
-           {
-             tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa);
-           }
-         else
-           {
-             FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
-             /* sign depends upon rounding mode */
-             setsign(dest, ((control_w & CW_RC) != RC_DOWN)
-                     ? SIGN_POS : SIGN_NEG);
-             return TAG_Zero;
-           }
-       }
  
-      if ( tag < 0 )
-       {
-         setsign(dest, saved_sign);
-         return tag;
+               if (tag < 0) {
+                       setsign(dest, saved_sign);
+                       return tag;
+               }
+               FPU_settagi(deststnr, tag);
+               return tag;
         }
-      FPU_settagi(deststnr, tag);
-      return tag;
-    }
  
-  if ( taga == TAG_Special )
-    taga = FPU_Special(a);
-  if ( tagb == TAG_Special )
-    tagb = FPU_Special(b);
+       if (taga == TAG_Special)
+               taga = FPU_Special(a);
+       if (tagb == TAG_Special)
+               tagb = FPU_Special(b);
  
-  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+       if (((taga == TAG_Valid) && (tagb == TW_Denormal))
             || ((taga == TW_Denormal) && (tagb == TAG_Valid))
-           || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
-    {
-      FPU_REG x, y;
+           || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+               FPU_REG x, y;
+
+               if (denormal_operand() < 0)
+                       return FPU_Exception;
+
+               FPU_to_exp16(a, &x);
+               FPU_to_exp16(b, &y);
+               a = &x;
+               b = &y;
+               expa = exponent16(a);
+               expb = exponent16(b);
+               goto valid_add;
+       }
  
-      if ( denormal_operand() < 0 )
-       return FPU_Exception;
+       if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+               if (deststnr == 0)
+                       return real_2op_NaN(b, tagb, deststnr, a);
+               else
+                       return real_2op_NaN(a, taga, deststnr, a);
+       }
  
-      FPU_to_exp16(a, &x);
-      FPU_to_exp16(b, &y);
-      a = &x;
-      b = &y;
-      expa = exponent16(a);
-      expb = exponent16(b);
-      goto valid_add;
-    }
-
-  if ( (taga == TW_NaN) || (tagb == TW_NaN) )
-    {
-      if ( deststnr == 0 )
-       return real_2op_NaN(b, tagb, deststnr, a);
-      else
-       return real_2op_NaN(a, taga, deststnr, a);
-    }
-
-  return add_sub_specials(a, taga, signa, b, tagb, signb,
-                         dest, deststnr, control_w);
+       return add_sub_specials(a, taga, signa, b, tagb, signb,
+                               dest, deststnr, control_w);
  }
  
-
  /* Subtract b from a.  (a-b) -> dest */
  int FPU_sub(int flags, int rm, int control_w)
  {
-  FPU_REG const *a, *b;
-  FPU_REG *dest;
-  u_char taga, tagb, signa, signb, saved_sign, sign;
-  int diff, tag = 0, expa, expb, deststnr;
-
-  a = &st(0);
-  taga = FPU_gettag0();
-
-  deststnr = 0;
-  if ( flags & LOADED )
-    {
-      b = (FPU_REG *)rm;
-      tagb = flags & 0x0f;
-    }
-  else
-    {
-      b = &st(rm);
-      tagb = FPU_gettagi(rm);
-
-      if ( flags & DEST_RM )
-       deststnr = rm;
-    }
-
-  signa = getsign(a);
-  signb = getsign(b);
-
-  if ( flags & REV )
-    {
-      signa ^= SIGN_NEG;
-      signb ^= SIGN_NEG;
-    }
-
-  dest = &st(deststnr);
-  saved_sign = getsign(dest);
-
-  if ( !(taga | tagb) )
-    {
-      expa = exponent(a);
-      expb = exponent(b);
-
-    valid_subtract:
-      /* Both registers are valid */
-
-      diff = expa - expb;
-
-      if (!diff)
-       {
-         diff = a->sigh - b->sigh;  /* Works only if ms bits are identical */
-         if (!diff)
-           {
-             diff = a->sigl > b->sigl;
-             if (!diff)
-               diff = -(a->sigl < b->sigl);
-           }
+       FPU_REG const *a, *b;
+       FPU_REG *dest;
+       u_char taga, tagb, signa, signb, saved_sign, sign;
+       int diff, tag = 0, expa, expb, deststnr;
+
+       a = &st(0);
+       taga = FPU_gettag0();
+
+       deststnr = 0;
+       if (flags & LOADED) {
+               b = (FPU_REG *) rm;
+               tagb = flags & 0x0f;
+       } else {
+               b = &st(rm);
+               tagb = FPU_gettagi(rm);
+
+               if (flags & DEST_RM)
+                       deststnr = rm;
         }
  
-      switch ( (((int)signa)*2 + signb) / SIGN_NEG )
-       {
-       case 0: /* P - P */
-       case 3: /* N - N */
-         if (diff > 0)
-           {
-             /* |a| > |b| */
-             tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb);
-           }
-         else if ( diff == 0 )
-           {
-             FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
-
-             /* sign depends upon rounding mode */
-             setsign(dest, ((control_w & CW_RC) != RC_DOWN)
-               ? SIGN_POS : SIGN_NEG);
-             return TAG_Zero;
-           }
-         else
-           {
-             sign = signa ^ SIGN_NEG;
-             tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa);
-           }
-         break;
-       case 1: /* P - N */
-         tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb);
-         break;
-       case 2: /* N - P */
-         tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb);
-         break;
+       signa = getsign(a);
+       signb = getsign(b);
+
+       if (flags & REV) {
+               signa ^= SIGN_NEG;
+               signb ^= SIGN_NEG;
+       }
+
+       dest = &st(deststnr);
+       saved_sign = getsign(dest);
+
+       if (!(taga | tagb)) {
+               expa = exponent(a);
+               expb = exponent(b);
+
+             valid_subtract:
+               /* Both registers are valid */
+
+               diff = expa - expb;
+
+               if (!diff) {
+                       diff = a->sigh - b->sigh;       /* Works only if ms bits are identical */
+                       if (!diff) {
+                               diff = a->sigl > b->sigl;
+                               if (!diff)
+                                       diff = -(a->sigl < b->sigl);
+                       }
+               }
+
+               switch ((((int)signa) * 2 + signb) / SIGN_NEG) {
+               case 0: /* P - P */
+               case 3: /* N - N */
+                       if (diff > 0) {
+                               /* |a| > |b| */
+                               tag =
+                                   FPU_u_sub(a, b, dest, control_w, signa,
+                                             expa, expb);
+                       } else if (diff == 0) {
+                               FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+
+                               /* sign depends upon rounding mode */
+                               setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+                                       ? SIGN_POS : SIGN_NEG);
+                               return TAG_Zero;
+                       } else {
+                               sign = signa ^ SIGN_NEG;
+                               tag =
+                                   FPU_u_sub(b, a, dest, control_w, sign, expb,
+                                             expa);
+                       }
+                       break;
+               case 1: /* P - N */
+                       tag =
+                           FPU_u_add(a, b, dest, control_w, SIGN_POS, expa,
+                                     expb);
+                       break;
+               case 2: /* N - P */
+                       tag =
+                           FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa,
+                                     expb);
+                       break;
  #ifdef PARANOID
-       default:
-         EXCEPTION(EX_INTERNAL|0x111);
-         return -1;
+               default:
+                       EXCEPTION(EX_INTERNAL | 0x111);
+                       return -1;
  #endif
+               }
+               if (tag < 0) {
+                       setsign(dest, saved_sign);
+                       return tag;
+               }
+               FPU_settagi(deststnr, tag);
+               return tag;
         }
-      if ( tag < 0 )
-       {
-         setsign(dest, saved_sign);
-         return tag;
-       }
-      FPU_settagi(deststnr, tag);
-      return tag;
-    }
  
-  if ( taga == TAG_Special )
-    taga = FPU_Special(a);
-  if ( tagb == TAG_Special )
-    tagb = FPU_Special(b);
+       if (taga == TAG_Special)
+               taga = FPU_Special(a);
+       if (tagb == TAG_Special)
+               tagb = FPU_Special(b);
  
-  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+       if (((taga == TAG_Valid) && (tagb == TW_Denormal))
             || ((taga == TW_Denormal) && (tagb == TAG_Valid))
-           || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
-    {
-      FPU_REG x, y;
+           || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+               FPU_REG x, y;
  
-      if ( denormal_operand() < 0 )
-       return FPU_Exception;
+               if (denormal_operand() < 0)
+                       return FPU_Exception;
+
+               FPU_to_exp16(a, &x);
+               FPU_to_exp16(b, &y);
+               a = &x;
+               b = &y;
+               expa = exponent16(a);
+               expb = exponent16(b);
  
-      FPU_to_exp16(a, &x);
-      FPU_to_exp16(b, &y);
-      a = &x;
-      b = &y;
-      expa = exponent16(a);
-      expb = exponent16(b);
-
-      goto valid_subtract;
-    }
-
-  if ( (taga == TW_NaN) || (tagb == TW_NaN) )
-    {
-      FPU_REG const *d1, *d2;
-      if ( flags & REV )
-       {
-         d1 = b;
-         d2 = a;
+               goto valid_subtract;
         }
-      else
-       {
-         d1 = a;
-         d2 = b;
+
+       if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+               FPU_REG const *d1, *d2;
+               if (flags & REV) {
+                       d1 = b;
+                       d2 = a;
+               } else {
+                       d1 = a;
+                       d2 = b;
+               }
+               if (flags & LOADED)
+                       return real_2op_NaN(b, tagb, deststnr, d1);
+               if (flags & DEST_RM)
+                       return real_2op_NaN(a, taga, deststnr, d2);
+               else
+                       return real_2op_NaN(b, tagb, deststnr, d2);
         }
-      if ( flags & LOADED )
-       return real_2op_NaN(b, tagb, deststnr, d1);
-      if ( flags & DEST_RM )
-       return real_2op_NaN(a, taga, deststnr, d2);
-      else
-       return real_2op_NaN(b, tagb, deststnr, d2);
-    }
-
-    return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
-                           dest, deststnr, control_w);
-}
  
+       return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
+                               dest, deststnr, control_w);
+}
  
  static
  int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
                      FPU_REG const *b, u_char tagb, u_char signb,
-                    FPU_REG *dest, int deststnr, int control_w)
+                    FPU_REG * dest, int deststnr, int control_w)
  {
-  if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
-       && (denormal_operand() < 0) )
-    return FPU_Exception;
-
-  if (taga == TAG_Zero)
-    {
-      if (tagb == TAG_Zero)
-       {
-         /* Both are zero, result will be zero. */
-         u_char different_signs = signa ^ signb;
-
-         FPU_copy_to_regi(a, TAG_Zero, deststnr);
-         if ( different_signs )
-           {
-             /* Signs are different. */
-             /* Sign of answer depends upon rounding mode. */
-             setsign(dest, ((control_w & CW_RC) != RC_DOWN)
-                     ? SIGN_POS : SIGN_NEG);
-           }
-         else
-           setsign(dest, signa);  /* signa may differ from the sign of a. */
-         return TAG_Zero;
-       }
-      else
-       {
-         reg_copy(b, dest);
-         if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) )
-           {
-             /* A pseudoDenormal, convert it. */
-             addexponent(dest, 1);
-             tagb = TAG_Valid;
-           }
-         else if ( tagb > TAG_Empty )
-           tagb = TAG_Special;
-         setsign(dest, signb);  /* signb may differ from the sign of b. */
-         FPU_settagi(deststnr, tagb);
-         return tagb;
-       }
-    }
-  else if (tagb == TAG_Zero)
-    {
-      reg_copy(a, dest);
-      if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) )
-       {
-         /* A pseudoDenormal */
-         addexponent(dest, 1);
-         taga = TAG_Valid;
-       }
-      else if ( taga > TAG_Empty )
-       taga = TAG_Special;
-      setsign(dest, signa);  /* signa may differ from the sign of a. */
-      FPU_settagi(deststnr, taga);
-      return taga;
-    }
-  else if (taga == TW_Infinity)
-    {
-      if ( (tagb != TW_Infinity) || (signa == signb) )
-       {
-         FPU_copy_to_regi(a, TAG_Special, deststnr);
-         setsign(dest, signa);  /* signa may differ from the sign of a. */
-         return taga;
+       if (((taga == TW_Denormal) || (tagb == TW_Denormal))
+           && (denormal_operand() < 0))
+               return FPU_Exception;
+
+       if (taga == TAG_Zero) {
+               if (tagb == TAG_Zero) {
+                       /* Both are zero, result will be zero. */
+                       u_char different_signs = signa ^ signb;
+
+                       FPU_copy_to_regi(a, TAG_Zero, deststnr);
+                       if (different_signs) {
+                               /* Signs are different. */
+                               /* Sign of answer depends upon rounding mode. */
+                               setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+                                       ? SIGN_POS : SIGN_NEG);
+                       } else
+                               setsign(dest, signa);   /* signa may differ from the sign of a. */
+                       return TAG_Zero;
+               } else {
+                       reg_copy(b, dest);
+                       if ((tagb == TW_Denormal) && (b->sigh & 0x80000000)) {
+                               /* A pseudoDenormal, convert it. */
+                               addexponent(dest, 1);
+                               tagb = TAG_Valid;
+                       } else if (tagb > TAG_Empty)
+                               tagb = TAG_Special;
+                       setsign(dest, signb);   /* signb may differ from the sign of b. */
+                       FPU_settagi(deststnr, tagb);
+                       return tagb;
+               }
+       } else if (tagb == TAG_Zero) {
+               reg_copy(a, dest);
+               if ((taga == TW_Denormal) && (a->sigh & 0x80000000)) {
+                       /* A pseudoDenormal */
+                       addexponent(dest, 1);
+                       taga = TAG_Valid;
+               } else if (taga > TAG_Empty)
+                       taga = TAG_Special;
+               setsign(dest, signa);   /* signa may differ from the sign of a. */
+               FPU_settagi(deststnr, taga);
+               return taga;
+       } else if (taga == TW_Infinity) {
+               if ((tagb != TW_Infinity) || (signa == signb)) {
+                       FPU_copy_to_regi(a, TAG_Special, deststnr);
+                       setsign(dest, signa);   /* signa may differ from the sign of a. */
+                       return taga;
+               }
+               /* Infinity-Infinity is undefined. */
+               return arith_invalid(deststnr);
+       } else if (tagb == TW_Infinity) {
+               FPU_copy_to_regi(b, TAG_Special, deststnr);
+               setsign(dest, signb);   /* signb may differ from the sign of b. */
+               return tagb;
         }
-      /* Infinity-Infinity is undefined. */
-      return arith_invalid(deststnr);
-    }
-  else if (tagb == TW_Infinity)
-    {
-      FPU_copy_to_regi(b, TAG_Special, deststnr);
-      setsign(dest, signb);  /* signb may differ from the sign of b. */
-      return tagb;
-    }
-
  #ifdef PARANOID
-  EXCEPTION(EX_INTERNAL|0x101);
+       EXCEPTION(EX_INTERNAL | 0x101);
  #endif
  
-  return FPU_Exception;
+       return FPU_Exception;
  }
-
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c

index f37c5b5a35ad54301c36a5dd21f1fe1c394a7cf2..ecce55fc2e2e1b6d59cc6f94891b47d80925c423 100644 (file)
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -20,362 +20,331 @@
  #include "control_w.h"
  #include "status_w.h"
  
-
  static int compare(FPU_REG const *b, int tagb)
  {
-  int diff, exp0, expb;
-  u_char               st0_tag;
-  FPU_REG      *st0_ptr;
-  FPU_REG      x, y;
-  u_char               st0_sign, signb = getsign(b);
-
-  st0_ptr = &st(0);
-  st0_tag = FPU_gettag0();
-  st0_sign = getsign(st0_ptr);
-
-  if ( tagb == TAG_Special )
-    tagb = FPU_Special(b);
-  if ( st0_tag == TAG_Special )
-    st0_tag = FPU_Special(st0_ptr);
-
-  if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal))
-       || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) )
-    {
-      if ( st0_tag == TAG_Zero )
-       {
-         if ( tagb == TAG_Zero ) return COMP_A_eq_B;
-         if ( tagb == TAG_Valid )
-           return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
-         if ( tagb == TW_Denormal )
-           return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
-           | COMP_Denormal;
-       }
-      else if ( tagb == TAG_Zero )
-       {
-         if ( st0_tag == TAG_Valid )
-           return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
-         if ( st0_tag == TW_Denormal )
-           return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
-           | COMP_Denormal;
+       int diff, exp0, expb;
+       u_char st0_tag;
+       FPU_REG *st0_ptr;
+       FPU_REG x, y;
+       u_char st0_sign, signb = getsign(b);
+
+       st0_ptr = &st(0);
+       st0_tag = FPU_gettag0();
+       st0_sign = getsign(st0_ptr);
+
+       if (tagb == TAG_Special)
+               tagb = FPU_Special(b);
+       if (st0_tag == TAG_Special)
+               st0_tag = FPU_Special(st0_ptr);
+
+       if (((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal))
+           || ((tagb != TAG_Valid) && (tagb != TW_Denormal))) {
+               if (st0_tag == TAG_Zero) {
+                       if (tagb == TAG_Zero)
+                               return COMP_A_eq_B;
+                       if (tagb == TAG_Valid)
+                               return ((signb ==
+                                        SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
+                       if (tagb == TW_Denormal)
+                               return ((signb ==
+                                        SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+                                   | COMP_Denormal;
+               } else if (tagb == TAG_Zero) {
+                       if (st0_tag == TAG_Valid)
+                               return ((st0_sign ==
+                                        SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+                       if (st0_tag == TW_Denormal)
+                               return ((st0_sign ==
+                                        SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+                                   | COMP_Denormal;
+               }
+
+               if (st0_tag == TW_Infinity) {
+                       if ((tagb == TAG_Valid) || (tagb == TAG_Zero))
+                               return ((st0_sign ==
+                                        SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+                       else if (tagb == TW_Denormal)
+                               return ((st0_sign ==
+                                        SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+                                   | COMP_Denormal;
+                       else if (tagb == TW_Infinity) {
+                               /* The 80486 book says that infinities can be equal! */
+                               return (st0_sign == signb) ? COMP_A_eq_B :
+                                   ((st0_sign ==
+                                     SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+                       }
+                       /* Fall through to the NaN code */
+               } else if (tagb == TW_Infinity) {
+                       if ((st0_tag == TAG_Valid) || (st0_tag == TAG_Zero))
+                               return ((signb ==
+                                        SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
+                       if (st0_tag == TW_Denormal)
+                               return ((signb ==
+                                        SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+                                   | COMP_Denormal;
+                       /* Fall through to the NaN code */
+               }
+
+               /* The only possibility now should be that one of the arguments
+                  is a NaN */
+               if ((st0_tag == TW_NaN) || (tagb == TW_NaN)) {
+                       int signalling = 0, unsupported = 0;
+                       if (st0_tag == TW_NaN) {
+                               signalling =
+                                   (st0_ptr->sigh & 0xc0000000) == 0x80000000;
+                               unsupported = !((exponent(st0_ptr) == EXP_OVER)
+                                               && (st0_ptr->
+                                                   sigh & 0x80000000));
+                       }
+                       if (tagb == TW_NaN) {
+                               signalling |=
+                                   (b->sigh & 0xc0000000) == 0x80000000;
+                               unsupported |= !((exponent(b) == EXP_OVER)
+                                                && (b->sigh & 0x80000000));
+                       }
+                       if (signalling || unsupported)
+                               return COMP_No_Comp | COMP_SNaN | COMP_NaN;
+                       else
+                               /* Neither is a signaling NaN */
+                               return COMP_No_Comp | COMP_NaN;
+               }
+
+               EXCEPTION(EX_Invalid);
         }
  
-      if ( st0_tag == TW_Infinity )
-       {
-         if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) )
-           return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
-         else if ( tagb == TW_Denormal )
-           return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
-             | COMP_Denormal;
-         else if ( tagb == TW_Infinity )
-           {
-             /* The 80486 book says that infinities can be equal! */
-             return (st0_sign == signb) ? COMP_A_eq_B :
-               ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
-           }
-         /* Fall through to the NaN code */
-       }
-      else if ( tagb == TW_Infinity )
-       {
-         if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) )
-           return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
-         if ( st0_tag == TW_Denormal )
-           return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
-               | COMP_Denormal;
-         /* Fall through to the NaN code */
+       if (st0_sign != signb) {
+               return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+                   | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+                      COMP_Denormal : 0);
         }
  
-      /* The only possibility now should be that one of the arguments
-        is a NaN */
-      if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) )
-       {
-         int signalling = 0, unsupported = 0;
-         if ( st0_tag == TW_NaN )
-           {
-             signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000;
-             unsupported = !((exponent(st0_ptr) == EXP_OVER)
-                             && (st0_ptr->sigh & 0x80000000));
-           }
-         if ( tagb == TW_NaN )
-           {
-             signalling |= (b->sigh & 0xc0000000) == 0x80000000;
-             unsupported |= !((exponent(b) == EXP_OVER)
-                              && (b->sigh & 0x80000000));
-           }
-         if ( signalling || unsupported )
-           return COMP_No_Comp | COMP_SNaN | COMP_NaN;
-         else
-           /* Neither is a signaling NaN */
-           return COMP_No_Comp | COMP_NaN;
+       if ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) {
+               FPU_to_exp16(st0_ptr, &x);
+               FPU_to_exp16(b, &y);
+               st0_ptr = &x;
+               b = &y;
+               exp0 = exponent16(st0_ptr);
+               expb = exponent16(b);
+       } else {
+               exp0 = exponent(st0_ptr);
+               expb = exponent(b);
         }
-      
-      EXCEPTION(EX_Invalid);
-    }
-  
-  if (st0_sign != signb)
-    {
-      return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
-       | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
-           COMP_Denormal : 0);
-    }
-
-  if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) )
-    {
-      FPU_to_exp16(st0_ptr, &x);
-      FPU_to_exp16(b, &y);
-      st0_ptr = &x;
-      b = &y;
-      exp0 = exponent16(st0_ptr);
-      expb = exponent16(b);
-    }
-  else
-    {
-      exp0 = exponent(st0_ptr);
-      expb = exponent(b);
-    }
  
  #ifdef PARANOID
-  if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid);
-  if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid);
+       if (!(st0_ptr->sigh & 0x80000000))
+               EXCEPTION(EX_Invalid);
+       if (!(b->sigh & 0x80000000))
+               EXCEPTION(EX_Invalid);
  #endif /* PARANOID */
  
-  diff = exp0 - expb;
-  if ( diff == 0 )
-    {
-      diff = st0_ptr->sigh - b->sigh;  /* Works only if ms bits are
-                                             identical */
-      if ( diff == 0 )
-       {
-       diff = st0_ptr->sigl > b->sigl;
-       if ( diff == 0 )
-         diff = -(st0_ptr->sigl < b->sigl);
+       diff = exp0 - expb;
+       if (diff == 0) {
+               diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are
+                                                  identical */
+               if (diff == 0) {
+                       diff = st0_ptr->sigl > b->sigl;
+                       if (diff == 0)
+                               diff = -(st0_ptr->sigl < b->sigl);
+               }
         }
-    }
-
-  if ( diff > 0 )
-    {
-      return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
-       | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
-           COMP_Denormal : 0);
-    }
-  if ( diff < 0 )
-    {
-      return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
-       | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
-           COMP_Denormal : 0);
-    }
-
-  return COMP_A_eq_B
-    | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
-       COMP_Denormal : 0);
  
-}
+       if (diff > 0) {
+               return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+                   | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+                      COMP_Denormal : 0);
+       }
+       if (diff < 0) {
+               return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+                   | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+                      COMP_Denormal : 0);
+       }
  
+       return COMP_A_eq_B
+           | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+              COMP_Denormal : 0);
+
+}
  
  /* This function requires that st(0) is not empty */
  int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
  {
-  int f = 0, c;
-
-  c = compare(loaded_data, loaded_tag);
-
-  if (c & COMP_NaN)
-    {
-      EXCEPTION(EX_Invalid);
-      f = SW_C3 | SW_C2 | SW_C0;
-    }
-  else
-    switch (c & 7)
-      {
-      case COMP_A_lt_B:
-       f = SW_C0;
-       break;
-      case COMP_A_eq_B:
-       f = SW_C3;
-       break;
-      case COMP_A_gt_B:
-       f = 0;
-       break;
-      case COMP_No_Comp:
-       f = SW_C3 | SW_C2 | SW_C0;
-       break;
+       int f = 0, c;
+
+       c = compare(loaded_data, loaded_tag);
+
+       if (c & COMP_NaN) {
+               EXCEPTION(EX_Invalid);
+               f = SW_C3 | SW_C2 | SW_C0;
+       } else
+               switch (c & 7) {
+               case COMP_A_lt_B:
+                       f = SW_C0;
+                       break;
+               case COMP_A_eq_B:
+                       f = SW_C3;
+                       break;
+               case COMP_A_gt_B:
+                       f = 0;
+                       break;
+               case COMP_No_Comp:
+                       f = SW_C3 | SW_C2 | SW_C0;
+                       break;
  #ifdef PARANOID
-      default:
-       EXCEPTION(EX_INTERNAL|0x121);
-       f = SW_C3 | SW_C2 | SW_C0;
-       break;
+               default:
+                       EXCEPTION(EX_INTERNAL | 0x121);
+                       f = SW_C3 | SW_C2 | SW_C0;
+                       break;
  #endif /* PARANOID */
-      }
-  setcc(f);
-  if (c & COMP_Denormal)
-    {
-      return denormal_operand() < 0;
-    }
-  return 0;
+               }
+       setcc(f);
+       if (c & COMP_Denormal) {
+               return denormal_operand() < 0;
+       }
+       return 0;
  }
  
-
  static int compare_st_st(int nr)
  {
-  int f = 0, c;
-  FPU_REG *st_ptr;
-
-  if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) )
-    {
-      setcc(SW_C3 | SW_C2 | SW_C0);
-      /* Stack fault */
-      EXCEPTION(EX_StackUnder);
-      return !(control_word & CW_Invalid);
-    }
-
-  st_ptr = &st(nr);
-  c = compare(st_ptr, FPU_gettagi(nr));
-  if (c & COMP_NaN)
-    {
-      setcc(SW_C3 | SW_C2 | SW_C0);
-      EXCEPTION(EX_Invalid);
-      return !(control_word & CW_Invalid);
-    }
-  else
-    switch (c & 7)
-      {
-      case COMP_A_lt_B:
-       f = SW_C0;
-       break;
-      case COMP_A_eq_B:
-       f = SW_C3;
-       break;
-      case COMP_A_gt_B:
-       f = 0;
-       break;
-      case COMP_No_Comp:
-       f = SW_C3 | SW_C2 | SW_C0;
-       break;
+       int f = 0, c;
+       FPU_REG *st_ptr;
+
+       if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+               setcc(SW_C3 | SW_C2 | SW_C0);
+               /* Stack fault */
+               EXCEPTION(EX_StackUnder);
+               return !(control_word & CW_Invalid);
+       }
+
+       st_ptr = &st(nr);
+       c = compare(st_ptr, FPU_gettagi(nr));
+       if (c & COMP_NaN) {
+               setcc(SW_C3 | SW_C2 | SW_C0);
+               EXCEPTION(EX_Invalid);
+               return !(control_word & CW_Invalid);
+       } else
+               switch (c & 7) {
+               case COMP_A_lt_B:
+                       f = SW_C0;
+                       break;
+               case COMP_A_eq_B:
+                       f = SW_C3;
+                       break;
+               case COMP_A_gt_B:
+                       f = 0;
+                       break;
+               case COMP_No_Comp:
+                       f = SW_C3 | SW_C2 | SW_C0;
+                       break;
  #ifdef PARANOID
-      default:
-       EXCEPTION(EX_INTERNAL|0x122);
-       f = SW_C3 | SW_C2 | SW_C0;
-       break;
+               default:
+                       EXCEPTION(EX_INTERNAL | 0x122);
+                       f = SW_C3 | SW_C2 | SW_C0;
+                       break;
  #endif /* PARANOID */
-      }
-  setcc(f);
-  if (c & COMP_Denormal)
-    {
-      return denormal_operand() < 0;
-    }
-  return 0;
+               }
+       setcc(f);
+       if (c & COMP_Denormal) {
+               return denormal_operand() < 0;
+       }
+       return 0;
  }
  
-
  static int compare_u_st_st(int nr)
  {
-  int f = 0, c;
-  FPU_REG *st_ptr;
-
-  if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) )
-    {
-      setcc(SW_C3 | SW_C2 | SW_C0);
-      /* Stack fault */
-      EXCEPTION(EX_StackUnder);
-      return !(control_word & CW_Invalid);
-    }
-
-  st_ptr = &st(nr);
-  c = compare(st_ptr, FPU_gettagi(nr));
-  if (c & COMP_NaN)
-    {
-      setcc(SW_C3 | SW_C2 | SW_C0);
-      if (c & COMP_SNaN)       /* This is the only difference between
-                                 un-ordered and ordinary comparisons */
-       {
-         EXCEPTION(EX_Invalid);
-         return !(control_word & CW_Invalid);
+       int f = 0, c;
+       FPU_REG *st_ptr;
+
+       if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+               setcc(SW_C3 | SW_C2 | SW_C0);
+               /* Stack fault */
+               EXCEPTION(EX_StackUnder);
+               return !(control_word & CW_Invalid);
         }
-      return 0;
-    }
-  else
-    switch (c & 7)
-      {
-      case COMP_A_lt_B:
-       f = SW_C0;
-       break;
-      case COMP_A_eq_B:
-       f = SW_C3;
-       break;
-      case COMP_A_gt_B:
-       f = 0;
-       break;
-      case COMP_No_Comp:
-       f = SW_C3 | SW_C2 | SW_C0;
-       break;
+
+       st_ptr = &st(nr);
+       c = compare(st_ptr, FPU_gettagi(nr));
+       if (c & COMP_NaN) {
+               setcc(SW_C3 | SW_C2 | SW_C0);
+               if (c & COMP_SNaN) {    /* This is the only difference between
+                                          un-ordered and ordinary comparisons */
+                       EXCEPTION(EX_Invalid);
+                       return !(control_word & CW_Invalid);
+               }
+               return 0;
+       } else
+               switch (c & 7) {
+               case COMP_A_lt_B:
+                       f = SW_C0;
+                       break;
+               case COMP_A_eq_B:
+                       f = SW_C3;
+                       break;
+               case COMP_A_gt_B:
+                       f = 0;
+                       break;
+               case COMP_No_Comp:
+                       f = SW_C3 | SW_C2 | SW_C0;
+                       break;
  #ifdef PARANOID
-      default:
-       EXCEPTION(EX_INTERNAL|0x123);
-       f = SW_C3 | SW_C2 | SW_C0;
-       break;
-#endif /* PARANOID */ 
-      }
-  setcc(f);
-  if (c & COMP_Denormal)
-    {
-      return denormal_operand() < 0;
-    }
-  return 0;
+               default:
+                       EXCEPTION(EX_INTERNAL | 0x123);
+                       f = SW_C3 | SW_C2 | SW_C0;
+                       break;
+#endif /* PARANOID */
+               }
+       setcc(f);
+       if (c & COMP_Denormal) {
+               return denormal_operand() < 0;
+       }
+       return 0;
  }
  
  /*---------------------------------------------------------------------------*/
  
  void fcom_st(void)
  {
-  /* fcom st(i) */
-  compare_st_st(FPU_rm);
+       /* fcom st(i) */
+       compare_st_st(FPU_rm);
  }
  
-
  void fcompst(void)
  {
-  /* fcomp st(i) */
-  if ( !compare_st_st(FPU_rm) )
-    FPU_pop();
+       /* fcomp st(i) */
+       if (!compare_st_st(FPU_rm))
+               FPU_pop();
  }
  
-
  void fcompp(void)
  {
-  /* fcompp */
-  if (FPU_rm != 1)
-    {
-      FPU_illegal();
-      return;
-    }
-  if ( !compare_st_st(1) )
-      poppop();
+       /* fcompp */
+       if (FPU_rm != 1) {
+               FPU_illegal();
+               return;
+       }
+       if (!compare_st_st(1))
+               poppop();
  }
  
-
  void fucom_(void)
  {
-  /* fucom st(i) */
-  compare_u_st_st(FPU_rm);
+       /* fucom st(i) */
+       compare_u_st_st(FPU_rm);
  
  }
  
-
  void fucomp(void)
  {
-  /* fucomp st(i) */
-  if ( !compare_u_st_st(FPU_rm) )
-    FPU_pop();
+       /* fucomp st(i) */
+       if (!compare_u_st_st(FPU_rm))
+               FPU_pop();
  }
  
-
  void fucompp(void)
  {
-  /* fucompp */
-  if (FPU_rm == 1)
-    {
-      if ( !compare_u_st_st(1) )
-       poppop();
-    }
-  else
-    FPU_illegal();
+       /* fucompp */
+       if (FPU_rm == 1) {
+               if (!compare_u_st_st(1))
+                       poppop();
+       } else
+               FPU_illegal();
  }
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c

index a85015801969ce8f92eaab01b33a105691c27db6..04869e64b18efdd8e87a7f7b2de7995cfe2a5f9b 100644 (file)
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -16,29 +16,28 @@
  #include "reg_constant.h"
  #include "control_w.h"
  
-
  #define MAKE_REG(s,e,l,h) { l, h, \
                              ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
  
-FPU_REG const CONST_1    = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
+FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
  #if 0
-FPU_REG const CONST_2    = MAKE_REG(POS, 1, 0x00000000, 0x80000000);
+FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000);
  FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000);
-#endif  /*  0  */
-static FPU_REG const CONST_L2T  = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b);
-static FPU_REG const CONST_L2E  = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29);
-FPU_REG const CONST_PI   = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2);
-FPU_REG const CONST_PI2  = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2);
-FPU_REG const CONST_PI4  = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2);
-static FPU_REG const CONST_LG2  = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84);
-static FPU_REG const CONST_LN2  = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7);
+#endif /*  0  */
+static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b);
+static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29);
+FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2);
+FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2);
+FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2);
+static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84);
+static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7);
  
  /* Extra bits to take pi/2 to more than 128 bits precision. */
  FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
-                                        0xfc8f8cbb, 0xece675d1);
+                                       0xfc8f8cbb, 0xece675d1);
  
  /* Only the sign (and tag) is used in internal zeroes */
-FPU_REG const CONST_Z    = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
+FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
  
  /* Only the sign and significand (and tag) are used in internal NaNs */
  /* The 80486 never generates one of these 
@@ -48,24 +47,22 @@ FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
  FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
  
  /* Only the sign (and tag) is used in internal infinities */
-FPU_REG const CONST_INF  = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
-
+FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
  
  static void fld_const(FPU_REG const *c, int adj, u_char tag)
  {
-  FPU_REG *st_new_ptr;
-
-  if ( STACK_OVERFLOW )
-    {
-      FPU_stack_overflow();
-      return;
-    }
-  push();
-  reg_copy(c, st_new_ptr);
-  st_new_ptr->sigl += adj;  /* For all our fldxxx constants, we don't need to
-                              borrow or carry. */
-  FPU_settag0(tag);
-  clear_C1();
+       FPU_REG *st_new_ptr;
+
+       if (STACK_OVERFLOW) {
+               FPU_stack_overflow();
+               return;
+       }
+       push();
+       reg_copy(c, st_new_ptr);
+       st_new_ptr->sigl += adj;        /* For all our fldxxx constants, we don't need to
+                                          borrow or carry. */
+       FPU_settag0(tag);
+       clear_C1();
  }
  
  /* A fast way to find out whether x is one of RC_DOWN or RC_CHOP
@@ -75,46 +72,46 @@ static void fld_const(FPU_REG const *c, int adj, u_char tag)
  
  static void fld1(int rc)
  {
-  fld_const(&CONST_1, 0, TAG_Valid);
+       fld_const(&CONST_1, 0, TAG_Valid);
  }
  
  static void fldl2t(int rc)
  {
-  fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid);
+       fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid);
  }
  
  static void fldl2e(int rc)
  {
-  fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+       fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
  }
  
  static void fldpi(int rc)
  {
-  fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+       fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
  }
  
  static void fldlg2(int rc)
  {
-  fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+       fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
  }
  
  static void fldln2(int rc)
  {
-  fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+       fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
  }
  
  static void fldz(int rc)
  {
-  fld_const(&CONST_Z, 0, TAG_Zero);
+       fld_const(&CONST_Z, 0, TAG_Zero);
  }
  
-typedef void (*FUNC_RC)(int);
+typedef void (*FUNC_RC) (int);
  
  static FUNC_RC constants_table[] = {
-  fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal
+       fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal
  };
  
  void fconst(void)
  {
-  (constants_table[FPU_rm])(control_word & CW_RC);
+       (constants_table[FPU_rm]) (control_word & CW_RC);
  }
diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c

index 45a258752703fd1ce42e18d54bf840afcaa44cd4..108060779977f3dbc417ee219fce7576aa47b62f 100644 (file)
--- a/arch/x86/math-emu/reg_convert.c
+++ b/arch/x86/math-emu/reg_convert.c
@@ -13,41 +13,34 @@
  #include "exception.h"
  #include "fpu_emu.h"
  
-
  int FPU_to_exp16(FPU_REG const *a, FPU_REG *x)
  {
-  int sign = getsign(a);
-
-  *(long long *)&(x->sigl) = *(const long long *)&(a->sigl);
-
-  /* Set up the exponent as a 16 bit quantity. */
-  setexponent16(x, exponent(a));
-
-  if ( exponent16(x) == EXP_UNDER )
-    {
-      /* The number is a de-normal or pseudodenormal. */
-      /* We only deal with the significand and exponent. */
-
-      if (x->sigh & 0x80000000)
-       {
-         /* Is a pseudodenormal. */
-         /* This is non-80486 behaviour because the number
-            loses its 'denormal' identity. */
-         addexponent(x, 1);
-       }
-      else
-       {
-         /* Is a denormal. */
-         addexponent(x, 1);
-         FPU_normalize_nuo(x);
+       int sign = getsign(a);
+
+       *(long long *)&(x->sigl) = *(const long long *)&(a->sigl);
+
+       /* Set up the exponent as a 16 bit quantity. */
+       setexponent16(x, exponent(a));
+
+       if (exponent16(x) == EXP_UNDER) {
+               /* The number is a de-normal or pseudodenormal. */
+               /* We only deal with the significand and exponent. */
+
+               if (x->sigh & 0x80000000) {
+                       /* Is a pseudodenormal. */
+                       /* This is non-80486 behaviour because the number
+                          loses its 'denormal' identity. */
+                       addexponent(x, 1);
+               } else {
+                       /* Is a denormal. */
+                       addexponent(x, 1);
+                       FPU_normalize_nuo(x);
+               }
         }
-    }
  
-  if ( !(x->sigh & 0x80000000) )
-    {
-      EXCEPTION(EX_INTERNAL | 0x180);
-    }
+       if (!(x->sigh & 0x80000000)) {
+               EXCEPTION(EX_INTERNAL | 0x180);
+       }
  
-  return sign;
+       return sign;
  }
-
diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c

index 5cee7ff920d93b8d8630134b6c5904c8e9cc1718..6827012db341324c5388e70c49fc3e51ca8d2d81 100644 (file)
--- a/arch/x86/math-emu/reg_divide.c
+++ b/arch/x86/math-emu/reg_divide.c
@@ -26,182 +26,157 @@
    */
  int FPU_div(int flags, int rm, int control_w)
  {
-  FPU_REG x, y;
-  FPU_REG const *a, *b, *st0_ptr, *st_ptr;
-  FPU_REG *dest;
-  u_char taga, tagb, signa, signb, sign, saved_sign;
-  int tag, deststnr;
-
-  if ( flags & DEST_RM )
-    deststnr = rm;
-  else
-    deststnr = 0;
-
-  if ( flags & REV )
-    {
-      b = &st(0);
-      st0_ptr = b;
-      tagb = FPU_gettag0();
-      if ( flags & LOADED )
-       {
-         a = (FPU_REG *)rm;
-         taga = flags & 0x0f;
+       FPU_REG x, y;
+       FPU_REG const *a, *b, *st0_ptr, *st_ptr;
+       FPU_REG *dest;
+       u_char taga, tagb, signa, signb, sign, saved_sign;
+       int tag, deststnr;
+
+       if (flags & DEST_RM)
+               deststnr = rm;
+       else
+               deststnr = 0;
+
+       if (flags & REV) {
+               b = &st(0);
+               st0_ptr = b;
+               tagb = FPU_gettag0();
+               if (flags & LOADED) {
+                       a = (FPU_REG *) rm;
+                       taga = flags & 0x0f;
+               } else {
+                       a = &st(rm);
+                       st_ptr = a;
+                       taga = FPU_gettagi(rm);
+               }
+       } else {
+               a = &st(0);
+               st0_ptr = a;
+               taga = FPU_gettag0();
+               if (flags & LOADED) {
+                       b = (FPU_REG *) rm;
+                       tagb = flags & 0x0f;
+               } else {
+                       b = &st(rm);
+                       st_ptr = b;
+                       tagb = FPU_gettagi(rm);
+               }
         }
-      else
-       {
-         a = &st(rm);
-         st_ptr = a;
-         taga = FPU_gettagi(rm);
-       }
-    }
-  else
-    {
-      a = &st(0);
-      st0_ptr = a;
-      taga = FPU_gettag0();
-      if ( flags & LOADED )
-       {
-         b = (FPU_REG *)rm;
-         tagb = flags & 0x0f;
-       }
-      else
-       {
-         b = &st(rm);
-         st_ptr = b;
-         tagb = FPU_gettagi(rm);
-       }
-    }
  
-  signa = getsign(a);
-  signb = getsign(b);
+       signa = getsign(a);
+       signb = getsign(b);
  
-  sign = signa ^ signb;
+       sign = signa ^ signb;
  
-  dest = &st(deststnr);
-  saved_sign = getsign(dest);
+       dest = &st(deststnr);
+       saved_sign = getsign(dest);
  
-  if ( !(taga | tagb) )
-    {
-      /* Both regs Valid, this should be the most common case. */
-      reg_copy(a, &x);
-      reg_copy(b, &y);
-      setpositive(&x);
-      setpositive(&y);
-      tag = FPU_u_div(&x, &y, dest, control_w, sign);
+       if (!(taga | tagb)) {
+               /* Both regs Valid, this should be the most common case. */
+               reg_copy(a, &x);
+               reg_copy(b, &y);
+               setpositive(&x);
+               setpositive(&y);
+               tag = FPU_u_div(&x, &y, dest, control_w, sign);
  
-      if ( tag < 0 )
-       return tag;
+               if (tag < 0)
+                       return tag;
  
-      FPU_settagi(deststnr, tag);
-      return tag;
-    }
+               FPU_settagi(deststnr, tag);
+               return tag;
+       }
  
-  if ( taga == TAG_Special )
-    taga = FPU_Special(a);
-  if ( tagb == TAG_Special )
-    tagb = FPU_Special(b);
+       if (taga == TAG_Special)
+               taga = FPU_Special(a);
+       if (tagb == TAG_Special)
+               tagb = FPU_Special(b);
  
-  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+       if (((taga == TAG_Valid) && (tagb == TW_Denormal))
             || ((taga == TW_Denormal) && (tagb == TAG_Valid))
-           || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
-    {
-      if ( denormal_operand() < 0 )
-       return FPU_Exception;
-
-      FPU_to_exp16(a, &x);
-      FPU_to_exp16(b, &y);
-      tag = FPU_u_div(&x, &y, dest, control_w, sign);
-      if ( tag < 0 )
-       return tag;
-
-      FPU_settagi(deststnr, tag);
-      return tag;
-    }
-  else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
-    {
-      if ( tagb != TAG_Zero )
-       {
-         /* Want to find Zero/Valid */
-         if ( tagb == TW_Denormal )
-           {
-             if ( denormal_operand() < 0 )
-               return FPU_Exception;
-           }
-
-         /* The result is zero. */
-         FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
-         setsign(dest, sign);
-         return TAG_Zero;
+           || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+               if (denormal_operand() < 0)
+                       return FPU_Exception;
+
+               FPU_to_exp16(a, &x);
+               FPU_to_exp16(b, &y);
+               tag = FPU_u_div(&x, &y, dest, control_w, sign);
+               if (tag < 0)
+                       return tag;
+
+               FPU_settagi(deststnr, tag);
+               return tag;
+       } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
+               if (tagb != TAG_Zero) {
+                       /* Want to find Zero/Valid */
+                       if (tagb == TW_Denormal) {
+                               if (denormal_operand() < 0)
+                                       return FPU_Exception;
+                       }
+
+                       /* The result is zero. */
+                       FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+                       setsign(dest, sign);
+                       return TAG_Zero;
+               }
+               /* We have an exception condition, either 0/0 or Valid/Zero. */
+               if (taga == TAG_Zero) {
+                       /* 0/0 */
+                       return arith_invalid(deststnr);
+               }
+               /* Valid/Zero */
+               return FPU_divide_by_zero(deststnr, sign);
         }
-      /* We have an exception condition, either 0/0 or Valid/Zero. */
-      if ( taga == TAG_Zero )
-       {
-         /* 0/0 */
-         return arith_invalid(deststnr);
+       /* Must have infinities, NaNs, etc */
+       else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+               if (flags & LOADED)
+                       return real_2op_NaN((FPU_REG *) rm, flags & 0x0f, 0,
+                                           st0_ptr);
+
+               if (flags & DEST_RM) {
+                       int tag;
+                       tag = FPU_gettag0();
+                       if (tag == TAG_Special)
+                               tag = FPU_Special(st0_ptr);
+                       return real_2op_NaN(st0_ptr, tag, rm,
+                                           (flags & REV) ? st0_ptr : &st(rm));
+               } else {
+                       int tag;
+                       tag = FPU_gettagi(rm);
+                       if (tag == TAG_Special)
+                               tag = FPU_Special(&st(rm));
+                       return real_2op_NaN(&st(rm), tag, 0,
+                                           (flags & REV) ? st0_ptr : &st(rm));
+               }
+       } else if (taga == TW_Infinity) {
+               if (tagb == TW_Infinity) {
+                       /* infinity/infinity */
+                       return arith_invalid(deststnr);
+               } else {
+                       /* tagb must be Valid or Zero */
+                       if ((tagb == TW_Denormal) && (denormal_operand() < 0))
+                               return FPU_Exception;
+
+                       /* Infinity divided by Zero or Valid does
+                          not raise and exception, but returns Infinity */
+                       FPU_copy_to_regi(a, TAG_Special, deststnr);
+                       setsign(dest, sign);
+                       return taga;
+               }
+       } else if (tagb == TW_Infinity) {
+               if ((taga == TW_Denormal) && (denormal_operand() < 0))
+                       return FPU_Exception;
+
+               /* The result is zero. */
+               FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+               setsign(dest, sign);
+               return TAG_Zero;
         }
-      /* Valid/Zero */
-      return FPU_divide_by_zero(deststnr, sign);
-    }
-  /* Must have infinities, NaNs, etc */
-  else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
-    {
-      if ( flags & LOADED )
-       return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr);
-
-      if ( flags & DEST_RM )
-       {
-         int tag;
-         tag = FPU_gettag0();
-         if ( tag == TAG_Special )
-           tag = FPU_Special(st0_ptr);
-         return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm));
-       }
-      else
-       {
-         int tag;
-         tag = FPU_gettagi(rm);
-         if ( tag == TAG_Special )
-           tag = FPU_Special(&st(rm));
-         return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm));
-       }
-    }
-  else if (taga == TW_Infinity)
-    {
-      if (tagb == TW_Infinity)
-       {
-         /* infinity/infinity */
-         return arith_invalid(deststnr);
-       }
-      else
-       {
-         /* tagb must be Valid or Zero */
-         if ( (tagb == TW_Denormal) && (denormal_operand() < 0) )
-           return FPU_Exception;
-         
-         /* Infinity divided by Zero or Valid does
-            not raise and exception, but returns Infinity */
-         FPU_copy_to_regi(a, TAG_Special, deststnr);
-         setsign(dest, sign);
-         return taga;
-       }
-    }
-  else if (tagb == TW_Infinity)
-    {
-      if ( (taga == TW_Denormal) && (denormal_operand() < 0) )
-       return FPU_Exception;
-
-      /* The result is zero. */
-      FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
-      setsign(dest, sign);
-      return TAG_Zero;
-    }
  #ifdef PARANOID
-  else
-    {
-      EXCEPTION(EX_INTERNAL|0x102);
-      return FPU_Exception;
-    }
-#endif /* PARANOID */ 
+       else {
+               EXCEPTION(EX_INTERNAL | 0x102);
+               return FPU_Exception;
+       }
+#endif /* PARANOID */
  
         return 0;
  }
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c

index e976caef64982849f3caa918ec5181bb3958b237..799d4af5be66381da440d545d94e0d71ebf4bac5 100644 (file)
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -27,1084 +27,938 @@
  #include "control_w.h"
  #include "status_w.h"
  
-
-#define DOUBLE_Emax 1023         /* largest valid exponent */
+#define DOUBLE_Emax 1023       /* largest valid exponent */
  #define DOUBLE_Ebias 1023
-#define DOUBLE_Emin (-1022)      /* smallest valid exponent */
+#define DOUBLE_Emin (-1022)    /* smallest valid exponent */
  
-#define SINGLE_Emax 127          /* largest valid exponent */
+#define SINGLE_Emax 127                /* largest valid exponent */
  #define SINGLE_Ebias 127
-#define SINGLE_Emin (-126)       /* smallest valid exponent */
-
+#define SINGLE_Emin (-126)     /* smallest valid exponent */
  
  static u_char normalize_no_excep(FPU_REG *r, int exp, int sign)
  {
-  u_char tag;
+       u_char tag;
  
-  setexponent16(r, exp);
+       setexponent16(r, exp);
  
-  tag = FPU_normalize_nuo(r);
-  stdexp(r);
-  if ( sign )
-    setnegative(r);
+       tag = FPU_normalize_nuo(r);
+       stdexp(r);
+       if (sign)
+               setnegative(r);
  
-  return tag;
+       return tag;
  }
  
-
  int FPU_tagof(FPU_REG *ptr)
  {
-  int exp;
-
-  exp = exponent16(ptr) & 0x7fff;
-  if ( exp == 0 )
-    {
-      if ( !(ptr->sigh | ptr->sigl) )
-       {
-         return TAG_Zero;
+       int exp;
+
+       exp = exponent16(ptr) & 0x7fff;
+       if (exp == 0) {
+               if (!(ptr->sigh | ptr->sigl)) {
+                       return TAG_Zero;
+               }
+               /* The number is a de-normal or pseudodenormal. */
+               return TAG_Special;
+       }
+
+       if (exp == 0x7fff) {
+               /* Is an Infinity, a NaN, or an unsupported data type. */
+               return TAG_Special;
         }
-      /* The number is a de-normal or pseudodenormal. */
-      return TAG_Special;
-    }
-
-  if ( exp == 0x7fff )
-    {
-      /* Is an Infinity, a NaN, or an unsupported data type. */
-      return TAG_Special;
-    }
-
-  if ( !(ptr->sigh & 0x80000000) )
-    {
-      /* Unsupported data type. */
-      /* Valid numbers have the ms bit set to 1. */
-      /* Unnormal. */
-      return TAG_Special;
-    }
-
-  return TAG_Valid;
-}
  
+       if (!(ptr->sigh & 0x80000000)) {
+               /* Unsupported data type. */
+               /* Valid numbers have the ms bit set to 1. */
+               /* Unnormal. */
+               return TAG_Special;
+       }
+
+       return TAG_Valid;
+}
  
  /* Get a long double from user memory */
  int FPU_load_extended(long double __user *s, int stnr)
  {
-  FPU_REG *sti_ptr = &st(stnr);
+       FPU_REG *sti_ptr = &st(stnr);
  
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, s, 10);
-  __copy_from_user(sti_ptr, s, 10);
-  RE_ENTRANT_CHECK_ON;
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, s, 10);
+       __copy_from_user(sti_ptr, s, 10);
+       RE_ENTRANT_CHECK_ON;
  
-  return FPU_tagof(sti_ptr);
+       return FPU_tagof(sti_ptr);
  }
  
-
  /* Get a double from user memory */
  int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data)
  {
-  int exp, tag, negative;
-  unsigned m64, l64;
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, dfloat, 8);
-  FPU_get_user(m64, 1 + (unsigned long __user *) dfloat);
-  FPU_get_user(l64, (unsigned long __user *) dfloat);
-  RE_ENTRANT_CHECK_ON;
-
-  negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
-  exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias;
-  m64 &= 0xfffff;
-  if ( exp > DOUBLE_Emax + EXTENDED_Ebias )
-    {
-      /* Infinity or NaN */
-      if ((m64 == 0) && (l64 == 0))
-       {
-         /* +- infinity */
-         loaded_data->sigh = 0x80000000;
-         loaded_data->sigl = 0x00000000;
-         exp = EXP_Infinity + EXTENDED_Ebias;
-         tag = TAG_Special;
-       }
-      else
-       {
-         /* Must be a signaling or quiet NaN */
-         exp = EXP_NaN + EXTENDED_Ebias;
-         loaded_data->sigh = (m64 << 11) | 0x80000000;
-         loaded_data->sigh |= l64 >> 21;
-         loaded_data->sigl = l64 << 11;
-         tag = TAG_Special;    /* The calling function must look for NaNs */
-       }
-    }
-  else if ( exp < DOUBLE_Emin + EXTENDED_Ebias )
-    {
-      /* Zero or de-normal */
-      if ((m64 == 0) && (l64 == 0))
-       {
-         /* Zero */
-         reg_copy(&CONST_Z, loaded_data);
-         exp = 0;
-         tag = TAG_Zero;
-       }
-      else
-       {
-         /* De-normal */
-         loaded_data->sigh = m64 << 11;
-         loaded_data->sigh |= l64 >> 21;
-         loaded_data->sigl = l64 << 11;
-
-         return normalize_no_excep(loaded_data, DOUBLE_Emin, negative)
-           | (denormal_operand() < 0 ? FPU_Exception : 0);
-       }
-    }
-  else
-    {
-      loaded_data->sigh = (m64 << 11) | 0x80000000;
-      loaded_data->sigh |= l64 >> 21;
-      loaded_data->sigl = l64 << 11;
+       int exp, tag, negative;
+       unsigned m64, l64;
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, dfloat, 8);
+       FPU_get_user(m64, 1 + (unsigned long __user *)dfloat);
+       FPU_get_user(l64, (unsigned long __user *)dfloat);
+       RE_ENTRANT_CHECK_ON;
+
+       negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
+       exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias;
+       m64 &= 0xfffff;
+       if (exp > DOUBLE_Emax + EXTENDED_Ebias) {
+               /* Infinity or NaN */
+               if ((m64 == 0) && (l64 == 0)) {
+                       /* +- infinity */
+                       loaded_data->sigh = 0x80000000;
+                       loaded_data->sigl = 0x00000000;
+                       exp = EXP_Infinity + EXTENDED_Ebias;
+                       tag = TAG_Special;
+               } else {
+                       /* Must be a signaling or quiet NaN */
+                       exp = EXP_NaN + EXTENDED_Ebias;
+                       loaded_data->sigh = (m64 << 11) | 0x80000000;
+                       loaded_data->sigh |= l64 >> 21;
+                       loaded_data->sigl = l64 << 11;
+                       tag = TAG_Special;      /* The calling function must look for NaNs */
+               }
+       } else if (exp < DOUBLE_Emin + EXTENDED_Ebias) {
+               /* Zero or de-normal */
+               if ((m64 == 0) && (l64 == 0)) {
+                       /* Zero */
+                       reg_copy(&CONST_Z, loaded_data);
+                       exp = 0;
+                       tag = TAG_Zero;
+               } else {
+                       /* De-normal */
+                       loaded_data->sigh = m64 << 11;
+                       loaded_data->sigh |= l64 >> 21;
+                       loaded_data->sigl = l64 << 11;
+
+                       return normalize_no_excep(loaded_data, DOUBLE_Emin,
+                                                 negative)
+                           | (denormal_operand() < 0 ? FPU_Exception : 0);
+               }
+       } else {
+               loaded_data->sigh = (m64 << 11) | 0x80000000;
+               loaded_data->sigh |= l64 >> 21;
+               loaded_data->sigl = l64 << 11;
  
-      tag = TAG_Valid;
-    }
+               tag = TAG_Valid;
+       }
  
-  setexponent16(loaded_data, exp | negative);
+       setexponent16(loaded_data, exp | negative);
  
-  return tag;
+       return tag;
  }
  
-
  /* Get a float from user memory */
  int FPU_load_single(float __user *single, FPU_REG *loaded_data)
  {
-  unsigned m32;
-  int exp, tag, negative;
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, single, 4);
-  FPU_get_user(m32, (unsigned long __user *) single);
-  RE_ENTRANT_CHECK_ON;
-
-  negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
-
-  if (!(m32 & 0x7fffffff))
-    {
-      /* Zero */
-      reg_copy(&CONST_Z, loaded_data);
-      addexponent(loaded_data, negative);
-      return TAG_Zero;
-    }
-  exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
-  m32 = (m32 & 0x7fffff) << 8;
-  if ( exp < SINGLE_Emin + EXTENDED_Ebias )
-    {
-      /* De-normals */
-      loaded_data->sigh = m32;
-      loaded_data->sigl = 0;
-
-      return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
-       | (denormal_operand() < 0 ? FPU_Exception : 0);
-    }
-  else if ( exp > SINGLE_Emax + EXTENDED_Ebias )
-    {
-    /* Infinity or NaN */
-      if ( m32 == 0 )
-       {
-         /* +- infinity */
-         loaded_data->sigh = 0x80000000;
-         loaded_data->sigl = 0x00000000;
-         exp = EXP_Infinity + EXTENDED_Ebias;
-         tag = TAG_Special;
+       unsigned m32;
+       int exp, tag, negative;
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, single, 4);
+       FPU_get_user(m32, (unsigned long __user *)single);
+       RE_ENTRANT_CHECK_ON;
+
+       negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
+
+       if (!(m32 & 0x7fffffff)) {
+               /* Zero */
+               reg_copy(&CONST_Z, loaded_data);
+               addexponent(loaded_data, negative);
+               return TAG_Zero;
         }
-      else
-       {
-         /* Must be a signaling or quiet NaN */
-         exp = EXP_NaN + EXTENDED_Ebias;
-         loaded_data->sigh = m32 | 0x80000000;
-         loaded_data->sigl = 0;
-         tag = TAG_Special;  /* The calling function must look for NaNs */
+       exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
+       m32 = (m32 & 0x7fffff) << 8;
+       if (exp < SINGLE_Emin + EXTENDED_Ebias) {
+               /* De-normals */
+               loaded_data->sigh = m32;
+               loaded_data->sigl = 0;
+
+               return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
+                   | (denormal_operand() < 0 ? FPU_Exception : 0);
+       } else if (exp > SINGLE_Emax + EXTENDED_Ebias) {
+               /* Infinity or NaN */
+               if (m32 == 0) {
+                       /* +- infinity */
+                       loaded_data->sigh = 0x80000000;
+                       loaded_data->sigl = 0x00000000;
+                       exp = EXP_Infinity + EXTENDED_Ebias;
+                       tag = TAG_Special;
+               } else {
+                       /* Must be a signaling or quiet NaN */
+                       exp = EXP_NaN + EXTENDED_Ebias;
+                       loaded_data->sigh = m32 | 0x80000000;
+                       loaded_data->sigl = 0;
+                       tag = TAG_Special;      /* The calling function must look for NaNs */
+               }
+       } else {
+               loaded_data->sigh = m32 | 0x80000000;
+               loaded_data->sigl = 0;
+               tag = TAG_Valid;
         }
-    }
-  else
-    {
-      loaded_data->sigh = m32 | 0x80000000;
-      loaded_data->sigl = 0;
-      tag = TAG_Valid;
-    }
  
-  setexponent16(loaded_data, exp | negative);  /* Set the sign. */
+       setexponent16(loaded_data, exp | negative);     /* Set the sign. */
  
-  return tag;
+       return tag;
  }
  
-
  /* Get a long long from user memory */
  int FPU_load_int64(long long __user *_s)
  {
-  long long s;
-  int sign;
-  FPU_REG *st0_ptr = &st(0);
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, _s, 8);
-  if (copy_from_user(&s,_s,8))
-    FPU_abort;
-  RE_ENTRANT_CHECK_ON;
-
-  if (s == 0)
-    {
-      reg_copy(&CONST_Z, st0_ptr);
-      return TAG_Zero;
-    }
-
-  if (s > 0)
-    sign = SIGN_Positive;
-  else
-  {
-    s = -s;
-    sign = SIGN_Negative;
-  }
-
-  significand(st0_ptr) = s;
-
-  return normalize_no_excep(st0_ptr, 63, sign);
-}
+       long long s;
+       int sign;
+       FPU_REG *st0_ptr = &st(0);
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, _s, 8);
+       if (copy_from_user(&s, _s, 8))
+               FPU_abort;
+       RE_ENTRANT_CHECK_ON;
+
+       if (s == 0) {
+               reg_copy(&CONST_Z, st0_ptr);
+               return TAG_Zero;
+       }
+
+       if (s > 0)
+               sign = SIGN_Positive;
+       else {
+               s = -s;
+               sign = SIGN_Negative;
+       }
  
+       significand(st0_ptr) = s;
+
+       return normalize_no_excep(st0_ptr, 63, sign);
+}
  
  /* Get a long from user memory */
  int FPU_load_int32(long __user *_s, FPU_REG *loaded_data)
  {
-  long s;
-  int negative;
+       long s;
+       int negative;
  
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, _s, 4);
-  FPU_get_user(s, _s);
-  RE_ENTRANT_CHECK_ON;
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, _s, 4);
+       FPU_get_user(s, _s);
+       RE_ENTRANT_CHECK_ON;
  
-  if (s == 0)
-    { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; }
+       if (s == 0) {
+               reg_copy(&CONST_Z, loaded_data);
+               return TAG_Zero;
+       }
  
-  if (s > 0)
-    negative = SIGN_Positive;
-  else
-    {
-      s = -s;
-      negative = SIGN_Negative;
-    }
+       if (s > 0)
+               negative = SIGN_Positive;
+       else {
+               s = -s;
+               negative = SIGN_Negative;
+       }
  
-  loaded_data->sigh = s;
-  loaded_data->sigl = 0;
+       loaded_data->sigh = s;
+       loaded_data->sigl = 0;
  
-  return normalize_no_excep(loaded_data, 31, negative);
+       return normalize_no_excep(loaded_data, 31, negative);
  }
  
-
  /* Get a short from user memory */
  int FPU_load_int16(short __user *_s, FPU_REG *loaded_data)
  {
-  int s, negative;
+       int s, negative;
  
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, _s, 2);
-  /* Cast as short to get the sign extended. */
-  FPU_get_user(s, _s);
-  RE_ENTRANT_CHECK_ON;
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, _s, 2);
+       /* Cast as short to get the sign extended. */
+       FPU_get_user(s, _s);
+       RE_ENTRANT_CHECK_ON;
  
-  if (s == 0)
-    { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; }
+       if (s == 0) {
+               reg_copy(&CONST_Z, loaded_data);
+               return TAG_Zero;
+       }
  
-  if (s > 0)
-    negative = SIGN_Positive;
-  else
-    {
-      s = -s;
-      negative = SIGN_Negative;
-    }
+       if (s > 0)
+               negative = SIGN_Positive;
+       else {
+               s = -s;
+               negative = SIGN_Negative;
+       }
  
-  loaded_data->sigh = s << 16;
-  loaded_data->sigl = 0;
+       loaded_data->sigh = s << 16;
+       loaded_data->sigl = 0;
  
-  return normalize_no_excep(loaded_data, 15, negative);
+       return normalize_no_excep(loaded_data, 15, negative);
  }
  
-
  /* Get a packed bcd array from user memory */
  int FPU_load_bcd(u_char __user *s)
  {
-  FPU_REG *st0_ptr = &st(0);
-  int pos;
-  u_char bcd;
-  long long l=0;
-  int sign;
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ, s, 10);
-  RE_ENTRANT_CHECK_ON;
-  for ( pos = 8; pos >= 0; pos--)
-    {
-      l *= 10;
-      RE_ENTRANT_CHECK_OFF;
-      FPU_get_user(bcd, s+pos);
-      RE_ENTRANT_CHECK_ON;
-      l += bcd >> 4;
-      l *= 10;
-      l += bcd & 0x0f;
-    }
- 
-  RE_ENTRANT_CHECK_OFF;
-  FPU_get_user(sign, s+9);
-  sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive;
-  RE_ENTRANT_CHECK_ON;
-
-  if ( l == 0 )
-    {
-      reg_copy(&CONST_Z, st0_ptr);
-      addexponent(st0_ptr, sign);   /* Set the sign. */
-      return TAG_Zero;
-    }
-  else
-    {
-      significand(st0_ptr) = l;
-      return normalize_no_excep(st0_ptr, 63, sign);
-    }
+       FPU_REG *st0_ptr = &st(0);
+       int pos;
+       u_char bcd;
+       long long l = 0;
+       int sign;
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, s, 10);
+       RE_ENTRANT_CHECK_ON;
+       for (pos = 8; pos >= 0; pos--) {
+               l *= 10;
+               RE_ENTRANT_CHECK_OFF;
+               FPU_get_user(bcd, s + pos);
+               RE_ENTRANT_CHECK_ON;
+               l += bcd >> 4;
+               l *= 10;
+               l += bcd & 0x0f;
+       }
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_get_user(sign, s + 9);
+       sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive;
+       RE_ENTRANT_CHECK_ON;
+
+       if (l == 0) {
+               reg_copy(&CONST_Z, st0_ptr);
+               addexponent(st0_ptr, sign);     /* Set the sign. */
+               return TAG_Zero;
+       } else {
+               significand(st0_ptr) = l;
+               return normalize_no_excep(st0_ptr, 63, sign);
+       }
  }
  
  /*===========================================================================*/
  
  /* Put a long double into user memory */
-int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d)
+int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
+                      long double __user * d)
  {
-  /*
-    The only exception raised by an attempt to store to an
-    extended format is the Invalid Stack exception, i.e.
-    attempting to store from an empty register.
-   */
-
-  if ( st0_tag != TAG_Empty )
-    {
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_WRITE, d, 10);
-
-      FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d);
-      FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4));
-      FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8));
-      RE_ENTRANT_CHECK_ON;
-
-      return 1;
-    }
-
-  /* Empty register (stack underflow) */
-  EXCEPTION(EX_StackUnder);
-  if ( control_word & CW_Invalid )
-    {
-      /* The masked response */
-      /* Put out the QNaN indefinite */
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_WRITE,d,10);
-      FPU_put_user(0, (unsigned long __user *) d);
-      FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d);
-      FPU_put_user(0xffff, 4 + (short __user *) d);
-      RE_ENTRANT_CHECK_ON;
-      return 1;
-    }
-  else
-    return 0;
+       /*
+          The only exception raised by an attempt to store to an
+          extended format is the Invalid Stack exception, i.e.
+          attempting to store from an empty register.
+        */
+
+       if (st0_tag != TAG_Empty) {
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_WRITE, d, 10);
+
+               FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d);
+               FPU_put_user(st0_ptr->sigh,
+                            (unsigned long __user *)((u_char __user *) d + 4));
+               FPU_put_user(exponent16(st0_ptr),
+                            (unsigned short __user *)((u_char __user *) d +
+                                                      8));
+               RE_ENTRANT_CHECK_ON;
+
+               return 1;
+       }
  
-}
+       /* Empty register (stack underflow) */
+       EXCEPTION(EX_StackUnder);
+       if (control_word & CW_Invalid) {
+               /* The masked response */
+               /* Put out the QNaN indefinite */
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_WRITE, d, 10);
+               FPU_put_user(0, (unsigned long __user *)d);
+               FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d);
+               FPU_put_user(0xffff, 4 + (short __user *)d);
+               RE_ENTRANT_CHECK_ON;
+               return 1;
+       } else
+               return 0;
  
+}
  
  /* Put a double into user memory */
  int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
  {
-  unsigned long l[2];
-  unsigned long increment = 0; /* avoid gcc warnings */
-  int precision_loss;
-  int exp;
-  FPU_REG tmp;
+       unsigned long l[2];
+       unsigned long increment = 0;    /* avoid gcc warnings */
+       int precision_loss;
+       int exp;
+       FPU_REG tmp;
  
-  if ( st0_tag == TAG_Valid )
-    {
-      reg_copy(st0_ptr, &tmp);
-      exp = exponent(&tmp);
+       if (st0_tag == TAG_Valid) {
+               reg_copy(st0_ptr, &tmp);
+               exp = exponent(&tmp);
  
-      if ( exp < DOUBLE_Emin )     /* It may be a denormal */
-       {
-         addexponent(&tmp, -DOUBLE_Emin + 52);  /* largest exp to be 51 */
+               if (exp < DOUBLE_Emin) {        /* It may be a denormal */
+                       addexponent(&tmp, -DOUBLE_Emin + 52);   /* largest exp to be 51 */
  
-       denormal_arg:
+                     denormal_arg:
  
-         if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) )
-           {
+                       if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
  #ifdef PECULIAR_486
-             /* Did it round to a non-denormal ? */
-             /* This behaviour might be regarded as peculiar, it appears
-                that the 80486 rounds to the dest precision, then
-                converts to decide underflow. */
-             if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) &&
-                 (st0_ptr->sigl & 0x000007ff)) )
+                               /* Did it round to a non-denormal ? */
+                               /* This behaviour might be regarded as peculiar, it appears
+                                  that the 80486 rounds to the dest precision, then
+                                  converts to decide underflow. */
+                               if (!
+                                   ((tmp.sigh == 0x00100000) && (tmp.sigl == 0)
+                                    && (st0_ptr->sigl & 0x000007ff)))
  #endif /* PECULIAR_486 */
-               {
-                 EXCEPTION(EX_Underflow);
-                 /* This is a special case: see sec 16.2.5.1 of
-                    the 80486 book */
-                 if ( !(control_word & CW_Underflow) )
-                   return 0;
-               }
-             EXCEPTION(precision_loss);
-             if ( !(control_word & CW_Precision) )
-               return 0;
-           }
-         l[0] = tmp.sigl;
-         l[1] = tmp.sigh;
-       }
-      else
-       {
-         if ( tmp.sigl & 0x000007ff )
-           {
-             precision_loss = 1;
-             switch (control_word & CW_RC)
-               {
-               case RC_RND:
-                 /* Rounding can get a little messy.. */
-                 increment = ((tmp.sigl & 0x7ff) > 0x400) |  /* nearest */
-                   ((tmp.sigl & 0xc00) == 0xc00);            /* odd -> even */
-                 break;
-               case RC_DOWN:   /* towards -infinity */
-                 increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff;
-                 break;
-               case RC_UP:     /* towards +infinity */
-                 increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0;
-                 break;
-               case RC_CHOP:
-                 increment = 0;
-                 break;
-               }
-         
-             /* Truncate the mantissa */
-             tmp.sigl &= 0xfffff800;
-         
-             if ( increment )
-               {
-                 if ( tmp.sigl >= 0xfffff800 )
-                   {
-                     /* the sigl part overflows */
-                     if ( tmp.sigh == 0xffffffff )
-                       {
-                         /* The sigh part overflows */
-                         tmp.sigh = 0x80000000;
-                         exp++;
-                         if (exp >= EXP_OVER)
-                           goto overflow;
+                               {
+                                       EXCEPTION(EX_Underflow);
+                                       /* This is a special case: see sec 16.2.5.1 of
+                                          the 80486 book */
+                                       if (!(control_word & CW_Underflow))
+                                               return 0;
+                               }
+                               EXCEPTION(precision_loss);
+                               if (!(control_word & CW_Precision))
+                                       return 0;
                         }
-                     else
-                       {
-                         tmp.sigh ++;
+                       l[0] = tmp.sigl;
+                       l[1] = tmp.sigh;
+               } else {
+                       if (tmp.sigl & 0x000007ff) {
+                               precision_loss = 1;
+                               switch (control_word & CW_RC) {
+                               case RC_RND:
+                                       /* Rounding can get a little messy.. */
+                                       increment = ((tmp.sigl & 0x7ff) > 0x400) |      /* nearest */
+                                           ((tmp.sigl & 0xc00) == 0xc00);      /* odd -> even */
+                                       break;
+                               case RC_DOWN:   /* towards -infinity */
+                                       increment =
+                                           signpositive(&tmp) ? 0 : tmp.
+                                           sigl & 0x7ff;
+                                       break;
+                               case RC_UP:     /* towards +infinity */
+                                       increment =
+                                           signpositive(&tmp) ? tmp.
+                                           sigl & 0x7ff : 0;
+                                       break;
+                               case RC_CHOP:
+                                       increment = 0;
+                                       break;
+                               }
+
+                               /* Truncate the mantissa */
+                               tmp.sigl &= 0xfffff800;
+
+                               if (increment) {
+                                       if (tmp.sigl >= 0xfffff800) {
+                                               /* the sigl part overflows */
+                                               if (tmp.sigh == 0xffffffff) {
+                                                       /* The sigh part overflows */
+                                                       tmp.sigh = 0x80000000;
+                                                       exp++;
+                                                       if (exp >= EXP_OVER)
+                                                               goto overflow;
+                                               } else {
+                                                       tmp.sigh++;
+                                               }
+                                               tmp.sigl = 0x00000000;
+                                       } else {
+                                               /* We only need to increment sigl */
+                                               tmp.sigl += 0x00000800;
+                                       }
+                               }
+                       } else
+                               precision_loss = 0;
+
+                       l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
+                       l[1] = ((tmp.sigh >> 11) & 0xfffff);
+
+                       if (exp > DOUBLE_Emax) {
+                             overflow:
+                               EXCEPTION(EX_Overflow);
+                               if (!(control_word & CW_Overflow))
+                                       return 0;
+                               set_precision_flag_up();
+                               if (!(control_word & CW_Precision))
+                                       return 0;
+
+                               /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+                               /* Overflow to infinity */
+                               l[0] = 0x00000000;      /* Set to */
+                               l[1] = 0x7ff00000;      /* + INF */
+                       } else {
+                               if (precision_loss) {
+                                       if (increment)
+                                               set_precision_flag_up();
+                                       else
+                                               set_precision_flag_down();
+                               }
+                               /* Add the exponent */
+                               l[1] |= (((exp + DOUBLE_Ebias) & 0x7ff) << 20);
                         }
-                     tmp.sigl = 0x00000000;
-                   }
-                 else
-                   {
-                     /* We only need to increment sigl */
-                     tmp.sigl += 0x00000800;
-                   }
-               }
-           }
-         else
-           precision_loss = 0;
-         
-         l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
-         l[1] = ((tmp.sigh >> 11) & 0xfffff);
-
-         if ( exp > DOUBLE_Emax )
-           {
-           overflow:
-             EXCEPTION(EX_Overflow);
-             if ( !(control_word & CW_Overflow) )
-               return 0;
-             set_precision_flag_up();
-             if ( !(control_word & CW_Precision) )
-               return 0;
-
-             /* This is a special case: see sec 16.2.5.1 of the 80486 book */
-             /* Overflow to infinity */
-             l[0] = 0x00000000;        /* Set to */
-             l[1] = 0x7ff00000;        /* + INF */
-           }
-         else
-           {
-             if ( precision_loss )
-               {
-                 if ( increment )
-                   set_precision_flag_up();
-                 else
-                   set_precision_flag_down();
                 }
-             /* Add the exponent */
-             l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20);
-           }
-       }
-    }
-  else if (st0_tag == TAG_Zero)
-    {
-      /* Number is zero */
-      l[0] = 0;
-      l[1] = 0;
-    }
-  else if ( st0_tag == TAG_Special )
-    {
-      st0_tag = FPU_Special(st0_ptr);
-      if ( st0_tag == TW_Denormal )
-       {
-         /* A denormal will always underflow. */
+       } else if (st0_tag == TAG_Zero) {
+               /* Number is zero */
+               l[0] = 0;
+               l[1] = 0;
+       } else if (st0_tag == TAG_Special) {
+               st0_tag = FPU_Special(st0_ptr);
+               if (st0_tag == TW_Denormal) {
+                       /* A denormal will always underflow. */
  #ifndef PECULIAR_486
-         /* An 80486 is supposed to be able to generate
-            a denormal exception here, but... */
-         /* Underflow has priority. */
-         if ( control_word & CW_Underflow )
-           denormal_operand();
+                       /* An 80486 is supposed to be able to generate
+                          a denormal exception here, but... */
+                       /* Underflow has priority. */
+                       if (control_word & CW_Underflow)
+                               denormal_operand();
  #endif /* PECULIAR_486 */
-         reg_copy(st0_ptr, &tmp);
-         goto denormal_arg;
-       }
-      else if (st0_tag == TW_Infinity)
-       {
-         l[0] = 0;
-         l[1] = 0x7ff00000;
-       }
-      else if (st0_tag == TW_NaN)
-       {
-         /* Is it really a NaN ? */
-         if ( (exponent(st0_ptr) == EXP_OVER)
-              && (st0_ptr->sigh & 0x80000000) )
-           {
-             /* See if we can get a valid NaN from the FPU_REG */
-             l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21);
-             l[1] = ((st0_ptr->sigh >> 11) & 0xfffff);
-             if ( !(st0_ptr->sigh & 0x40000000) )
-               {
-                 /* It is a signalling NaN */
-                 EXCEPTION(EX_Invalid);
-                 if ( !(control_word & CW_Invalid) )
-                   return 0;
-                 l[1] |= (0x40000000 >> 11);
+                       reg_copy(st0_ptr, &tmp);
+                       goto denormal_arg;
+               } else if (st0_tag == TW_Infinity) {
+                       l[0] = 0;
+                       l[1] = 0x7ff00000;
+               } else if (st0_tag == TW_NaN) {
+                       /* Is it really a NaN ? */
+                       if ((exponent(st0_ptr) == EXP_OVER)
+                           && (st0_ptr->sigh & 0x80000000)) {
+                               /* See if we can get a valid NaN from the FPU_REG */
+                               l[0] =
+                                   (st0_ptr->sigl >> 11) | (st0_ptr->
+                                                            sigh << 21);
+                               l[1] = ((st0_ptr->sigh >> 11) & 0xfffff);
+                               if (!(st0_ptr->sigh & 0x40000000)) {
+                                       /* It is a signalling NaN */
+                                       EXCEPTION(EX_Invalid);
+                                       if (!(control_word & CW_Invalid))
+                                               return 0;
+                                       l[1] |= (0x40000000 >> 11);
+                               }
+                               l[1] |= 0x7ff00000;
+                       } else {
+                               /* It is an unsupported data type */
+                               EXCEPTION(EX_Invalid);
+                               if (!(control_word & CW_Invalid))
+                                       return 0;
+                               l[0] = 0;
+                               l[1] = 0xfff80000;
+                       }
                 }
-             l[1] |= 0x7ff00000;
-           }
-         else
-           {
-             /* It is an unsupported data type */
-             EXCEPTION(EX_Invalid);
-             if ( !(control_word & CW_Invalid) )
-               return 0;
-             l[0] = 0;
-             l[1] = 0xfff80000;
-           }
+       } else if (st0_tag == TAG_Empty) {
+               /* Empty register (stack underflow) */
+               EXCEPTION(EX_StackUnder);
+               if (control_word & CW_Invalid) {
+                       /* The masked response */
+                       /* Put out the QNaN indefinite */
+                       RE_ENTRANT_CHECK_OFF;
+                       FPU_access_ok(VERIFY_WRITE, dfloat, 8);
+                       FPU_put_user(0, (unsigned long __user *)dfloat);
+                       FPU_put_user(0xfff80000,
+                                    1 + (unsigned long __user *)dfloat);
+                       RE_ENTRANT_CHECK_ON;
+                       return 1;
+               } else
+                       return 0;
         }
-    }
-  else if ( st0_tag == TAG_Empty )
-    {
-      /* Empty register (stack underflow) */
-      EXCEPTION(EX_StackUnder);
-      if ( control_word & CW_Invalid )
-       {
-         /* The masked response */
-         /* Put out the QNaN indefinite */
-         RE_ENTRANT_CHECK_OFF;
-         FPU_access_ok(VERIFY_WRITE,dfloat,8);
-         FPU_put_user(0, (unsigned long __user *) dfloat);
-         FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat);
-         RE_ENTRANT_CHECK_ON;
-         return 1;
-       }
-      else
-       return 0;
-    }
-  if ( getsign(st0_ptr) )
-    l[1] |= 0x80000000;
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE,dfloat,8);
-  FPU_put_user(l[0], (unsigned long __user *)dfloat);
-  FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
-  RE_ENTRANT_CHECK_ON;
-
-  return 1;
-}
+       if (getsign(st0_ptr))
+               l[1] |= 0x80000000;
  
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_WRITE, dfloat, 8);
+       FPU_put_user(l[0], (unsigned long __user *)dfloat);
+       FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
+       RE_ENTRANT_CHECK_ON;
+
+       return 1;
+}
  
  /* Put a float into user memory */
  int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
  {
-  long templ = 0;
-  unsigned long increment = 0;         /* avoid gcc warnings */
-  int precision_loss;
-  int exp;
-  FPU_REG tmp;
+       long templ = 0;
+       unsigned long increment = 0;    /* avoid gcc warnings */
+       int precision_loss;
+       int exp;
+       FPU_REG tmp;
  
-  if ( st0_tag == TAG_Valid )
-    {
+       if (st0_tag == TAG_Valid) {
  
-      reg_copy(st0_ptr, &tmp);
-      exp = exponent(&tmp);
+               reg_copy(st0_ptr, &tmp);
+               exp = exponent(&tmp);
  
-      if ( exp < SINGLE_Emin )
-       {
-         addexponent(&tmp, -SINGLE_Emin + 23);  /* largest exp to be 22 */
+               if (exp < SINGLE_Emin) {
+                       addexponent(&tmp, -SINGLE_Emin + 23);   /* largest exp to be 22 */
  
-       denormal_arg:
+                     denormal_arg:
  
-         if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) )
-           {
+                       if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
  #ifdef PECULIAR_486
-             /* Did it round to a non-denormal ? */
-             /* This behaviour might be regarded as peculiar, it appears
-                that the 80486 rounds to the dest precision, then
-                converts to decide underflow. */
-             if ( !((tmp.sigl == 0x00800000) &&
-                 ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) )
+                               /* Did it round to a non-denormal ? */
+                               /* This behaviour might be regarded as peculiar, it appears
+                                  that the 80486 rounds to the dest precision, then
+                                  converts to decide underflow. */
+                               if (!((tmp.sigl == 0x00800000) &&
+                                     ((st0_ptr->sigh & 0x000000ff)
+                                      || st0_ptr->sigl)))
  #endif /* PECULIAR_486 */
-               {
-                 EXCEPTION(EX_Underflow);
-                 /* This is a special case: see sec 16.2.5.1 of
-                    the 80486 book */
-                 if ( !(control_word & CW_Underflow) )
-                   return 0;
-               }
-             EXCEPTION(precision_loss);
-             if ( !(control_word & CW_Precision) )
-               return 0;
-           }
-         templ = tmp.sigl;
-      }
-      else
-       {
-         if ( tmp.sigl | (tmp.sigh & 0x000000ff) )
-           {
-             unsigned long sigh = tmp.sigh;
-             unsigned long sigl = tmp.sigl;
-             
-             precision_loss = 1;
-             switch (control_word & CW_RC)
-               {
-               case RC_RND:
-                 increment = ((sigh & 0xff) > 0x80)       /* more than half */
-                   || (((sigh & 0xff) == 0x80) && sigl)   /* more than half */
-                   || ((sigh & 0x180) == 0x180);        /* round to even */
-                 break;
-               case RC_DOWN:   /* towards -infinity */
-                 increment = signpositive(&tmp)
-                   ? 0 : (sigl | (sigh & 0xff));
-                 break;
-               case RC_UP:     /* towards +infinity */
-                 increment = signpositive(&tmp)
-                   ? (sigl | (sigh & 0xff)) : 0;
-                 break;
-               case RC_CHOP:
-                 increment = 0;
-                 break;
-               }
-         
-             /* Truncate part of the mantissa */
-             tmp.sigl = 0;
-         
-             if (increment)
-               {
-                 if ( sigh >= 0xffffff00 )
-                   {
-                     /* The sigh part overflows */
-                     tmp.sigh = 0x80000000;
-                     exp++;
-                     if ( exp >= EXP_OVER )
-                       goto overflow;
-                   }
-                 else
-                   {
-                     tmp.sigh &= 0xffffff00;
-                     tmp.sigh += 0x100;
-                   }
-               }
-             else
-               {
-                 tmp.sigh &= 0xffffff00;  /* Finish the truncation */
-               }
-           }
-         else
-           precision_loss = 0;
-      
-         templ = (tmp.sigh >> 8) & 0x007fffff;
-
-         if ( exp > SINGLE_Emax )
-           {
-           overflow:
-             EXCEPTION(EX_Overflow);
-             if ( !(control_word & CW_Overflow) )
-               return 0;
-             set_precision_flag_up();
-             if ( !(control_word & CW_Precision) )
-               return 0;
-
-             /* This is a special case: see sec 16.2.5.1 of the 80486 book. */
-             /* Masked response is overflow to infinity. */
-             templ = 0x7f800000;
-           }
-         else
-           {
-             if ( precision_loss )
-               {
-                 if ( increment )
-                   set_precision_flag_up();
-                 else
-                   set_precision_flag_down();
+                               {
+                                       EXCEPTION(EX_Underflow);
+                                       /* This is a special case: see sec 16.2.5.1 of
+                                          the 80486 book */
+                                       if (!(control_word & CW_Underflow))
+                                               return 0;
+                               }
+                               EXCEPTION(precision_loss);
+                               if (!(control_word & CW_Precision))
+                                       return 0;
+                       }
+                       templ = tmp.sigl;
+               } else {
+                       if (tmp.sigl | (tmp.sigh & 0x000000ff)) {
+                               unsigned long sigh = tmp.sigh;
+                               unsigned long sigl = tmp.sigl;
+
+                               precision_loss = 1;
+                               switch (control_word & CW_RC) {
+                               case RC_RND:
+                                       increment = ((sigh & 0xff) > 0x80)      /* more than half */
+                                           ||(((sigh & 0xff) == 0x80) && sigl) /* more than half */
+                                           ||((sigh & 0x180) == 0x180);        /* round to even */
+                                       break;
+                               case RC_DOWN:   /* towards -infinity */
+                                       increment = signpositive(&tmp)
+                                           ? 0 : (sigl | (sigh & 0xff));
+                                       break;
+                               case RC_UP:     /* towards +infinity */
+                                       increment = signpositive(&tmp)
+                                           ? (sigl | (sigh & 0xff)) : 0;
+                                       break;
+                               case RC_CHOP:
+                                       increment = 0;
+                                       break;
+                               }
+
+                               /* Truncate part of the mantissa */
+                               tmp.sigl = 0;
+
+                               if (increment) {
+                                       if (sigh >= 0xffffff00) {
+                                               /* The sigh part overflows */
+                                               tmp.sigh = 0x80000000;
+                                               exp++;
+                                               if (exp >= EXP_OVER)
+                                                       goto overflow;
+                                       } else {
+                                               tmp.sigh &= 0xffffff00;
+                                               tmp.sigh += 0x100;
+                                       }
+                               } else {
+                                       tmp.sigh &= 0xffffff00; /* Finish the truncation */
+                               }
+                       } else
+                               precision_loss = 0;
+
+                       templ = (tmp.sigh >> 8) & 0x007fffff;
+
+                       if (exp > SINGLE_Emax) {
+                             overflow:
+                               EXCEPTION(EX_Overflow);
+                               if (!(control_word & CW_Overflow))
+                                       return 0;
+                               set_precision_flag_up();
+                               if (!(control_word & CW_Precision))
+                                       return 0;
+
+                               /* This is a special case: see sec 16.2.5.1 of the 80486 book. */
+                               /* Masked response is overflow to infinity. */
+                               templ = 0x7f800000;
+                       } else {
+                               if (precision_loss) {
+                                       if (increment)
+                                               set_precision_flag_up();
+                                       else
+                                               set_precision_flag_down();
+                               }
+                               /* Add the exponent */
+                               templ |= ((exp + SINGLE_Ebias) & 0xff) << 23;
+                       }
                 }
-             /* Add the exponent */
-             templ |= ((exp+SINGLE_Ebias) & 0xff) << 23;
-           }
-       }
-    }
-  else if (st0_tag == TAG_Zero)
-    {
-      templ = 0;
-    }
-  else if ( st0_tag == TAG_Special )
-    {
-      st0_tag = FPU_Special(st0_ptr);
-      if (st0_tag == TW_Denormal)
-       {
-         reg_copy(st0_ptr, &tmp);
-
-         /* A denormal will always underflow. */
+       } else if (st0_tag == TAG_Zero) {
+               templ = 0;
+       } else if (st0_tag == TAG_Special) {
+               st0_tag = FPU_Special(st0_ptr);
+               if (st0_tag == TW_Denormal) {
+                       reg_copy(st0_ptr, &tmp);
+
+                       /* A denormal will always underflow. */
  #ifndef PECULIAR_486
-         /* An 80486 is supposed to be able to generate
-            a denormal exception here, but... */
-         /* Underflow has priority. */
-         if ( control_word & CW_Underflow )
-           denormal_operand();
-#endif /* PECULIAR_486 */ 
-         goto denormal_arg;
-       }
-      else if (st0_tag == TW_Infinity)
-       {
-         templ = 0x7f800000;
-       }
-      else if (st0_tag == TW_NaN)
-       {
-         /* Is it really a NaN ? */
-         if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) )
-           {
-             /* See if we can get a valid NaN from the FPU_REG */
-             templ = st0_ptr->sigh >> 8;
-             if ( !(st0_ptr->sigh & 0x40000000) )
-               {
-                 /* It is a signalling NaN */
-                 EXCEPTION(EX_Invalid);
-                 if ( !(control_word & CW_Invalid) )
-                   return 0;
-                 templ |= (0x40000000 >> 8);
+                       /* An 80486 is supposed to be able to generate
+                          a denormal exception here, but... */
+                       /* Underflow has priority. */
+                       if (control_word & CW_Underflow)
+                               denormal_operand();
+#endif /* PECULIAR_486 */
+                       goto denormal_arg;
+               } else if (st0_tag == TW_Infinity) {
+                       templ = 0x7f800000;
+               } else if (st0_tag == TW_NaN) {
+                       /* Is it really a NaN ? */
+                       if ((exponent(st0_ptr) == EXP_OVER)
+                           && (st0_ptr->sigh & 0x80000000)) {
+                               /* See if we can get a valid NaN from the FPU_REG */
+                               templ = st0_ptr->sigh >> 8;
+                               if (!(st0_ptr->sigh & 0x40000000)) {
+                                       /* It is a signalling NaN */
+                                       EXCEPTION(EX_Invalid);
+                                       if (!(control_word & CW_Invalid))
+                                               return 0;
+                                       templ |= (0x40000000 >> 8);
+                               }
+                               templ |= 0x7f800000;
+                       } else {
+                               /* It is an unsupported data type */
+                               EXCEPTION(EX_Invalid);
+                               if (!(control_word & CW_Invalid))
+                                       return 0;
+                               templ = 0xffc00000;
+                       }
                 }
-             templ |= 0x7f800000;
-           }
-         else
-           {
-             /* It is an unsupported data type */
-             EXCEPTION(EX_Invalid);
-             if ( !(control_word & CW_Invalid) )
-               return 0;
-             templ = 0xffc00000;
-           }
-       }
  #ifdef PARANOID
-      else
-       {
-         EXCEPTION(EX_INTERNAL|0x164);
-         return 0;
-       }
+               else {
+                       EXCEPTION(EX_INTERNAL | 0x164);
+                       return 0;
+               }
  #endif
-    }
-  else if ( st0_tag == TAG_Empty )
-    {
-      /* Empty register (stack underflow) */
-      EXCEPTION(EX_StackUnder);
-      if ( control_word & EX_Invalid )
-       {
-         /* The masked response */
-         /* Put out the QNaN indefinite */
-         RE_ENTRANT_CHECK_OFF;
-         FPU_access_ok(VERIFY_WRITE,single,4);
-         FPU_put_user(0xffc00000, (unsigned long __user *) single);
-         RE_ENTRANT_CHECK_ON;
-         return 1;
+       } else if (st0_tag == TAG_Empty) {
+               /* Empty register (stack underflow) */
+               EXCEPTION(EX_StackUnder);
+               if (control_word & EX_Invalid) {
+                       /* The masked response */
+                       /* Put out the QNaN indefinite */
+                       RE_ENTRANT_CHECK_OFF;
+                       FPU_access_ok(VERIFY_WRITE, single, 4);
+                       FPU_put_user(0xffc00000,
+                                    (unsigned long __user *)single);
+                       RE_ENTRANT_CHECK_ON;
+                       return 1;
+               } else
+                       return 0;
         }
-      else
-       return 0;
-    }
  #ifdef PARANOID
-  else
-    {
-      EXCEPTION(EX_INTERNAL|0x163);
-      return 0;
-    }
+       else {
+               EXCEPTION(EX_INTERNAL | 0x163);
+               return 0;
+       }
  #endif
-  if ( getsign(st0_ptr) )
-    templ |= 0x80000000;
+       if (getsign(st0_ptr))
+               templ |= 0x80000000;
  
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE,single,4);
-  FPU_put_user(templ,(unsigned long __user *) single);
-  RE_ENTRANT_CHECK_ON;
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_WRITE, single, 4);
+       FPU_put_user(templ, (unsigned long __user *)single);
+       RE_ENTRANT_CHECK_ON;
  
-  return 1;
+       return 1;
  }
  
-
  /* Put a long long into user memory */
  int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
  {
-  FPU_REG t;
-  long long tll;
-  int precision_loss;
-
-  if ( st0_tag == TAG_Empty )
-    {
-      /* Empty register (stack underflow) */
-      EXCEPTION(EX_StackUnder);
-      goto invalid_operand;
-    }
-  else if ( st0_tag == TAG_Special )
-    {
-      st0_tag = FPU_Special(st0_ptr);
-      if ( (st0_tag == TW_Infinity) ||
-          (st0_tag == TW_NaN) )
-       {
-         EXCEPTION(EX_Invalid);
-         goto invalid_operand;
+       FPU_REG t;
+       long long tll;
+       int precision_loss;
+
+       if (st0_tag == TAG_Empty) {
+               /* Empty register (stack underflow) */
+               EXCEPTION(EX_StackUnder);
+               goto invalid_operand;
+       } else if (st0_tag == TAG_Special) {
+               st0_tag = FPU_Special(st0_ptr);
+               if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+                       EXCEPTION(EX_Invalid);
+                       goto invalid_operand;
+               }
         }
-    }
-
-  reg_copy(st0_ptr, &t);
-  precision_loss = FPU_round_to_int(&t, st0_tag);
-  ((long *)&tll)[0] = t.sigl;
-  ((long *)&tll)[1] = t.sigh;
-  if ( (precision_loss == 1) ||
-      ((t.sigh & 0x80000000) &&
-       !((t.sigh == 0x80000000) && (t.sigl == 0) &&
-        signnegative(&t))) )
-    {
-      EXCEPTION(EX_Invalid);
-      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
-    invalid_operand:
-      if ( control_word & EX_Invalid )
-       {
-         /* Produce something like QNaN "indefinite" */
-         tll = 0x8000000000000000LL;
+
+       reg_copy(st0_ptr, &t);
+       precision_loss = FPU_round_to_int(&t, st0_tag);
+       ((long *)&tll)[0] = t.sigl;
+       ((long *)&tll)[1] = t.sigh;
+       if ((precision_loss == 1) ||
+           ((t.sigh & 0x80000000) &&
+            !((t.sigh == 0x80000000) && (t.sigl == 0) && signnegative(&t)))) {
+               EXCEPTION(EX_Invalid);
+               /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+             invalid_operand:
+               if (control_word & EX_Invalid) {
+                       /* Produce something like QNaN "indefinite" */
+                       tll = 0x8000000000000000LL;
+               } else
+                       return 0;
+       } else {
+               if (precision_loss)
+                       set_precision_flag(precision_loss);
+               if (signnegative(&t))
+                       tll = -tll;
         }
-      else
-       return 0;
-    }
-  else
-    {
-      if ( precision_loss )
-       set_precision_flag(precision_loss);
-      if ( signnegative(&t) )
-       tll = - tll;
-    }
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE,d,8);
-  if (copy_to_user(d, &tll, 8))
-    FPU_abort;
-  RE_ENTRANT_CHECK_ON;
-
-  return 1;
-}
  
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_WRITE, d, 8);
+       if (copy_to_user(d, &tll, 8))
+               FPU_abort;
+       RE_ENTRANT_CHECK_ON;
+
+       return 1;
+}
  
  /* Put a long into user memory */
  int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d)
  {
-  FPU_REG t;
-  int precision_loss;
-
-  if ( st0_tag == TAG_Empty )
-    {
-      /* Empty register (stack underflow) */
-      EXCEPTION(EX_StackUnder);
-      goto invalid_operand;
-    }
-  else if ( st0_tag == TAG_Special )
-    {
-      st0_tag = FPU_Special(st0_ptr);
-      if ( (st0_tag == TW_Infinity) ||
-          (st0_tag == TW_NaN) )
-       {
-         EXCEPTION(EX_Invalid);
-         goto invalid_operand;
+       FPU_REG t;
+       int precision_loss;
+
+       if (st0_tag == TAG_Empty) {
+               /* Empty register (stack underflow) */
+               EXCEPTION(EX_StackUnder);
+               goto invalid_operand;
+       } else if (st0_tag == TAG_Special) {
+               st0_tag = FPU_Special(st0_ptr);
+               if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+                       EXCEPTION(EX_Invalid);
+                       goto invalid_operand;
+               }
         }
-    }
-
-  reg_copy(st0_ptr, &t);
-  precision_loss = FPU_round_to_int(&t, st0_tag);
-  if (t.sigh ||
-      ((t.sigl & 0x80000000) &&
-       !((t.sigl == 0x80000000) && signnegative(&t))) )
-    {
-      EXCEPTION(EX_Invalid);
-      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
-    invalid_operand:
-      if ( control_word & EX_Invalid )
-       {
-         /* Produce something like QNaN "indefinite" */
-         t.sigl = 0x80000000;
+
+       reg_copy(st0_ptr, &t);
+       precision_loss = FPU_round_to_int(&t, st0_tag);
+       if (t.sigh ||
+           ((t.sigl & 0x80000000) &&
+            !((t.sigl == 0x80000000) && signnegative(&t)))) {
+               EXCEPTION(EX_Invalid);
+               /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+             invalid_operand:
+               if (control_word & EX_Invalid) {
+                       /* Produce something like QNaN "indefinite" */
+                       t.sigl = 0x80000000;
+               } else
+                       return 0;
+       } else {
+               if (precision_loss)
+                       set_precision_flag(precision_loss);
+               if (signnegative(&t))
+                       t.sigl = -(long)t.sigl;
         }
-      else
-       return 0;
-    }
-  else
-    {
-      if ( precision_loss )
-       set_precision_flag(precision_loss);
-      if ( signnegative(&t) )
-       t.sigl = -(long)t.sigl;
-    }
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE,d,4);
-  FPU_put_user(t.sigl, (unsigned long __user *) d);
-  RE_ENTRANT_CHECK_ON;
-
-  return 1;
-}
  
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_WRITE, d, 4);
+       FPU_put_user(t.sigl, (unsigned long __user *)d);
+       RE_ENTRANT_CHECK_ON;
+
+       return 1;
+}
  
  /* Put a short into user memory */
  int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d)
  {
-  FPU_REG t;
-  int precision_loss;
-
-  if ( st0_tag == TAG_Empty )
-    {
-      /* Empty register (stack underflow) */
-      EXCEPTION(EX_StackUnder);
-      goto invalid_operand;
-    }
-  else if ( st0_tag == TAG_Special )
-    {
-      st0_tag = FPU_Special(st0_ptr);
-      if ( (st0_tag == TW_Infinity) ||
-          (st0_tag == TW_NaN) )
-       {
-         EXCEPTION(EX_Invalid);
-         goto invalid_operand;
+       FPU_REG t;
+       int precision_loss;
+
+       if (st0_tag == TAG_Empty) {
+               /* Empty register (stack underflow) */
+               EXCEPTION(EX_StackUnder);
+               goto invalid_operand;
+       } else if (st0_tag == TAG_Special) {
+               st0_tag = FPU_Special(st0_ptr);
+               if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+                       EXCEPTION(EX_Invalid);
+                       goto invalid_operand;
+               }
         }
-    }
-
-  reg_copy(st0_ptr, &t);
-  precision_loss = FPU_round_to_int(&t, st0_tag);
-  if (t.sigh ||
-      ((t.sigl & 0xffff8000) &&
-       !((t.sigl == 0x8000) && signnegative(&t))) )
-    {
-      EXCEPTION(EX_Invalid);
-      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
-    invalid_operand:
-      if ( control_word & EX_Invalid )
-       {
-         /* Produce something like QNaN "indefinite" */
-         t.sigl = 0x8000;
+
+       reg_copy(st0_ptr, &t);
+       precision_loss = FPU_round_to_int(&t, st0_tag);
+       if (t.sigh ||
+           ((t.sigl & 0xffff8000) &&
+            !((t.sigl == 0x8000) && signnegative(&t)))) {
+               EXCEPTION(EX_Invalid);
+               /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+             invalid_operand:
+               if (control_word & EX_Invalid) {
+                       /* Produce something like QNaN "indefinite" */
+                       t.sigl = 0x8000;
+               } else
+                       return 0;
+       } else {
+               if (precision_loss)
+                       set_precision_flag(precision_loss);
+               if (signnegative(&t))
+                       t.sigl = -t.sigl;
         }
-      else
-       return 0;
-    }
-  else
-    {
-      if ( precision_loss )
-       set_precision_flag(precision_loss);
-      if ( signnegative(&t) )
-       t.sigl = -t.sigl;
-    }
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE,d,2);
-  FPU_put_user((short)t.sigl, d);
-  RE_ENTRANT_CHECK_ON;
-
-  return 1;
-}
  
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_WRITE, d, 2);
+       FPU_put_user((short)t.sigl, d);
+       RE_ENTRANT_CHECK_ON;
+
+       return 1;
+}
  
  /* Put a packed bcd array into user memory */
  int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
  {
-  FPU_REG t;
-  unsigned long long ll;
-  u_char b;
-  int i, precision_loss;
-  u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0;
-
-  if ( st0_tag == TAG_Empty )
-    {
-      /* Empty register (stack underflow) */
-      EXCEPTION(EX_StackUnder);
-      goto invalid_operand;
-    }
-  else if ( st0_tag == TAG_Special )
-    {
-      st0_tag = FPU_Special(st0_ptr);
-      if ( (st0_tag == TW_Infinity) ||
-          (st0_tag == TW_NaN) )
-       {
-         EXCEPTION(EX_Invalid);
-         goto invalid_operand;
+       FPU_REG t;
+       unsigned long long ll;
+       u_char b;
+       int i, precision_loss;
+       u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0;
+
+       if (st0_tag == TAG_Empty) {
+               /* Empty register (stack underflow) */
+               EXCEPTION(EX_StackUnder);
+               goto invalid_operand;
+       } else if (st0_tag == TAG_Special) {
+               st0_tag = FPU_Special(st0_ptr);
+               if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+                       EXCEPTION(EX_Invalid);
+                       goto invalid_operand;
+               }
+       }
+
+       reg_copy(st0_ptr, &t);
+       precision_loss = FPU_round_to_int(&t, st0_tag);
+       ll = significand(&t);
+
+       /* Check for overflow, by comparing with 999999999999999999 decimal. */
+       if ((t.sigh > 0x0de0b6b3) ||
+           ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff))) {
+               EXCEPTION(EX_Invalid);
+               /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+             invalid_operand:
+               if (control_word & CW_Invalid) {
+                       /* Produce the QNaN "indefinite" */
+                       RE_ENTRANT_CHECK_OFF;
+                       FPU_access_ok(VERIFY_WRITE, d, 10);
+                       for (i = 0; i < 7; i++)
+                               FPU_put_user(0, d + i); /* These bytes "undefined" */
+                       FPU_put_user(0xc0, d + 7);      /* This byte "undefined" */
+                       FPU_put_user(0xff, d + 8);
+                       FPU_put_user(0xff, d + 9);
+                       RE_ENTRANT_CHECK_ON;
+                       return 1;
+               } else
+                       return 0;
+       } else if (precision_loss) {
+               /* Precision loss doesn't stop the data transfer */
+               set_precision_flag(precision_loss);
         }
-    }
-
-  reg_copy(st0_ptr, &t);
-  precision_loss = FPU_round_to_int(&t, st0_tag);
-  ll = significand(&t);
-
-  /* Check for overflow, by comparing with 999999999999999999 decimal. */
-  if ( (t.sigh > 0x0de0b6b3) ||
-      ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) )
-    {
-      EXCEPTION(EX_Invalid);
-      /* This is a special case: see sec 16.2.5.1 of the 80486 book */
-    invalid_operand:
-      if ( control_word & CW_Invalid )
-       {
-         /* Produce the QNaN "indefinite" */
-         RE_ENTRANT_CHECK_OFF;
-         FPU_access_ok(VERIFY_WRITE,d,10);
-         for ( i = 0; i < 7; i++)
-           FPU_put_user(0, d+i); /* These bytes "undefined" */
-         FPU_put_user(0xc0, d+7); /* This byte "undefined" */
-         FPU_put_user(0xff, d+8);
-         FPU_put_user(0xff, d+9);
-         RE_ENTRANT_CHECK_ON;
-         return 1;
+
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_WRITE, d, 10);
+       RE_ENTRANT_CHECK_ON;
+       for (i = 0; i < 9; i++) {
+               b = FPU_div_small(&ll, 10);
+               b |= (FPU_div_small(&ll, 10)) << 4;
+               RE_ENTRANT_CHECK_OFF;
+               FPU_put_user(b, d + i);
+               RE_ENTRANT_CHECK_ON;
         }
-      else
-       return 0;
-    }
-  else if ( precision_loss )
-    {
-      /* Precision loss doesn't stop the data transfer */
-      set_precision_flag(precision_loss);
-    }
-
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE,d,10);
-  RE_ENTRANT_CHECK_ON;
-  for ( i = 0; i < 9; i++)
-    {
-      b = FPU_div_small(&ll, 10);
-      b |= (FPU_div_small(&ll, 10)) << 4;
-      RE_ENTRANT_CHECK_OFF;
-      FPU_put_user(b, d+i);
-      RE_ENTRANT_CHECK_ON;
-    }
-  RE_ENTRANT_CHECK_OFF;
-  FPU_put_user(sign, d+9);
-  RE_ENTRANT_CHECK_ON;
-
-  return 1;
+       RE_ENTRANT_CHECK_OFF;
+       FPU_put_user(sign, d + 9);
+       RE_ENTRANT_CHECK_ON;
+
+       return 1;
  }
  
  /*===========================================================================*/
@@ -1119,59 +973,56 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
     largest possible value */
  int FPU_round_to_int(FPU_REG *r, u_char tag)
  {
-  u_char     very_big;
-  unsigned eax;
-
-  if (tag == TAG_Zero)
-    {
-      /* Make sure that zero is returned */
-      significand(r) = 0;
-      return 0;        /* o.k. */
-    }
-
-  if (exponent(r) > 63)
-    {
-      r->sigl = r->sigh = ~0;      /* The largest representable number */
-      return 1;        /* overflow */
-    }
-
-  eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
-  very_big = !(~(r->sigh) | ~(r->sigl));  /* test for 0xfff...fff */
+       u_char very_big;
+       unsigned eax;
+
+       if (tag == TAG_Zero) {
+               /* Make sure that zero is returned */
+               significand(r) = 0;
+               return 0;       /* o.k. */
+       }
+
+       if (exponent(r) > 63) {
+               r->sigl = r->sigh = ~0; /* The largest representable number */
+               return 1;       /* overflow */
+       }
+
+       eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
+       very_big = !(~(r->sigh) | ~(r->sigl));  /* test for 0xfff...fff */
  #define        half_or_more    (eax & 0x80000000)
  #define        frac_part       (eax)
  #define more_than_half  ((eax & 0x80000001) == 0x80000001)
-  switch (control_word & CW_RC)
-    {
-    case RC_RND:
-      if ( more_than_half                      /* nearest */
-         || (half_or_more && (r->sigl & 1)) )  /* odd -> even */
-       {
-         if ( very_big ) return 1;        /* overflow */
-         significand(r) ++;
-         return PRECISION_LOST_UP;
-       }
-      break;
-    case RC_DOWN:
-      if (frac_part && getsign(r))
-       {
-         if ( very_big ) return 1;        /* overflow */
-         significand(r) ++;
-         return PRECISION_LOST_UP;
-       }
-      break;
-    case RC_UP:
-      if (frac_part && !getsign(r))
-       {
-         if ( very_big ) return 1;        /* overflow */
-         significand(r) ++;
-         return PRECISION_LOST_UP;
+       switch (control_word & CW_RC) {
+       case RC_RND:
+               if (more_than_half      /* nearest */
+                   || (half_or_more && (r->sigl & 1))) {       /* odd -> even */
+                       if (very_big)
+                               return 1;       /* overflow */
+                       significand(r)++;
+                       return PRECISION_LOST_UP;
+               }
+               break;
+       case RC_DOWN:
+               if (frac_part && getsign(r)) {
+                       if (very_big)
+                               return 1;       /* overflow */
+                       significand(r)++;
+                       return PRECISION_LOST_UP;
+               }
+               break;
+       case RC_UP:
+               if (frac_part && !getsign(r)) {
+                       if (very_big)
+                               return 1;       /* overflow */
+                       significand(r)++;
+                       return PRECISION_LOST_UP;
+               }
+               break;
+       case RC_CHOP:
+               break;
         }
-      break;
-    case RC_CHOP:
-      break;
-    }
  
-  return eax ? PRECISION_LOST_DOWN : 0;
+       return eax ? PRECISION_LOST_DOWN : 0;
  
  }
  
@@ -1179,197 +1030,195 @@ int FPU_round_to_int(FPU_REG *r, u_char tag)
  
  u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
  {
-  unsigned short tag_word = 0;
-  u_char tag;
-  int i;
-
-  if ( (addr_modes.default_mode == VM86) ||
-      ((addr_modes.default_mode == PM16)
-      ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) )
-    {
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_READ, s, 0x0e);
-      FPU_get_user(control_word, (unsigned short __user *) s);
-      FPU_get_user(partial_status, (unsigned short __user *) (s+2));
-      FPU_get_user(tag_word, (unsigned short __user *) (s+4));
-      FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6));
-      FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8));
-      FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a));
-      FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c));
-      RE_ENTRANT_CHECK_ON;
-      s += 0x0e;
-      if ( addr_modes.default_mode == VM86 )
-       {
-         instruction_address.offset
-           += (instruction_address.selector & 0xf000) << 4;
-         operand_address.offset += (operand_address.selector & 0xf000) << 4;
+       unsigned short tag_word = 0;
+       u_char tag;
+       int i;
+
+       if ((addr_modes.default_mode == VM86) ||
+           ((addr_modes.default_mode == PM16)
+            ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_READ, s, 0x0e);
+               FPU_get_user(control_word, (unsigned short __user *)s);
+               FPU_get_user(partial_status, (unsigned short __user *)(s + 2));
+               FPU_get_user(tag_word, (unsigned short __user *)(s + 4));
+               FPU_get_user(instruction_address.offset,
+                            (unsigned short __user *)(s + 6));
+               FPU_get_user(instruction_address.selector,
+                            (unsigned short __user *)(s + 8));
+               FPU_get_user(operand_address.offset,
+                            (unsigned short __user *)(s + 0x0a));
+               FPU_get_user(operand_address.selector,
+                            (unsigned short __user *)(s + 0x0c));
+               RE_ENTRANT_CHECK_ON;
+               s += 0x0e;
+               if (addr_modes.default_mode == VM86) {
+                       instruction_address.offset
+                           += (instruction_address.selector & 0xf000) << 4;
+                       operand_address.offset +=
+                           (operand_address.selector & 0xf000) << 4;
+               }
+       } else {
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_READ, s, 0x1c);
+               FPU_get_user(control_word, (unsigned short __user *)s);
+               FPU_get_user(partial_status, (unsigned short __user *)(s + 4));
+               FPU_get_user(tag_word, (unsigned short __user *)(s + 8));
+               FPU_get_user(instruction_address.offset,
+                            (unsigned long __user *)(s + 0x0c));
+               FPU_get_user(instruction_address.selector,
+                            (unsigned short __user *)(s + 0x10));
+               FPU_get_user(instruction_address.opcode,
+                            (unsigned short __user *)(s + 0x12));
+               FPU_get_user(operand_address.offset,
+                            (unsigned long __user *)(s + 0x14));
+               FPU_get_user(operand_address.selector,
+                            (unsigned long __user *)(s + 0x18));
+               RE_ENTRANT_CHECK_ON;
+               s += 0x1c;
         }
-    }
-  else
-    {
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_READ, s, 0x1c);
-      FPU_get_user(control_word, (unsigned short __user *) s);
-      FPU_get_user(partial_status, (unsigned short __user *) (s+4));
-      FPU_get_user(tag_word, (unsigned short __user *) (s+8));
-      FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c));
-      FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10));
-      FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12));
-      FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14));
-      FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18));
-      RE_ENTRANT_CHECK_ON;
-      s += 0x1c;
-    }
  
  #ifdef PECULIAR_486
-  control_word &= ~0xe080;
-#endif /* PECULIAR_486 */ 
-
-  top = (partial_status >> SW_Top_Shift) & 7;
-
-  if ( partial_status & ~control_word & CW_Exceptions )
-    partial_status |= (SW_Summary | SW_Backward);
-  else
-    partial_status &= ~(SW_Summary | SW_Backward);
-
-  for ( i = 0; i < 8; i++ )
-    {
-      tag = tag_word & 3;
-      tag_word >>= 2;
-
-      if ( tag == TAG_Empty )
-       /* New tag is empty.  Accept it */
-       FPU_settag(i, TAG_Empty);
-      else if ( FPU_gettag(i) == TAG_Empty )
-       {
-         /* Old tag is empty and new tag is not empty.  New tag is determined
-            by old reg contents */
-         if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias )
-           {
-             if ( !(fpu_register(i).sigl | fpu_register(i).sigh) )
-               FPU_settag(i, TAG_Zero);
-             else
-               FPU_settag(i, TAG_Special);
-           }
-         else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias )
-           {
-             FPU_settag(i, TAG_Special);
-           }
-         else if ( fpu_register(i).sigh & 0x80000000 )
-           FPU_settag(i, TAG_Valid);
-         else
-           FPU_settag(i, TAG_Special);   /* An Un-normal */
-       }
-      /* Else old tag is not empty and new tag is not empty.  Old tag
-        remains correct */
-    }
-
-  return s;
-}
+       control_word &= ~0xe080;
+#endif /* PECULIAR_486 */
+
+       top = (partial_status >> SW_Top_Shift) & 7;
+
+       if (partial_status & ~control_word & CW_Exceptions)
+               partial_status |= (SW_Summary | SW_Backward);
+       else
+               partial_status &= ~(SW_Summary | SW_Backward);
+
+       for (i = 0; i < 8; i++) {
+               tag = tag_word & 3;
+               tag_word >>= 2;
+
+               if (tag == TAG_Empty)
+                       /* New tag is empty.  Accept it */
+                       FPU_settag(i, TAG_Empty);
+               else if (FPU_gettag(i) == TAG_Empty) {
+                       /* Old tag is empty and new tag is not empty.  New tag is determined
+                          by old reg contents */
+                       if (exponent(&fpu_register(i)) == -EXTENDED_Ebias) {
+                               if (!
+                                   (fpu_register(i).sigl | fpu_register(i).
+                                    sigh))
+                                       FPU_settag(i, TAG_Zero);
+                               else
+                                       FPU_settag(i, TAG_Special);
+                       } else if (exponent(&fpu_register(i)) ==
+                                  0x7fff - EXTENDED_Ebias) {
+                               FPU_settag(i, TAG_Special);
+                       } else if (fpu_register(i).sigh & 0x80000000)
+                               FPU_settag(i, TAG_Valid);
+                       else
+                               FPU_settag(i, TAG_Special);     /* An Un-normal */
+               }
+               /* Else old tag is not empty and new tag is not empty.  Old tag
+                  remains correct */
+       }
  
+       return s;
+}
  
  void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
  {
-  int i, regnr;
-  u_char __user *s = fldenv(addr_modes, data_address);
-  int offset = (top & 7) * 10, other = 80 - offset;
-
-  /* Copy all registers in stack order. */
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_READ,s,80);
-  __copy_from_user(register_base+offset, s, other);
-  if ( offset )
-    __copy_from_user(register_base, s+other, offset);
-  RE_ENTRANT_CHECK_ON;
-
-  for ( i = 0; i < 8; i++ )
-    {
-      regnr = (i+top) & 7;
-      if ( FPU_gettag(regnr) != TAG_Empty )
-       /* The loaded data over-rides all other cases. */
-       FPU_settag(regnr, FPU_tagof(&st(i)));
-    }
+       int i, regnr;
+       u_char __user *s = fldenv(addr_modes, data_address);
+       int offset = (top & 7) * 10, other = 80 - offset;
+
+       /* Copy all registers in stack order. */
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_READ, s, 80);
+       __copy_from_user(register_base + offset, s, other);
+       if (offset)
+               __copy_from_user(register_base, s + other, offset);
+       RE_ENTRANT_CHECK_ON;
+
+       for (i = 0; i < 8; i++) {
+               regnr = (i + top) & 7;
+               if (FPU_gettag(regnr) != TAG_Empty)
+                       /* The loaded data over-rides all other cases. */
+                       FPU_settag(regnr, FPU_tagof(&st(i)));
+       }
  
  }
  
-
  u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
  {
-  if ( (addr_modes.default_mode == VM86) ||
-      ((addr_modes.default_mode == PM16)
-      ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) )
-    {
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_WRITE,d,14);
+       if ((addr_modes.default_mode == VM86) ||
+           ((addr_modes.default_mode == PM16)
+            ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_WRITE, d, 14);
  #ifdef PECULIAR_486
-      FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d);
+               FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d);
  #else
-      FPU_put_user(control_word, (unsigned short __user *) d);
+               FPU_put_user(control_word, (unsigned short __user *)d);
  #endif /* PECULIAR_486 */
-      FPU_put_user(status_word(), (unsigned short __user *) (d+2));
-      FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4));
-      FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6));
-      FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a));
-      if ( addr_modes.default_mode == VM86 )
-       {
-         FPU_put_user((instruction_address.offset & 0xf0000) >> 4,
-                     (unsigned short __user *) (d+8));
-         FPU_put_user((operand_address.offset & 0xf0000) >> 4,
-                     (unsigned short __user *) (d+0x0c));
-       }
-      else
-       {
-         FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8));
-         FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c));
-       }
-      RE_ENTRANT_CHECK_ON;
-      d += 0x0e;
-    }
-  else
-    {
-      RE_ENTRANT_CHECK_OFF;
-      FPU_access_ok(VERIFY_WRITE, d, 7*4);
+               FPU_put_user(status_word(), (unsigned short __user *)(d + 2));
+               FPU_put_user(fpu_tag_word, (unsigned short __user *)(d + 4));
+               FPU_put_user(instruction_address.offset,
+                            (unsigned short __user *)(d + 6));
+               FPU_put_user(operand_address.offset,
+                            (unsigned short __user *)(d + 0x0a));
+               if (addr_modes.default_mode == VM86) {
+                       FPU_put_user((instruction_address.
+                                     offset & 0xf0000) >> 4,
+                                    (unsigned short __user *)(d + 8));
+                       FPU_put_user((operand_address.offset & 0xf0000) >> 4,
+                                    (unsigned short __user *)(d + 0x0c));
+               } else {
+                       FPU_put_user(instruction_address.selector,
+                                    (unsigned short __user *)(d + 8));
+                       FPU_put_user(operand_address.selector,
+                                    (unsigned short __user *)(d + 0x0c));
+               }
+               RE_ENTRANT_CHECK_ON;
+               d += 0x0e;
+       } else {
+               RE_ENTRANT_CHECK_OFF;
+               FPU_access_ok(VERIFY_WRITE, d, 7 * 4);
  #ifdef PECULIAR_486
-      control_word &= ~0xe080;
-      /* An 80486 sets nearly all of the reserved bits to 1. */
-      control_word |= 0xffff0040;
-      partial_status = status_word() | 0xffff0000;
-      fpu_tag_word |= 0xffff0000;
-      I387.soft.fcs &= ~0xf8000000;
-      I387.soft.fos |= 0xffff0000;
+               control_word &= ~0xe080;
+               /* An 80486 sets nearly all of the reserved bits to 1. */
+               control_word |= 0xffff0040;
+               partial_status = status_word() | 0xffff0000;
+               fpu_tag_word |= 0xffff0000;
+               I387.soft.fcs &= ~0xf8000000;
+               I387.soft.fos |= 0xffff0000;
  #endif /* PECULIAR_486 */
-      if (__copy_to_user(d, &control_word, 7*4))
-       FPU_abort;
-      RE_ENTRANT_CHECK_ON;
-      d += 0x1c;
-    }
-  
-  control_word |= CW_Exceptions;
-  partial_status &= ~(SW_Summary | SW_Backward);
-
-  return d;
-}
+               if (__copy_to_user(d, &control_word, 7 * 4))
+                       FPU_abort;
+               RE_ENTRANT_CHECK_ON;
+               d += 0x1c;
+       }
  
+       control_word |= CW_Exceptions;
+       partial_status &= ~(SW_Summary | SW_Backward);
+
+       return d;
+}
  
  void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
  {
-  u_char __user *d;
-  int offset = (top & 7) * 10, other = 80 - offset;
+       u_char __user *d;
+       int offset = (top & 7) * 10, other = 80 - offset;
  
-  d = fstenv(addr_modes, data_address);
+       d = fstenv(addr_modes, data_address);
  
-  RE_ENTRANT_CHECK_OFF;
-  FPU_access_ok(VERIFY_WRITE,d,80);
+       RE_ENTRANT_CHECK_OFF;
+       FPU_access_ok(VERIFY_WRITE, d, 80);
  
-  /* Copy all registers in stack order. */
-  if (__copy_to_user(d, register_base+offset, other))
-    FPU_abort;
-  if ( offset )
-    if (__copy_to_user(d+other, register_base, offset))
-      FPU_abort;
-  RE_ENTRANT_CHECK_ON;
+       /* Copy all registers in stack order. */
+       if (__copy_to_user(d, register_base + offset, other))
+               FPU_abort;
+       if (offset)
+               if (__copy_to_user(d + other, register_base, offset))
+                       FPU_abort;
+       RE_ENTRANT_CHECK_ON;
  
-  finit();
+       finit();
  }
  
  /*===========================================================================*/
diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c

index 40f50b61bc674dc3d0028ea19c4694c48e1bc634..36c37f71f713b69537387c658303ca579e456730 100644 (file)
--- a/arch/x86/math-emu/reg_mul.c
+++ b/arch/x86/math-emu/reg_mul.c
@@ -20,7 +20,6 @@
  #include "reg_constant.h"
  #include "fpu_system.h"
  
-
  /*
    Multiply two registers to give a register result.
    The sources are st(deststnr) and (b,tagb,signb).
@@ -29,104 +28,88 @@
  /* This routine must be called with non-empty source registers */
  int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
  {
-  FPU_REG *a = &st(deststnr);
-  FPU_REG *dest = a;
-  u_char taga = FPU_gettagi(deststnr);
-  u_char saved_sign = getsign(dest);
-  u_char sign = (getsign(a) ^ getsign(b));
-  int tag;
-
+       FPU_REG *a = &st(deststnr);
+       FPU_REG *dest = a;
+       u_char taga = FPU_gettagi(deststnr);
+       u_char saved_sign = getsign(dest);
+       u_char sign = (getsign(a) ^ getsign(b));
+       int tag;
  
-  if ( !(taga | tagb) )
-    {
-      /* Both regs Valid, this should be the most common case. */
+       if (!(taga | tagb)) {
+               /* Both regs Valid, this should be the most common case. */
  
-      tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b));
-      if ( tag < 0 )
-       {
-         setsign(dest, saved_sign);
-         return tag;
+               tag =
+                   FPU_u_mul(a, b, dest, control_w, sign,
+                             exponent(a) + exponent(b));
+               if (tag < 0) {
+                       setsign(dest, saved_sign);
+                       return tag;
+               }
+               FPU_settagi(deststnr, tag);
+               return tag;
         }
-      FPU_settagi(deststnr, tag);
-      return tag;
-    }
  
-  if ( taga == TAG_Special )
-    taga = FPU_Special(a);
-  if ( tagb == TAG_Special )
-    tagb = FPU_Special(b);
+       if (taga == TAG_Special)
+               taga = FPU_Special(a);
+       if (tagb == TAG_Special)
+               tagb = FPU_Special(b);
  
-  if ( ((taga == TAG_Valid) && (tagb == TW_Denormal))
+       if (((taga == TAG_Valid) && (tagb == TW_Denormal))
             || ((taga == TW_Denormal) && (tagb == TAG_Valid))
-           || ((taga == TW_Denormal) && (tagb == TW_Denormal)) )
-    {
-      FPU_REG x, y;
-      if ( denormal_operand() < 0 )
-       return FPU_Exception;
-
-      FPU_to_exp16(a, &x);
-      FPU_to_exp16(b, &y);
-      tag = FPU_u_mul(&x, &y, dest, control_w, sign,
-                     exponent16(&x) + exponent16(&y));
-      if ( tag < 0 )
-       {
-         setsign(dest, saved_sign);
-         return tag;
-       }
-      FPU_settagi(deststnr, tag);
-      return tag;
-    }
-  else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) )
-    {
-      if ( ((tagb == TW_Denormal) || (taga == TW_Denormal))
-          && (denormal_operand() < 0) )
-       return FPU_Exception;
+           || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+               FPU_REG x, y;
+               if (denormal_operand() < 0)
+                       return FPU_Exception;
  
-      /* Must have either both arguments == zero, or
-        one valid and the other zero.
-        The result is therefore zero. */
-      FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
-      /* The 80486 book says that the answer is +0, but a real
-        80486 behaves this way.
-        IEEE-754 apparently says it should be this way. */
-      setsign(dest, sign);
-      return TAG_Zero;
-    }
-      /* Must have infinities, NaNs, etc */
-  else if ( (taga == TW_NaN) || (tagb == TW_NaN) )
-    {
-      return real_2op_NaN(b, tagb, deststnr, &st(0));
-    }
-  else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero))
-           || ((tagb == TW_Infinity) && (taga == TAG_Zero)) )
-    {
-      return arith_invalid(deststnr);  /* Zero*Infinity is invalid */
-    }
-  else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal))
-           && (denormal_operand() < 0) )
-    {
-      return FPU_Exception;
-    }
-  else if (taga == TW_Infinity)
-    {
-      FPU_copy_to_regi(a, TAG_Special, deststnr);
-      setsign(dest, sign);
-      return TAG_Special;
-    }
-  else if (tagb == TW_Infinity)
-    {
-      FPU_copy_to_regi(b, TAG_Special, deststnr);
-      setsign(dest, sign);
-      return TAG_Special;
-    }
+               FPU_to_exp16(a, &x);
+               FPU_to_exp16(b, &y);
+               tag = FPU_u_mul(&x, &y, dest, control_w, sign,
+                               exponent16(&x) + exponent16(&y));
+               if (tag < 0) {
+                       setsign(dest, saved_sign);
+                       return tag;
+               }
+               FPU_settagi(deststnr, tag);
+               return tag;
+       } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
+               if (((tagb == TW_Denormal) || (taga == TW_Denormal))
+                   && (denormal_operand() < 0))
+                       return FPU_Exception;
  
+               /* Must have either both arguments == zero, or
+                  one valid and the other zero.
+                  The result is therefore zero. */
+               FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+               /* The 80486 book says that the answer is +0, but a real
+                  80486 behaves this way.
+                  IEEE-754 apparently says it should be this way. */
+               setsign(dest, sign);
+               return TAG_Zero;
+       }
+       /* Must have infinities, NaNs, etc */
+       else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+               return real_2op_NaN(b, tagb, deststnr, &st(0));
+       } else if (((taga == TW_Infinity) && (tagb == TAG_Zero))
+                  || ((tagb == TW_Infinity) && (taga == TAG_Zero))) {
+               return arith_invalid(deststnr); /* Zero*Infinity is invalid */
+       } else if (((taga == TW_Denormal) || (tagb == TW_Denormal))
+                  && (denormal_operand() < 0)) {
+               return FPU_Exception;
+       } else if (taga == TW_Infinity) {
+               FPU_copy_to_regi(a, TAG_Special, deststnr);
+               setsign(dest, sign);
+               return TAG_Special;
+       } else if (tagb == TW_Infinity) {
+               FPU_copy_to_regi(b, TAG_Special, deststnr);
+               setsign(dest, sign);
+               return TAG_Special;
+       }
  #ifdef PARANOID
-  else
-    {
-      EXCEPTION(EX_INTERNAL|0x102);
-      return FPU_Exception;
-    }
-#endif /* PARANOID */ 
+       else {
+               EXCEPTION(EX_INTERNAL | 0x102);
+               return FPU_Exception;
+       }
+#endif /* PARANOID */
  
         return 0;
  }
diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h

index 59e73302aa60c599c6f5609d2e81a2bf49e3c9df..54a3f226982dcd108db4691eb7cde3c8d9061faf 100644 (file)
--- a/arch/x86/math-emu/status_w.h
+++ b/arch/x86/math-emu/status_w.h
@@ -10,7 +10,7 @@
  #ifndef _STATUS_H_
  #define _STATUS_H_
  
-#include "fpu_emu.h"    /* for definition of PECULIAR_486 */
+#include "fpu_emu.h"           /* for definition of PECULIAR_486 */
  
  #ifdef __ASSEMBLY__
  #define        Const__(x)      $##x
@@ -34,7 +34,7 @@
  #define SW_Denorm_Op           Const__(0x0002) /* denormalized operand */
  #define SW_Invalid             Const__(0x0001) /* invalid operation */
  
-#define SW_Exc_Mask     Const__(0x27f)  /* Status word exception bit mask */
+#define SW_Exc_Mask     Const__(0x27f) /* Status word exception bit mask */
  
  #ifndef __ASSEMBLY__
  
@@ -50,8 +50,8 @@
    ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
  static inline void setcc(int cc)
  {
-       partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3);
-       partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3);
+       partial_status &= ~(SW_C0 | SW_C1 | SW_C2 | SW_C3);
+       partial_status |= (cc) & (SW_C0 | SW_C1 | SW_C2 | SW_C3);
  }
  
  #ifdef PECULIAR_486
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32

index 362b4ad082de26dc807e1c6078ac79a05c438882..c36ae88bb543ebdcfd21314d1d2ac47df09de3e3 100644 (file)
--- a/arch/x86/mm/Makefile_32
+++ b/arch/x86/mm/Makefile_32
@@ -2,9 +2,8 @@
  # Makefile for the linux i386-specific parts of the memory manager.
  #
  
-obj-y  := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
+obj-y  := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
  
  obj-$(CONFIG_NUMA) += discontig_32.o
  obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
  obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64

index 6bcb47945b87aee40c9422bd35cfe1bb8f2d0f5f..688c8c28ac8f0ebce22d1719f5743e05b6964468 100644 (file)
--- a/arch/x86/mm/Makefile_64
+++ b/arch/x86/mm/Makefile_64
@@ -2,9 +2,8 @@
  # Makefile for the linux x86_64-specific parts of the memory manager.
  #
  
-obj-y   := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
+obj-y   := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
  obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
  obj-$(CONFIG_NUMA) += numa_64.o
  obj-$(CONFIG_K8_NUMA) += k8topology_64.o
  obj-$(CONFIG_ACPI_NUMA) += srat_64.o
-
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c

deleted file mode 100644 (file)

index f14da2a..0000000
--- a/arch/x86/mm/boot_ioremap_32.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * arch/i386/mm/boot_ioremap.c
- * 
- * Re-map functions for early boot-time before paging_init() when the 
- * boot-time pagetables are still in use
- *
- * Written by Dave Hansen <haveblue@us.ibm.com>
- */
-
-
-/*
- * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
- * keeps that from happening.  If anyone has a better way, I'm listening.
- *
- * boot_pte_t is defined only if this all works correctly
- */
-
-#undef CONFIG_X86_PAE
-#undef CONFIG_PARAVIRT
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/init.h>
-#include <linux/stddef.h>
-
-/* 
- * I'm cheating here.  It is known that the two boot PTE pages are 
- * allocated next to each other.  I'm pretending that they're just
- * one big array. 
- */
-
-#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
-
-static unsigned long boot_pte_index(unsigned long vaddr) 
-{
-       return __pa(vaddr) >> PAGE_SHIFT;
-}
-
-static inline boot_pte_t* boot_vaddr_to_pte(void *address)
-{
-       boot_pte_t* boot_pg = (boot_pte_t*)pg0;
-       return &boot_pg[boot_pte_index((unsigned long)address)];
-}
-
-/*
- * This is only for a caller who is clever enough to page-align
- * phys_addr and virtual_source, and who also has a preference
- * about which virtual address from which to steal ptes
- */
-static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, 
-                   void* virtual_source)
-{
-       boot_pte_t* pte;
-       int i;
-       char *vaddr = virtual_source;
-
-       pte = boot_vaddr_to_pte(virtual_source);
-       for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
-               set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
-               __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
-       }
-}
-
-/* the virtual space we're going to remap comes from this array */
-#define BOOT_IOREMAP_PAGES 4
-#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
-static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
-                      __attribute__ ((aligned (PAGE_SIZE)));
-
-/*
- * This only applies to things which need to ioremap before paging_init()
- * bt_ioremap() and plain ioremap() are both useless at this point.
- * 
- * When used, we're still using the boot-time pagetables, which only
- * have 2 PTE pages mapping the first 8MB
- *
- * There is no unmap.  The boot-time PTE pages aren't used after boot.
- * If you really want the space back, just remap it yourself.
- * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
- */
-__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
-{
-       unsigned long last_addr, offset;
-       unsigned int nrpages;
-       
-       last_addr = phys_addr + size - 1;
-
-       /* page align the requested address */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr) - phys_addr;
-       
-       nrpages = size >> PAGE_SHIFT;
-       if (nrpages > BOOT_IOREMAP_PAGES)
-               return NULL;
-       
-       __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
-
-       return &boot_ioremap_space[offset];
-}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c

index 13a474d3c6e9732d742fb389c7202477c2fa9fda..04b1d20e2613ca16018424a10e037f68cfa36973 100644 (file)
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -32,6 +32,7 @@
  #include <linux/kexec.h>
  #include <linux/pfn.h>
  #include <linux/swap.h>
+#include <linux/acpi.h>
  
  #include <asm/e820.h>
  #include <asm/setup.h>
@@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn;
  
  #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
  
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
  unsigned long node_remap_size[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
  static void *node_remap_start_vaddr[MAX_NUMNODES];
  void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
  
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
  static unsigned long kva_start_pfn;
  static unsigned long kva_pages;
  /*
@@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid)
         }
  }
  
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * In the discontig memory model, a portion of the kernel virtual area (KVA)
+ * is reserved and portions of nodes are mapped using it. This is to allow
+ * node-local memory to be allocated for structures that would normally require
+ * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers
+ * should be prepared to allocate from the bootmem allocator instead. This KVA
+ * mechanism is incompatible with SPARSEMEM as it makes assumptions about the
+ * layout of memory that are broken if alloc_remap() succeeds for some of the
+ * map and fails for others
+ */
+static unsigned long node_remap_start_pfn[MAX_NUMNODES];
+static void *node_remap_end_vaddr[MAX_NUMNODES];
+static void *node_remap_alloc_vaddr[MAX_NUMNODES];
+static unsigned long node_remap_offset[MAX_NUMNODES];
+
  void *alloc_remap(int nid, unsigned long size)
  {
         void *allocation = node_remap_alloc_vaddr[nid];
@@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void)
         return reserve_pages;
  }
  
+static void init_remap_allocator(int nid)
+{
+       node_remap_start_vaddr[nid] = pfn_to_kaddr(
+                       kva_start_pfn + node_remap_offset[nid]);
+       node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+               (node_remap_size[nid] * PAGE_SIZE);
+       node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+               ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
+       printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+               (ulong) node_remap_start_vaddr[nid],
+               (ulong) pfn_to_kaddr(highstart_pfn
+                  + node_remap_offset[nid] + node_remap_size[nid]));
+}
+#else
+void *alloc_remap(int nid, unsigned long size)
+{
+       return NULL;
+}
+
+static unsigned long calculate_numa_remap_pages(void)
+{
+       return 0;
+}
+
+static void init_remap_allocator(int nid)
+{
+}
+
+void __init remap_numa_kva(void)
+{
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
  extern void setup_bootmem_allocator(void);
  unsigned long __init setup_memory(void)
  {
         int nid;
         unsigned long system_start_pfn, system_max_low_pfn;
+       unsigned long wasted_pages;
  
         /*
          * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -288,11 +336,18 @@ unsigned long __init setup_memory(void)
  
  #ifdef CONFIG_BLK_DEV_INITRD
         /* Numa kva area is below the initrd */
-       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image)
-               kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image)
+       if (initrd_start)
+               kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
                         - kva_pages;
  #endif
-       kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
+
+       /*
+        * We waste pages past at the end of the KVA for no good reason other
+        * than how it is located. This is bad.
+        */
+       wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
+       kva_start_pfn -= wasted_pages;
+       kva_pages += wasted_pages;
  
         system_max_low_pfn = max_low_pfn = find_max_low_pfn();
         printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
@@ -318,19 +373,9 @@ unsigned long __init setup_memory(void)
         printk("Low memory ends at vaddr %08lx\n",
                         (ulong) pfn_to_kaddr(max_low_pfn));
         for_each_online_node(nid) {
-               node_remap_start_vaddr[nid] = pfn_to_kaddr(
-                               kva_start_pfn + node_remap_offset[nid]);
-               /* Init the node remap allocator */
-               node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
-                       (node_remap_size[nid] * PAGE_SIZE);
-               node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
-                       ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+               init_remap_allocator(nid);
  
                 allocate_pgdat(nid);
-               printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
-                       (ulong) node_remap_start_vaddr[nid],
-                       (ulong) pfn_to_kaddr(highstart_pfn
-                          + node_remap_offset[nid] + node_remap_size[nid]));
         }
         printk("High memory starts at vaddr %08lx\n",
                         (ulong) pfn_to_kaddr(highstart_pfn));
@@ -345,7 +390,8 @@ unsigned long __init setup_memory(void)
  
  void __init numa_kva_reserve(void)
  {
-       reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
+       if (kva_pages)
+               reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages));
  }
  
  void __init zone_sizes_init(void)
@@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr)
  
  EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
  #endif
+
+#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
+/*
+ * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
+ *
+ * These stub functions are needed to compile 32-bit NUMA when SRAT is
+ * not set. There are functions in srat_64.c for parsing this table
+ * and it may be possible to make them common functions.
+ */
+void acpi_numa_slit_init (struct acpi_table_slit *slit)
+{
+       printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
+}
+
+void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
+{
+}
+
+void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
+#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c

new file mode 100644 (file)

index 0000000..7e8db53
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,62 @@
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+
+
+int fixup_exception(struct pt_regs *regs)
+{
+       const struct exception_table_entry *fixup;
+
+#ifdef CONFIG_PNPBIOS
+       if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
+               extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+               extern u32 pnp_bios_is_utter_crap;
+               pnp_bios_is_utter_crap = 1;
+               printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+               __asm__ volatile(
+                       "movl %0, %%esp\n\t"
+                       "jmp *%1\n\t"
+                       : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+               panic("do_trap: can't hit this");
+       }
+#endif
+
+       fixup = search_exception_tables(regs->ip);
+       if (fixup) {
+               regs->ip = fixup->fixup;
+               return 1;
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Need to defined our own search_extable on X86_64 to work around
+ * a B stepping K8 bug.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+              const struct exception_table_entry *last,
+              unsigned long value)
+{
+       /* B stepping K8 bug */
+       if ((value >> 32) == 0)
+               value |= 0xffffffffUL << 32;
+
+       while (first <= last) {
+               const struct exception_table_entry *mid;
+               long diff;
+
+               mid = (last - first) / 2 + first;
+               diff = mid->insn - value;
+               if (diff == 0)
+                       return mid;
+               else if (diff < 0)
+                       first = mid+1;
+               else
+                       last = mid-1;
+       }
+       return NULL;
+}
+#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c

deleted file mode 100644 (file)

index 0ce4f22..0000000
--- a/arch/x86/mm/extable_32.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/i386/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/uaccess.h>
-
-int fixup_exception(struct pt_regs *regs)
-{
-       const struct exception_table_entry *fixup;
-
-#ifdef CONFIG_PNPBIOS
-       if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
-       {
-               extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
-               extern u32 pnp_bios_is_utter_crap;
-               pnp_bios_is_utter_crap = 1;
-               printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
-               __asm__ volatile(
-                       "movl %0, %%esp\n\t"
-                       "jmp *%1\n\t"
-                       : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
-               panic("do_trap: can't hit this");
-       }
-#endif
-
-       fixup = search_exception_tables(regs->eip);
-       if (fixup) {
-               regs->eip = fixup->fixup;
-               return 1;
-       }
-
-       return 0;
-}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c

deleted file mode 100644 (file)

index 79ac6e7..0000000
--- a/arch/x86/mm/extable_64.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * linux/arch/x86_64/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-
-/* Simple binary search */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-              const struct exception_table_entry *last,
-              unsigned long value)
-{
-       /* Work around a B stepping K8 bug */
-       if ((value >> 32) == 0)
-               value |= 0xffffffffUL << 32; 
-
-        while (first <= last) {
-               const struct exception_table_entry *mid;
-               long diff;
-
-               mid = (last - first) / 2 + first;
-               diff = mid->insn - value;
-                if (diff == 0)
-                        return mid;
-                else if (diff < 0)
-                        first = mid+1;
-                else
-                        last = mid-1;
-        }
-        return NULL;
-}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

new file mode 100644 (file)

index 0000000..e28cc52
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,986 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>             /* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>             /* for max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+
+/*
+ * Page fault error code bits
+ *     bit 0 == 0 means no page found, 1 means protection fault
+ *     bit 1 == 0 means read, 1 means write
+ *     bit 2 == 0 means kernel, 1 means user-mode
+ *     bit 3 == 1 means use of reserved bit detected
+ *     bit 4 == 1 means fault was an instruction fetch
+ */
+#define PF_PROT                (1<<0)
+#define PF_WRITE       (1<<1)
+#define PF_USER                (1<<2)
+#define PF_RSVD                (1<<3)
+#define PF_INSTR       (1<<4)
+
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+#ifdef CONFIG_KPROBES
+       int ret = 0;
+
+       /* kprobe_running() needs smp_processor_id() */
+#ifdef CONFIG_X86_32
+       if (!user_mode_vm(regs)) {
+#else
+       if (!user_mode(regs)) {
+#endif
+               preempt_disable();
+               if (kprobe_running() && kprobe_fault_handler(regs, 14))
+                       ret = 1;
+               preempt_enable();
+       }
+
+       return ret;
+#else
+       return 0;
+#endif
+}
+
+/*
+ * X86_32
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * X86_64
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner
+ */
+static int is_prefetch(struct pt_regs *regs, unsigned long addr,
+                      unsigned long error_code)
+{
+       unsigned char *instr;
+       int scan_more = 1;
+       int prefetch = 0;
+       unsigned char *max_instr;
+
+#ifdef CONFIG_X86_32
+       if (!(__supported_pte_mask & _PAGE_NX))
+               return 0;
+#endif
+
+       /* If it was a exec fault on NX page, ignore */
+       if (error_code & PF_INSTR)
+               return 0;
+
+       instr = (unsigned char *)convert_ip_to_linear(current, regs);
+       max_instr = instr + 15;
+
+       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+               return 0;
+
+       while (scan_more && instr < max_instr) {
+               unsigned char opcode;
+               unsigned char instr_hi;
+               unsigned char instr_lo;
+
+               if (probe_kernel_address(instr, opcode))
+                       break;
+
+               instr_hi = opcode & 0xf0;
+               instr_lo = opcode & 0x0f;
+               instr++;
+
+               switch (instr_hi) {
+               case 0x20:
+               case 0x30:
+                       /*
+                        * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+                        * In X86_64 long mode, the CPU will signal invalid
+                        * opcode if some of these prefixes are present so
+                        * X86_64 will never get here anyway
+                        */
+                       scan_more = ((instr_lo & 7) == 0x6);
+                       break;
+#ifdef CONFIG_X86_64
+               case 0x40:
+                       /*
+                        * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+                        * Need to figure out under what instruction mode the
+                        * instruction was issued. Could check the LDT for lm,
+                        * but for now it's good enough to assume that long
+                        * mode only uses well known segments or kernel.
+                        */
+                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+                       break;
+#endif
+               case 0x60:
+                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                       scan_more = (instr_lo & 0xC) == 0x4;
+                       break;
+               case 0xF0:
+                       /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+                       scan_more = !instr_lo || (instr_lo>>1) == 1;
+                       break;
+               case 0x00:
+                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                       scan_more = 0;
+
+                       if (probe_kernel_address(instr, opcode))
+                               break;
+                       prefetch = (instr_lo == 0xF) &&
+                               (opcode == 0x0D || opcode == 0x18);
+                       break;
+               default:
+                       scan_more = 0;
+                       break;
+               }
+       }
+       return prefetch;
+}
+
+static void force_sig_info_fault(int si_signo, int si_code,
+       unsigned long address, struct task_struct *tsk)
+{
+       siginfo_t info;
+
+       info.si_signo = si_signo;
+       info.si_errno = 0;
+       info.si_code = si_code;
+       info.si_addr = (void __user *)address;
+       force_sig_info(si_signo, &info, tsk);
+}
+
+#ifdef CONFIG_X86_64
+static int bad_address(void *p)
+{
+       unsigned long dummy;
+       return probe_kernel_address((unsigned long *)p, dummy);
+}
+#endif
+
+void dump_pagetable(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+       __typeof__(pte_val(__pte(0))) page;
+
+       page = read_cr3();
+       page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+       printk("*pdpt = %016Lx ", page);
+       if ((page >> PAGE_SHIFT) < max_low_pfn
+           && page & _PAGE_PRESENT) {
+               page &= PAGE_MASK;
+               page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+                                                        & (PTRS_PER_PMD - 1)];
+               printk(KERN_CONT "*pde = %016Lx ", page);
+               page &= ~_PAGE_NX;
+       }
+#else
+       printk("*pde = %08lx ", page);
+#endif
+
+       /*
+        * We must not directly access the pte in the highpte
+        * case if the page table is located in highmem.
+        * And let's rather not kmap-atomic the pte, just in case
+        * it's allocated already.
+        */
+       if ((page >> PAGE_SHIFT) < max_low_pfn
+           && (page & _PAGE_PRESENT)
+           && !(page & _PAGE_PSE)) {
+               page &= PAGE_MASK;
+               page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+                                                        & (PTRS_PER_PTE - 1)];
+               printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
+       }
+
+       printk("\n");
+#else /* CONFIG_X86_64 */
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = (pgd_t *)read_cr3();
+
+       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+       pgd += pgd_index(address);
+       if (bad_address(pgd)) goto bad;
+       printk("PGD %lx ", pgd_val(*pgd));
+       if (!pgd_present(*pgd)) goto ret;
+
+       pud = pud_offset(pgd, address);
+       if (bad_address(pud)) goto bad;
+       printk("PUD %lx ", pud_val(*pud));
+       if (!pud_present(*pud)) goto ret;
+
+       pmd = pmd_offset(pud, address);
+       if (bad_address(pmd)) goto bad;
+       printk("PMD %lx ", pmd_val(*pmd));
+       if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (bad_address(pte)) goto bad;
+       printk("PTE %lx", pte_val(*pte));
+ret:
+       printk("\n");
+       return;
+bad:
+       printk("BAD\n");
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+       unsigned index = pgd_index(address);
+       pgd_t *pgd_k;
+       pud_t *pud, *pud_k;
+       pmd_t *pmd, *pmd_k;
+
+       pgd += index;
+       pgd_k = init_mm.pgd + index;
+
+       if (!pgd_present(*pgd_k))
+               return NULL;
+
+       /*
+        * set_pgd(pgd, *pgd_k); here would be useless on PAE
+        * and redundant with the set_pmd() on non-PAE. As would
+        * set_pud.
+        */
+
+       pud = pud_offset(pgd, address);
+       pud_k = pud_offset(pgd_k, address);
+       if (!pud_present(*pud_k))
+               return NULL;
+
+       pmd = pmd_offset(pud, address);
+       pmd_k = pmd_offset(pud_k, address);
+       if (!pmd_present(*pmd_k))
+               return NULL;
+       if (!pmd_present(*pmd)) {
+               set_pmd(pmd, *pmd_k);
+               arch_flush_lazy_mmu_mode();
+       } else
+               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+       return pmd_k;
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8.
+   A lot of BIOS that didn't get tested properly miss this.
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here.
+   Does nothing for X86_32
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+       static int warned;
+       if (address != regs->ip)
+               return 0;
+       if ((address >> 32) != 0)
+               return 0;
+       address |= 0xffffffffUL << 32;
+       if ((address >= (u64)_stext && address <= (u64)_etext) ||
+           (address >= MODULES_VADDR && address <= MODULES_END)) {
+               if (!warned) {
+                       printk(errata93_warning);
+                       warned = 1;
+               }
+               regs->ip = address;
+               return 1;
+       }
+#endif
+       return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
+ * addresses >4GB.  We catch this in the page fault handler because these
+ * addresses are not reachable. Just detect this case and return.  Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+       if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+           (address >> 32))
+               return 1;
+#endif
+       return 0;
+}
+
+void do_invalid_op(struct pt_regs *, unsigned long);
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+       unsigned long nr;
+       /*
+        * Pentium F0 0F C7 C8 bug workaround.
+        */
+       if (boot_cpu_data.f00f_bug) {
+               nr = (address - idt_descr.address) >> 3;
+
+               if (nr == 6) {
+                       do_invalid_op(regs, 0);
+                       return 1;
+               }
+       }
+#endif
+       return 0;
+}
+
+static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+                           unsigned long address)
+{
+#ifdef CONFIG_X86_32
+       if (!oops_may_print())
+               return;
+#endif
+
+#ifdef CONFIG_X86_PAE
+       if (error_code & PF_INSTR) {
+               int level;
+               pte_t *pte = lookup_address(address, &level);
+
+               if (pte && pte_present(*pte) && !pte_exec(*pte))
+                       printk(KERN_CRIT "kernel tried to execute "
+                               "NX-protected page - exploit attempt? "
+                               "(uid: %d)\n", current->uid);
+       }
+#endif
+
+       printk(KERN_ALERT "BUG: unable to handle kernel ");
+       if (address < PAGE_SIZE)
+               printk(KERN_CONT "NULL pointer dereference");
+       else
+               printk(KERN_CONT "paging request");
+#ifdef CONFIG_X86_32
+       printk(KERN_CONT " at %08lx\n", address);
+#else
+       printk(KERN_CONT " at %016lx\n", address);
+#endif
+       printk(KERN_ALERT "IP:");
+       printk_address(regs->ip, 1);
+       dump_pagetable(address);
+}
+
+#ifdef CONFIG_X86_64
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+                                unsigned long error_code)
+{
+       unsigned long flags = oops_begin();
+       struct task_struct *tsk;
+
+       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+              current->comm, address);
+       dump_pagetable(address);
+       tsk = current;
+       tsk->thread.cr2 = address;
+       tsk->thread.trap_no = 14;
+       tsk->thread.error_code = error_code;
+       if (__die("Bad pagetable", regs, error_code))
+               regs = NULL;
+       oops_end(flags, regs, SIGKILL);
+}
+#endif
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.  This allows
+ * us to lazily refresh the TLB when increasing the permissions of a
+ * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
+ * expensive since that implies doing a full cross-processor TLB
+ * flush, even if no stale TLB entries exist on other processors.
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static int spurious_fault(unsigned long address,
+                         unsigned long error_code)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & (PF_USER | PF_RSVD))
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+
+       if ((error_code & PF_WRITE) && !pte_write(*pte))
+               return 0;
+       if ((error_code & PF_INSTR) && !pte_exec(*pte))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * X86_32
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * X86_64
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+       unsigned long pgd_paddr;
+       pmd_t *pmd_k;
+       pte_t *pte_k;
+       /*
+        * Synchronize this task's top level page-table
+        * with the 'reference' page table.
+        *
+        * Do _not_ use "current" here. We might be inside
+        * an interrupt in the middle of a task switch..
+        */
+       pgd_paddr = read_cr3();
+       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+       if (!pmd_k)
+               return -1;
+       pte_k = pte_offset_kernel(pmd_k, address);
+       if (!pte_present(*pte_k))
+               return -1;
+       return 0;
+#else
+       pgd_t *pgd, *pgd_ref;
+       pud_t *pud, *pud_ref;
+       pmd_t *pmd, *pmd_ref;
+       pte_t *pte, *pte_ref;
+
+       /* Copy kernel mappings over when needed. This can also
+          happen within a race in page table update. In the later
+          case just flush. */
+
+       pgd = pgd_offset(current->mm ?: &init_mm, address);
+       pgd_ref = pgd_offset_k(address);
+       if (pgd_none(*pgd_ref))
+               return -1;
+       if (pgd_none(*pgd))
+               set_pgd(pgd, *pgd_ref);
+       else
+               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+       /* Below here mismatches are bugs because these lower tables
+          are shared */
+
+       pud = pud_offset(pgd, address);
+       pud_ref = pud_offset(pgd_ref, address);
+       if (pud_none(*pud_ref))
+               return -1;
+       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+               BUG();
+       pmd = pmd_offset(pud, address);
+       pmd_ref = pmd_offset(pud_ref, address);
+       if (pmd_none(*pmd_ref))
+               return -1;
+       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+               BUG();
+       pte_ref = pte_offset_kernel(pmd_ref, address);
+       if (!pte_present(*pte_ref))
+               return -1;
+       pte = pte_offset_kernel(pmd, address);
+       /* Don't use pte_page here, because the mappings can point
+          outside mem_map, and the NUMA hash lookup cannot handle
+          that. */
+       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+               BUG();
+       return 0;
+#endif
+}
+
+int show_unhandled_signals = 1;
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       struct vm_area_struct *vma;
+       unsigned long address;
+       int write, si_code;
+       int fault;
+#ifdef CONFIG_X86_64
+       unsigned long flags;
+#endif
+
+       /*
+        * We can fault from pretty much anywhere, with unknown IRQ state.
+        */
+       trace_hardirqs_fixup();
+
+       tsk = current;
+       mm = tsk->mm;
+       prefetchw(&mm->mmap_sem);
+
+       /* get the address */
+       address = read_cr2();
+
+       si_code = SEGV_MAPERR;
+
+       if (notify_page_fault(regs))
+               return;
+
+       /*
+        * We fault-in kernel-space virtual memory on-demand. The
+        * 'reference' page table is init_mm.pgd.
+        *
+        * NOTE! We MUST NOT take any locks for this case. We may
+        * be in an interrupt or a critical region, and should
+        * only copy the information from the master page table,
+        * nothing more.
+        *
+        * This verifies that the fault happens in kernel space
+        * (error_code & 4) == 0, and that the fault was not a
+        * protection error (error_code & 9) == 0.
+        */
+#ifdef CONFIG_X86_32
+       if (unlikely(address >= TASK_SIZE)) {
+               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+                   vmalloc_fault(address) >= 0)
+                       return;
+
+               /* Can handle a stale RO->RW TLB */
+               if (spurious_fault(address, error_code))
+                       return;
+
+               /*
+                * Don't take the mm semaphore here. If we fixup a prefetch
+                * fault we could otherwise deadlock.
+                */
+               goto bad_area_nosemaphore;
+       }
+
+       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+          fault has been handled. */
+       if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+               local_irq_enable();
+
+       /*
+        * If we're in an interrupt, have no user context or are running in an
+        * atomic region then we must not take the fault.
+        */
+       if (in_atomic() || !mm)
+               goto bad_area_nosemaphore;
+#else /* CONFIG_X86_64 */
+       if (unlikely(address >= TASK_SIZE64)) {
+               /*
+                * Don't check for the module range here: its PML4
+                * is always initialized because it's shared with the main
+                * kernel text. Only vmalloc may need PML4 syncups.
+                */
+               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
+                       if (vmalloc_fault(address) >= 0)
+                               return;
+               }
+
+               /* Can handle a stale RO->RW TLB */
+               if (spurious_fault(address, error_code))
+                       return;
+
+               /*
+                * Don't take the mm semaphore here. If we fixup a prefetch
+                * fault we could otherwise deadlock.
+                */
+               goto bad_area_nosemaphore;
+       }
+       if (likely(regs->flags & X86_EFLAGS_IF))
+               local_irq_enable();
+
+       if (unlikely(error_code & PF_RSVD))
+               pgtable_bad(address, regs, error_code);
+
+       /*
+        * If we're in an interrupt, have no user context or are running in an
+        * atomic region then we must not take the fault.
+        */
+       if (unlikely(in_atomic() || !mm))
+               goto bad_area_nosemaphore;
+
+       /*
+        * User-mode registers count as a user access even for any
+        * potential system fault or CPU buglet.
+        */
+       if (user_mode_vm(regs))
+               error_code |= PF_USER;
+again:
+#endif
+       /* When running in the kernel we expect faults to occur only to
+        * addresses in user space.  All other faults represent errors in the
+        * kernel and should generate an OOPS.  Unfortunately, in the case of an
+        * erroneous fault occurring in a code path which already holds mmap_sem
+        * we will deadlock attempting to validate the fault against the
+        * address space.  Luckily the kernel only validly references user
+        * space from well defined areas of code, which are listed in the
+        * exceptions table.
+        *
+        * As the vast majority of faults will be valid we will only perform
+        * the source reference check when there is a possibility of a deadlock.
+        * Attempt to lock the address space, if we cannot we then validate the
+        * source.  If this is invalid we can skip the address space check,
+        * thus avoiding the deadlock.
+        */
+       if (!down_read_trylock(&mm->mmap_sem)) {
+               if ((error_code & PF_USER) == 0 &&
+                   !search_exception_tables(regs->ip))
+                       goto bad_area_nosemaphore;
+               down_read(&mm->mmap_sem);
+       }
+
+       vma = find_vma(mm, address);
+       if (!vma)
+               goto bad_area;
+       if (vma->vm_start <= address)
+               goto good_area;
+       if (!(vma->vm_flags & VM_GROWSDOWN))
+               goto bad_area;
+       if (error_code & PF_USER) {
+               /*
+                * Accessing the stack below %sp is always a bug.
+                * The large cushion allows instructions like enter
+                * and pusha to work.  ("enter $65535,$31" pushes
+                * 32 pointers and then decrements %sp by 65535.)
+                */
+               if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
+                       goto bad_area;
+       }
+       if (expand_stack(vma, address))
+               goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+       si_code = SEGV_ACCERR;
+       write = 0;
+       switch (error_code & (PF_PROT|PF_WRITE)) {
+       default:        /* 3: write, present */
+               /* fall through */
+       case PF_WRITE:          /* write, not present */
+               if (!(vma->vm_flags & VM_WRITE))
+                       goto bad_area;
+               write++;
+               break;
+       case PF_PROT:           /* read, present */
+               goto bad_area;
+       case 0:                 /* read, not present */
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                       goto bad_area;
+       }
+
+#ifdef CONFIG_X86_32
+survive:
+#endif
+       /*
+        * If for any reason at all we couldn't handle the fault,
+        * make sure we exit gracefully rather than endlessly redo
+        * the fault.
+        */
+       fault = handle_mm_fault(mm, vma, address, write);
+       if (unlikely(fault & VM_FAULT_ERROR)) {
+               if (fault & VM_FAULT_OOM)
+                       goto out_of_memory;
+               else if (fault & VM_FAULT_SIGBUS)
+                       goto do_sigbus;
+               BUG();
+       }
+       if (fault & VM_FAULT_MAJOR)
+               tsk->maj_flt++;
+       else
+               tsk->min_flt++;
+
+#ifdef CONFIG_X86_32
+       /*
+        * Did it hit the DOS screen memory VA from vm86 mode?
+        */
+       if (v8086_mode(regs)) {
+               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+               if (bit < 32)
+                       tsk->thread.screen_bitmap |= 1 << bit;
+       }
+#endif
+       up_read(&mm->mmap_sem);
+       return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+       up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+       /* User mode accesses just cause a SIGSEGV */
+       if (error_code & PF_USER) {
+               /*
+                * It's possible to have interrupts off here.
+                */
+               local_irq_enable();
+
+               /*
+                * Valid to do another page fault here because this one came
+                * from user space.
+                */
+               if (is_prefetch(regs, address, error_code))
+                       return;
+
+               if (is_errata100(regs, address))
+                       return;
+
+               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                   printk_ratelimit()) {
+                       printk(
+#ifdef CONFIG_X86_32
+                       "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
+#else
+                       "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
+#endif
+                       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+                       tsk->comm, task_pid_nr(tsk), address, regs->ip,
+                       regs->sp, error_code);
+                       print_vma_addr(" in ", regs->ip);
+                       printk("\n");
+               }
+
+               tsk->thread.cr2 = address;
+               /* Kernel addresses are always protection faults */
+               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+               tsk->thread.trap_no = 14;
+               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+               return;
+       }
+
+       if (is_f00f_bug(regs, address))
+               return;
+
+no_context:
+       /* Are we prepared to handle this kernel fault?  */
+       if (fixup_exception(regs))
+               return;
+
+       /*
+        * X86_32
+        * Valid to do another page fault here, because if this fault
+        * had been triggered by is_prefetch fixup_exception would have
+        * handled it.
+        *
+        * X86_64
+        * Hall of shame of CPU/BIOS bugs.
+        */
+       if (is_prefetch(regs, address, error_code))
+               return;
+
+       if (is_errata93(regs, address))
+               return;
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+#ifdef CONFIG_X86_32
+       bust_spinlocks(1);
+#else
+       flags = oops_begin();
+#endif
+
+       show_fault_oops(regs, error_code, address);
+
+       tsk->thread.cr2 = address;
+       tsk->thread.trap_no = 14;
+       tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+       die("Oops", regs, error_code);
+       bust_spinlocks(0);
+       do_exit(SIGKILL);
+#else
+       if (__die("Oops", regs, error_code))
+               regs = NULL;
+       /* Executive summary in case the body of the oops scrolled away */
+       printk(KERN_EMERG "CR2: %016lx\n", address);
+       oops_end(flags, regs, SIGKILL);
+#endif
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+       up_read(&mm->mmap_sem);
+       if (is_global_init(tsk)) {
+               yield();
+#ifdef CONFIG_X86_32
+               down_read(&mm->mmap_sem);
+               goto survive;
+#else
+               goto again;
+#endif
+       }
+
+       printk("VM: killing process %s\n", tsk->comm);
+       if (error_code & PF_USER)
+               do_group_exit(SIGKILL);
+       goto no_context;
+
+do_sigbus:
+       up_read(&mm->mmap_sem);
+
+       /* Kernel mode? Handle exceptions or die */
+       if (!(error_code & PF_USER))
+               goto no_context;
+#ifdef CONFIG_X86_32
+       /* User space => ok to do another page fault */
+       if (is_prefetch(regs, address, error_code))
+               return;
+#endif
+       tsk->thread.cr2 = address;
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 14;
+       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+void vmalloc_sync_all(void)
+{
+#ifdef CONFIG_X86_32
+       /*
+        * Note that races in the updates of insync and start aren't
+        * problematic: insync can only get set bits added, and updates to
+        * start are only improving performance (without affecting correctness
+        * if undone).
+        */
+       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+       static unsigned long start = TASK_SIZE;
+       unsigned long address;
+
+       if (SHARED_KERNEL_PMD)
+               return;
+
+       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+               if (!test_bit(pgd_index(address), insync)) {
+                       unsigned long flags;
+                       struct page *page;
+
+                       spin_lock_irqsave(&pgd_lock, flags);
+                       list_for_each_entry(page, &pgd_list, lru) {
+                               if (!vmalloc_sync_one(page_address(page),
+                                                     address))
+                                       break;
+                       }
+                       spin_unlock_irqrestore(&pgd_lock, flags);
+                       if (!page)
+                               set_bit(pgd_index(address), insync);
+               }
+               if (address == start && test_bit(pgd_index(address), insync))
+                       start = address + PGDIR_SIZE;
+       }
+#else /* CONFIG_X86_64 */
+       /*
+        * Note that races in the updates of insync and start aren't
+        * problematic: insync can only get set bits added, and updates to
+        * start are only improving performance (without affecting correctness
+        * if undone).
+        */
+       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+       static unsigned long start = VMALLOC_START & PGDIR_MASK;
+       unsigned long address;
+
+       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+               if (!test_bit(pgd_index(address), insync)) {
+                       const pgd_t *pgd_ref = pgd_offset_k(address);
+                       struct page *page;
+
+                       if (pgd_none(*pgd_ref))
+                               continue;
+                       spin_lock(&pgd_lock);
+                       list_for_each_entry(page, &pgd_list, lru) {
+                               pgd_t *pgd;
+                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                               if (pgd_none(*pgd))
+                                       set_pgd(pgd, *pgd_ref);
+                               else
+                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+                       }
+                       spin_unlock(&pgd_lock);
+                       set_bit(pgd_index(address), insync);
+               }
+               if (address == start)
+                       start = address + PGDIR_SIZE;
+       }
+       /* Check that there is no need to do the same for the modules area. */
+       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+                               (__START_KERNEL & PGDIR_MASK)));
+#endif
+}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c

deleted file mode 100644 (file)

index a2273d4..0000000
--- a/arch/x86/mm/fault_32.c
+++ /dev/null
@@ -1,659 +0,0 @@
-/*
- *  linux/arch/i386/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>             /* For unblank_screen() */
-#include <linux/highmem.h>
-#include <linux/bootmem.h>             /* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-
-extern void die(const char *,struct pt_regs *,long);
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-       int ret = 0;
-
-       /* kprobe_running() needs smp_processor_id() */
-       if (!user_mode_vm(regs)) {
-               preempt_disable();
-               if (kprobe_running() && kprobe_fault_handler(regs, 14))
-                       ret = 1;
-               preempt_enable();
-       }
-
-       return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-       return 0;
-}
-#endif
-
-/*
- * Return EIP plus the CS segment base.  The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- * 
- * This is slow, but is very rarely executed.
- */
-static inline unsigned long get_segment_eip(struct pt_regs *regs,
-                                           unsigned long *eip_limit)
-{
-       unsigned long eip = regs->eip;
-       unsigned seg = regs->xcs & 0xffff;
-       u32 seg_ar, seg_limit, base, *desc;
-
-       /* Unlikely, but must come before segment checks. */
-       if (unlikely(regs->eflags & VM_MASK)) {
-               base = seg << 4;
-               *eip_limit = base + 0xffff;
-               return base + (eip & 0xffff);
-       }
-
-       /* The standard kernel/user address space limit. */
-       *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-       
-       /* By far the most common cases. */
-       if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-               return eip;
-
-       /* Check the segment exists, is within the current LDT/GDT size,
-          that kernel/user (ring 0..3) has the appropriate privilege,
-          that it's a code segment, and get the limit. */
-       __asm__ ("larl %3,%0; lsll %3,%1"
-                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-       if ((~seg_ar & 0x9800) || eip > seg_limit) {
-               *eip_limit = 0;
-               return 1;        /* So that returned eip > *eip_limit. */
-       }
-
-       /* Get the GDT/LDT descriptor base. 
-          When you look for races in this code remember that
-          LDT and other horrors are only used in user space. */
-       if (seg & (1<<2)) {
-               /* Must lock the LDT while reading it. */
-               mutex_lock(&current->mm->context.lock);
-               desc = current->mm->context.ldt;
-               desc = (void *)desc + (seg & ~7);
-       } else {
-               /* Must disable preemption while reading the GDT. */
-               desc = (u32 *)get_cpu_gdt_table(get_cpu());
-               desc = (void *)desc + (seg & ~7);
-       }
-
-       /* Decode the code segment base from the descriptor */
-       base = get_desc_base((unsigned long *)desc);
-
-       if (seg & (1<<2)) { 
-               mutex_unlock(&current->mm->context.lock);
-       } else
-               put_cpu();
-
-       /* Adjust EIP and segment limit, and clamp at the kernel limit.
-          It's legitimate for segments to wrap at 0xffffffff. */
-       seg_limit += base;
-       if (seg_limit < *eip_limit && seg_limit >= base)
-               *eip_limit = seg_limit;
-       return eip + base;
-}
-
-/* 
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
- */
-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
-{ 
-       unsigned long limit;
-       unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
-       int scan_more = 1;
-       int prefetch = 0; 
-       int i;
-
-       for (i = 0; scan_more && i < 15; i++) { 
-               unsigned char opcode;
-               unsigned char instr_hi;
-               unsigned char instr_lo;
-
-               if (instr > (unsigned char *)limit)
-                       break;
-               if (probe_kernel_address(instr, opcode))
-                       break; 
-
-               instr_hi = opcode & 0xf0; 
-               instr_lo = opcode & 0x0f; 
-               instr++;
-
-               switch (instr_hi) { 
-               case 0x20:
-               case 0x30:
-                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
-                       scan_more = ((instr_lo & 7) == 0x6);
-                       break;
-                       
-               case 0x60:
-                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
-                       scan_more = (instr_lo & 0xC) == 0x4;
-                       break;          
-               case 0xF0:
-                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
-                       scan_more = !instr_lo || (instr_lo>>1) == 1;
-                       break;                  
-               case 0x00:
-                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
-                       scan_more = 0;
-                       if (instr > (unsigned char *)limit)
-                               break;
-                       if (probe_kernel_address(instr, opcode))
-                               break;
-                       prefetch = (instr_lo == 0xF) &&
-                               (opcode == 0x0D || opcode == 0x18);
-                       break;                  
-               default:
-                       scan_more = 0;
-                       break;
-               } 
-       }
-       return prefetch;
-}
-
-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-                             unsigned long error_code)
-{
-       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-                    boot_cpu_data.x86 >= 6)) {
-               /* Catch an obscure case of prefetch inside an NX page. */
-               if (nx_enabled && (error_code & 16))
-                       return 0;
-               return __is_prefetch(regs, addr);
-       }
-       return 0;
-} 
-
-static noinline void force_sig_info_fault(int si_signo, int si_code,
-       unsigned long address, struct task_struct *tsk)
-{
-       siginfo_t info;
-
-       info.si_signo = si_signo;
-       info.si_errno = 0;
-       info.si_code = si_code;
-       info.si_addr = (void __user *)address;
-       force_sig_info(si_signo, &info, tsk);
-}
-
-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
-
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
-       unsigned index = pgd_index(address);
-       pgd_t *pgd_k;
-       pud_t *pud, *pud_k;
-       pmd_t *pmd, *pmd_k;
-
-       pgd += index;
-       pgd_k = init_mm.pgd + index;
-
-       if (!pgd_present(*pgd_k))
-               return NULL;
-
-       /*
-        * set_pgd(pgd, *pgd_k); here would be useless on PAE
-        * and redundant with the set_pmd() on non-PAE. As would
-        * set_pud.
-        */
-
-       pud = pud_offset(pgd, address);
-       pud_k = pud_offset(pgd_k, address);
-       if (!pud_present(*pud_k))
-               return NULL;
-
-       pmd = pmd_offset(pud, address);
-       pmd_k = pmd_offset(pud_k, address);
-       if (!pmd_present(*pmd_k))
-               return NULL;
-       if (!pmd_present(*pmd)) {
-               set_pmd(pmd, *pmd_k);
-               arch_flush_lazy_mmu_mode();
-       } else
-               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-       return pmd_k;
-}
-
-/*
- * Handle a fault on the vmalloc or module mapping area
- *
- * This assumes no large pages in there.
- */
-static inline int vmalloc_fault(unsigned long address)
-{
-       unsigned long pgd_paddr;
-       pmd_t *pmd_k;
-       pte_t *pte_k;
-       /*
-        * Synchronize this task's top level page-table
-        * with the 'reference' page table.
-        *
-        * Do _not_ use "current" here. We might be inside
-        * an interrupt in the middle of a task switch..
-        */
-       pgd_paddr = read_cr3();
-       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-       if (!pmd_k)
-               return -1;
-       pte_k = pte_offset_kernel(pmd_k, address);
-       if (!pte_present(*pte_k))
-               return -1;
-       return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- *
- * error_code:
- *     bit 0 == 0 means no page found, 1 means protection fault
- *     bit 1 == 0 means read, 1 means write
- *     bit 2 == 0 means kernel, 1 means user-mode
- *     bit 3 == 1 means use of reserved bit detected
- *     bit 4 == 1 means fault was an instruction fetch
- */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-                                     unsigned long error_code)
-{
-       struct task_struct *tsk;
-       struct mm_struct *mm;
-       struct vm_area_struct * vma;
-       unsigned long address;
-       int write, si_code;
-       int fault;
-
-       /*
-        * We can fault from pretty much anywhere, with unknown IRQ state.
-        */
-       trace_hardirqs_fixup();
-
-       /* get the address */
-        address = read_cr2();
-
-       tsk = current;
-
-       si_code = SEGV_MAPERR;
-
-       /*
-        * We fault-in kernel-space virtual memory on-demand. The
-        * 'reference' page table is init_mm.pgd.
-        *
-        * NOTE! We MUST NOT take any locks for this case. We may
-        * be in an interrupt or a critical region, and should
-        * only copy the information from the master page table,
-        * nothing more.
-        *
-        * This verifies that the fault happens in kernel space
-        * (error_code & 4) == 0, and that the fault was not a
-        * protection error (error_code & 9) == 0.
-        */
-       if (unlikely(address >= TASK_SIZE)) {
-               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
-                       return;
-               if (notify_page_fault(regs))
-                       return;
-               /*
-                * Don't take the mm semaphore here. If we fixup a prefetch
-                * fault we could otherwise deadlock.
-                */
-               goto bad_area_nosemaphore;
-       }
-
-       if (notify_page_fault(regs))
-               return;
-
-       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
-          fault has been handled. */
-       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
-               local_irq_enable();
-
-       mm = tsk->mm;
-
-       /*
-        * If we're in an interrupt, have no user context or are running in an
-        * atomic region then we must not take the fault..
-        */
-       if (in_atomic() || !mm)
-               goto bad_area_nosemaphore;
-
-       /* When running in the kernel we expect faults to occur only to
-        * addresses in user space.  All other faults represent errors in the
-        * kernel and should generate an OOPS.  Unfortunately, in the case of an
-        * erroneous fault occurring in a code path which already holds mmap_sem
-        * we will deadlock attempting to validate the fault against the
-        * address space.  Luckily the kernel only validly references user
-        * space from well defined areas of code, which are listed in the
-        * exceptions table.
-        *
-        * As the vast majority of faults will be valid we will only perform
-        * the source reference check when there is a possibility of a deadlock.
-        * Attempt to lock the address space, if we cannot we then validate the
-        * source.  If this is invalid we can skip the address space check,
-        * thus avoiding the deadlock.
-        */
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               if ((error_code & 4) == 0 &&
-                   !search_exception_tables(regs->eip))
-                       goto bad_area_nosemaphore;
-               down_read(&mm->mmap_sem);
-       }
-
-       vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-       if (vma->vm_start <= address)
-               goto good_area;
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
-       if (error_code & 4) {
-               /*
-                * Accessing the stack below %esp is always a bug.
-                * The large cushion allows instructions like enter
-                * and pusha to work.  ("enter $65535,$31" pushes
-                * 32 pointers and then decrements %esp by 65535.)
-                */
-               if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
-                       goto bad_area;
-       }
-       if (expand_stack(vma, address))
-               goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-       si_code = SEGV_ACCERR;
-       write = 0;
-       switch (error_code & 3) {
-               default:        /* 3: write, present */
-                               /* fall through */
-               case 2:         /* write, not present */
-                       if (!(vma->vm_flags & VM_WRITE))
-                               goto bad_area;
-                       write++;
-                       break;
-               case 1:         /* read, present */
-                       goto bad_area;
-               case 0:         /* read, not present */
-                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                               goto bad_area;
-       }
-
- survive:
-       /*
-        * If for any reason at all we couldn't handle the fault,
-        * make sure we exit gracefully rather than endlessly redo
-        * the fault.
-        */
-       fault = handle_mm_fault(mm, vma, address, write);
-       if (unlikely(fault & VM_FAULT_ERROR)) {
-               if (fault & VM_FAULT_OOM)
-                       goto out_of_memory;
-               else if (fault & VM_FAULT_SIGBUS)
-                       goto do_sigbus;
-               BUG();
-       }
-       if (fault & VM_FAULT_MAJOR)
-               tsk->maj_flt++;
-       else
-               tsk->min_flt++;
-
-       /*
-        * Did it hit the DOS screen memory VA from vm86 mode?
-        */
-       if (regs->eflags & VM_MASK) {
-               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
-               if (bit < 32)
-                       tsk->thread.screen_bitmap |= 1 << bit;
-       }
-       up_read(&mm->mmap_sem);
-       return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-       up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-       /* User mode accesses just cause a SIGSEGV */
-       if (error_code & 4) {
-               /*
-                * It's possible to have interrupts off here.
-                */
-               local_irq_enable();
-
-               /* 
-                * Valid to do another page fault here because this one came 
-                * from user space.
-                */
-               if (is_prefetch(regs, address, error_code))
-                       return;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit()) {
-                       printk("%s%s[%d]: segfault at %08lx eip %08lx "
-                           "esp %08lx error %lx\n",
-                           task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-                           tsk->comm, task_pid_nr(tsk), address, regs->eip,
-                           regs->esp, error_code);
-               }
-               tsk->thread.cr2 = address;
-               /* Kernel addresses are always protection faults */
-               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-               tsk->thread.trap_no = 14;
-               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-               return;
-       }
-
-#ifdef CONFIG_X86_F00F_BUG
-       /*
-        * Pentium F0 0F C7 C8 bug workaround.
-        */
-       if (boot_cpu_data.f00f_bug) {
-               unsigned long nr;
-               
-               nr = (address - idt_descr.address) >> 3;
-
-               if (nr == 6) {
-                       do_invalid_op(regs, 0);
-                       return;
-               }
-       }
-#endif
-
-no_context:
-       /* Are we prepared to handle this kernel fault?  */
-       if (fixup_exception(regs))
-               return;
-
-       /* 
-        * Valid to do another page fault here, because if this fault
-        * had been triggered by is_prefetch fixup_exception would have 
-        * handled it.
-        */
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-       bust_spinlocks(1);
-
-       if (oops_may_print()) {
-               __typeof__(pte_val(__pte(0))) page;
-
-#ifdef CONFIG_X86_PAE
-               if (error_code & 16) {
-                       pte_t *pte = lookup_address(address);
-
-                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
-                               printk(KERN_CRIT "kernel tried to execute "
-                                       "NX-protected page - exploit attempt? "
-                                       "(uid: %d)\n", current->uid);
-               }
-#endif
-               if (address < PAGE_SIZE)
-                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
-                                       "pointer dereference");
-               else
-                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
-                                       " request");
-               printk(" at virtual address %08lx\n",address);
-               printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
-
-               page = read_cr3();
-               page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
-#ifdef CONFIG_X86_PAE
-               printk("*pdpt = %016Lx ", page);
-               if ((page >> PAGE_SHIFT) < max_low_pfn
-                   && page & _PAGE_PRESENT) {
-                       page &= PAGE_MASK;
-                       page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
-                                                                & (PTRS_PER_PMD - 1)];
-                       printk(KERN_CONT "*pde = %016Lx ", page);
-                       page &= ~_PAGE_NX;
-               }
-#else
-               printk("*pde = %08lx ", page);
-#endif
-
-               /*
-                * We must not directly access the pte in the highpte
-                * case if the page table is located in highmem.
-                * And let's rather not kmap-atomic the pte, just in case
-                * it's allocated already.
-                */
-               if ((page >> PAGE_SHIFT) < max_low_pfn
-                   && (page & _PAGE_PRESENT)
-                   && !(page & _PAGE_PSE)) {
-                       page &= PAGE_MASK;
-                       page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
-                                                                & (PTRS_PER_PTE - 1)];
-                       printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
-               }
-
-               printk("\n");
-       }
-
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-       die("Oops", regs, error_code);
-       bust_spinlocks(0);
-       do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-       up_read(&mm->mmap_sem);
-       if (is_global_init(tsk)) {
-               yield();
-               down_read(&mm->mmap_sem);
-               goto survive;
-       }
-       printk("VM: killing process %s\n", tsk->comm);
-       if (error_code & 4)
-               do_group_exit(SIGKILL);
-       goto no_context;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-
-       /* Kernel mode? Handle exceptions or die */
-       if (!(error_code & 4))
-               goto no_context;
-
-       /* User space => ok to do another page fault */
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-       tsk->thread.cr2 = address;
-       tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 14;
-       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-
-void vmalloc_sync_all(void)
-{
-       /*
-        * Note that races in the updates of insync and start aren't
-        * problematic: insync can only get set bits added, and updates to
-        * start are only improving performance (without affecting correctness
-        * if undone).
-        */
-       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-       static unsigned long start = TASK_SIZE;
-       unsigned long address;
-
-       if (SHARED_KERNEL_PMD)
-               return;
-
-       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
-       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
-               if (!test_bit(pgd_index(address), insync)) {
-                       unsigned long flags;
-                       struct page *page;
-
-                       spin_lock_irqsave(&pgd_lock, flags);
-                       for (page = pgd_list; page; page =
-                                       (struct page *)page->index)
-                               if (!vmalloc_sync_one(page_address(page),
-                                                               address)) {
-                                       BUG_ON(page != pgd_list);
-                                       break;
-                               }
-                       spin_unlock_irqrestore(&pgd_lock, flags);
-                       if (!page)
-                               set_bit(pgd_index(address), insync);
-               }
-               if (address == start && test_bit(pgd_index(address), insync))
-                       start = address + PGDIR_SIZE;
-       }
-}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c

deleted file mode 100644 (file)

index 0e26230..0000000
--- a/arch/x86/mm/fault_64.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/*
- *  linux/arch/x86-64/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>             /* For unblank_screen() */
-#include <linux/compiler.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm-generic/sections.h>
-
-/* Page fault error code bits */
-#define PF_PROT        (1<<0)          /* or no page found */
-#define PF_WRITE       (1<<1)
-#define PF_USER        (1<<2)
-#define PF_RSVD        (1<<3)
-#define PF_INSTR       (1<<4)
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-       int ret = 0;
-
-       /* kprobe_running() needs smp_processor_id() */
-       if (!user_mode(regs)) {
-               preempt_disable();
-               if (kprobe_running() && kprobe_fault_handler(regs, 14))
-                       ret = 1;
-               preempt_enable();
-       }
-
-       return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-       return 0;
-}
-#endif
-
-/* Sometimes the CPU reports invalid exceptions on prefetch.
-   Check that here and ignore.
-   Opcode checker based on code by Richard Brunner */
-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-                               unsigned long error_code)
-{ 
-       unsigned char *instr;
-       int scan_more = 1;
-       int prefetch = 0; 
-       unsigned char *max_instr;
-
-       /* If it was a exec fault ignore */
-       if (error_code & PF_INSTR)
-               return 0;
-       
-       instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
-       max_instr = instr + 15;
-
-       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
-               return 0;
-
-       while (scan_more && instr < max_instr) { 
-               unsigned char opcode;
-               unsigned char instr_hi;
-               unsigned char instr_lo;
-
-               if (probe_kernel_address(instr, opcode))
-                       break; 
-
-               instr_hi = opcode & 0xf0; 
-               instr_lo = opcode & 0x0f; 
-               instr++;
-
-               switch (instr_hi) { 
-               case 0x20:
-               case 0x30:
-                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
-                          prefixes.  In long mode, the CPU will signal
-                          invalid opcode if some of these prefixes are
-                          present so we will never get here anyway */
-                       scan_more = ((instr_lo & 7) == 0x6);
-                       break;
-                       
-               case 0x40:
-                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
-                          Need to figure out under what instruction mode the
-                          instruction was issued ... */
-                       /* Could check the LDT for lm, but for now it's good
-                          enough to assume that long mode only uses well known
-                          segments or kernel. */
-                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
-                       break;
-                       
-               case 0x60:
-                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
-                       scan_more = (instr_lo & 0xC) == 0x4;
-                       break;          
-               case 0xF0:
-                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
-                       scan_more = !instr_lo || (instr_lo>>1) == 1;
-                       break;                  
-               case 0x00:
-                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
-                       scan_more = 0;
-                       if (probe_kernel_address(instr, opcode))
-                               break;
-                       prefetch = (instr_lo == 0xF) &&
-                               (opcode == 0x0D || opcode == 0x18);
-                       break;                  
-               default:
-                       scan_more = 0;
-                       break;
-               } 
-       }
-       return prefetch;
-}
-
-static int bad_address(void *p) 
-{ 
-       unsigned long dummy;
-       return probe_kernel_address((unsigned long *)p, dummy);
-} 
-
-void dump_pagetable(unsigned long address)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       pgd = (pgd_t *)read_cr3();
-
-       pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
-       pgd += pgd_index(address);
-       if (bad_address(pgd)) goto bad;
-       printk("PGD %lx ", pgd_val(*pgd));
-       if (!pgd_present(*pgd)) goto ret; 
-
-       pud = pud_offset(pgd, address);
-       if (bad_address(pud)) goto bad;
-       printk("PUD %lx ", pud_val(*pud));
-       if (!pud_present(*pud)) goto ret;
-
-       pmd = pmd_offset(pud, address);
-       if (bad_address(pmd)) goto bad;
-       printk("PMD %lx ", pmd_val(*pmd));
-       if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
-
-       pte = pte_offset_kernel(pmd, address);
-       if (bad_address(pte)) goto bad;
-       printk("PTE %lx", pte_val(*pte)); 
-ret:
-       printk("\n");
-       return;
-bad:
-       printk("BAD\n");
-}
-
-static const char errata93_warning[] = 
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-
-/* Workaround for K8 erratum #93 & buggy BIOS.
-   BIOS SMM functions are required to use a specific workaround
-   to avoid corruption of the 64bit RIP register on C stepping K8. 
-   A lot of BIOS that didn't get tested properly miss this. 
-   The OS sees this as a page fault with the upper 32bits of RIP cleared.
-   Try to work around it here.
-   Note we only handle faults in kernel here. */
-
-static int is_errata93(struct pt_regs *regs, unsigned long address) 
-{
-       static int warned;
-       if (address != regs->rip)
-               return 0;
-       if ((address >> 32) != 0) 
-               return 0;
-       address |= 0xffffffffUL << 32;
-       if ((address >= (u64)_stext && address <= (u64)_etext) || 
-           (address >= MODULES_VADDR && address <= MODULES_END)) { 
-               if (!warned) {
-                       printk(errata93_warning);               
-                       warned = 1;
-               }
-               regs->rip = address;
-               return 1;
-       }
-       return 0;
-} 
-
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-                                unsigned long error_code)
-{
-       unsigned long flags = oops_begin();
-       struct task_struct *tsk;
-
-       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-              current->comm, address);
-       dump_pagetable(address);
-       tsk = current;
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-       __die("Bad pagetable", regs, error_code);
-       oops_end(flags);
-       do_exit(SIGKILL);
-}
-
-/*
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
-       pgd_t *pgd, *pgd_ref;
-       pud_t *pud, *pud_ref;
-       pmd_t *pmd, *pmd_ref;
-       pte_t *pte, *pte_ref;
-
-       /* Copy kernel mappings over when needed. This can also
-          happen within a race in page table update. In the later
-          case just flush. */
-
-       pgd = pgd_offset(current->mm ?: &init_mm, address);
-       pgd_ref = pgd_offset_k(address);
-       if (pgd_none(*pgd_ref))
-               return -1;
-       if (pgd_none(*pgd))
-               set_pgd(pgd, *pgd_ref);
-       else
-               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-
-       /* Below here mismatches are bugs because these lower tables
-          are shared */
-
-       pud = pud_offset(pgd, address);
-       pud_ref = pud_offset(pgd_ref, address);
-       if (pud_none(*pud_ref))
-               return -1;
-       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
-               BUG();
-       pmd = pmd_offset(pud, address);
-       pmd_ref = pmd_offset(pud_ref, address);
-       if (pmd_none(*pmd_ref))
-               return -1;
-       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
-               BUG();
-       pte_ref = pte_offset_kernel(pmd_ref, address);
-       if (!pte_present(*pte_ref))
-               return -1;
-       pte = pte_offset_kernel(pmd, address);
-       /* Don't use pte_page here, because the mappings can point
-          outside mem_map, and the NUMA hash lookup cannot handle
-          that. */
-       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
-               BUG();
-       return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
-                                       unsigned long error_code)
-{
-       struct task_struct *tsk;
-       struct mm_struct *mm;
-       struct vm_area_struct * vma;
-       unsigned long address;
-       const struct exception_table_entry *fixup;
-       int write, fault;
-       unsigned long flags;
-       siginfo_t info;
-
-       /*
-        * We can fault from pretty much anywhere, with unknown IRQ state.
-        */
-       trace_hardirqs_fixup();
-
-       tsk = current;
-       mm = tsk->mm;
-       prefetchw(&mm->mmap_sem);
-
-       /* get the address */
-       address = read_cr2();
-
-       info.si_code = SEGV_MAPERR;
-
-
-       /*
-        * We fault-in kernel-space virtual memory on-demand. The
-        * 'reference' page table is init_mm.pgd.
-        *
-        * NOTE! We MUST NOT take any locks for this case. We may
-        * be in an interrupt or a critical region, and should
-        * only copy the information from the master page table,
-        * nothing more.
-        *
-        * This verifies that the fault happens in kernel space
-        * (error_code & 4) == 0, and that the fault was not a
-        * protection error (error_code & 9) == 0.
-        */
-       if (unlikely(address >= TASK_SIZE64)) {
-               /*
-                * Don't check for the module range here: its PML4
-                * is always initialized because it's shared with the main
-                * kernel text. Only vmalloc may need PML4 syncups.
-                */
-               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
-                       if (vmalloc_fault(address) >= 0)
-                               return;
-               }
-               if (notify_page_fault(regs))
-                       return;
-               /*
-                * Don't take the mm semaphore here. If we fixup a prefetch
-                * fault we could otherwise deadlock.
-                */
-               goto bad_area_nosemaphore;
-       }
-
-       if (notify_page_fault(regs))
-               return;
-
-       if (likely(regs->eflags & X86_EFLAGS_IF))
-               local_irq_enable();
-
-       if (unlikely(error_code & PF_RSVD))
-               pgtable_bad(address, regs, error_code);
-
-       /*
-        * If we're in an interrupt or have no user
-        * context, we must not take the fault..
-        */
-       if (unlikely(in_atomic() || !mm))
-               goto bad_area_nosemaphore;
-
-       /*
-        * User-mode registers count as a user access even for any
-        * potential system fault or CPU buglet.
-        */
-       if (user_mode_vm(regs))
-               error_code |= PF_USER;
-
- again:
-       /* When running in the kernel we expect faults to occur only to
-        * addresses in user space.  All other faults represent errors in the
-        * kernel and should generate an OOPS.  Unfortunately, in the case of an
-        * erroneous fault occurring in a code path which already holds mmap_sem
-        * we will deadlock attempting to validate the fault against the
-        * address space.  Luckily the kernel only validly references user
-        * space from well defined areas of code, which are listed in the
-        * exceptions table.
-        *
-        * As the vast majority of faults will be valid we will only perform
-        * the source reference check when there is a possibility of a deadlock.
-        * Attempt to lock the address space, if we cannot we then validate the
-        * source.  If this is invalid we can skip the address space check,
-        * thus avoiding the deadlock.
-        */
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               if ((error_code & PF_USER) == 0 &&
-                   !search_exception_tables(regs->rip))
-                       goto bad_area_nosemaphore;
-               down_read(&mm->mmap_sem);
-       }
-
-       vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-       if (likely(vma->vm_start <= address))
-               goto good_area;
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
-       if (error_code & 4) {
-               /* Allow userspace just enough access below the stack pointer
-                * to let the 'enter' instruction work.
-                */
-               if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
-                       goto bad_area;
-       }
-       if (expand_stack(vma, address))
-               goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-       info.si_code = SEGV_ACCERR;
-       write = 0;
-       switch (error_code & (PF_PROT|PF_WRITE)) {
-               default:        /* 3: write, present */
-                       /* fall through */
-               case PF_WRITE:          /* write, not present */
-                       if (!(vma->vm_flags & VM_WRITE))
-                               goto bad_area;
-                       write++;
-                       break;
-               case PF_PROT:           /* read, present */
-                       goto bad_area;
-               case 0:                 /* read, not present */
-                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                               goto bad_area;
-       }
-
-       /*
-        * If for any reason at all we couldn't handle the fault,
-        * make sure we exit gracefully rather than endlessly redo
-        * the fault.
-        */
-       fault = handle_mm_fault(mm, vma, address, write);
-       if (unlikely(fault & VM_FAULT_ERROR)) {
-               if (fault & VM_FAULT_OOM)
-                       goto out_of_memory;
-               else if (fault & VM_FAULT_SIGBUS)
-                       goto do_sigbus;
-               BUG();
-       }
-       if (fault & VM_FAULT_MAJOR)
-               tsk->maj_flt++;
-       else
-               tsk->min_flt++;
-       up_read(&mm->mmap_sem);
-       return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-       up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-       /* User mode accesses just cause a SIGSEGV */
-       if (error_code & PF_USER) {
-
-               /*
-                * It's possible to have interrupts off here.
-                */
-               local_irq_enable();
-
-               if (is_prefetch(regs, address, error_code))
-                       return;
-
-               /* Work around K8 erratum #100 K8 in compat mode
-                  occasionally jumps to illegal addresses >4GB.  We
-                  catch this here in the page fault handler because
-                  these addresses are not reachable. Just detect this
-                  case and return.  Any code segment in LDT is
-                  compatibility mode. */
-               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
-                   (address >> 32))
-                       return;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit()) {
-                       printk(
-                      "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
-                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
-                                       tsk->comm, tsk->pid, address, regs->rip,
-                                       regs->rsp, error_code);
-               }
-       
-               tsk->thread.cr2 = address;
-               /* Kernel addresses are always protection faults */
-               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-               tsk->thread.trap_no = 14;
-               info.si_signo = SIGSEGV;
-               info.si_errno = 0;
-               /* info.si_code has been set above */
-               info.si_addr = (void __user *)address;
-               force_sig_info(SIGSEGV, &info, tsk);
-               return;
-       }
-
-no_context:
-       
-       /* Are we prepared to handle this kernel fault?  */
-       fixup = search_exception_tables(regs->rip);
-       if (fixup) {
-               regs->rip = fixup->fixup;
-               return;
-       }
-
-       /* 
-        * Hall of shame of CPU/BIOS bugs.
-        */
-
-       if (is_prefetch(regs, address, error_code))
-               return;
-
-       if (is_errata93(regs, address))
-               return; 
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-       flags = oops_begin();
-
-       if (address < PAGE_SIZE)
-               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
-       else
-               printk(KERN_ALERT "Unable to handle kernel paging request");
-       printk(" at %016lx RIP: \n" KERN_ALERT,address);
-       printk_address(regs->rip);
-       dump_pagetable(address);
-       tsk->thread.cr2 = address;
-       tsk->thread.trap_no = 14;
-       tsk->thread.error_code = error_code;
-       __die("Oops", regs, error_code);
-       /* Executive summary in case the body of the oops scrolled away */
-       printk(KERN_EMERG "CR2: %016lx\n", address);
-       oops_end(flags);
-       do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-       up_read(&mm->mmap_sem);
-       if (is_global_init(current)) {
-               yield();
-               goto again;
-       }
-       printk("VM: killing process %s\n", tsk->comm);
-       if (error_code & 4)
-               do_group_exit(SIGKILL);
-       goto no_context;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-
-       /* Kernel mode? Handle exceptions or die */
-       if (!(error_code & PF_USER))
-               goto no_context;
-
-       tsk->thread.cr2 = address;
-       tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 14;
-       info.si_signo = SIGBUS;
-       info.si_errno = 0;
-       info.si_code = BUS_ADRERR;
-       info.si_addr = (void __user *)address;
-       force_sig_info(SIGBUS, &info, tsk);
-       return;
-}
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-void vmalloc_sync_all(void)
-{
-       /* Note that races in the updates of insync and start aren't 
-          problematic:
-          insync can only get set bits added, and updates to start are only
-          improving performance (without affecting correctness if undone). */
-       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-       static unsigned long start = VMALLOC_START & PGDIR_MASK;
-       unsigned long address;
-
-       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
-               if (!test_bit(pgd_index(address), insync)) {
-                       const pgd_t *pgd_ref = pgd_offset_k(address);
-                       struct page *page;
-
-                       if (pgd_none(*pgd_ref))
-                               continue;
-                       spin_lock(&pgd_lock);
-                       list_for_each_entry(page, &pgd_list, lru) {
-                               pgd_t *pgd;
-                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                               if (pgd_none(*pgd))
-                                       set_pgd(pgd, *pgd_ref);
-                               else
-                                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-                       }
-                       spin_unlock(&pgd_lock);
-                       set_bit(pgd_index(address), insync);
-               }
-               if (address == start)
-                       start = address + PGDIR_SIZE;
-       }
-       /* Check that there is no need to do the same for the modules area. */
-       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
-                               (__START_KERNEL & PGDIR_MASK)));
-}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c

index 1c3bf95f7356826a9243081c8269bd1475df0b1a..3d936f23270409344105faf23b37ce45b0e02535 100644 (file)
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -18,6 +18,49 @@ void kunmap(struct page *page)
         kunmap_high(page);
  }
  
+static void debug_kmap_atomic_prot(enum km_type type)
+{
+#ifdef CONFIG_DEBUG_HIGHMEM
+       static unsigned warn_count = 10;
+
+       if (unlikely(warn_count == 0))
+               return;
+
+       if (unlikely(in_interrupt())) {
+               if (in_irq()) {
+                       if (type != KM_IRQ0 && type != KM_IRQ1 &&
+                           type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+                           type != KM_BOUNCE_READ) {
+                               WARN_ON(1);
+                               warn_count--;
+                       }
+               } else if (!irqs_disabled()) {  /* softirq */
+                       if (type != KM_IRQ0 && type != KM_IRQ1 &&
+                           type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+                           type != KM_SKB_SUNRPC_DATA &&
+                           type != KM_SKB_DATA_SOFTIRQ &&
+                           type != KM_BOUNCE_READ) {
+                               WARN_ON(1);
+                               warn_count--;
+                       }
+               }
+       }
+
+       if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+                       type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+               if (!irqs_disabled()) {
+                       WARN_ON(1);
+                       warn_count--;
+               }
+       } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+               if (irq_count() == 0 && !irqs_disabled()) {
+                       WARN_ON(1);
+                       warn_count--;
+               }
+       }
+#endif
+}
+
  /*
   * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
   * no global lock is needed and because the kmap code must perform a global TLB
@@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
  {
         enum fixed_addresses idx;
         unsigned long vaddr;
-
         /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+
+       debug_kmap_atomic_prot(type);
+
         pagefault_disable();
  
         if (!PageHighMem(page))
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c

index 6c06d9c0488ec28c05d45c226d77c1050a7394a8..4fbafb4bc2f02a93bfcbb64fa732fc08fb0af406 100644 (file)
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -15,6 +15,7 @@
  #include <asm/mman.h>
  #include <asm/tlb.h>
  #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
  
  static unsigned long page_table_shareable(struct vm_area_struct *svma,
                                 struct vm_area_struct *vma,
@@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
  
         spin_lock(&mm->page_table_lock);
         if (pud_none(*pud))
-               pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+               pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
         else
                 put_page(virt_to_page(spte));
         spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index 3c76d194fd2ccef4df106a2f47fe9b02a3057264..da524fb22422eafc8b4d6514609aec749ffc2dc0 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -27,7 +27,6 @@
  #include <linux/bootmem.h>
  #include <linux/slab.h>
  #include <linux/proc_fs.h>
-#include <linux/efi.h>
  #include <linux/memory_hotplug.h>
  #include <linux/initrd.h>
  #include <linux/cpumask.h>
@@ -40,8 +39,10 @@
  #include <asm/fixmap.h>
  #include <asm/e820.h>
  #include <asm/apic.h>
+#include <asm/bugs.h>
  #include <asm/tlb.h>
  #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
  #include <asm/sections.h>
  #include <asm/paravirt.h>
  
@@ -50,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
  DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
  unsigned long highstart_pfn, highend_pfn;
  
-static int noinline do_test_wp_bit(void);
+static noinline int do_test_wp_bit(void);
  
  /*
   * Creates a middle page table and puts a pointer to it in the
@@ -61,26 +62,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
  {
         pud_t *pud;
         pmd_t *pmd_table;
-               
+
  #ifdef CONFIG_X86_PAE
         if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
                 pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
  
-               paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+               paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                 pud = pud_offset(pgd, 0);
-               if (pmd_table != pmd_offset(pud, 0))
-                       BUG();
+               BUG_ON(pmd_table != pmd_offset(pud, 0));
         }
  #endif
         pud = pud_offset(pgd, 0);
         pmd_table = pmd_offset(pud, 0);
+
         return pmd_table;
  }
  
  /*
   * Create a page table and place a pointer to it in a middle page
- * directory entry.
+ * directory entry:
   */
  static pte_t * __init one_page_table_init(pmd_t *pmd)
  {
@@ -90,9 +91,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
  #ifdef CONFIG_DEBUG_PAGEALLOC
                 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
  #endif
-               if (!page_table)
+               if (!page_table) {
                         page_table =
                                 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+               }
  
                 paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
                 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -103,22 +105,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
  }
  
  /*
- * This function initializes a certain range of kernel virtual memory 
+ * This function initializes a certain range of kernel virtual memory
   * with new bootmem page tables, everywhere page tables are missing in
   * the given range.
- */
-
-/*
- * NOTE: The pagetables are allocated contiguous on the physical space 
- * so we can cache the place of the first one and move around without 
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
   * checking the pgd every time.
   */
-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
  {
-       pgd_t *pgd;
-       pmd_t *pmd;
         int pgd_idx, pmd_idx;
         unsigned long vaddr;
+       pgd_t *pgd;
+       pmd_t *pmd;
  
         vaddr = start;
         pgd_idx = pgd_index(vaddr);
@@ -128,7 +129,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
         for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
                 pmd = one_md_table_init(pgd);
                 pmd = pmd + pmd_index(vaddr);
-               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
+               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+                                                       pmd++, pmd_idx++) {
                         one_page_table_init(pmd);
  
                         vaddr += PMD_SIZE;
@@ -145,17 +147,17 @@ static inline int is_kernel_text(unsigned long addr)
  }
  
  /*
- * This maps the physical memory to kernel virtual address space, a total 
- * of max_low_pfn pages, by creating page tables starting from address 
- * PAGE_OFFSET.
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
   */
  static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
  {
+       int pgd_idx, pmd_idx, pte_ofs;
         unsigned long pfn;
         pgd_t *pgd;
         pmd_t *pmd;
         pte_t *pte;
-       int pgd_idx, pmd_idx, pte_ofs;
  
         pgd_idx = pgd_index(PAGE_OFFSET);
         pgd = pgd_base + pgd_idx;
@@ -165,29 +167,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
                 pmd = one_md_table_init(pgd);
                 if (pfn >= max_low_pfn)
                         continue;
-               for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
-                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
  
-                       /* Map with big pages if possible, otherwise create normal page tables. */
+               for (pmd_idx = 0;
+                    pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+                    pmd++, pmd_idx++) {
+                       unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+                       /*
+                        * Map with big pages if possible, otherwise
+                        * create normal page tables:
+                        */
                         if (cpu_has_pse) {
-                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
-                               if (is_kernel_text(address) || is_kernel_text(address2))
-                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
-                               else
-                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+                               unsigned int addr2;
+                               pgprot_t prot = PAGE_KERNEL_LARGE;
+
+                               addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+                                       PAGE_OFFSET + PAGE_SIZE-1;
+
+                               if (is_kernel_text(addr) ||
+                                   is_kernel_text(addr2))
+                                       prot = PAGE_KERNEL_LARGE_EXEC;
+
+                               set_pmd(pmd, pfn_pmd(pfn, prot));
  
                                 pfn += PTRS_PER_PTE;
-                       } else {
-                               pte = one_page_table_init(pmd);
-
-                               for (pte_ofs = 0;
-                                    pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
-                                    pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
-                                       if (is_kernel_text(address))
-                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
-                                       else
-                                               set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-                               }
+                               continue;
+                       }
+                       pte = one_page_table_init(pmd);
+
+                       for (pte_ofs = 0;
+                            pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+                            pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+                               pgprot_t prot = PAGE_KERNEL;
+
+                               if (is_kernel_text(addr))
+                                       prot = PAGE_KERNEL_EXEC;
+
+                               set_pte(pte, pfn_pte(pfn, prot));
                         }
                 }
         }
@@ -200,57 +216,23 @@ static inline int page_kills_ppro(unsigned long pagenr)
         return 0;
  }
  
-int page_is_ram(unsigned long pagenr)
-{
-       int i;
-       unsigned long addr, end;
-
-       if (efi_enabled) {
-               efi_memory_desc_t *md;
-               void *p;
-
-               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-                       md = p;
-                       if (!is_available_memory(md))
-                               continue;
-                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
-
-                       if ((pagenr >= addr) && (pagenr < end))
-                               return 1;
-               }
-               return 0;
-       }
-
-       for (i = 0; i < e820.nr_map; i++) {
-
-               if (e820.map[i].type != E820_RAM)       /* not usable memory */
-                       continue;
-               /*
-                *      !!!FIXME!!! Some BIOSen report areas as RAM that
-                *      are not. Notably the 640->1Mb area. We need a sanity
-                *      check here.
-                */
-               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
-               if  ((pagenr >= addr) && (pagenr < end))
-                       return 1;
-       }
-       return 0;
-}
-
  #ifdef CONFIG_HIGHMEM
  pte_t *kmap_pte;
  pgprot_t kmap_prot;
  
-#define kmap_get_fixmap_pte(vaddr)                                     \
-       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+       return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+                       vaddr), vaddr), vaddr);
+}
  
  static void __init kmap_init(void)
  {
         unsigned long kmap_vstart;
  
-       /* cache the first kmap pte */
+       /*
+        * Cache the first kmap pte:
+        */
         kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
         kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
  
@@ -259,11 +241,11 @@ static void __init kmap_init(void)
  
  static void __init permanent_kmaps_init(pgd_t *pgd_base)
  {
+       unsigned long vaddr;
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *pte;
-       unsigned long vaddr;
  
         vaddr = PKMAP_BASE;
         page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
@@ -272,7 +254,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
         pud = pud_offset(pgd, vaddr);
         pmd = pmd_offset(pud, vaddr);
         pte = pte_offset_kernel(pmd, vaddr);
-       pkmap_page_table = pte; 
+       pkmap_page_table = pte;
  }
  
  static void __meminit free_new_highpage(struct page *page)
@@ -291,7 +273,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
                 SetPageReserved(page);
  }
  
-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+static int __meminit
+add_one_highpage_hotplug(struct page *page, unsigned long pfn)
  {
         free_new_highpage(page);
         totalram_pages++;
@@ -299,6 +282,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
         max_mapnr = max(pfn, max_mapnr);
  #endif
         num_physpages++;
+
         return 0;
  }
  
@@ -306,7 +290,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
   * Not currently handling the NUMA case.
   * Assuming single node and all memory that
   * has been added dynamically that would be
- * onlined here is in HIGHMEM
+ * onlined here is in HIGHMEM.
   */
  void __meminit online_page(struct page *page)
  {
@@ -314,13 +298,11 @@ void __meminit online_page(struct page *page)
         add_one_highpage_hotplug(page, page_to_pfn(page));
  }
  
-
-#ifdef CONFIG_NUMA
-extern void set_highmem_pages_init(int);
-#else
+#ifndef CONFIG_NUMA
  static void __init set_highmem_pages_init(int bad_ppro)
  {
         int pfn;
+
         for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
                 /*
                  * Holes under sparsemem might not have no mem_map[]:
@@ -330,23 +312,18 @@ static void __init set_highmem_pages_init(int bad_ppro)
         }
         totalram_pages += totalhigh_pages;
  }
-#endif /* CONFIG_FLATMEM */
+#endif /* !CONFIG_NUMA */
  
  #else
-#define kmap_init() do { } while (0)
-#define permanent_kmaps_init(pgd_base) do { } while (0)
-#define set_highmem_pages_init(bad_ppro) do { } while (0)
+# define kmap_init()                           do { } while (0)
+# define permanent_kmaps_init(pgd_base)                do { } while (0)
+# define set_highmem_pages_init(bad_ppro)      do { } while (0)
  #endif /* CONFIG_HIGHMEM */
  
-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
  EXPORT_SYMBOL(__PAGE_KERNEL);
-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
  
-#ifdef CONFIG_NUMA
-extern void __init remap_numa_kva(void);
-#else
-#define remap_numa_kva() do {} while (0)
-#endif
+pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
  
  void __init native_pagetable_setup_start(pgd_t *base)
  {
@@ -372,7 +349,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
         memset(&base[USER_PTRS_PER_PGD], 0,
                KERNEL_PGD_PTRS * sizeof(pgd_t));
  #else
-       paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
+       paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
  #endif
  }
  
@@ -410,10 +387,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
   * be partially populated, and so it avoids stomping on any existing
   * mappings.
   */
-static void __init pagetable_init (void)
+static void __init pagetable_init(void)
  {
-       unsigned long vaddr, end;
         pgd_t *pgd_base = swapper_pg_dir;
+       unsigned long vaddr, end;
  
         paravirt_pagetable_setup_start(pgd_base);
  
@@ -435,9 +412,11 @@ static void __init pagetable_init (void)
          * Fixed mappings, only the page table structure has to be
          * created - mappings will be set by set_fixmap():
          */
+       early_ioremap_clear();
         vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
         end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
         page_table_range_init(vaddr, end, pgd_base);
+       early_ioremap_reset();
  
         permanent_kmaps_init(pgd_base);
  
@@ -450,7 +429,7 @@ static void __init pagetable_init (void)
   * driver might have split up a kernel 4MB mapping.
   */
  char __nosavedata swsusp_pg_dir[PAGE_SIZE]
-       __attribute__ ((aligned (PAGE_SIZE)));
+       __attribute__ ((aligned(PAGE_SIZE)));
  
  static inline void save_pg_dir(void)
  {
@@ -462,7 +441,7 @@ static inline void save_pg_dir(void)
  }
  #endif
  
-void zap_low_mappings (void)
+void zap_low_mappings(void)
  {
         int i;
  
@@ -474,22 +453,24 @@ void zap_low_mappings (void)
          * Note that "pgd_clear()" doesn't do it for
          * us, because pgd_clear() is a no-op on i386.
          */
-       for (i = 0; i < USER_PTRS_PER_PGD; i++)
+       for (i = 0; i < USER_PTRS_PER_PGD; i++) {
  #ifdef CONFIG_X86_PAE
                 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
  #else
                 set_pgd(swapper_pg_dir+i, __pgd(0));
  #endif
+       }
         flush_tlb_all();
  }
  
-int nx_enabled = 0;
+int nx_enabled;
+
+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
  
  #ifdef CONFIG_X86_PAE
  
-static int disable_nx __initdata = 0;
-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
+static int disable_nx __initdata;
  
  /*
   * noexec = on|off
@@ -506,11 +487,14 @@ static int __init noexec_setup(char *str)
                         __supported_pte_mask |= _PAGE_NX;
                         disable_nx = 0;
                 }
-       } else if (!strcmp(str,"off")) {
-               disable_nx = 1;
-               __supported_pte_mask &= ~_PAGE_NX;
-       } else
-               return -EINVAL;
+       } else {
+               if (!strcmp(str, "off")) {
+                       disable_nx = 1;
+                       __supported_pte_mask &= ~_PAGE_NX;
+               } else {
+                       return -EINVAL;
+               }
+       }
  
         return 0;
  }
@@ -522,6 +506,7 @@ static void __init set_nx(void)
  
         if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
                 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
                 if ((v[3] & (1 << 20)) && !disable_nx) {
                         rdmsr(MSR_EFER, l, h);
                         l |= EFER_NX;
@@ -531,35 +516,6 @@ static void __init set_nx(void)
                 }
         }
  }
-
-/*
- * Enables/disables executability of a given kernel page and
- * returns the previous setting.
- */
-int __init set_kernel_exec(unsigned long vaddr, int enable)
-{
-       pte_t *pte;
-       int ret = 1;
-
-       if (!nx_enabled)
-               goto out;
-
-       pte = lookup_address(vaddr);
-       BUG_ON(!pte);
-
-       if (!pte_exec_kernel(*pte))
-               ret = 0;
-
-       if (enable)
-               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
-       else
-               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
-       pte_update_defer(&init_mm, vaddr, pte);
-       __flush_tlb_all();
-out:
-       return ret;
-}
-
  #endif
  
  /*
@@ -574,9 +530,8 @@ void __init paging_init(void)
  #ifdef CONFIG_X86_PAE
         set_nx();
         if (nx_enabled)
-               printk("NX (Execute Disable) protection: active\n");
+               printk(KERN_INFO "NX (Execute Disable) protection: active\n");
  #endif
-
         pagetable_init();
  
         load_cr3(swapper_pg_dir);
@@ -600,10 +555,10 @@ void __init paging_init(void)
   * used to involve black magic jumps to work around some nasty CPU bugs,
   * but fortunately the switch to using exceptions got rid of all that.
   */
-
  static void __init test_wp_bit(void)
  {
-       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
+       printk(KERN_INFO
+  "Checking if this processor honours the WP bit even in supervisor mode...");
  
         /* Any page-aligned address will do, the test is non-destructive */
         __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
@@ -611,47 +566,46 @@ static void __init test_wp_bit(void)
         clear_fixmap(FIX_WP_TEST);
  
         if (!boot_cpu_data.wp_works_ok) {
-               printk("No.\n");
+               printk(KERN_CONT "No.\n");
  #ifdef CONFIG_X86_WP_WORKS_OK
-               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+               panic(
+  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
  #endif
         } else {
-               printk("Ok.\n");
+               printk(KERN_CONT "Ok.\n");
         }
  }
  
-static struct kcore_list kcore_mem, kcore_vmalloc; 
+static struct kcore_list kcore_mem, kcore_vmalloc;
  
  void __init mem_init(void)
  {
-       extern int ppro_with_ram_bug(void);
         int codesize, reservedpages, datasize, initsize;
-       int tmp;
-       int bad_ppro;
+       int tmp, bad_ppro;
  
  #ifdef CONFIG_FLATMEM
         BUG_ON(!mem_map);
  #endif
-       
         bad_ppro = ppro_with_ram_bug();
  
  #ifdef CONFIG_HIGHMEM
         /* check that fixmap and pkmap do not overlap */
-       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
-               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
+       if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+               printk(KERN_ERR
+                       "fixmap and kmap areas overlap - this will crash\n");
                 printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
-                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
+                               PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
+                               FIXADDR_START);
                 BUG();
         }
  #endif
- 
         /* this will put all low memory onto the freelists */
         totalram_pages += free_all_bootmem();
  
         reservedpages = 0;
         for (tmp = 0; tmp < max_low_pfn; tmp++)
                 /*
-                * Only count reserved RAM pages
+                * Only count reserved RAM pages:
                  */
                 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
                         reservedpages++;
@@ -662,11 +616,12 @@ void __init mem_init(void)
         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
  
-       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
-       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
                    VMALLOC_END-VMALLOC_START);
  
-       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
+       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
+                       "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
                 num_physpages << (PAGE_SHIFT-10),
                 codesize >> 10,
@@ -677,45 +632,46 @@ void __init mem_init(void)
                );
  
  #if 1 /* double-sanity-check paranoia */
-       printk("virtual kernel memory layout:\n"
-              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+       printk(KERN_INFO "virtual kernel memory layout:\n"
+               "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
  #ifdef CONFIG_HIGHMEM
-              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
  #endif
-              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
-              FIXADDR_START, FIXADDR_TOP,
-              (FIXADDR_TOP - FIXADDR_START) >> 10,
+               "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+               "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+               "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+               FIXADDR_START, FIXADDR_TOP,
+               (FIXADDR_TOP - FIXADDR_START) >> 10,
  
  #ifdef CONFIG_HIGHMEM
-              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
-              (LAST_PKMAP*PAGE_SIZE) >> 10,
+               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+               (LAST_PKMAP*PAGE_SIZE) >> 10,
  #endif
  
-              VMALLOC_START, VMALLOC_END,
-              (VMALLOC_END - VMALLOC_START) >> 20,
+               VMALLOC_START, VMALLOC_END,
+               (VMALLOC_END - VMALLOC_START) >> 20,
  
-              (unsigned long)__va(0), (unsigned long)high_memory,
-              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+               (unsigned long)__va(0), (unsigned long)high_memory,
+               ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
  
-              (unsigned long)&__init_begin, (unsigned long)&__init_end,
-              ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+               (unsigned long)&__init_begin, (unsigned long)&__init_end,
+               ((unsigned long)&__init_end -
+                (unsigned long)&__init_begin) >> 10,
  
-              (unsigned long)&_etext, (unsigned long)&_edata,
-              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+               (unsigned long)&_etext, (unsigned long)&_edata,
+               ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
  
-              (unsigned long)&_text, (unsigned long)&_etext,
-              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+               (unsigned long)&_text, (unsigned long)&_etext,
+               ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
  
  #ifdef CONFIG_HIGHMEM
-       BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
-       BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+       BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
+       BUG_ON(VMALLOC_END                              > PKMAP_BASE);
  #endif
-       BUG_ON(VMALLOC_START                   > VMALLOC_END);
-       BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+       BUG_ON(VMALLOC_START                            > VMALLOC_END);
+       BUG_ON((unsigned long)high_memory               > VMALLOC_START);
  #endif /* double-sanity-check paranoia */
  
  #ifdef CONFIG_X86_PAE
@@ -746,49 +702,38 @@ int arch_add_memory(int nid, u64 start, u64 size)
  
         return __add_pages(zone, start_pfn, nr_pages);
  }
-
  #endif
  
-struct kmem_cache *pmd_cache;
-
-void __init pgtable_cache_init(void)
-{
-       if (PTRS_PER_PMD > 1)
-               pmd_cache = kmem_cache_create("pmd",
-                                             PTRS_PER_PMD*sizeof(pmd_t),
-                                             PTRS_PER_PMD*sizeof(pmd_t),
-                                             SLAB_PANIC,
-                                             pmd_ctor);
-}
-
  /*
   * This function cannot be __init, since exceptions don't work in that
   * section.  Put this after the callers, so that it cannot be inlined.
   */
-static int noinline do_test_wp_bit(void)
+static noinline int do_test_wp_bit(void)
  {
         char tmp_reg;
         int flag;
  
         __asm__ __volatile__(
-               "       movb %0,%1      \n"
-               "1:     movb %1,%0      \n"
-               "       xorl %2,%2      \n"
+               "       movb %0, %1     \n"
+               "1:     movb %1, %0     \n"
+               "       xorl %2, %2     \n"
                 "2:                     \n"
-               ".section __ex_table,\"a\"\n"
+               ".section __ex_table, \"a\"\n"
                 "       .align 4        \n"
-               "       .long 1b,2b     \n"
+               "       .long 1b, 2b    \n"
                 ".previous              \n"
                 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
                  "=q" (tmp_reg),
                  "=r" (flag)
                 :"2" (1)
                 :"memory");
-       
+
         return flag;
  }
  
  #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
  
  void mark_rodata_ro(void)
  {
@@ -801,32 +746,58 @@ void mark_rodata_ro(void)
         if (num_possible_cpus() <= 1)
  #endif
         {
-               change_page_attr(virt_to_page(start),
-                                size >> PAGE_SHIFT, PAGE_KERNEL_RX);
-               printk("Write protecting the kernel text: %luk\n", size >> 10);
+               set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+               printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+                       size >> 10);
+
+#ifdef CONFIG_CPA_DEBUG
+               printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+                       start, start+size);
+               set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+               printk(KERN_INFO "Testing CPA: write protecting again\n");
+               set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
         }
  #endif
         start += size;
         size = (unsigned long)__end_rodata - start;
-       change_page_attr(virt_to_page(start),
-                        size >> PAGE_SHIFT, PAGE_KERNEL_RO);
-       printk("Write protecting the kernel read-only data: %luk\n",
-              size >> 10);
+       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+       printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+               size >> 10);
+       rodata_test();
  
-       /*
-        * change_page_attr() requires a global_flush_tlb() call after it.
-        * We do this after the printk so that if something went wrong in the
-        * change, the printk gets out at least to give a better debug hint
-        * of who is the culprit.
-        */
-       global_flush_tlb();
+#ifdef CONFIG_CPA_DEBUG
+       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+       set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+       printk(KERN_INFO "Testing CPA: write protecting again\n");
+       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
  }
  #endif
  
  void free_init_pages(char *what, unsigned long begin, unsigned long end)
  {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * If debugging page accesses then do not free this memory but
+        * mark them not present - any buggy init-section access will
+        * create a kernel page fault:
+        */
+       printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+               begin, PAGE_ALIGN(end));
+       set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
         unsigned long addr;
  
+       /*
+        * We just marked the kernel text read only above, now that
+        * we are going to free part of that, we need to make that
+        * writeable first.
+        */
+       set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
         for (addr = begin; addr < end; addr += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(addr));
                 init_page_count(virt_to_page(addr));
@@ -835,6 +806,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
                 totalram_pages++;
         }
         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+#endif
  }
  
  void free_initmem(void)
@@ -850,4 +822,3 @@ void free_initrd_mem(unsigned long start, unsigned long end)
         free_init_pages("initrd memory", start, end);
  }
  #endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index 0f9c8c890658043773018ad32e8786953dae8c79..cc50a13ce8d9d855f9f29e884843ed8717c1e0b2 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -43,12 +43,10 @@
  #include <asm/proto.h>
  #include <asm/smp.h>
  #include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
  
-#ifndef Dprintk
-#define Dprintk(x...)
-#endif
-
-const struct dma_mapping_ops* dma_ops;
+const struct dma_mapping_ops *dma_ops;
  EXPORT_SYMBOL(dma_ops);
  
  static unsigned long dma_reserve __initdata;
@@ -65,22 +63,26 @@ void show_mem(void)
  {
         long i, total = 0, reserved = 0;
         long shared = 0, cached = 0;
-       pg_data_t *pgdat;
         struct page *page;
+       pg_data_t *pgdat;
  
         printk(KERN_INFO "Mem-info:\n");
         show_free_areas();
-       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+       printk(KERN_INFO "Free swap:       %6ldkB\n",
+               nr_swap_pages << (PAGE_SHIFT-10));
  
         for_each_online_pgdat(pgdat) {
-               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-                       /* this loop can take a while with 256 GB and 4k pages
-                          so update the NMI watchdog */
-                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                       /*
+                        * This loop can take a while with 256 GB and
+                        * 4k pages so defer the NMI watchdog:
+                        */
+                       if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
                                 touch_nmi_watchdog();
-                       }
+
                         if (!pfn_valid(pgdat->node_start_pfn + i))
                                 continue;
+
                         page = pfn_to_page(pgdat->node_start_pfn + i);
                         total++;
                         if (PageReserved(page))
@@ -89,51 +91,58 @@ void show_mem(void)
                                 cached++;
                         else if (page_count(page))
                                 shared += page_count(page) - 1;
-               }
+               }
         }
-       printk(KERN_INFO "%lu pages of RAM\n", total);
-       printk(KERN_INFO "%lu reserved pages\n",reserved);
-       printk(KERN_INFO "%lu pages shared\n",shared);
-       printk(KERN_INFO "%lu pages swap cached\n",cached);
+       printk(KERN_INFO "%lu pages of RAM\n",          total);
+       printk(KERN_INFO "%lu reserved pages\n",        reserved);
+       printk(KERN_INFO "%lu pages shared\n",          shared);
+       printk(KERN_INFO "%lu pages swap cached\n",     cached);
  }
  
  int after_bootmem;
  
  static __init void *spp_getpage(void)
-{ 
+{
         void *ptr;
+
         if (after_bootmem)
-               ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
+               ptr = (void *) get_zeroed_page(GFP_ATOMIC);
         else
                 ptr = alloc_bootmem_pages(PAGE_SIZE);
-       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
-               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
  
-       Dprintk("spp_getpage %p\n", ptr);
+       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+               panic("set_pte_phys: cannot allocate page data %s\n",
+                       after_bootmem ? "after bootmem" : "");
+       }
+
+       pr_debug("spp_getpage %p\n", ptr);
+
         return ptr;
-} 
+}
  
-static __init void set_pte_phys(unsigned long vaddr,
-                        unsigned long phys, pgprot_t prot)
+static __init void
+set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *pte, new_pte;
  
-       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+       pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
  
         pgd = pgd_offset_k(vaddr);
         if (pgd_none(*pgd)) {
-               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+               printk(KERN_ERR
+                       "PGD FIXMAP MISSING, it should be setup in head.S!\n");
                 return;
         }
         pud = pud_offset(pgd, vaddr);
         if (pud_none(*pud)) {
-               pmd = (pmd_t *) spp_getpage(); 
+               pmd = (pmd_t *) spp_getpage();
                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
                 if (pmd != pmd_offset(pud, 0)) {
-                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+                       printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+                               pmd, pmd_offset(pud, 0));
                         return;
                 }
         }
@@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr,
                 pte = (pte_t *) spp_getpage();
                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
                 if (pte != pte_offset_kernel(pmd, 0)) {
-                       printk("PAGETABLE BUG #02!\n");
+                       printk(KERN_ERR "PAGETABLE BUG #02!\n");
                         return;
                 }
         }
@@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr,
  }
  
  /* NOTE: this is meant to be run only at boot */
-void __init 
-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __init
+__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
  {
         unsigned long address = __fix_to_virt(idx);
  
         if (idx >= __end_of_fixed_addresses) {
-               printk("Invalid __set_fixmap\n");
+               printk(KERN_ERR "Invalid __set_fixmap\n");
                 return;
         }
         set_pte_phys(address, phys, prot);
  }
  
-unsigned long __meminitdata table_start, table_end;
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
  
  static __meminit void *alloc_low_page(unsigned long *phys)
-{ 
+{
         unsigned long pfn = table_end++;
         void *adr;
  
         if (after_bootmem) {
                 adr = (void *)get_zeroed_page(GFP_ATOMIC);
                 *phys = __pa(adr);
+
                 return adr;
         }
  
-       if (pfn >= end_pfn) 
-               panic("alloc_low_page: ran out of memory"); 
+       if (pfn >= end_pfn)
+               panic("alloc_low_page: ran out of memory");
  
         adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
         memset(adr, 0, PAGE_SIZE);
@@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys)
  }
  
  static __meminit void unmap_low_page(void *adr)
-{ 
-
+{
         if (after_bootmem)
                 return;
  
         early_iounmap(adr, PAGE_SIZE);
-} 
+}
  
  /* Must run before zap_low_mappings */
  __meminit void *early_ioremap(unsigned long addr, unsigned long size)
  {
-       unsigned long vaddr;
         pmd_t *pmd, *last_pmd;
+       unsigned long vaddr;
         int i, pmds;
  
         pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
         vaddr = __START_KERNEL_map;
         pmd = level2_kernel_pgt;
         last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+
         for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
                 for (i = 0; i < pmds; i++) {
                         if (pmd_present(pmd[i]))
-                               goto next;
+                               goto continue_outer_loop;
                 }
                 vaddr += addr & ~PMD_MASK;
                 addr &= PMD_MASK;
+
                 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
-                       set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
-               __flush_tlb();
+                       set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               __flush_tlb_all();
+
                 return (void *)vaddr;
-       next:
+continue_outer_loop:
                 ;
         }
-       printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+       printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
+
         return NULL;
  }
  
-/* To avoid virtual aliases later */
+/*
+ * To avoid virtual aliases later:
+ */
  __meminit void early_iounmap(void *addr, unsigned long size)
  {
         unsigned long vaddr;
@@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size)
         vaddr = (unsigned long)addr;
         pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
         pmd = level2_kernel_pgt + pmd_index(vaddr);
+
         for (i = 0; i < pmds; i++)
                 pmd_clear(pmd + i);
-       __flush_tlb();
+
+       __flush_tlb_all();
  }
  
  static void __meminit
@@ -259,16 +277,17 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
                 pmd_t *pmd = pmd_page + pmd_index(address);
  
                 if (address >= end) {
-                       if (!after_bootmem)
+                       if (!after_bootmem) {
                                 for (; i < PTRS_PER_PMD; i++, pmd++)
                                         set_pmd(pmd, __pmd(0));
+                       }
                         break;
                 }
  
                 if (pmd_val(*pmd))
                         continue;
  
-               entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+               entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
                 entry &= __supported_pte_mask;
                 set_pmd(pmd, __pmd(entry));
         }
@@ -277,19 +296,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
  static void __meminit
  phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
  {
-       pmd_t *pmd = pmd_offset(pud,0);
+       pmd_t *pmd = pmd_offset(pud, 0);
         spin_lock(&init_mm.page_table_lock);
         phys_pmd_init(pmd, address, end);
         spin_unlock(&init_mm.page_table_lock);
         __flush_tlb_all();
  }
  
-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
-{ 
+static void __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+{
         int i = pud_index(addr);
  
-
-       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
+       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
                 unsigned long pmd_phys;
                 pud_t *pud = pud_page + pud_index(addr);
                 pmd_t *pmd;
@@ -297,10 +316,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
                 if (addr >= end)
                         break;
  
-               if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
-                       set_pud(pud, __pud(0)); 
+               if (!after_bootmem &&
+                               !e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
+                       set_pud(pud, __pud(0));
                         continue;
-               } 
+               }
  
                 if (pud_val(*pud)) {
                         phys_pmd_update(pud, addr, end);
@@ -308,14 +328,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
                 }
  
                 pmd = alloc_low_page(&pmd_phys);
+
                 spin_lock(&init_mm.page_table_lock);
                 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
                 phys_pmd_init(pmd, addr, end);
                 spin_unlock(&init_mm.page_table_lock);
+
                 unmap_low_page(pmd);
         }
-       __flush_tlb();
-} 
+       __flush_tlb_all();
+}
  
  static void __init find_early_table_space(unsigned long end)
  {
@@ -326,14 +348,23 @@ static void __init find_early_table_space(unsigned long end)
         tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
                  round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
  
-       /* RED-PEN putting page tables only on node 0 could
-          cause a hotspot and fill up ZONE_DMA. The page tables
-          need roughly 0.5KB per GB. */
-       start = 0x8000;
-       table_start = find_e820_area(start, end, tables);
+       /*
+        * RED-PEN putting page tables only on node 0 could
+        * cause a hotspot and fill up ZONE_DMA. The page tables
+        * need roughly 0.5KB per GB.
+        */
+       start = 0x8000;
+       table_start = find_e820_area(start, end, tables);
         if (table_start == -1UL)
                 panic("Cannot find space for the kernel page tables");
  
+       /*
+        * When you have a lot of RAM like 256GB, early_table will not fit
+        * into 0x8000 range, find_e820_area() will find area after kernel
+        * bss but the table_start is not page aligned, so need to round it
+        * up to avoid overlap with bss:
+        */
+       table_start = round_up(table_start, PAGE_SIZE);
         table_start >>= PAGE_SHIFT;
         table_end = table_start;
  
@@ -342,20 +373,23 @@ static void __init find_early_table_space(unsigned long end)
                 (table_start << PAGE_SHIFT) + tables);
  }
  
-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
-   This runs before bootmem is initialized and gets pages directly from the 
-   physical memory. To access them they are temporarily mapped. */
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
  void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
-{ 
-       unsigned long next; 
+{
+       unsigned long next;
  
-       Dprintk("init_memory_mapping\n");
+       pr_debug("init_memory_mapping\n");
  
-       /* 
+       /*
          * Find space for the kernel direct mapping tables.
-        * Later we should allocate these tables in the local node of the memory
-        * mapped.  Unfortunately this is done currently before the nodes are 
-        * discovered.
+        *
+        * Later we should allocate these tables in the local node of the
+        * memory mapped. Unfortunately this is done currently before the
+        * nodes are discovered.
          */
         if (!after_bootmem)
                 find_early_table_space(end);
@@ -364,8 +398,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
         end = (unsigned long)__va(end);
  
         for (; start < end; start = next) {
-               unsigned long pud_phys; 
                 pgd_t *pgd = pgd_offset_k(start);
+               unsigned long pud_phys;
                 pud_t *pud;
  
                 if (after_bootmem)
@@ -374,23 +408,26 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
                         pud = alloc_low_page(&pud_phys);
  
                 next = start + PGDIR_SIZE;
-               if (next > end) 
-                       next = end; 
+               if (next > end)
+                       next = end;
                 phys_pud_init(pud, __pa(start), __pa(next));
                 if (!after_bootmem)
                         set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
                 unmap_low_page(pud);
-       } 
+       }
  
         if (!after_bootmem)
                 mmu_cr4_features = read_cr4();
         __flush_tlb_all();
+
+       reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
  }
  
  #ifndef CONFIG_NUMA
  void __init paging_init(void)
  {
         unsigned long max_zone_pfns[MAX_NR_ZONES];
+
         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -402,39 +439,48 @@ void __init paging_init(void)
  }
  #endif
  
-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
-   from the CPU leading to inconsistent cache lines. address and size
-   must be aligned to 2MB boundaries. 
-   Does nothing when the mapping doesn't exist. */
-void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+/*
+ * Unmap a kernel mapping if it exists. This is useful to avoid
+ * prefetches from the CPU leading to inconsistent cache lines.
+ * address and size must be aligned to 2MB boundaries.
+ * Does nothing when the mapping doesn't exist.
+ */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size)
  {
         unsigned long end = address + size;
  
         BUG_ON(address & ~LARGE_PAGE_MASK);
-       BUG_ON(size & ~LARGE_PAGE_MASK); 
-       
-       for (; address < end; address += LARGE_PAGE_SIZE) { 
+       BUG_ON(size & ~LARGE_PAGE_MASK);
+
+       for (; address < end; address += LARGE_PAGE_SIZE) {
                 pgd_t *pgd = pgd_offset_k(address);
                 pud_t *pud;
                 pmd_t *pmd;
+
                 if (pgd_none(*pgd))
                         continue;
+
                 pud = pud_offset(pgd, address);
                 if (pud_none(*pud))
-                       continue; 
+                       continue;
+
                 pmd = pmd_offset(pud, address);
                 if (!pmd || pmd_none(*pmd))
-                       continue; 
-               if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
-                       /* Could handle this, but it should not happen currently. */
-                       printk(KERN_ERR 
-              "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
-                       pmd_ERROR(*pmd); 
+                       continue;
+
+               if (!(pmd_val(*pmd) & _PAGE_PSE)) {
+                       /*
+                        * Could handle this, but it should not happen
+                        * currently:
+                        */
+                       printk(KERN_ERR "clear_kernel_mapping: "
+                               "mapping has been split. will leak memory\n");
+                       pmd_ERROR(*pmd);
                 }
-               set_pmd(pmd, __pmd(0));                 
+               set_pmd(pmd, __pmd(0));
         }
         __flush_tlb_all();
-} 
+}
  
  /*
   * Memory hotplug specific functions
@@ -461,16 +507,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
         unsigned long nr_pages = size >> PAGE_SHIFT;
         int ret;
  
-       init_memory_mapping(start, (start + size -1));
+       init_memory_mapping(start, start + size-1);
  
         ret = __add_pages(zone, start_pfn, nr_pages);
-       if (ret)
-               goto error;
+       WARN_ON(1);
  
         return ret;
-error:
-       printk("%s: Problem encountered in __add_pages!\n", __func__);
-       return ret;
  }
  EXPORT_SYMBOL_GPL(arch_add_memory);
  
@@ -484,36 +526,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
  
  #endif /* CONFIG_MEMORY_HOTPLUG */
  
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
- */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
-{
-       int err = -EIO;
-       unsigned long pfn;
-       unsigned long total = 0, mem = 0;
-       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-               if (pfn_valid(pfn)) {
-                       online_page(pfn_to_page(pfn));
-                       err = 0;
-                       mem++;
-               }
-               total++;
-       }
-       if (!err) {
-               z->spanned_pages += total;
-               z->present_pages += mem;
-               z->zone_pgdat->node_spanned_pages += total;
-               z->zone_pgdat->node_present_pages += mem;
-       }
-       return err;
-}
-#endif
-
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
-                        kcore_vsyscall;
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
+                        kcore_modules, kcore_vsyscall;
  
  void __init mem_init(void)
  {
@@ -521,8 +535,15 @@ void __init mem_init(void)
  
         pci_iommu_alloc();
  
-       /* clear the zero-page */
-       memset(empty_zero_page, 0, PAGE_SIZE);
+       /* clear_bss() already clear the empty_zero_page */
+
+       /* temporary debugging - double check it's true: */
+       {
+               int i;
+
+               for (i = 0; i < 1024; i++)
+                       WARN_ON_ONCE(empty_zero_page[i]);
+       }
  
         reservedpages = 0;
  
@@ -534,7 +555,6 @@ void __init mem_init(void)
  #endif
         reservedpages = end_pfn - totalram_pages -
                                         absent_pages_in_range(0, end_pfn);
-
         after_bootmem = 1;
  
         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
@@ -542,15 +562,16 @@ void __init mem_init(void)
         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
  
         /* Register memory areas for /proc/kcore */
-       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
-       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
                    VMALLOC_END-VMALLOC_START);
         kclist_add(&kcore_kernel, &_stext, _end - _stext);
         kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
-       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
+       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
                                  VSYSCALL_END - VSYSCALL_START);
  
-       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
+       printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
+                               "%ldk reserved, %ldk data, %ldk init)\n",
                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
                 end_pfn << (PAGE_SHIFT-10),
                 codesize >> 10,
@@ -566,19 +587,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
         if (begin >= end)
                 return;
  
+       /*
+        * If debugging page accesses then do not free this memory but
+        * mark them not present - any buggy init-section access will
+        * create a kernel page fault:
+        */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+               begin, PAGE_ALIGN(end));
+       set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
         for (addr = begin; addr < end; addr += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(addr));
                 init_page_count(virt_to_page(addr));
                 memset((void *)(addr & ~(PAGE_SIZE-1)),
                         POISON_FREE_INITMEM, PAGE_SIZE);
-               if (addr >= __START_KERNEL_map)
-                       change_page_attr_addr(addr, 1, __pgprot(0));
                 free_page(addr);
                 totalram_pages++;
         }
-       if (addr > __START_KERNEL_map)
-               global_flush_tlb();
+#endif
  }
  
  void free_initmem(void)
@@ -589,6 +618,8 @@ void free_initmem(void)
  }
  
  #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
  
  void mark_rodata_ro(void)
  {
@@ -603,25 +634,27 @@ void mark_rodata_ro(void)
  #ifdef CONFIG_KPROBES
         start = (unsigned long)__start_rodata;
  #endif
-       
+
         end = (unsigned long)__end_rodata;
         start = (start + PAGE_SIZE - 1) & PAGE_MASK;
         end &= PAGE_MASK;
         if (end <= start)
                 return;
  
-       change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+       set_memory_ro(start, (end - start) >> PAGE_SHIFT);
  
         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
                (end - start) >> 10);
  
-       /*
-        * change_page_attr_addr() requires a global_flush_tlb() call after it.
-        * We do this after the printk so that if something went wrong in the
-        * change, the printk gets out at least to give a better debug hint
-        * of who is the culprit.
-        */
-       global_flush_tlb();
+       rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+       set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+       printk(KERN_INFO "Testing CPA: again\n");
+       set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
  }
  #endif
  
@@ -632,17 +665,21 @@ void free_initrd_mem(unsigned long start, unsigned long end)
  }
  #endif
  
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
-{ 
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+{
  #ifdef CONFIG_NUMA
         int nid = phys_to_nid(phys);
  #endif
         unsigned long pfn = phys >> PAGE_SHIFT;
+
         if (pfn >= end_pfn) {
-               /* This can happen with kdump kernels when accessing firmware
-                  tables. */
+               /*
+                * This can happen with kdump kernels when accessing
+                * firmware tables:
+                */
                 if (pfn < end_pfn_map)
                         return;
+
                 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
                                 phys, len);
                 return;
@@ -650,9 +687,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
  
         /* Should check here against the e820 map to avoid double free */
  #ifdef CONFIG_NUMA
-       reserve_bootmem_node(NODE_DATA(nid), phys, len);
-#else                  
-       reserve_bootmem(phys, len);    
+       reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else
+       reserve_bootmem(phys, len);
  #endif
         if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
                 dma_reserve += len / PAGE_SIZE;
@@ -660,46 +697,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
         }
  }
  
-int kern_addr_valid(unsigned long addr) 
-{ 
+int kern_addr_valid(unsigned long addr)
+{
         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
  
         if (above != 0 && above != -1UL)
-               return 0; 
-       
+               return 0;
+
         pgd = pgd_offset_k(addr);
         if (pgd_none(*pgd))
                 return 0;
  
         pud = pud_offset(pgd, addr);
         if (pud_none(*pud))
-               return 0; 
+               return 0;
  
         pmd = pmd_offset(pud, addr);
         if (pmd_none(*pmd))
                 return 0;
+
         if (pmd_large(*pmd))
                 return pfn_valid(pmd_pfn(*pmd));
  
         pte = pte_offset_kernel(pmd, addr);
         if (pte_none(*pte))
                 return 0;
+
         return pfn_valid(pte_pfn(*pte));
  }
  
-/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
-   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
-   not need special handling anymore. */
-
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
  static struct vm_area_struct gate_vma = {
-       .vm_start = VSYSCALL_START,
-       .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
-       .vm_page_prot = PAGE_READONLY_EXEC,
-       .vm_flags = VM_READ | VM_EXEC
+       .vm_start       = VSYSCALL_START,
+       .vm_end         = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+       .vm_page_prot   = PAGE_READONLY_EXEC,
+       .vm_flags       = VM_READ | VM_EXEC
  };
  
  struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
@@ -714,14 +754,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
  int in_gate_area(struct task_struct *task, unsigned long addr)
  {
         struct vm_area_struct *vma = get_gate_vma(task);
+
         if (!vma)
                 return 0;
+
         return (addr >= vma->vm_start) && (addr < vma->vm_end);
  }
  
-/* Use this when you have no reliable task/vma, typically from interrupt
- * context.  It is less reliable than using the task's vma and may give
- * false positives.
+/*
+ * Use this when you have no reliable task/vma, typically from interrupt
+ * context. It is less reliable than using the task's vma and may give
+ * false positives:
   */
  int in_gate_area_no_task(unsigned long addr)
  {
@@ -741,8 +784,8 @@ const char *arch_vma_name(struct vm_area_struct *vma)
  /*
   * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
   */
-int __meminit vmemmap_populate(struct page *start_page,
-                                               unsigned long size, int node)
+int __meminit
+vmemmap_populate(struct page *start_page, unsigned long size, int node)
  {
         unsigned long addr = (unsigned long)start_page;
         unsigned long end = (unsigned long)(start_page + size);
@@ -757,6 +800,7 @@ int __meminit vmemmap_populate(struct page *start_page,
                 pgd = vmemmap_pgd_populate(addr, node);
                 if (!pgd)
                         return -ENOMEM;
+
                 pud = vmemmap_pud_populate(pgd, addr, node);
                 if (!pud)
                         return -ENOMEM;
@@ -764,20 +808,22 @@ int __meminit vmemmap_populate(struct page *start_page,
                 pmd = pmd_offset(pud, addr);
                 if (pmd_none(*pmd)) {
                         pte_t entry;
-                       void *p = vmemmap_alloc_block(PMD_SIZE, node);
+                       void *p;
+
+                       p = vmemmap_alloc_block(PMD_SIZE, node);
                         if (!p)
                                 return -ENOMEM;
  
-                       entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
-                       mk_pte_huge(entry);
+                       entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+                                                       PAGE_KERNEL_LARGE);
                         set_pmd(pmd, __pmd(pte_val(entry)));
  
                         printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
                                 addr, addr + PMD_SIZE - 1, p, node);
-               } else
+               } else {
                         vmemmap_verify((pte_t *)pmd, node, addr, next);
+               }
         }
-
         return 0;
  }
  #endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c

new file mode 100644 (file)

index 0000000..ed79572
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,501 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+
+enum ioremap_mode {
+       IOR_MODE_UNCACHED,
+       IOR_MODE_CACHED,
+};
+
+#ifdef CONFIG_X86_64
+
+unsigned long __phys_addr(unsigned long x)
+{
+       if (x >= __START_KERNEL_map)
+               return x - __START_KERNEL_map + phys_base;
+       return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+#endif
+
+int page_is_ram(unsigned long pagenr)
+{
+       unsigned long addr, end;
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               /*
+                * Not usable memory:
+                */
+               if (e820.map[i].type != E820_RAM)
+                       continue;
+               addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
+               end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
+
+               /*
+                * Sanity check: Some BIOSen report areas as RAM that
+                * are not. Notably the 640->1Mb area, which is the
+                * PCI BIOS area.
+                */
+               if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
+                   end < (BIOS_END >> PAGE_SHIFT))
+                       continue;
+
+               if ((pagenr >= addr) && (pagenr < end))
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int ioremap_change_attr(unsigned long paddr, unsigned long size,
+                              enum ioremap_mode mode)
+{
+       unsigned long vaddr = (unsigned long)__va(paddr);
+       unsigned long nrpages = size >> PAGE_SHIFT;
+       int err, level;
+
+       /* No change for pages after the last mapping */
+       if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT))
+               return 0;
+
+       /*
+        * If there is no identity map for this address,
+        * change_page_attr_addr is unnecessary
+        */
+       if (!lookup_address(vaddr, &level))
+               return 0;
+
+       switch (mode) {
+       case IOR_MODE_UNCACHED:
+       default:
+               err = set_memory_uc(vaddr, nrpages);
+               break;
+       case IOR_MODE_CACHED:
+               err = set_memory_wb(vaddr, nrpages);
+               break;
+       }
+
+       return err;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
+                              enum ioremap_mode mode)
+{
+       void __iomem *addr;
+       struct vm_struct *area;
+       unsigned long offset, last_addr;
+       pgprot_t prot;
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr)
+               return NULL;
+
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped..
+        */
+       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+               return (__force void __iomem *)phys_to_virt(phys_addr);
+
+       /*
+        * Don't allow anybody to remap normal RAM that we're using..
+        */
+       for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped &&
+            (offset << PAGE_SHIFT) < last_addr; offset++) {
+               if (page_is_ram(offset))
+                       return NULL;
+       }
+
+       switch (mode) {
+       case IOR_MODE_UNCACHED:
+       default:
+               prot = PAGE_KERNEL_NOCACHE;
+               break;
+       case IOR_MODE_CACHED:
+               prot = PAGE_KERNEL;
+               break;
+       }
+
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+       /*
+        * Ok, go for it..
+        */
+       area = get_vm_area(size, VM_IOREMAP);
+       if (!area)
+               return NULL;
+       area->phys_addr = phys_addr;
+       addr = (void __iomem *) area->addr;
+       if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
+                              phys_addr, prot)) {
+               remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+               return NULL;
+       }
+
+       if (ioremap_change_attr(phys_addr, size, mode) < 0) {
+               vunmap(addr);
+               return NULL;
+       }
+
+       return (void __iomem *) (offset + (char __iomem *)addr);
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size)
+{
+       return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size)
+{
+       return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+       struct vm_struct *p, *o;
+
+       if ((void __force *)addr <= high_memory)
+               return;
+
+       /*
+        * __ioremap special-cases the PCI/ISA range by not instantiating a
+        * vm_area and by simply returning an address into the kernel mapping
+        * of ISA space.   So handle that here.
+        */
+       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+           addr < phys_to_virt(ISA_END_ADDRESS))
+               return;
+
+       addr = (volatile void __iomem *)
+               (PAGE_MASK & (unsigned long __force)addr);
+
+       /* Use the vm area unlocked, assuming the caller
+          ensures there isn't another iounmap for the same address
+          in parallel. Reuse of the virtual address is prevented by
+          leaving it in the global lists until we're done with it.
+          cpa takes care of the direct mappings. */
+       read_lock(&vmlist_lock);
+       for (p = vmlist; p; p = p->next) {
+               if (p->addr == addr)
+                       break;
+       }
+       read_unlock(&vmlist_lock);
+
+       if (!p) {
+               printk(KERN_ERR "iounmap: bad address %p\n", addr);
+               dump_stack();
+               return;
+       }
+
+       /* Reset the direct mapping. Can block */
+       ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED);
+
+       /* Finally remove it */
+       o = remove_vm_area((void *)addr);
+       BUG_ON(p != o || o == NULL);
+       kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+#ifdef CONFIG_X86_32
+
+int __initdata early_ioremap_debug;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+       early_ioremap_debug = 1;
+
+       return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static __initdata int after_paging_init;
+static __initdata unsigned long bm_pte[1024]
+                               __attribute__((aligned(PAGE_SIZE)));
+
+static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
+{
+       return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
+}
+
+static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
+{
+       return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
+}
+
+void __init early_ioremap_init(void)
+{
+       unsigned long *pgd;
+
+       if (early_ioremap_debug)
+               printk(KERN_INFO "early_ioremap_init()\n");
+
+       pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+       *pgd = __pa(bm_pte) | _PAGE_TABLE;
+       memset(bm_pte, 0, sizeof(bm_pte));
+       /*
+        * The boot-ioremap range spans multiple pgds, for which
+        * we are not prepared:
+        */
+       if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
+               WARN_ON(1);
+               printk(KERN_WARNING "pgd %p != %p\n",
+                      pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+                      fix_to_virt(FIX_BTMAP_BEGIN));
+               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
+                      fix_to_virt(FIX_BTMAP_END));
+
+               printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
+               printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
+                      FIX_BTMAP_BEGIN);
+       }
+}
+
+void __init early_ioremap_clear(void)
+{
+       unsigned long *pgd;
+
+       if (early_ioremap_debug)
+               printk(KERN_INFO "early_ioremap_clear()\n");
+
+       pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+       *pgd = 0;
+       paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
+       __flush_tlb_all();
+}
+
+void __init early_ioremap_reset(void)
+{
+       enum fixed_addresses idx;
+       unsigned long *pte, phys, addr;
+
+       after_paging_init = 1;
+       for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
+               addr = fix_to_virt(idx);
+               pte = early_ioremap_pte(addr);
+               if (!*pte & _PAGE_PRESENT) {
+                       phys = *pte & PAGE_MASK;
+                       set_fixmap(idx, phys);
+               }
+       }
+}
+
+static void __init __early_set_fixmap(enum fixed_addresses idx,
+                                  unsigned long phys, pgprot_t flags)
+{
+       unsigned long *pte, addr = __fix_to_virt(idx);
+
+       if (idx >= __end_of_fixed_addresses) {
+               BUG();
+               return;
+       }
+       pte = early_ioremap_pte(addr);
+       if (pgprot_val(flags))
+               *pte = (phys & PAGE_MASK) | pgprot_val(flags);
+       else
+               *pte = 0;
+       __flush_tlb_one(addr);
+}
+
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
+                                       unsigned long phys)
+{
+       if (after_paging_init)
+               set_fixmap(idx, phys);
+       else
+               __early_set_fixmap(idx, phys, PAGE_KERNEL);
+}
+
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
+{
+       if (after_paging_init)
+               clear_fixmap(idx);
+       else
+               __early_set_fixmap(idx, 0, __pgprot(0));
+}
+
+
+int __initdata early_ioremap_nested;
+
+static int __init check_early_ioremap_leak(void)
+{
+       if (!early_ioremap_nested)
+               return 0;
+
+       printk(KERN_WARNING
+              "Debug warning: early ioremap leak of %d areas detected.\n",
+              early_ioremap_nested);
+       printk(KERN_WARNING
+              "please boot with early_ioremap_debug and report the dmesg.\n");
+       WARN_ON(1);
+
+       return 1;
+}
+late_initcall(check_early_ioremap_leak);
+
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+{
+       unsigned long offset, last_addr;
+       unsigned int nrpages, nesting;
+       enum fixed_addresses idx0, idx;
+
+       WARN_ON(system_state != SYSTEM_BOOTING);
+
+       nesting = early_ioremap_nested;
+       if (early_ioremap_debug) {
+               printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
+                      phys_addr, size, nesting);
+               dump_stack();
+       }
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr) {
+               WARN_ON(1);
+               return NULL;
+       }
+
+       if (nesting >= FIX_BTMAPS_NESTING) {
+               WARN_ON(1);
+               return NULL;
+       }
+       early_ioremap_nested++;
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr) - phys_addr;
+
+       /*
+        * Mappings have to fit in the FIX_BTMAP area.
+        */
+       nrpages = size >> PAGE_SHIFT;
+       if (nrpages > NR_FIX_BTMAPS) {
+               WARN_ON(1);
+               return NULL;
+       }
+
+       /*
+        * Ok, go for it..
+        */
+       idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+       idx = idx0;
+       while (nrpages > 0) {
+               early_set_fixmap(idx, phys_addr);
+               phys_addr += PAGE_SIZE;
+               --idx;
+               --nrpages;
+       }
+       if (early_ioremap_debug)
+               printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+
+       return (void *) (offset + fix_to_virt(idx0));
+}
+
+void __init early_iounmap(void *addr, unsigned long size)
+{
+       unsigned long virt_addr;
+       unsigned long offset;
+       unsigned int nrpages;
+       enum fixed_addresses idx;
+       unsigned int nesting;
+
+       nesting = --early_ioremap_nested;
+       WARN_ON(nesting < 0);
+
+       if (early_ioremap_debug) {
+               printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+                      size, nesting);
+               dump_stack();
+       }
+
+       virt_addr = (unsigned long)addr;
+       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
+               WARN_ON(1);
+               return;
+       }
+       offset = virt_addr & ~PAGE_MASK;
+       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+
+       idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+       while (nrpages > 0) {
+               early_clear_fixmap(idx);
+               --idx;
+               --nrpages;
+       }
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+       WARN_ON(1);
+}
+
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c

deleted file mode 100644 (file)

index 0b27831..0000000
--- a/arch/x86/mm/ioremap_32.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * arch/i386/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <asm/fixmap.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-
-#define ISA_START_ADDRESS      0xa0000
-#define ISA_END_ADDRESS                0x100000
-
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
-       void __iomem * addr;
-       struct vm_struct * area;
-       unsigned long offset, last_addr;
-       pgprot_t prot;
-
-       /* Don't allow wraparound or zero size */
-       last_addr = phys_addr + size - 1;
-       if (!size || last_addr < phys_addr)
-               return NULL;
-
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return (void __iomem *) phys_to_virt(phys_addr);
-
-       /*
-        * Don't allow anybody to remap normal RAM that we're using..
-        */
-       if (phys_addr <= virt_to_phys(high_memory - 1)) {
-               char *t_addr, *t_end;
-               struct page *page;
-
-               t_addr = __va(phys_addr);
-               t_end = t_addr + (size - 1);
-          
-               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-                       if(!PageReserved(page))
-                               return NULL;
-       }
-
-       prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
-                       | _PAGE_ACCESSED | flags);
-
-       /*
-        * Mappings have to be page-aligned
-        */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-       /*
-        * Ok, go for it..
-        */
-       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
-       if (!area)
-               return NULL;
-       area->phys_addr = phys_addr;
-       addr = (void __iomem *) area->addr;
-       if (ioremap_page_range((unsigned long) addr,
-                       (unsigned long) addr + size, phys_addr, prot)) {
-               vunmap((void __force *) addr);
-               return NULL;
-       }
-       return (void __iomem *) (offset + (char __iomem *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache     -   map bus memory into CPU space
- * @offset:    bus address of the memory
- * @size:      size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address. 
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many 
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- * 
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
-       unsigned long last_addr;
-       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
-       if (!p) 
-               return p; 
-
-       /* Guaranteed to be > phys_addr, as per __ioremap() */
-       last_addr = phys_addr + size - 1;
-
-       if (last_addr < virt_to_phys(high_memory) - 1) {
-               struct page *ppage = virt_to_page(__va(phys_addr));             
-               unsigned long npages;
-
-               phys_addr &= PAGE_MASK;
-
-               /* This might overflow and become zero.. */
-               last_addr = PAGE_ALIGN(last_addr);
-
-               /* .. but that's ok, because modulo-2**n arithmetic will make
-               * the page-aligned "last - first" come out right.
-               */
-               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
-
-               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
-                       iounmap(p); 
-                       p = NULL;
-               }
-               global_flush_tlb();
-       }
-
-       return p;                                       
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
-       struct vm_struct *p, *o;
-
-       if ((void __force *)addr <= high_memory)
-               return;
-
-       /*
-        * __ioremap special-cases the PCI/ISA range by not instantiating a
-        * vm_area and by simply returning an address into the kernel mapping
-        * of ISA space.   So handle that here.
-        */
-       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
-                       addr < phys_to_virt(ISA_END_ADDRESS))
-               return;
-
-       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-
-       /* Use the vm area unlocked, assuming the caller
-          ensures there isn't another iounmap for the same address
-          in parallel. Reuse of the virtual address is prevented by
-          leaving it in the global lists until we're done with it.
-          cpa takes care of the direct mappings. */
-       read_lock(&vmlist_lock);
-       for (p = vmlist; p; p = p->next) {
-               if (p->addr == addr)
-                       break;
-       }
-       read_unlock(&vmlist_lock);
-
-       if (!p) {
-               printk("iounmap: bad address %p\n", addr);
-               dump_stack();
-               return;
-       }
-
-       /* Reset the direct mapping. Can block */
-       if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
-               change_page_attr(virt_to_page(__va(p->phys_addr)),
-                                get_vm_area_size(p) >> PAGE_SHIFT,
-                                PAGE_KERNEL);
-               global_flush_tlb();
-       } 
-
-       /* Finally remove it */
-       o = remove_vm_area((void *)addr);
-       BUG_ON(p != o || o == NULL);
-       kfree(p); 
-}
-EXPORT_SYMBOL(iounmap);
-
-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
-{
-       unsigned long offset, last_addr;
-       unsigned int nrpages;
-       enum fixed_addresses idx;
-
-       /* Don't allow wraparound or zero size */
-       last_addr = phys_addr + size - 1;
-       if (!size || last_addr < phys_addr)
-               return NULL;
-
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return phys_to_virt(phys_addr);
-
-       /*
-        * Mappings have to be page-aligned
-        */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr) - phys_addr;
-
-       /*
-        * Mappings have to fit in the FIX_BTMAP area.
-        */
-       nrpages = size >> PAGE_SHIFT;
-       if (nrpages > NR_FIX_BTMAPS)
-               return NULL;
-
-       /*
-        * Ok, go for it..
-        */
-       idx = FIX_BTMAP_BEGIN;
-       while (nrpages > 0) {
-               set_fixmap(idx, phys_addr);
-               phys_addr += PAGE_SIZE;
-               --idx;
-               --nrpages;
-       }
-       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
-}
-
-void __init bt_iounmap(void *addr, unsigned long size)
-{
-       unsigned long virt_addr;
-       unsigned long offset;
-       unsigned int nrpages;
-       enum fixed_addresses idx;
-
-       virt_addr = (unsigned long)addr;
-       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
-               return;
-       offset = virt_addr & ~PAGE_MASK;
-       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
-
-       idx = FIX_BTMAP_BEGIN;
-       while (nrpages > 0) {
-               clear_fixmap(idx);
-               --idx;
-               --nrpages;
-       }
-}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c

deleted file mode 100644 (file)

index 6cac90a..0000000
--- a/arch/x86/mm/ioremap_64.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * arch/x86_64/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-
-#include <asm/pgalloc.h>
-#include <asm/fixmap.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/proto.h>
-
-unsigned long __phys_addr(unsigned long x)
-{
-       if (x >= __START_KERNEL_map)
-               return x - __START_KERNEL_map + phys_base;
-       return x - PAGE_OFFSET;
-}
-EXPORT_SYMBOL(__phys_addr);
-
-#define ISA_START_ADDRESS      0xa0000
-#define ISA_END_ADDRESS                0x100000
-
-/*
- * Fix up the linear direct mapping of the kernel to avoid cache attribute
- * conflicts.
- */
-static int
-ioremap_change_attr(unsigned long phys_addr, unsigned long size,
-                                       unsigned long flags)
-{
-       int err = 0;
-       if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
-               unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               unsigned long vaddr = (unsigned long) __va(phys_addr);
-
-               /*
-                * Must use a address here and not struct page because the phys addr
-                * can be a in hole between nodes and not have an memmap entry.
-                */
-               err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
-               if (!err)
-                       global_flush_tlb();
-       }
-       return err;
-}
-
-/*
- * Generic mapping function
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
-       void * addr;
-       struct vm_struct * area;
-       unsigned long offset, last_addr;
-       pgprot_t pgprot;
-
-       /* Don't allow wraparound or zero size */
-       last_addr = phys_addr + size - 1;
-       if (!size || last_addr < phys_addr)
-               return NULL;
-
-       /*
-        * Don't remap the low PCI/ISA area, it's always mapped..
-        */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return (__force void __iomem *)phys_to_virt(phys_addr);
-
-#ifdef CONFIG_FLATMEM
-       /*
-        * Don't allow anybody to remap normal RAM that we're using..
-        */
-       if (last_addr < virt_to_phys(high_memory)) {
-               char *t_addr, *t_end;
-               struct page *page;
-
-               t_addr = __va(phys_addr);
-               t_end = t_addr + (size - 1);
-          
-               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-                       if(!PageReserved(page))
-                               return NULL;
-       }
-#endif
-
-       pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
-                         | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
-       /*
-        * Mappings have to be page-aligned
-        */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-       /*
-        * Ok, go for it..
-        */
-       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
-       if (!area)
-               return NULL;
-       area->phys_addr = phys_addr;
-       addr = area->addr;
-       if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-                              phys_addr, pgprot)) {
-               remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
-               return NULL;
-       }
-       if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
-               area->flags &= 0xffffff;
-               vunmap(addr);
-               return NULL;
-       }
-       return (__force void __iomem *) (offset + (char *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache     -   map bus memory into CPU space
- * @offset:    bus address of the memory
- * @size:      size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address. 
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many 
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- * 
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
-       return __ioremap(phys_addr, size, _PAGE_PCD);
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
-       struct vm_struct *p, *o;
-
-       if (addr <= high_memory) 
-               return; 
-       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
-               addr < phys_to_virt(ISA_END_ADDRESS))
-               return;
-
-       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-       /* Use the vm area unlocked, assuming the caller
-          ensures there isn't another iounmap for the same address
-          in parallel. Reuse of the virtual address is prevented by
-          leaving it in the global lists until we're done with it.
-          cpa takes care of the direct mappings. */
-       read_lock(&vmlist_lock);
-       for (p = vmlist; p; p = p->next) {
-               if (p->addr == addr)
-                       break;
-       }
-       read_unlock(&vmlist_lock);
-
-       if (!p) {
-               printk("iounmap: bad address %p\n", addr);
-               dump_stack();
-               return;
-       }
-
-       /* Reset the direct mapping. Can block */
-       if (p->flags >> 20)
-               ioremap_change_attr(p->phys_addr, p->size, 0);
-
-       /* Finally remove it */
-       o = remove_vm_area((void *)addr);
-       BUG_ON(p != o || o == NULL);
-       kfree(p); 
-}
-EXPORT_SYMBOL(iounmap);
-
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c

index a96006f7ae0c84a1878e5315522d89a56ff8242f..7a2ebce87df5dee511701d3f3d521d62e2d26dc8 100644 (file)
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -1,9 +1,9 @@
-/* 
+/*
   * AMD K8 NUMA support.
   * Discover the memory map and associated nodes.
- * 
+ *
   * This version reads it directly from the K8 northbridge.
- * 
+ *
   * Copyright 2002,2003 Andi Kleen, SuSE Labs.
   */
  #include <linux/kernel.h>
@@ -22,132 +22,135 @@
  
  static __init int find_northbridge(void)
  {
-       int num; 
+       int num;
  
-       for (num = 0; num < 32; num++) { 
+       for (num = 0; num < 32; num++) {
                 u32 header;
-               
-               header = read_pci_config(0, num, 0, 0x00);  
-               if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
-                       continue;       
-
-               header = read_pci_config(0, num, 1, 0x00); 
-               if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
-                       continue;       
-               return num; 
-       } 
-
-       return -1;      
+
+               header = read_pci_config(0, num, 0, 0x00);
+               if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+                       continue;
+
+               header = read_pci_config(0, num, 1, 0x00);
+               if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+                       continue;
+               return num;
+       }
+
+       return -1;
  }
  
  int __init k8_scan_nodes(unsigned long start, unsigned long end)
-{ 
+{
         unsigned long prevbase;
         struct bootnode nodes[8];
-       int nodeid, i, j, nb;
+       int nodeid, i, nb;
         unsigned char nodeids[8];
         int found = 0;
         u32 reg;
         unsigned numnodes;
-       unsigned num_cores;
+       unsigned cores;
+       unsigned bits;
+       int j;
  
         if (!early_pci_allowed())
                 return -1;
  
-       nb = find_northbridge(); 
-       if (nb < 0) 
+       nb = find_northbridge();
+       if (nb < 0)
                 return nb;
  
-       printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 
-
-       num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-       printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
+       printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
  
-       reg = read_pci_config(0, nb, 0, 0x60); 
+       reg = read_pci_config(0, nb, 0, 0x60);
         numnodes = ((reg >> 4) & 0xF) + 1;
         if (numnodes <= 1)
                 return -1;
  
         printk(KERN_INFO "Number of nodes %d\n", numnodes);
  
-       memset(&nodes,0,sizeof(nodes)); 
+       memset(&nodes, 0, sizeof(nodes));
         prevbase = 0;
-       for (i = 0; i < 8; i++) { 
-               unsigned long base,limit; 
+       for (i = 0; i < 8; i++) {
+               unsigned long base, limit;
                 u32 nodeid;
-               
+
                 base = read_pci_config(0, nb, 1, 0x40 + i*8);
                 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
  
-               nodeid = limit & 7; 
+               nodeid = limit & 7;
                 nodeids[i] = nodeid;
-               if ((base & 3) == 0) { 
+               if ((base & 3) == 0) {
                         if (i < numnodes)
-                               printk("Skipping disabled node %d\n", i); 
+                               printk("Skipping disabled node %d\n", i);
                         continue;
-               } 
+               }
                 if (nodeid >= numnodes) {
                         printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-                              base, limit); 
+                              base, limit);
                         continue;
-               } 
+               }
  
-               if (!limit) { 
-                       printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
-                              base);
+               if (!limit) {
+                       printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
+                              i, base);
                         continue;
                 }
                 if ((base >> 8) & 3 || (limit >> 8) & 3) {
-                       printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 
-                              nodeid, (base>>8)&3, (limit>>8) & 3); 
-                       return -1; 
-               }       
+                       printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
+                              nodeid, (base>>8)&3, (limit>>8) & 3);
+                       return -1;
+               }
                 if (node_isset(nodeid, node_possible_map)) {
-                       printk(KERN_INFO "Node %d already present. Skipping\n", 
+                       printk(KERN_INFO "Node %d already present. Skipping\n",
                                nodeid);
                         continue;
                 }
  
-               limit >>= 16; 
-               limit <<= 24; 
+               limit >>= 16;
+               limit <<= 24;
                 limit |= (1<<24)-1;
                 limit++;
  
                 if (limit > end_pfn << PAGE_SHIFT)
                         limit = end_pfn << PAGE_SHIFT;
                 if (limit <= base)
-                       continue; 
-                       
+                       continue;
+
                 base >>= 16;
-               base <<= 24; 
-
-               if (base < start) 
-                       base = start; 
-               if (limit > end) 
-                       limit = end; 
-               if (limit == base) { 
-                       printk(KERN_ERR "Empty node %d\n", nodeid); 
-                       continue; 
+               base <<= 24;
+
+               if (base < start)
+                       base = start;
+               if (limit > end)
+                       limit = end;
+               if (limit == base) {
+                       printk(KERN_ERR "Empty node %d\n", nodeid);
+                       continue;
                 }
-               if (limit < base) { 
+               if (limit < base) {
                         printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
-                              nodeid, base, limit);                           
+                              nodeid, base, limit);
                         continue;
-               } 
-               
+               }
+
                 /* Could sort here, but pun for now. Should not happen anyroads. */
-               if (prevbase > base) { 
+               if (prevbase > base) {
                         printk(KERN_ERR "Node map not sorted %lx,%lx\n",
-                              prevbase,base);
+                              prevbase, base);
                         return -1;
                 }
-                       
-               printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 
-                      nodeid, base, limit); 
-               
+
+               printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
+                      nodeid, base, limit);
+
                 found++;
-               
-               nodes[nodeid].start = base; 
+
+               nodes[nodeid].start = base;
                 nodes[nodeid].end = limit;
                 e820_register_active_regions(nodeid,
                                 nodes[nodeid].start >> PAGE_SHIFT,
@@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
                 prevbase = base;
  
                 node_set(nodeid, node_possible_map);
-       } 
+       }
  
         if (!found)
-               return -1; 
+               return -1;
  
         memnode_shift = compute_hash_shift(nodes, 8);
-       if (memnode_shift < 0) { 
-               printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
-               return -1; 
-       } 
-       printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 
+       if (memnode_shift < 0) {
+               printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
+               return -1;
+       }
+       printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+
+       /* use the coreid bits from early_identify_cpu */
+       bits = boot_cpu_data.x86_coreid_bits;
+       cores = (1<<bits);
  
         for (i = 0; i < 8; i++) {
-               if (nodes[i].start != nodes[i].end) { 
+               if (nodes[i].start != nodes[i].end) {
                         nodeid = nodeids[i];
-                       for (j = 0; j < num_cores; j++)
-                               apicid_to_node[(nodeid * num_cores) + j] = i;
-                       setup_node_bootmem(i, nodes[i].start, nodes[i].end); 
-               } 
+                       for (j = 0; j < cores; j++)
+                               apicid_to_node[(nodeid << bits) + j] = i;
+                       setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+               }
         }
  
         numa_init_array();
         return 0;
-} 
+}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c

similarity index 52%

rename from arch/x86/mm/mmap_32.c

rename to arch/x86/mm/mmap.c

index 552e0847375585884eb96352e1636cffd23b05e8..56fe7124fbec932c0b14f8d9dbe9b448fcdd88b7 100644 (file)
--- a/arch/x86/mm/mmap_32.c
+++ b/arch/x86/mm/mmap.c
@@ -1,10 +1,13 @@
  /*
- *  linux/arch/i386/mm/mmap.c
+ * Flexible mmap layout support
   *
- *  flexible mmap layout support
+ * Based on code by Ingo Molnar and Andi Kleen, copyrighted
+ * as follows:
   *
   * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
   * All Rights Reserved.
+ * Copyright 2005 Andi Kleen, SUSE Labs.
+ * Copyright 2007 Jiri Kosina, SUSE Labs.
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -19,14 +22,12 @@
   * You should have received a copy of the GNU General Public License
   * along with this program; if not, write to the Free Software
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
   */
  
  #include <linux/personality.h>
  #include <linux/mm.h>
  #include <linux/random.h>
+#include <linux/limits.h>
  #include <linux/sched.h>
  
  /*
@@ -37,20 +38,71 @@
  #define MIN_GAP (128*1024*1024)
  #define MAX_GAP (TASK_SIZE/6*5)
  
-static inline unsigned long mmap_base(struct mm_struct *mm)
+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static int mmap_is_ia32(void)
  {
-       unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
-       unsigned long random_factor = 0;
+#ifdef CONFIG_X86_32
+       return 1;
+#endif
+#ifdef CONFIG_IA32_EMULATION
+       if (test_thread_flag(TIF_IA32))
+               return 1;
+#endif
+       return 0;
+}
  
-       if (current->flags & PF_RANDOMIZE)
-               random_factor = get_random_int() % (1024*1024);
+static int mmap_is_legacy(void)
+{
+       if (current->personality & ADDR_COMPAT_LAYOUT)
+               return 1;
+
+       if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+               return 1;
+
+       return sysctl_legacy_va_layout;
+}
+
+static unsigned long mmap_rnd(void)
+{
+       unsigned long rnd = 0;
+
+       /*
+       *  8 bits of randomness in 32bit mmaps, 20 address space bits
+       * 28 bits of randomness in 64bit mmaps, 40 address space bits
+       */
+       if (current->flags & PF_RANDOMIZE) {
+               if (mmap_is_ia32())
+                       rnd = (long)get_random_int() % (1<<8);
+               else
+                       rnd = (long)(get_random_int() % (1<<28));
+       }
+       return rnd << PAGE_SHIFT;
+}
+
+static unsigned long mmap_base(void)
+{
+       unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
  
         if (gap < MIN_GAP)
                 gap = MIN_GAP;
         else if (gap > MAX_GAP)
                 gap = MAX_GAP;
  
-       return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+       return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+}
+
+/*
+ * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
+ * does, but not when emulating X86_32
+ */
+static unsigned long mmap_legacy_base(void)
+{
+       if (mmap_is_ia32())
+               return TASK_UNMAPPED_BASE;
+       else
+               return TASK_UNMAPPED_BASE + mmap_rnd();
  }
  
  /*
@@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm)
   */
  void arch_pick_mmap_layout(struct mm_struct *mm)
  {
-       /*
-        * Fall back to the standard layout if the personality
-        * bit is set, or if the expected stack growth is unlimited:
-        */
-       if (sysctl_legacy_va_layout ||
-                       (current->personality & ADDR_COMPAT_LAYOUT) ||
-                       current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
-               mm->mmap_base = TASK_UNMAPPED_BASE;
+       if (mmap_is_legacy()) {
+               mm->mmap_base = mmap_legacy_base();
                 mm->get_unmapped_area = arch_get_unmapped_area;
                 mm->unmap_area = arch_unmap_area;
         } else {
-               mm->mmap_base = mmap_base(mm);
+               mm->mmap_base = mmap_base();
                 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
                 mm->unmap_area = arch_unmap_area_topdown;
         }
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c

deleted file mode 100644 (file)

index 80bba0d..0000000
--- a/arch/x86/mm/mmap_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2005 Andi Kleen, SuSE Labs.
- * Licensed under GPL, v.2
- */
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/random.h>
-#include <asm/ia32.h>
-
-/* Notebook: move the mmap code from sys_x86_64.c over here. */
-
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-#ifdef CONFIG_IA32_EMULATION
-       if (current_thread_info()->flags & _TIF_IA32)
-               return ia32_pick_mmap_layout(mm);
-#endif
-       mm->mmap_base = TASK_UNMAPPED_BASE;
-       if (current->flags & PF_RANDOMIZE) {
-               /* Add 28bit randomness which is about 40bits of address space
-                  because mmap base has to be page aligned.
-                  or ~1/128 of the total user VM
-                  (total user address space is 47bits) */
-               unsigned rnd = get_random_int() & 0xfffffff;
-               mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
-       }
-       mm->get_unmapped_area = arch_get_unmapped_area;
-       mm->unmap_area = arch_unmap_area;
-}
-
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c

index 3d6926ba8995e4d54131c2660ffb1348955d6e32..dc3b1f7e1451ba46221524860aa0cf54c7e564bb 100644 (file)
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,7 +1,7 @@
-/* 
+/*
   * Generic VM initialization for x86-64 NUMA setups.
   * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */ 
+ */
  #include <linux/kernel.h>
  #include <linux/mm.h>
  #include <linux/string.h>
@@ -11,35 +11,45 @@
  #include <linux/ctype.h>
  #include <linux/module.h>
  #include <linux/nodemask.h>
+#include <linux/sched.h>
  
  #include <asm/e820.h>
  #include <asm/proto.h>
  #include <asm/dma.h>
  #include <asm/numa.h>
  #include <asm/acpi.h>
+#include <asm/k8.h>
  
  #ifndef Dprintk
  #define Dprintk(x...)
  #endif
  
  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
  bootmem_data_t plat_node_bdata[MAX_NUMNODES];
  
  struct memnode memnode;
  
-unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
+int x86_cpu_to_node_map_init[NR_CPUS] = {
         [0 ... NR_CPUS-1] = NUMA_NO_NODE
  };
-unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
-       [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+void *x86_cpu_to_node_map_early_ptr;
+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+
+s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+       [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
  };
-cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_to_cpumask_map);
  
  int numa_off __initdata;
  unsigned long __initdata nodemap_addr;
  unsigned long __initdata nodemap_size;
  
-
  /*
   * Given a shift value, try to populate memnodemap[]
   * Returns :
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size;
   * 0 if memnodmap[] too small (of shift too small)
   * -1 if node overlap or lost ram (shift too big)
   */
-static int __init
-populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
+static int __init populate_memnodemap(const struct bootnode *nodes,
+                                     int numnodes, int shift)
  {
-       int i; 
-       int res = -1;
         unsigned long addr, end;
+       int i, res = -1;
  
-       memset(memnodemap, 0xff, memnodemapsize);
+       memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
         for (i = 0; i < numnodes; i++) {
                 addr = nodes[i].start;
                 end = nodes[i].end;
@@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
                 if ((end >> shift) >= memnodemapsize)
                         return 0;
                 do {
-                       if (memnodemap[addr >> shift] != 0xff)
+                       if (memnodemap[addr >> shift] != NUMA_NO_NODE)
                                 return -1;
                         memnodemap[addr >> shift] = i;
                         addr += (1UL << shift);
                 } while (addr < end);
                 res = 1;
-       } 
+       }
         return res;
  }
  
@@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void)
         unsigned long pad, pad_addr;
  
         memnodemap = memnode.embedded_map;
-       if (memnodemapsize <= 48)
+       if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
                 return 0;
  
         pad = L1_CACHE_BYTES - 1;
         pad_addr = 0x8000;
-       nodemap_size = pad + memnodemapsize;
+       nodemap_size = pad + sizeof(s16) * memnodemapsize;
         nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
                                       nodemap_size);
         if (nodemap_addr == -1UL) {
@@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void)
         }
         pad_addr = (nodemap_addr + pad) & ~pad;
         memnodemap = phys_to_virt(pad_addr);
+       reserve_early(nodemap_addr, nodemap_addr + nodemap_size);
  
         printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
                nodemap_addr, nodemap_addr + nodemap_size);
@@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void)
   * The LSB of all start and end addresses in the node map is the value of the
   * maximum possible shift.
   */
-static int __init
-extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
+                                        int numnodes)
  {
         int i, nodes_used = 0;
         unsigned long start, end;
@@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
                 shift);
  
         if (populate_memnodemap(nodes, numnodes, shift) != 1) {
-               printk(KERN_INFO
-       "Your memory is not aligned you need to rebuild your kernel "
-       "with a bigger NODEMAPSIZE shift=%d\n",
-                       shift);
+               printk(KERN_INFO "Your memory is not aligned you need to "
+                      "rebuild your kernel with a bigger NODEMAPSIZE "
+                      "shift=%d\n", shift);
                 return -1;
         }
         return shift;
  }
  
-#ifdef CONFIG_SPARSEMEM
  int early_pfn_to_nid(unsigned long pfn)
  {
         return phys_to_nid(pfn << PAGE_SHIFT);
  }
-#endif
  
-static void * __init
-early_node_mem(int nodeid, unsigned long start, unsigned long end,
-             unsigned long size)
+static void * __init early_node_mem(int nodeid, unsigned long start,
+                                   unsigned long end, unsigned long size)
  {
         unsigned long mem = find_e820_area(start, end, size);
         void *ptr;
+
         if (mem != -1L)
                 return __va(mem);
         ptr = __alloc_bootmem_nopanic(size,
                                 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
         if (ptr == NULL) {
                 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-                       size, nodeid);
+                      size, nodeid);
                 return NULL;
         }
         return ptr;
  }
  
  /* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{ 
-       unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
-       unsigned long nodedata_phys;
+void __init setup_node_bootmem(int nodeid, unsigned long start,
+                              unsigned long end)
+{
+       unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
+       unsigned long bootmap_start, nodedata_phys;
         void *bootmap;
         const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
  
-       start = round_up(start, ZONE_ALIGN); 
+       start = round_up(start, ZONE_ALIGN);
  
-       printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+       printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
+              start, end);
  
         start_pfn = start >> PAGE_SHIFT;
         end_pfn = end >> PAGE_SHIFT;
@@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
         NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
  
         /* Find a place for the bootmem map */
-       bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
+       bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
         bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
         bootmap = early_node_mem(nodeid, bootmap_start, end,
                                         bootmap_pages<<PAGE_SHIFT);
         if (bootmap == NULL)  {
                 if (nodedata_phys < start || nodedata_phys >= end)
-                       free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+                       free_bootmem((unsigned long)node_data[nodeid],
+                                    pgdat_size);
                 node_data[nodeid] = NULL;
                 return;
         }
         bootmap_start = __pa(bootmap);
-       Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
-       
+       Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
+
         bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
-                                        bootmap_start >> PAGE_SHIFT, 
-                                        start_pfn, end_pfn); 
+                                        bootmap_start >> PAGE_SHIFT,
+                                        start_pfn, end_pfn);
  
         free_bootmem_with_active_regions(nodeid, end);
  
-       reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
-       reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+       reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
+       reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
+                            bootmap_pages<<PAGE_SHIFT);
  #ifdef CONFIG_ACPI_NUMA
         srat_reserve_add_area(nodeid);
  #endif
         node_set_online(nodeid);
-} 
-
-/* Initialize final allocator for a zone */
-void __init setup_node_zones(int nodeid)
-{ 
-       unsigned long start_pfn, end_pfn, memmapsize, limit;
-
-       start_pfn = node_start_pfn(nodeid);
-       end_pfn = node_end_pfn(nodeid);
-
-       Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
-               nodeid, start_pfn, end_pfn);
-
-       /* Try to allocate mem_map at end to not fill up precious <4GB
-          memory. */
-       memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
-       limit = end_pfn << PAGE_SHIFT;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-       NODE_DATA(nodeid)->node_mem_map = 
-               __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
-                               memmapsize, SMP_CACHE_BYTES, 
-                               round_down(limit - memmapsize, PAGE_SIZE), 
-                               limit);
-#endif
-} 
+}
  
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
  void __init numa_init_array(void)
  {
         int rr, i;
-       /* There are unfortunately some poorly designed mainboards around
-          that only connect memory to a single CPU. This breaks the 1:1 cpu->node
-          mapping. To avoid this fill in the mapping for all possible
-          CPUs, as the number of CPUs is not known yet. 
-          We round robin the existing nodes. */
+
         rr = first_node(node_online_map);
         for (i = 0; i < NR_CPUS; i++) {
-               if (cpu_to_node(i) != NUMA_NO_NODE)
+               if (early_cpu_to_node(i) != NUMA_NO_NODE)
                         continue;
-               numa_set_node(i, rr);
+               numa_set_node(i, rr);
                 rr = next_node(rr, node_online_map);
                 if (rr == MAX_NUMNODES)
                         rr = first_node(node_online_map);
         }
-
  }
  
  #ifdef CONFIG_NUMA_EMU
@@ -276,15 +265,17 @@ void __init numa_init_array(void)
  char *cmdline __initdata;
  
  /*
- * Setups up nid to range from addr to addr + size.  If the end boundary is
- * greater than max_addr, then max_addr is used instead.  The return value is 0
- * if there is additional memory left for allocation past addr and -1 otherwise.
- * addr is adjusted to be at the end of the node.
+ * Setups up nid to range from addr to addr + size.  If the end
+ * boundary is greater than max_addr, then max_addr is used instead.
+ * The return value is 0 if there is additional memory left for
+ * allocation past addr and -1 otherwise.  addr is adjusted to be at
+ * the end of the node.
   */
  static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
                                    u64 size, u64 max_addr)
  {
         int ret = 0;
+
         nodes[nid].start = *addr;
         *addr += size;
         if (*addr >= max_addr) {
@@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
  
         for (i = node_start; i < num_nodes + node_start; i++) {
                 u64 end = *addr + size;
+
                 if (i < big)
                         end += FAKE_NODE_MIN_SIZE;
                 /*
@@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
  static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  {
         struct bootnode nodes[MAX_NUMNODES];
-       u64 addr = start_pfn << PAGE_SHIFT;
+       u64 size, addr = start_pfn << PAGE_SHIFT;
         u64 max_addr = end_pfn << PAGE_SHIFT;
-       int num_nodes = 0;
-       int coeff_flag;
-       int coeff = -1;
-       int num = 0;
-       u64 size;
-       int i;
+       int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
  
         memset(&nodes, 0, sizeof(nodes));
         /*
@@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
          * system RAM into N fake nodes.
          */
         if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
-               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
-                                               simple_strtol(cmdline, NULL, 0));
+               long n = simple_strtol(cmdline, NULL, 0);
+
+               num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
                 if (num_nodes < 0)
                         return num_nodes;
                 goto out;
@@ -483,46 +471,47 @@ out:
         for_each_node_mask(i, node_possible_map) {
                 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
                                                 nodes[i].end >> PAGE_SHIFT);
-               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
         }
         acpi_fake_nodes(nodes, num_nodes);
-       numa_init_array();
-       return 0;
+       numa_init_array();
+       return 0;
  }
  #endif /* CONFIG_NUMA_EMU */
  
  void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{ 
+{
         int i;
  
         nodes_clear(node_possible_map);
  
  #ifdef CONFIG_NUMA_EMU
         if (cmdline && !numa_emulation(start_pfn, end_pfn))
-               return;
+               return;
         nodes_clear(node_possible_map);
  #endif
  
  #ifdef CONFIG_ACPI_NUMA
         if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
                                           end_pfn << PAGE_SHIFT))
-               return;
+               return;
         nodes_clear(node_possible_map);
  #endif
  
  #ifdef CONFIG_K8_NUMA
-       if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+       if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
+                                       end_pfn<<PAGE_SHIFT))
                 return;
         nodes_clear(node_possible_map);
  #endif
         printk(KERN_INFO "%s\n",
                numa_off ? "NUMA turned off" : "No NUMA configuration found");
  
-       printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+       printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
                start_pfn << PAGE_SHIFT,
-              end_pfn << PAGE_SHIFT); 
-               /* setup dummy node covering all memory */ 
-       memnode_shift = 63; 
+              end_pfn << PAGE_SHIFT);
+       /* setup dummy node covering all memory */
+       memnode_shift = 63;
         memnodemap = memnode.embedded_map;
         memnodemap[0] = 0;
         nodes_clear(node_online_map);
@@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
         node_set(0, node_possible_map);
         for (i = 0; i < NR_CPUS; i++)
                 numa_set_node(i, 0);
-       node_to_cpumask[0] = cpumask_of_cpu(0);
+       /* cpumask_of_cpu() may not be available during early startup */
+       memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
+       cpu_set(0, node_to_cpumask_map[0]);
         e820_register_active_regions(0, start_pfn, end_pfn);
         setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
  }
  
  __cpuinit void numa_add_cpu(int cpu)
  {
-       set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
-} 
+       set_bit(cpu,
+               (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
  
  void __cpuinit numa_set_node(int cpu, int node)
  {
+       int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
+
         cpu_pda(cpu)->nodenumber = node;
-       cpu_to_node(cpu) = node;
+
+       if(cpu_to_node_map)
+               cpu_to_node_map[cpu] = node;
+       else if(per_cpu_offset(cpu))
+               per_cpu(x86_cpu_to_node_map, cpu) = node;
+       else
+               Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
  }
  
-unsigned long __init numa_free_all_bootmem(void) 
-{ 
-       int i;
+unsigned long __init numa_free_all_bootmem(void)
+{
         unsigned long pages = 0;
-       for_each_online_node(i) {
+       int i;
+
+       for_each_online_node(i)
                 pages += free_all_bootmem_node(NODE_DATA(i));
-       }
+
         return pages;
-} 
+}
  
  void __init paging_init(void)
-{ 
-       int i;
+{
         unsigned long max_zone_pfns[MAX_NR_ZONES];
+
         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -568,32 +569,27 @@ void __init paging_init(void)
         sparse_memory_present_with_active_regions(MAX_NUMNODES);
         sparse_init();
  
-       for_each_online_node(i) {
-               setup_node_zones(i); 
-       }
-
         free_area_init_nodes(max_zone_pfns);
-} 
+}
  
  static __init int numa_setup(char *opt)
-{ 
+{
         if (!opt)
                 return -EINVAL;
-       if (!strncmp(opt,"off",3))
+       if (!strncmp(opt, "off", 3))
                 numa_off = 1;
  #ifdef CONFIG_NUMA_EMU
         if (!strncmp(opt, "fake=", 5))
                 cmdline = opt + 5;
  #endif
  #ifdef CONFIG_ACPI_NUMA
-       if (!strncmp(opt,"noacpi",6))
-               acpi_numa = -1;
-       if (!strncmp(opt,"hotadd=", 7))
+       if (!strncmp(opt, "noacpi", 6))
+               acpi_numa = -1;
+       if (!strncmp(opt, "hotadd=", 7))
                 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
  #endif
         return 0;
-} 
-
+}
  early_param("numa", numa_setup);
  
  /*
@@ -611,38 +607,16 @@ early_param("numa", numa_setup);
  void __init init_cpu_to_node(void)
  {
         int i;
-       for (i = 0; i < NR_CPUS; i++) {
-               u8 apicid = x86_cpu_to_apicid_init[i];
+
+       for (i = 0; i < NR_CPUS; i++) {
+               u16 apicid = x86_cpu_to_apicid_init[i];
+
                 if (apicid == BAD_APICID)
                         continue;
                 if (apicid_to_node[apicid] == NUMA_NO_NODE)
                         continue;
-               numa_set_node(i,apicid_to_node[apicid]);
+               numa_set_node(i, apicid_to_node[apicid]);
         }
  }
  
-EXPORT_SYMBOL(cpu_to_node);
-EXPORT_SYMBOL(node_to_cpumask);
-EXPORT_SYMBOL(memnode);
-EXPORT_SYMBOL(node_data);
-
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * Functions to convert PFNs from/to per node page addresses.
- * These are out of line because they are quite big.
- * They could be all tuned by pre caching more state.
- * Should do that.
- */
  
-int pfn_valid(unsigned long pfn)
-{
-       unsigned nid;
-       if (pfn >= num_physpages)
-               return 0;
-       nid = pfn_to_nid(pfn);
-       if (nid == 0xff)
-               return 0;
-       return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
-}
-EXPORT_SYMBOL(pfn_valid);
-#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c

new file mode 100644 (file)

index 0000000..06353d4
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,224 @@
+/*
+ * self test for change_page_attr.
+ *
+ * Clears the global bit on random pages in the direct mapping, then reverts
+ * and compares page tables forwards and afterwards.
+ */
+#include <linux/bootmem.h>
+#include <linux/random.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+enum {
+       NTEST                   = 4000,
+#ifdef CONFIG_X86_64
+       LPS                     = (1 << PMD_SHIFT),
+#elif defined(CONFIG_X86_PAE)
+       LPS                     = (1 << PMD_SHIFT),
+#else
+       LPS                     = (1 << 22),
+#endif
+       GPS                     = (1<<30)
+};
+
+struct split_state {
+       long lpg, gpg, spg, exec;
+       long min_exec, max_exec;
+};
+
+static __init int print_split(struct split_state *s)
+{
+       long i, expected, missed = 0;
+       int printed = 0;
+       int err = 0;
+
+       s->lpg = s->gpg = s->spg = s->exec = 0;
+       s->min_exec = ~0UL;
+       s->max_exec = 0;
+       for (i = 0; i < max_pfn_mapped; ) {
+               unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
+               int level;
+               pte_t *pte;
+
+               pte = lookup_address(addr, &level);
+               if (!pte) {
+                       if (!printed) {
+                               dump_pagetable(addr);
+                               printk(KERN_INFO "CPA %lx no pte level %d\n",
+                                       addr, level);
+                               printed = 1;
+                       }
+                       missed++;
+                       i++;
+                       continue;
+               }
+
+               if (level == PG_LEVEL_1G && sizeof(long) == 8) {
+                       s->gpg++;
+                       i += GPS/PAGE_SIZE;
+               } else if (level == PG_LEVEL_2M) {
+                       if (!(pte_val(*pte) & _PAGE_PSE)) {
+                               printk(KERN_ERR
+                                       "%lx level %d but not PSE %Lx\n",
+                                       addr, level, (u64)pte_val(*pte));
+                               err = 1;
+                       }
+                       s->lpg++;
+                       i += LPS/PAGE_SIZE;
+               } else {
+                       s->spg++;
+                       i++;
+               }
+               if (!(pte_val(*pte) & _PAGE_NX)) {
+                       s->exec++;
+                       if (addr < s->min_exec)
+                               s->min_exec = addr;
+                       if (addr > s->max_exec)
+                               s->max_exec = addr;
+               }
+       }
+       printk(KERN_INFO
+               "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
+               s->spg, s->lpg, s->gpg, s->exec,
+               s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed);
+
+       expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
+       if (expected != i) {
+               printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
+                       max_pfn_mapped, expected);
+               return 1;
+       }
+       return err;
+}
+
+static unsigned long __initdata addr[NTEST];
+static unsigned int __initdata len[NTEST];
+
+/* Change the global bit on random pages in the direct mapping */
+static __init int exercise_pageattr(void)
+{
+       struct split_state sa, sb, sc;
+       unsigned long *bm;
+       pte_t *pte, pte0;
+       int failed = 0;
+       int level;
+       int i, k;
+       int err;
+
+       printk(KERN_INFO "CPA exercising pageattr\n");
+
+       bm = vmalloc((max_pfn_mapped + 7) / 8);
+       if (!bm) {
+               printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
+               return -ENOMEM;
+       }
+       memset(bm, 0, (max_pfn_mapped + 7) / 8);
+
+       failed += print_split(&sa);
+       srandom32(100);
+
+       for (i = 0; i < NTEST; i++) {
+               unsigned long pfn = random32() % max_pfn_mapped;
+
+               addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
+               len[i] = random32() % 100;
+               len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
+
+               if (len[i] == 0)
+                       len[i] = 1;
+
+               pte = NULL;
+               pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
+
+               for (k = 0; k < len[i]; k++) {
+                       pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
+                       if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) {
+                               addr[i] = 0;
+                               break;
+                       }
+                       if (k == 0) {
+                               pte0 = *pte;
+                       } else {
+                               if (pgprot_val(pte_pgprot(*pte)) !=
+                                       pgprot_val(pte_pgprot(pte0))) {
+                                       len[i] = k;
+                                       break;
+                               }
+                       }
+                       if (test_bit(pfn + k, bm)) {
+                               len[i] = k;
+                               break;
+                       }
+                       __set_bit(pfn + k, bm);
+               }
+               if (!addr[i] || !pte || !k) {
+                       addr[i] = 0;
+                       continue;
+               }
+
+               err = change_page_attr_clear(addr[i], len[i],
+                                              __pgprot(_PAGE_GLOBAL));
+               if (err < 0) {
+                       printk(KERN_ERR "CPA %d failed %d\n", i, err);
+                       failed++;
+               }
+
+               pte = lookup_address(addr[i], &level);
+               if (!pte || pte_global(*pte) || pte_huge(*pte)) {
+                       printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
+                               pte ? (u64)pte_val(*pte) : 0ULL);
+                       failed++;
+               }
+               if (level != PG_LEVEL_4K) {
+                       printk(KERN_ERR "CPA %lx: unexpected level %d\n",
+                               addr[i], level);
+                       failed++;
+               }
+
+       }
+       vfree(bm);
+
+       failed += print_split(&sb);
+
+       printk(KERN_INFO "CPA reverting everything\n");
+       for (i = 0; i < NTEST; i++) {
+               if (!addr[i])
+                       continue;
+               pte = lookup_address(addr[i], &level);
+               if (!pte) {
+                       printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
+                       failed++;
+                       continue;
+               }
+               err = change_page_attr_set(addr[i], len[i],
+                                            __pgprot(_PAGE_GLOBAL));
+               if (err < 0) {
+                       printk(KERN_ERR "CPA reverting failed: %d\n", err);
+                       failed++;
+               }
+               pte = lookup_address(addr[i], &level);
+               if (!pte || !pte_global(*pte)) {
+                       printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
+                               addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
+                       failed++;
+               }
+
+       }
+
+       failed += print_split(&sc);
+
+       if (failed) {
+               printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n");
+               WARN_ON(1);
+       } else {
+               printk(KERN_INFO "CPA selftests PASSED\n");
+       }
+
+       return 0;
+}
+module_init(exercise_pageattr);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

new file mode 100644 (file)

index 0000000..1cc6607
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+       return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @addr:      virtual start address
+ * @size:      number of bytes to flush
+ *
+ * clflush is an unordered instruction which needs fencing with mfence
+ * to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+       void *vend = vaddr + size - 1;
+
+       mb();
+
+       for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+               clflush(vaddr);
+       /*
+        * Flush any possible final partial cacheline:
+        */
+       clflush(vend);
+
+       mb();
+}
+
+static void __cpa_flush_all(void *arg)
+{
+       /*
+        * Flush all to work around Errata in early athlons regarding
+        * large page flushing.
+        */
+       __flush_tlb_all();
+
+       if (boot_cpu_data.x86_model >= 4)
+               wbinvd();
+}
+
+static void cpa_flush_all(void)
+{
+       BUG_ON(irqs_disabled());
+
+       on_each_cpu(__cpa_flush_all, NULL, 1, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+       /*
+        * We could optimize that further and do individual per page
+        * tlb invalidates for a low number of pages. Caveat: we must
+        * flush the high aliases on 64bit as well.
+        */
+       __flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages)
+{
+       unsigned int i, level;
+       unsigned long addr;
+
+       BUG_ON(irqs_disabled());
+       WARN_ON(PAGE_ALIGN(start) != start);
+
+       on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+
+       /*
+        * We only need to flush on one CPU,
+        * clflush is a MESI-coherent instruction that
+        * will cause all other CPUs to flush the same
+        * cachelines:
+        */
+       for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+               pte_t *pte = lookup_address(addr, &level);
+
+               /*
+                * Only flush present addresses:
+                */
+               if (pte && pte_present(*pte))
+                       clflush_cache_range((void *) addr, PAGE_SIZE);
+       }
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
+{
+       pgprot_t forbidden = __pgprot(0);
+
+       /*
+        * The BIOS area between 640k and 1Mb needs to be executable for
+        * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+        */
+       if (within(__pa(address), BIOS_BEGIN, BIOS_END))
+               pgprot_val(forbidden) |= _PAGE_NX;
+
+       /*
+        * The kernel text needs to be executable for obvious reasons
+        * Does not cover __inittext since that is gone later on
+        */
+       if (within(address, (unsigned long)_text, (unsigned long)_etext))
+               pgprot_val(forbidden) |= _PAGE_NX;
+
+#ifdef CONFIG_DEBUG_RODATA
+       /* The .rodata section needs to be read-only */
+       if (within(address, (unsigned long)__start_rodata,
+                               (unsigned long)__end_rodata))
+               pgprot_val(forbidden) |= _PAGE_RW;
+#endif
+
+       prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+       return prot;
+}
+
+pte_t *lookup_address(unsigned long address, int *level)
+{
+       pgd_t *pgd = pgd_offset_k(address);
+       pud_t *pud;
+       pmd_t *pmd;
+
+       *level = PG_LEVEL_NONE;
+
+       if (pgd_none(*pgd))
+               return NULL;
+       pud = pud_offset(pgd, address);
+       if (pud_none(*pud))
+               return NULL;
+       pmd = pmd_offset(pud, address);
+       if (pmd_none(*pmd))
+               return NULL;
+
+       *level = PG_LEVEL_2M;
+       if (pmd_large(*pmd))
+               return (pte_t *)pmd;
+
+       *level = PG_LEVEL_4K;
+       return pte_offset_kernel(pmd, address);
+}
+
+static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
+{
+       /* change init_mm */
+       set_pte_atomic(kpte, pte);
+#ifdef CONFIG_X86_32
+       if (!SHARED_KERNEL_PMD) {
+               struct page *page;
+
+               list_for_each_entry(page, &pgd_list, lru) {
+                       pgd_t *pgd;
+                       pud_t *pud;
+                       pmd_t *pmd;
+
+                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                       pud = pud_offset(pgd, address);
+                       pmd = pmd_offset(pud, address);
+                       set_pte_atomic((pte_t *)pmd, pte);
+               }
+       }
+#endif
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+       pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+       gfp_t gfp_flags = GFP_KERNEL;
+       unsigned long flags;
+       unsigned long addr;
+       pte_t *pbase, *tmp;
+       struct page *base;
+       unsigned int i, level;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
+       gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
+#endif
+       base = alloc_pages(gfp_flags, 0);
+       if (!base)
+               return -ENOMEM;
+
+       spin_lock_irqsave(&pgd_lock, flags);
+       /*
+        * Check for races, another CPU might have split this page
+        * up for us already:
+        */
+       tmp = lookup_address(address, &level);
+       if (tmp != kpte) {
+               WARN_ON_ONCE(1);
+               goto out_unlock;
+       }
+
+       address = __pa(address);
+       addr = address & LARGE_PAGE_MASK;
+       pbase = (pte_t *)page_address(base);
+#ifdef CONFIG_X86_32
+       paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+#endif
+
+       pgprot_val(ref_prot) &= ~_PAGE_NX;
+       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
+               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
+
+       /*
+        * Install the new, split up pagetable. Important detail here:
+        *
+        * On Intel the NX bit of all levels must be cleared to make a
+        * page executable. See section 4.13.2 of Intel 64 and IA-32
+        * Architectures Software Developer's Manual).
+        */
+       ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+       __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
+       base = NULL;
+
+out_unlock:
+       spin_unlock_irqrestore(&pgd_lock, flags);
+
+       if (base)
+               __free_pages(base, 0);
+
+       return 0;
+}
+
+static int
+__change_page_attr(unsigned long address, unsigned long pfn,
+                  pgprot_t mask_set, pgprot_t mask_clr)
+{
+       struct page *kpte_page;
+       int level, err = 0;
+       pte_t *kpte;
+
+#ifdef CONFIG_X86_32
+       BUG_ON(pfn > max_low_pfn);
+#endif
+
+repeat:
+       kpte = lookup_address(address, &level);
+       if (!kpte)
+               return -EINVAL;
+
+       kpte_page = virt_to_page(kpte);
+       BUG_ON(PageLRU(kpte_page));
+       BUG_ON(PageCompound(kpte_page));
+
+       if (level == PG_LEVEL_4K) {
+               pgprot_t new_prot = pte_pgprot(*kpte);
+               pte_t new_pte, old_pte = *kpte;
+
+               pgprot_val(new_prot) &= ~pgprot_val(mask_clr);
+               pgprot_val(new_prot) |= pgprot_val(mask_set);
+
+               new_prot = static_protections(new_prot, address);
+
+               new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+               BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte));
+
+               set_pte_atomic(kpte, new_pte);
+       } else {
+               err = split_large_page(kpte, address);
+               if (!err)
+                       goto repeat;
+       }
+       return err;
+}
+
+/**
+ * change_page_attr_addr - Change page table attributes in linear mapping
+ * @address: Virtual address in linear mapping.
+ * @prot:    New page table attribute (PAGE_*)
+ *
+ * Change page attributes of a page in the direct mapping. This is a variant
+ * of change_page_attr() that also works on memory holes that do not have
+ * mem_map entry (pfn_valid() is false).
+ *
+ * See change_page_attr() documentation for more details.
+ *
+ * Modules and drivers should use the set_memory_* APIs instead.
+ */
+
+#define HIGH_MAP_START __START_KERNEL_map
+#define HIGH_MAP_END   (__START_KERNEL_map + KERNEL_TEXT_SIZE)
+
+static int
+change_page_attr_addr(unsigned long address, pgprot_t mask_set,
+                     pgprot_t mask_clr)
+{
+       unsigned long phys_addr = __pa(address);
+       unsigned long pfn = phys_addr >> PAGE_SHIFT;
+       int err;
+
+#ifdef CONFIG_X86_64
+       /*
+        * If we are inside the high mapped kernel range, then we
+        * fixup the low mapping first. __va() returns the virtual
+        * address in the linear mapping:
+        */
+       if (within(address, HIGH_MAP_START, HIGH_MAP_END))
+               address = (unsigned long) __va(phys_addr);
+#endif
+
+       err = __change_page_attr(address, pfn, mask_set, mask_clr);
+       if (err)
+               return err;
+
+#ifdef CONFIG_X86_64
+       /*
+        * If the physical address is inside the kernel map, we need
+        * to touch the high mapped kernel as well:
+        */
+       if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) {
+               /*
+                * Calc the high mapping address. See __phys_addr()
+                * for the non obvious details.
+                */
+               address = phys_addr + HIGH_MAP_START - phys_base;
+               /* Make sure the kernel mappings stay executable */
+               pgprot_val(mask_clr) |= _PAGE_NX;
+
+               /*
+                * Our high aliases are imprecise, because we check
+                * everything between 0 and KERNEL_TEXT_SIZE, so do
+                * not propagate lookup failures back to users:
+                */
+               __change_page_attr(address, pfn, mask_set, mask_clr);
+       }
+#endif
+       return err;
+}
+
+static int __change_page_attr_set_clr(unsigned long addr, int numpages,
+                                     pgprot_t mask_set, pgprot_t mask_clr)
+{
+       unsigned int i;
+       int ret;
+
+       for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) {
+               ret = change_page_attr_addr(addr, mask_set, mask_clr);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int change_page_attr_set_clr(unsigned long addr, int numpages,
+                                   pgprot_t mask_set, pgprot_t mask_clr)
+{
+       int ret = __change_page_attr_set_clr(addr, numpages, mask_set,
+                                            mask_clr);
+
+       /*
+        * On success we use clflush, when the CPU supports it to
+        * avoid the wbindv. If the CPU does not support it and in the
+        * error case we fall back to cpa_flush_all (which uses
+        * wbindv):
+        */
+       if (!ret && cpu_has_clflush)
+               cpa_flush_range(addr, numpages);
+       else
+               cpa_flush_all();
+
+       return ret;
+}
+
+static inline int change_page_attr_set(unsigned long addr, int numpages,
+                                      pgprot_t mask)
+{
+       return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+}
+
+static inline int change_page_attr_clear(unsigned long addr, int numpages,
+                                        pgprot_t mask)
+{
+       return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+       return change_page_attr_set(addr, numpages,
+                                   __pgprot(_PAGE_PCD | _PAGE_PWT));
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+       return change_page_attr_clear(addr, numpages,
+                                     __pgprot(_PAGE_PCD | _PAGE_PWT));
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+       return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_rw(addr, numpages);
+}
+
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
+static inline int __change_page_attr_set(unsigned long addr, int numpages,
+                                        pgprot_t mask)
+{
+       return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+}
+
+static inline int __change_page_attr_clear(unsigned long addr, int numpages,
+                                          pgprot_t mask)
+{
+       return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return __change_page_attr_set(addr, numpages,
+                                     __pgprot(_PAGE_PRESENT | _PAGE_RW));
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return __change_page_attr_clear(addr, numpages,
+                                       __pgprot(_PAGE_PRESENT));
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+       if (PageHighMem(page))
+               return;
+       if (!enable) {
+               debug_check_no_locks_freed(page_address(page),
+                                          numpages * PAGE_SIZE);
+       }
+
+       /*
+        * If page allocator is not up yet then do not call c_p_a():
+        */
+       if (!debug_pagealloc_enabled)
+               return;
+
+       /*
+        * The return value is ignored - the calls cannot fail,
+        * large pages are disabled at boot time:
+        */
+       if (enable)
+               __set_pages_p(page, numpages);
+       else
+               __set_pages_np(page, numpages);
+
+       /*
+        * We should perform an IPI and flush all tlbs,
+        * but that can deadlock->flush only current cpu:
+        */
+       __flush_tlb_all();
+}
+#endif
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c

deleted file mode 100644 (file)

index 260073c..0000000
--- a/arch/x86/mm/pageattr_32.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/* 
- * Copyright 2002 Andi Kleen, SuSE Labs. 
- * Thanks to Ben LaHaise for precious feedback.
- */ 
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/sections.h>
-
-static DEFINE_SPINLOCK(cpa_lock);
-static struct list_head df_list = LIST_HEAD_INIT(df_list);
-
-
-pte_t *lookup_address(unsigned long address) 
-{ 
-       pgd_t *pgd = pgd_offset_k(address);
-       pud_t *pud;
-       pmd_t *pmd;
-       if (pgd_none(*pgd))
-               return NULL;
-       pud = pud_offset(pgd, address);
-       if (pud_none(*pud))
-               return NULL;
-       pmd = pmd_offset(pud, address);
-       if (pmd_none(*pmd))
-               return NULL;
-       if (pmd_large(*pmd))
-               return (pte_t *)pmd;
-        return pte_offset_kernel(pmd, address);
-} 
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
-                                       pgprot_t ref_prot)
-{ 
-       int i; 
-       unsigned long addr;
-       struct page *base;
-       pte_t *pbase;
-
-       spin_unlock_irq(&cpa_lock);
-       base = alloc_pages(GFP_KERNEL, 0);
-       spin_lock_irq(&cpa_lock);
-       if (!base) 
-               return NULL;
-
-       /*
-        * page_private is used to track the number of entries in
-        * the page table page that have non standard attributes.
-        */
-       SetPagePrivate(base);
-       page_private(base) = 0;
-
-       address = __pa(address);
-       addr = address & LARGE_PAGE_MASK; 
-       pbase = (pte_t *)page_address(base);
-       paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
-                                          addr == address ? prot : ref_prot));
-       }
-       return base;
-} 
-
-static void cache_flush_page(struct page *p)
-{ 
-       void *adr = page_address(p);
-       int i;
-       for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
-               clflush(adr+i);
-}
-
-static void flush_kernel_map(void *arg)
-{
-       struct list_head *lh = (struct list_head *)arg;
-       struct page *p;
-
-       /* High level code is not ready for clflush yet */
-       if (0 && cpu_has_clflush) {
-               list_for_each_entry (p, lh, lru)
-                       cache_flush_page(p);
-       } else if (boot_cpu_data.x86_model >= 4)
-               wbinvd();
-
-       /* Flush all to work around Errata in early athlons regarding 
-        * large page flushing. 
-        */
-       __flush_tlb_all();      
-}
-
-static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
-{ 
-       struct page *page;
-       unsigned long flags;
-
-       set_pte_atomic(kpte, pte);      /* change init_mm */
-       if (SHARED_KERNEL_PMD)
-               return;
-
-       spin_lock_irqsave(&pgd_lock, flags);
-       for (page = pgd_list; page; page = (struct page *)page->index) {
-               pgd_t *pgd;
-               pud_t *pud;
-               pmd_t *pmd;
-               pgd = (pgd_t *)page_address(page) + pgd_index(address);
-               pud = pud_offset(pgd, address);
-               pmd = pmd_offset(pud, address);
-               set_pte_atomic((pte_t *)pmd, pte);
-       }
-       spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-/* 
- * No more special protections in this 2/4MB area - revert to a
- * large page again. 
- */
-static inline void revert_page(struct page *kpte_page, unsigned long address)
-{
-       pgprot_t ref_prot;
-       pte_t *linear;
-
-       ref_prot =
-       ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-               ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
-
-       linear = (pte_t *)
-               pmd_offset(pud_offset(pgd_offset_k(address), address), address);
-       set_pmd_pte(linear,  address,
-                   pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
-                           ref_prot));
-}
-
-static inline void save_page(struct page *kpte_page)
-{
-       if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
-               list_add(&kpte_page->lru, &df_list);
-}
-
-static int
-__change_page_attr(struct page *page, pgprot_t prot)
-{ 
-       pte_t *kpte; 
-       unsigned long address;
-       struct page *kpte_page;
-
-       BUG_ON(PageHighMem(page));
-       address = (unsigned long)page_address(page);
-
-       kpte = lookup_address(address);
-       if (!kpte)
-               return -EINVAL;
-       kpte_page = virt_to_page(kpte);
-       BUG_ON(PageLRU(kpte_page));
-       BUG_ON(PageCompound(kpte_page));
-
-       if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
-               if (!pte_huge(*kpte)) {
-                       set_pte_atomic(kpte, mk_pte(page, prot)); 
-               } else {
-                       pgprot_t ref_prot;
-                       struct page *split;
-
-                       ref_prot =
-                       ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-                               ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
-                       split = split_large_page(address, prot, ref_prot);
-                       if (!split)
-                               return -ENOMEM;
-                       set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
-                       kpte_page = split;
-               }
-               page_private(kpte_page)++;
-       } else if (!pte_huge(*kpte)) {
-               set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
-               BUG_ON(page_private(kpte_page) == 0);
-               page_private(kpte_page)--;
-       } else
-               BUG();
-
-       /*
-        * If the pte was reserved, it means it was created at boot
-        * time (not via split_large_page) and in turn we must not
-        * replace it with a largepage.
-        */
-
-       save_page(kpte_page);
-       if (!PageReserved(kpte_page)) {
-               if (cpu_has_pse && (page_private(kpte_page) == 0)) {
-                       paravirt_release_pt(page_to_pfn(kpte_page));
-                       revert_page(kpte_page, address);
-               }
-       }
-       return 0;
-} 
-
-static inline void flush_map(struct list_head *l)
-{
-       on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- * 
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- * 
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-       int err = 0; 
-       int i; 
-       unsigned long flags;
-
-       spin_lock_irqsave(&cpa_lock, flags);
-       for (i = 0; i < numpages; i++, page++) { 
-               err = __change_page_attr(page, prot);
-               if (err) 
-                       break; 
-       }       
-       spin_unlock_irqrestore(&cpa_lock, flags);
-       return err;
-}
-
-void global_flush_tlb(void)
-{
-       struct list_head l;
-       struct page *pg, *next;
-
-       BUG_ON(irqs_disabled());
-
-       spin_lock_irq(&cpa_lock);
-       list_replace_init(&df_list, &l);
-       spin_unlock_irq(&cpa_lock);
-       flush_map(&l);
-       list_for_each_entry_safe(pg, next, &l, lru) {
-               list_del(&pg->lru);
-               clear_bit(PG_arch_1, &pg->flags);
-               if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
-                       continue;
-               ClearPagePrivate(pg);
-               __free_page(pg);
-       }
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
-       if (PageHighMem(page))
-               return;
-       if (!enable)
-               debug_check_no_locks_freed(page_address(page),
-                                          numpages * PAGE_SIZE);
-
-       /* the return value is ignored - the calls cannot fail,
-        * large pages are disabled at boot time.
-        */
-       change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
-       /* we should perform an IPI and flush all tlbs,
-        * but that can deadlock->flush only current cpu.
-        */
-       __flush_tlb_all();
-}
-#endif
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c

deleted file mode 100644 (file)

index c40afba..0000000
--- a/arch/x86/mm/pageattr_64.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/* 
- * Copyright 2002 Andi Kleen, SuSE Labs. 
- * Thanks to Ben LaHaise for precious feedback.
- */ 
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-pte_t *lookup_address(unsigned long address)
-{ 
-       pgd_t *pgd = pgd_offset_k(address);
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       if (pgd_none(*pgd))
-               return NULL;
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               return NULL; 
-       pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
-               return NULL; 
-       if (pmd_large(*pmd))
-               return (pte_t *)pmd;
-       pte = pte_offset_kernel(pmd, address);
-       if (pte && !pte_present(*pte))
-               pte = NULL; 
-       return pte;
-} 
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
-                                    pgprot_t ref_prot)
-{ 
-       int i; 
-       unsigned long addr;
-       struct page *base = alloc_pages(GFP_KERNEL, 0);
-       pte_t *pbase;
-       if (!base) 
-               return NULL;
-       /*
-        * page_private is used to track the number of entries in
-        * the page table page have non standard attributes.
-        */
-       SetPagePrivate(base);
-       page_private(base) = 0;
-
-       address = __pa(address);
-       addr = address & LARGE_PAGE_MASK; 
-       pbase = (pte_t *)page_address(base);
-       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-               pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
-                                  addr == address ? prot : ref_prot);
-       }
-       return base;
-} 
-
-void clflush_cache_range(void *adr, int size)
-{
-       int i;
-       for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
-               clflush(adr+i);
-}
-
-static void flush_kernel_map(void *arg)
-{
-       struct list_head *l = (struct list_head *)arg;
-       struct page *pg;
-
-       /* When clflush is available always use it because it is
-          much cheaper than WBINVD. */
-       /* clflush is still broken. Disable for now. */
-       if (1 || !cpu_has_clflush)
-               asm volatile("wbinvd" ::: "memory");
-       else list_for_each_entry(pg, l, lru) {
-               void *adr = page_address(pg);
-               clflush_cache_range(adr, PAGE_SIZE);
-       }
-       __flush_tlb_all();
-}
-
-static inline void flush_map(struct list_head *l)
-{      
-       on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
-
-static inline void save_page(struct page *fpage)
-{
-       if (!test_and_set_bit(PG_arch_1, &fpage->flags))
-               list_add(&fpage->lru, &deferred_pages);
-}
-
-/* 
- * No more special protections in this 2/4MB area - revert to a
- * large page again. 
- */
-static void revert_page(unsigned long address, pgprot_t ref_prot)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t large_pte;
-       unsigned long pfn;
-
-       pgd = pgd_offset_k(address);
-       BUG_ON(pgd_none(*pgd));
-       pud = pud_offset(pgd,address);
-       BUG_ON(pud_none(*pud));
-       pmd = pmd_offset(pud, address);
-       BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
-       pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
-       large_pte = pfn_pte(pfn, ref_prot);
-       large_pte = pte_mkhuge(large_pte);
-       set_pte((pte_t *)pmd, large_pte);
-}      
-
-static int
-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
-                                  pgprot_t ref_prot)
-{ 
-       pte_t *kpte; 
-       struct page *kpte_page;
-       pgprot_t ref_prot2;
-
-       kpte = lookup_address(address);
-       if (!kpte) return 0;
-       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
-       BUG_ON(PageLRU(kpte_page));
-       BUG_ON(PageCompound(kpte_page));
-       if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
-               if (!pte_huge(*kpte)) {
-                       set_pte(kpte, pfn_pte(pfn, prot));
-               } else {
-                       /*
-                        * split_large_page will take the reference for this
-                        * change_page_attr on the split page.
-                        */
-                       struct page *split;
-                       ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-                       split = split_large_page(address, prot, ref_prot2);
-                       if (!split)
-                               return -ENOMEM;
-                       pgprot_val(ref_prot2) &= ~_PAGE_NX;
-                       set_pte(kpte, mk_pte(split, ref_prot2));
-                       kpte_page = split;
-               }
-               page_private(kpte_page)++;
-       } else if (!pte_huge(*kpte)) {
-               set_pte(kpte, pfn_pte(pfn, ref_prot));
-               BUG_ON(page_private(kpte_page) == 0);
-               page_private(kpte_page)--;
-       } else
-               BUG();
-
-       /* on x86-64 the direct mapping set at boot is not using 4k pages */
-       BUG_ON(PageReserved(kpte_page));
-
-       save_page(kpte_page);
-       if (page_private(kpte_page) == 0)
-               revert_page(address, ref_prot);
-       return 0;
-} 
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- * 
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- * 
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
-{
-       int err = 0, kernel_map = 0;
-       int i; 
-
-       if (address >= __START_KERNEL_map
-           && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
-               address = (unsigned long)__va(__pa(address));
-               kernel_map = 1;
-       }
-
-       down_write(&init_mm.mmap_sem);
-       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
-               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
-
-               if (!kernel_map || pte_present(pfn_pte(0, prot))) {
-                       err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
-                       if (err)
-                               break;
-               }
-               /* Handle kernel mapping too which aliases part of the
-                * lowmem */
-               if (__pa(address) < KERNEL_TEXT_SIZE) {
-                       unsigned long addr2;
-                       pgprot_t prot2;
-                       addr2 = __START_KERNEL_map + __pa(address);
-                       /* Make sure the kernel mappings stay executable */
-                       prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
-                       err = __change_page_attr(addr2, pfn, prot2,
-                                                PAGE_KERNEL_EXEC);
-               } 
-       }       
-       up_write(&init_mm.mmap_sem); 
-       return err;
-}
-
-/* Don't call this for MMIO areas that may not have a mem_map entry */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-       unsigned long addr = (unsigned long)page_address(page);
-       return change_page_attr_addr(addr, numpages, prot);
-}
-
-void global_flush_tlb(void)
-{ 
-       struct page *pg, *next;
-       struct list_head l;
-
-       /*
-        * Write-protect the semaphore, to exclude two contexts
-        * doing a list_replace_init() call in parallel and to
-        * exclude new additions to the deferred_pages list:
-        */
-       down_write(&init_mm.mmap_sem);
-       list_replace_init(&deferred_pages, &l);
-       up_write(&init_mm.mmap_sem);
-
-       flush_map(&l);
-
-       list_for_each_entry_safe(pg, next, &l, lru) {
-               list_del(&pg->lru);
-               clear_bit(PG_arch_1, &pg->flags);
-               if (page_private(pg) != 0)
-                       continue;
-               ClearPagePrivate(pg);
-               __free_page(pg);
-       } 
-} 
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c

index be61a1d845a4a5e9fffbe5ccd9a80c9650d45e62..2ae5999a795adfb5cd56f160d0018aee4f0578ee 100644 (file)
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
         return pte;
  }
  
-void pmd_ctor(struct kmem_cache *cache, void *pmd)
-{
-       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-}
-
  /*
   * List of all pgd's needed for non-PAE so it can invalidate entries
   * in both cached and uncached pgd's; not needed for PAE since the
@@ -210,27 +205,18 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd)
   * vmalloc faults work because attached pagetables are never freed.
   * -- wli
   */
-DEFINE_SPINLOCK(pgd_lock);
-struct page *pgd_list;
-
  static inline void pgd_list_add(pgd_t *pgd)
  {
         struct page *page = virt_to_page(pgd);
-       page->index = (unsigned long)pgd_list;
-       if (pgd_list)
-               set_page_private(pgd_list, (unsigned long)&page->index);
-       pgd_list = page;
-       set_page_private(page, (unsigned long)&pgd_list);
+
+       list_add(&page->lru, &pgd_list);
  }
  
  static inline void pgd_list_del(pgd_t *pgd)
  {
-       struct page *next, **pprev, *page = virt_to_page(pgd);
-       next = (struct page *)page->index;
-       pprev = (struct page **)page_private(page);
-       *pprev = next;
-       if (next)
-               set_page_private(next, (unsigned long)pprev);
+       struct page *page = virt_to_page(pgd);
+
+       list_del(&page->lru);
  }
  
  
@@ -285,7 +271,6 @@ static void pgd_dtor(void *pgd)
         if (SHARED_KERNEL_PMD)
                 return;
  
-       paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
         spin_lock_irqsave(&pgd_lock, flags);
         pgd_list_del(pgd);
         spin_unlock_irqrestore(&pgd_lock, flags);
@@ -294,77 +279,96 @@ static void pgd_dtor(void *pgd)
  #define UNSHARED_PTRS_PER_PGD                          \
         (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
  
-/* If we allocate a pmd for part of the kernel address space, then
-   make sure its initialized with the appropriate kernel mappings.
-   Otherwise use a cached zeroed pmd.  */
-static pmd_t *pmd_cache_alloc(int idx)
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(pgd_t *pgdp)
  {
-       pmd_t *pmd;
+       int i;
  
-       if (idx >= USER_PTRS_PER_PGD) {
-               pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+       for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+               pgd_t pgd = pgdp[i];
  
-               if (pmd)
-                       memcpy(pmd,
-                              (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
+               if (pgd_val(pgd) != 0) {
+                       pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+                       pgdp[i] = native_make_pgd(0);
+
+                       paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+                       pmd_free(pmd);
+               }
+       }
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+       pud_t *pud;
+       unsigned long addr;
+       int i;
+
+       pud = pud_offset(pgd, 0);
+       for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+            i++, pud++, addr += PUD_SIZE) {
+               pmd_t *pmd = pmd_alloc_one(mm, addr);
+
+               if (!pmd) {
+                       pgd_mop_up_pmds(pgd);
+                       return 0;
+               }
+
+               if (i >= USER_PTRS_PER_PGD)
+                       memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
                                sizeof(pmd_t) * PTRS_PER_PMD);
-       } else
-               pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
  
-       return pmd;
+               pud_populate(mm, pud, pmd);
+       }
+
+       return 1;
+}
+#else  /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+       return 1;
  }
  
-static void pmd_cache_free(pmd_t *pmd, int idx)
+static void pgd_mop_up_pmds(pgd_t *pgd)
  {
-       if (idx >= USER_PTRS_PER_PGD)
-               free_page((unsigned long)pmd);
-       else
-               kmem_cache_free(pmd_cache, pmd);
  }
+#endif /* CONFIG_X86_PAE */
  
  pgd_t *pgd_alloc(struct mm_struct *mm)
  {
-       int i;
         pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
  
-       if (PTRS_PER_PMD == 1 || !pgd)
-               return pgd;
+       mm->pgd = pgd;          /* so that alloc_pd can use it */
  
-       for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-               pmd_t *pmd = pmd_cache_alloc(i);
-
-               if (!pmd)
-                       goto out_oom;
-
-               paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
-               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+       if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+               quicklist_free(0, pgd_dtor, pgd);
+               pgd = NULL;
         }
-       return pgd;
  
-out_oom:
-       for (i--; i >= 0; i--) {
-               pgd_t pgdent = pgd[i];
-               void* pmd = (void *)__va(pgd_val(pgdent)-1);
-               paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-               pmd_cache_free(pmd, i);
-       }
-       quicklist_free(0, pgd_dtor, pgd);
-       return NULL;
+       return pgd;
  }
  
  void pgd_free(pgd_t *pgd)
  {
-       int i;
-
-       /* in the PAE case user pgd entries are overwritten before usage */
-       if (PTRS_PER_PMD > 1)
-               for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-                       pgd_t pgdent = pgd[i];
-                       void* pmd = (void *)__va(pgd_val(pgdent)-1);
-                       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-                       pmd_cache_free(pmd, i);
-               }
-       /* in the non-PAE case, free_pgtables() clears user pgd entries */
+       pgd_mop_up_pmds(pgd);
         quicklist_free(0, pgd_dtor, pgd);
  }
  
@@ -372,4 +376,3 @@ void check_pgt_cache(void)
  {
         quicklist_trim(0, pgd_dtor, 25, 16);
  }
-
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c

index ea85172fc0cc6c33fd20ec49981411ea9b73d031..65416f843e597b2908d64eee5ae321f1e29d7537 100644 (file)
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
  acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
  {
         int pxm, node;
+       int apic_id;
+
+       apic_id = pa->apic_id;
         if (srat_disabled())
                 return;
         if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
                 bad_srat();
                 return;
         }
-       apicid_to_node[pa->apic_id] = node;
+       apicid_to_node[apic_id] = node;
         acpi_numa = 1;
         printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
-              pxm, pa->apic_id, node);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Protect against too large hotadd areas that would fill up memory.
- */
-static int hotadd_enough_memory(struct bootnode *nd)
-{
-       static unsigned long allocated;
-       static unsigned long last_area_end;
-       unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
-       long mem = pages * sizeof(struct page);
-       unsigned long addr;
-       unsigned long allowed;
-       unsigned long oldpages = pages;
-
-       if (mem < 0)
-               return 0;
-       allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
-       allowed = (allowed / 100) * hotadd_percent;
-       if (allocated + mem > allowed) {
-               unsigned long range;
-               /* Give them at least part of their hotadd memory upto hotadd_percent
-                  It would be better to spread the limit out
-                  over multiple hotplug areas, but that is too complicated
-                  right now */
-               if (allocated >= allowed)
-                       return 0;
-               range = allowed - allocated;
-               pages = (range / PAGE_SIZE);
-               mem = pages * sizeof(struct page);
-               nd->end = nd->start + range;
-       }
-       /* Not completely fool proof, but a good sanity check */
-       addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
-       if (addr == -1UL)
-               return 0;
-       if (pages != oldpages)
-               printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
-                       pages << PAGE_SHIFT);
-       last_area_end = addr + mem;
-       allocated += mem;
-       return 1;
-}
-
-static int update_end_of_memory(unsigned long end)
-{
-       found_add_area = 1;
-       if ((end >> PAGE_SHIFT) > end_pfn)
-               end_pfn = end >> PAGE_SHIFT;
-       return 1;
+              pxm, apic_id, node);
  }
  
-static inline int save_add_info(void)
-{
-       return hotadd_percent > 0;
-}
-#else
  int update_end_of_memory(unsigned long end) {return -1;}
  static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
  #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;}
  #else
  static inline int save_add_info(void) {return 0;}
  #endif
-#endif
  /*
   * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add infomation.
+ * Both SPARSE and RESERVE need nodes_add information.
   * This code supports one contiguous hot add area per node.
   */
  static int reserve_hotadd(int node, unsigned long start, unsigned long end)
@@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
         return 1;
  }
  
-static void unparse_node(int node)
+static void __init unparse_node(int node)
  {
         int i;
         node_clear(node, nodes_parsed);
@@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
         /* First clean up the node list */
         for (i = 0; i < MAX_NUMNODES; i++) {
                 cutoff_node(i, start, end);
-               if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+               /*
+                * don't confuse VM with a node that doesn't have the
+                * minimum memory.
+                */
+               if (nodes[i].end &&
+                       (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
                         unparse_node(i);
                         node_set_offline(i);
                 }
@@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                         setup_node_bootmem(i, nodes[i].start, nodes[i].end);
  
         for (i = 0; i < NR_CPUS; i++) {
-               if (cpu_to_node(i) == NUMA_NO_NODE)
+               int node = early_cpu_to_node(i);
+
+               if (node == NUMA_NO_NODE)
                         continue;
-               if (!node_isset(cpu_to_node(i), node_possible_map))
+               if (!node_isset(node, node_possible_map))
                         numa_set_node(i, NUMA_NO_NODE);
         }
         numa_init_array();
@@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
  }
  
  #ifdef CONFIG_NUMA_EMU
+static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
+       [0 ... MAX_NUMNODES-1] = PXM_INVAL
+};
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+       [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
  static int __init find_node_by_addr(unsigned long addr)
  {
         int ret = NUMA_NO_NODE;
@@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr)
                         break;
                 }
         }
-       return i;
+       return ret;
  }
  
  /*
@@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr)
  void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
  {
         int i, j;
-       int fake_node_to_pxm_map[MAX_NUMNODES] = {
-               [0 ... MAX_NUMNODES-1] = PXM_INVAL
-       };
-       unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
-               [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-       };
  
         printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
                          "topology.\n");
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c

index 0ed046a187f77fb1d29228780c395f4e8d4636a1..e2095cba409f200b5b25e9b5bcdbce0ed6288492 100644 (file)
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -32,7 +32,7 @@ static int backtrace_stack(void *data, char *name)
         return 0;
  }
  
-static void backtrace_address(void *data, unsigned long addr)
+static void backtrace_address(void *data, unsigned long addr, int reliable)
  {
         unsigned int *depth = data;
  
@@ -48,7 +48,7 @@ static struct stacktrace_ops backtrace_ops = {
  };
  
  struct frame_head {
-       struct frame_head *ebp;
+       struct frame_head *bp;
         unsigned long ret;
  } __attribute__((packed));
  
@@ -67,21 +67,21 @@ dump_user_backtrace(struct frame_head * head)
  
         /* frame pointers should strictly progress back up the stack
          * (towards higher addresses) */
-       if (head >= bufhead[0].ebp)
+       if (head >= bufhead[0].bp)
                 return NULL;
  
-       return bufhead[0].ebp;
+       return bufhead[0].bp;
  }
  
  void
  x86_backtrace(struct pt_regs * const regs, unsigned int depth)
  {
         struct frame_head *head = (struct frame_head *)frame_pointer(regs);
-       unsigned long stack = stack_pointer(regs);
+       unsigned long stack = kernel_trap_sp(regs);
  
         if (!user_mode_vm(regs)) {
                 if (depth)
-                       dump_trace(NULL, regs, (unsigned long *)stack,
+                       dump_trace(NULL, regs, (unsigned long *)stack, 0,
                                    &backtrace_ops, &depth);
                 return;
         }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c

index c8ab79ef42761f1df1cfffc2b53a205e5a5282ce..1f11cf0a307f448e777f5a132e0de15ebc4ca7a3 100644 (file)
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -18,11 +18,11 @@
  #include <asm/nmi.h>
  #include <asm/msr.h>
  #include <asm/apic.h>
- 
+
  #include "op_counter.h"
  #include "op_x86_model.h"
  
-static struct op_x86_model_spec const * model;
+static struct op_x86_model_spec const *model;
  static struct op_msrs cpu_msrs[NR_CPUS];
  static unsigned long saved_lvtpc[NR_CPUS];
  
@@ -41,7 +41,6 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state)
         return 0;
  }
  
-
  static int nmi_resume(struct sys_device *dev)
  {
         if (nmi_enabled == 1)
@@ -49,29 +48,27 @@ static int nmi_resume(struct sys_device *dev)
         return 0;
  }
  
-
  static struct sysdev_class oprofile_sysclass = {
         .name           = "oprofile",
         .resume         = nmi_resume,
         .suspend        = nmi_suspend,
  };
  
-
  static struct sys_device device_oprofile = {
         .id     = 0,
         .cls    = &oprofile_sysclass,
  };
  
-
  static int __init init_sysfs(void)
  {
         int error;
-       if (!(error = sysdev_class_register(&oprofile_sysclass)))
+
+       error = sysdev_class_register(&oprofile_sysclass);
+       if (!error)
                 error = sysdev_register(&device_oprofile);
         return error;
  }
  
-
  static void exit_sysfs(void)
  {
         sysdev_unregister(&device_oprofile);
@@ -90,7 +87,7 @@ static int profile_exceptions_notify(struct notifier_block *self,
         int ret = NOTIFY_DONE;
         int cpu = smp_processor_id();
  
-       switch(val) {
+       switch (val) {
         case DIE_NMI:
                 if (model->check_ctrs(args->regs, &cpu_msrs[cpu]))
                         ret = NOTIFY_STOP;
@@ -101,24 +98,24 @@ static int profile_exceptions_notify(struct notifier_block *self,
         return ret;
  }
  
-static void nmi_cpu_save_registers(struct op_msrs * msrs)
+static void nmi_cpu_save_registers(struct op_msrs *msrs)
  {
         unsigned int const nr_ctrs = model->num_counters;
-       unsigned int const nr_ctrls = model->num_controls; 
-       struct op_msr * counters = msrs->counters;
-       struct op_msr * controls = msrs->controls;
+       unsigned int const nr_ctrls = model->num_controls;
+       struct op_msr *counters = msrs->counters;
+       struct op_msr *controls = msrs->controls;
         unsigned int i;
  
         for (i = 0; i < nr_ctrs; ++i) {
-               if (counters[i].addr){
+               if (counters[i].addr) {
                         rdmsr(counters[i].addr,
                                 counters[i].saved.low,
                                 counters[i].saved.high);
                 }
         }
- 
+
         for (i = 0; i < nr_ctrls; ++i) {
-               if (controls[i].addr){
+               if (controls[i].addr) {
                         rdmsr(controls[i].addr,
                                 controls[i].saved.low,
                                 controls[i].saved.high);
@@ -126,15 +123,13 @@ static void nmi_cpu_save_registers(struct op_msrs * msrs)
         }
  }
  
-
-static void nmi_save_registers(void * dummy)
+static void nmi_save_registers(void *dummy)
  {
         int cpu = smp_processor_id();
-       struct op_msrs * msrs = &cpu_msrs[cpu];
+       struct op_msrs *msrs = &cpu_msrs[cpu];
         nmi_cpu_save_registers(msrs);
  }
  
-
  static void free_msrs(void)
  {
         int i;
@@ -146,7 +141,6 @@ static void free_msrs(void)
         }
  }
  
-
  static int allocate_msrs(void)
  {
         int success = 1;
@@ -173,11 +167,10 @@ static int allocate_msrs(void)
         return success;
  }
  
-
-static void nmi_cpu_setup(void * dummy)
+static void nmi_cpu_setup(void *dummy)
  {
         int cpu = smp_processor_id();
-       struct op_msrs * msrs = &cpu_msrs[cpu];
+       struct op_msrs *msrs = &cpu_msrs[cpu];
         spin_lock(&oprofilefs_lock);
         model->setup_ctrs(msrs);
         spin_unlock(&oprofilefs_lock);
@@ -193,13 +186,14 @@ static struct notifier_block profile_exceptions_nb = {
  
  static int nmi_setup(void)
  {
-       int err=0;
+       int err = 0;
         int cpu;
  
         if (!allocate_msrs())
                 return -ENOMEM;
  
-       if ((err = register_die_notifier(&profile_exceptions_nb))){
+       err = register_die_notifier(&profile_exceptions_nb);
+       if (err) {
                 free_msrs();
                 return err;
         }
@@ -210,7 +204,7 @@ static int nmi_setup(void)
  
         /* Assume saved/restored counters are the same on all CPUs */
         model->fill_in_addresses(&cpu_msrs[0]);
-       for_each_possible_cpu (cpu) {
+       for_each_possible_cpu(cpu) {
                 if (cpu != 0) {
                         memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters,
                                 sizeof(struct op_msr) * model->num_counters);
@@ -226,39 +220,37 @@ static int nmi_setup(void)
         return 0;
  }
  
-
-static void nmi_restore_registers(struct op_msrs * msrs)
+static void nmi_restore_registers(struct op_msrs *msrs)
  {
         unsigned int const nr_ctrs = model->num_counters;
-       unsigned int const nr_ctrls = model->num_controls; 
-       struct op_msr * counters = msrs->counters;
-       struct op_msr * controls = msrs->controls;
+       unsigned int const nr_ctrls = model->num_controls;
+       struct op_msr *counters = msrs->counters;
+       struct op_msr *controls = msrs->controls;
         unsigned int i;
  
         for (i = 0; i < nr_ctrls; ++i) {
-               if (controls[i].addr){
+               if (controls[i].addr) {
                         wrmsr(controls[i].addr,
                                 controls[i].saved.low,
                                 controls[i].saved.high);
                 }
         }
- 
+
         for (i = 0; i < nr_ctrs; ++i) {
-               if (counters[i].addr){
+               if (counters[i].addr) {
                         wrmsr(counters[i].addr,
                                 counters[i].saved.low,
                                 counters[i].saved.high);
                 }
         }
  }
- 
  
-static void nmi_cpu_shutdown(void * dummy)
+static void nmi_cpu_shutdown(void *dummy)
  {
         unsigned int v;
         int cpu = smp_processor_id();
-       struct op_msrs * msrs = &cpu_msrs[cpu];
- 
+       struct op_msrs *msrs = &cpu_msrs[cpu];
+
         /* restoring APIC_LVTPC can trigger an apic error because the delivery
          * mode and vector nr combination can be illegal. That's by design: on
          * power on apic lvt contain a zero vector nr which are legal only for
@@ -271,7 +263,6 @@ static void nmi_cpu_shutdown(void * dummy)
         nmi_restore_registers(msrs);
  }
  
- 
  static void nmi_shutdown(void)
  {
         nmi_enabled = 0;
@@ -281,45 +272,40 @@ static void nmi_shutdown(void)
         free_msrs();
  }
  
- 
-static void nmi_cpu_start(void * dummy)
+static void nmi_cpu_start(void *dummy)
  {
-       struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()];
+       struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
         model->start(msrs);
  }
- 
  
  static int nmi_start(void)
  {
         on_each_cpu(nmi_cpu_start, NULL, 0, 1);
         return 0;
  }
- 
- 
-static void nmi_cpu_stop(void * dummy)
+
+static void nmi_cpu_stop(void *dummy)
  {
-       struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()];
+       struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
         model->stop(msrs);
  }
- 
- 
+
  static void nmi_stop(void)
  {
         on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
  }
  
-
  struct op_counter_config counter_config[OP_MAX_COUNTER];
  
-static int nmi_create_files(struct super_block * sb, struct dentry * root)
+static int nmi_create_files(struct super_block *sb, struct dentry *root)
  {
         unsigned int i;
  
         for (i = 0; i < model->num_counters; ++i) {
-               struct dentry * dir;
+               struct dentry *dir;
                 char buf[4];
- 
-               /* quick little hack to _not_ expose a counter if it is not
+
+               /* quick little hack to _not_ expose a counter if it is not
                  * available for use.  This should protect userspace app.
                  * NOTE:  assumes 1:1 mapping here (that counters are organized
                  *        sequentially in their struct assignment).
@@ -329,21 +315,21 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root)
  
                 snprintf(buf,  sizeof(buf), "%d", i);
                 dir = oprofilefs_mkdir(sb, root, buf);
-               oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); 
-               oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); 
-               oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); 
-               oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 
-               oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 
-               oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 
+               oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
+               oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
+               oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
+               oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
+               oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
+               oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
         }
  
         return 0;
  }
- 
+
  static int p4force;
  module_param(p4force, int, 0);
- 
-static int __init p4_init(char ** cpu_type)
+
+static int __init p4_init(char **cpu_type)
  {
         __u8 cpu_model = boot_cpu_data.x86_model;
  
@@ -356,15 +342,15 @@ static int __init p4_init(char ** cpu_type)
         return 1;
  #else
         switch (smp_num_siblings) {
-               case 1:
-                       *cpu_type = "i386/p4";
-                       model = &op_p4_spec;
-                       return 1;
-
-               case 2:
-                       *cpu_type = "i386/p4-ht";
-                       model = &op_p4_ht2_spec;
-                       return 1;
+       case 1:
+               *cpu_type = "i386/p4";
+               model = &op_p4_spec;
+               return 1;
+
+       case 2:
+               *cpu_type = "i386/p4-ht";
+               model = &op_p4_ht2_spec;
+               return 1;
         }
  #endif
  
@@ -373,8 +359,7 @@ static int __init p4_init(char ** cpu_type)
         return 0;
  }
  
-
-static int __init ppro_init(char ** cpu_type)
+static int __init ppro_init(char **cpu_type)
  {
         __u8 cpu_model = boot_cpu_data.x86_model;
  
@@ -409,52 +394,52 @@ int __init op_nmi_init(struct oprofile_operations *ops)
  
         if (!cpu_has_apic)
                 return -ENODEV;
- 
+
         switch (vendor) {
-               case X86_VENDOR_AMD:
-                       /* Needs to be at least an Athlon (or hammer in 32bit mode) */
+       case X86_VENDOR_AMD:
+               /* Needs to be at least an Athlon (or hammer in 32bit mode) */
  
-                       switch (family) {
-                       default:
+               switch (family) {
+               default:
+                       return -ENODEV;
+               case 6:
+                       model = &op_athlon_spec;
+                       cpu_type = "i386/athlon";
+                       break;
+               case 0xf:
+                       model = &op_athlon_spec;
+                       /* Actually it could be i386/hammer too, but give
+                        user space an consistent name. */
+                       cpu_type = "x86-64/hammer";
+                       break;
+               case 0x10:
+                       model = &op_athlon_spec;
+                       cpu_type = "x86-64/family10";
+                       break;
+               }
+               break;
+
+       case X86_VENDOR_INTEL:
+               switch (family) {
+                       /* Pentium IV */
+               case 0xf:
+                       if (!p4_init(&cpu_type))
                                 return -ENODEV;
-                       case 6:
-                               model = &op_athlon_spec;
-                               cpu_type = "i386/athlon";
-                               break;
-                       case 0xf:
-                               model = &op_athlon_spec;
-                               /* Actually it could be i386/hammer too, but give
-                                  user space an consistent name. */
-                               cpu_type = "x86-64/hammer";
-                               break;
-                       case 0x10:
-                               model = &op_athlon_spec;
-                               cpu_type = "x86-64/family10";
-                               break;
-                       }
                         break;
- 
-               case X86_VENDOR_INTEL:
-                       switch (family) {
-                               /* Pentium IV */
-                               case 0xf:
-                                       if (!p4_init(&cpu_type))
-                                               return -ENODEV;
-                                       break;
-
-                               /* A P6-class processor */
-                               case 6:
-                                       if (!ppro_init(&cpu_type))
-                                               return -ENODEV;
-                                       break;
-
-                               default:
-                                       return -ENODEV;
-                       }
+
+                       /* A P6-class processor */
+               case 6:
+                       if (!ppro_init(&cpu_type))
+                               return -ENODEV;
                         break;
  
                 default:
                         return -ENODEV;
+               }
+               break;
+
+       default:
+               return -ENODEV;
         }
  
         init_sysfs();
@@ -469,7 +454,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
         return 0;
  }
  
-
  void op_nmi_exit(void)
  {
         if (using_nmi)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c

index 862746390666daa6da34942889321d6eebcbe9b2..52deabc72a6facaf7975436a7bba1cf5c99d403f 100644 (file)
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -109,6 +109,19 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
         }
  }
  
+static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
+{
+       struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
+
+       if (rom_r->parent)
+               return;
+       if (rom_r->start)
+               /* we deal with BIOS assigned ROM later */
+               return;
+       if (!(pci_probe & PCI_ASSIGN_ROMS))
+               rom_r->start = rom_r->end = rom_r->flags = 0;
+}
+
  /*
   *  Called after each bus is probed, but before its children
   *  are examined.
@@ -116,8 +129,12 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
  
  void __devinit  pcibios_fixup_bus(struct pci_bus *b)
  {
+       struct pci_dev *dev;
+
         pcibios_fixup_ghosts(b);
         pci_read_bridge_bases(b);
+       list_for_each_entry(dev, &b->devices, bus_list)
+               pcibios_fixup_device_resources(dev);
  }
  
  /*
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c

index 6cff66dd0c91ac6d071701dd3f92e95a59d62da7..cb63007e20b2af4142a6cf1703987a48ee86c944 100644 (file)
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -19,7 +19,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
  
         printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
         reg = 0xd0;
-       for(pxb=0; pxb<2; pxb++) {
+       for(pxb = 0; pxb < 2; pxb++) {
                 pci_read_config_byte(d, reg++, &busno);
                 pci_read_config_byte(d, reg++, &suba);
                 pci_read_config_byte(d, reg++, &subb);
@@ -56,7 +56,7 @@ static void __devinit  pci_fixup_umc_ide(struct pci_dev *d)
         int i;
  
         printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d));
-       for(i=0; i<4; i++)
+       for(i = 0; i < 4; i++)
                 d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO;
  }
  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
@@ -127,7 +127,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
                    NB latency to zero */
                 pci_write_config_byte(d, PCI_LATENCY_TIMER, 0);
  
-               where = 0x95; /* the memory write queue timer register is 
+               where = 0x95; /* the memory write queue timer register is
                                 different for the KT266x's: 0x95 not 0x55 */
         } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 &&
                         (d->revision == VIA_8363_KL133_REVISION_ID ||
@@ -230,7 +230,7 @@ static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int wh
  
         if ((offset) && (where == offset))
                 value = value & 0xfffffffc;
-       
+
         return raw_pci_ops->write(0, bus->number, devfn, where, size, value);
  }
  
@@ -271,8 +271,8 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
                  * after hot-remove, the pbus->devices is empty and this code
                  * will set the offsets to zero and the bus ops to parent's bus
                  * ops, which is unmodified.
-                */
-               for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i)
+                */
+               for (i = GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i)
                         quirk_aspm_offset[i] = 0;
  
                 pbus->ops = pbus->parent->ops;
@@ -286,17 +286,17 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
                 list_for_each_entry(dev, &pbus->devices, bus_list) {
                         /* There are 0 to 8 devices attached to this bus */
                         cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP);
-                       quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10;
+                       quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)] = cap_base + 0x10;
                 }
                 pbus->ops = &quirk_pcie_aspm_ops;
         }
  }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PA,     pcie_rootport_aspm_quirk );
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PA1,    pcie_rootport_aspm_quirk );
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PB,     pcie_rootport_aspm_quirk );
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PB1,    pcie_rootport_aspm_quirk );
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PC,     pcie_rootport_aspm_quirk );
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PC1,    pcie_rootport_aspm_quirk );
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PA,     pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PA1,    pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PB,     pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PB1,    pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PC,     pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_MCH_PC1,    pcie_rootport_aspm_quirk);
  
  /*
   * Fixup to mark boot BIOS video selected by BIOS before it changes
@@ -336,8 +336,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
                  * PCI header type NORMAL.
                  */
                 if (bridge
-                   &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
-                      ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
+                   && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+                      || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) {
                         pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
                                                 &config);
                         if (!(config & PCI_BRIDGE_CTL_VGA))
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c

index 88d8f5c0ecb5e2e8bb5bc05f5aff97790ddd667a..ed07ce6c171bef5cb3f869105271f1cdbc706b74 100644 (file)
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -200,6 +200,7 @@ static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
  {
         static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
  
+       WARN_ON_ONCE(pirq >= 16);
         return irqmap[read_config_nybble(router, 0x48, pirq-1)];
  }
  
@@ -207,7 +208,8 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
  {
         static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
         unsigned int val = irqmap[irq];
-               
+
+       WARN_ON_ONCE(pirq >= 16);
         if (val) {
                 write_config_nybble(router, 0x48, pirq-1, val);
                 return 1;
@@ -257,12 +259,16 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
  static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
  {
         static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+       WARN_ON_ONCE(pirq >= 5);
         return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
  }
  
  static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
  {
         static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+       WARN_ON_ONCE(pirq >= 5);
         write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
         return 1;
  }
@@ -275,12 +281,16 @@ static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq
  static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
  {
         static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+       WARN_ON_ONCE(pirq >= 4);
         return read_config_nybble(router,0x43, pirqmap[pirq-1]);
  }
  
  static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
  {
         static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+       WARN_ON_ONCE(pirq >= 4);
         write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
         return 1;
  }
@@ -419,6 +429,7 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
  
  static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
  {
+       WARN_ON_ONCE(pirq >= 9);
         if (pirq > 8) {
                 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
                 return 0;
@@ -428,6 +439,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
  
  static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
  {
+       WARN_ON_ONCE(pirq >= 9);
         if (pirq > 8) {
                 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
                 return 0;
@@ -449,14 +461,14 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
   */
  static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
  {
-       outb_p(pirq, 0xc00);
+       outb(pirq, 0xc00);
         return inb(0xc01) & 0xf;
  }
  
  static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
  {
-       outb_p(pirq, 0xc00);
-       outb_p(irq, 0xc01);
+       outb(pirq, 0xc00);
+       outb(irq, 0xc01);
         return 1;
  }
  
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c

index 998fd3ec0d68a5d252b9dc4f1e1a41dee4777a89..efcf620d1439490b7311fe4aa05f6d3ce4f7df75 100644 (file)
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -19,7 +19,7 @@ unsigned long saved_context_esp, saved_context_ebp;
  unsigned long saved_context_esi, saved_context_edi;
  unsigned long saved_context_eflags;
  
-void __save_processor_state(struct saved_context *ctxt)
+static void __save_processor_state(struct saved_context *ctxt)
  {
         mtrr_save_fixed_ranges(NULL);
         kernel_fpu_begin();
@@ -74,19 +74,19 @@ static void fix_processor_context(void)
         /*
          * Now maybe reload the debug registers
          */
-       if (current->thread.debugreg[7]){
-               set_debugreg(current->thread.debugreg[0], 0);
-               set_debugreg(current->thread.debugreg[1], 1);
-               set_debugreg(current->thread.debugreg[2], 2);
-               set_debugreg(current->thread.debugreg[3], 3);
+       if (current->thread.debugreg7) {
+               set_debugreg(current->thread.debugreg0, 0);
+               set_debugreg(current->thread.debugreg1, 1);
+               set_debugreg(current->thread.debugreg2, 2);
+               set_debugreg(current->thread.debugreg3, 3);
                 /* no 4 and 5 */
-               set_debugreg(current->thread.debugreg[6], 6);
-               set_debugreg(current->thread.debugreg[7], 7);
+               set_debugreg(current->thread.debugreg6, 6);
+               set_debugreg(current->thread.debugreg7, 7);
         }
  
  }
  
-void __restore_processor_state(struct saved_context *ctxt)
+static void __restore_processor_state(struct saved_context *ctxt)
  {
         /*
          * control registers
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore

index f8b69d84238eb4f7743c23b425e1a4900cb9cd7f..60274d5746e1020ff24c648c68e0bfbb70e66c59 100644 (file)
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/vdso/.gitignore
@@ -1 +1,6 @@
  vdso.lds
+vdso-syms.lds
+vdso32-syms.lds
+vdso32-syscall-syms.lds
+vdso32-sysenter-syms.lds
+vdso32-int80-syms.lds
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile

index e7bff0fbac235715ca5e8601d510414c38e56a9c..d28dda574700949f12c6301cf7f5f85457256e8b 100644 (file)
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -1,39 +1,37 @@
  #
-# x86-64 vDSO.
+# Building vDSO images for x86.
  #
  
+VDSO64-$(CONFIG_X86_64)                := y
+VDSO32-$(CONFIG_X86_32)                := y
+VDSO32-$(CONFIG_COMPAT)                := y
+
+vdso-install-$(VDSO64-y)       += vdso.so
+vdso-install-$(VDSO32-y)       += $(vdso32-y:=.so)
+
+
  # files to link into the vdso
-# vdso-start.o has to be first
-vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
  
  # files to link into kernel
-obj-y := vma.o vdso.o vdso-syms.o
+obj-$(VDSO64-y)                        += vma.o vdso.o
+obj-$(VDSO32-y)                        += vdso32.o vdso32-setup.o
  
  vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
  
  $(obj)/vdso.o: $(obj)/vdso.so
  
-targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) vdso-syms.o
-
-# The DSO images are built using a special linker script.
-quiet_cmd_syscall = SYSCALL $@
-      cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
-                         -Wl,-T,$(filter-out FORCE,$^) -o $@
+targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
  
  export CPPFLAGS_vdso.lds += -P -C
  
-vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \
-                $(call ld-option, -Wl$(comma)--hash-style=sysv) \
-               -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
-SYSCFLAGS_vdso.so = $(vdso-flags)
-SYSCFLAGS_vdso.so.dbg = $(vdso-flags)
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \
+                       -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
  
  $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
  
-$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
-
  $(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
-       $(call if_changed,syscall)
+       $(call if_changed,vdso)
  
  $(obj)/%.so: OBJCOPYFLAGS := -S
  $(obj)/%.so: $(obj)/%.so.dbg FORCE
@@ -41,24 +39,96 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
  
  CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
  
-$(obj)/vclock_gettime.o: KBUILD_CFLAGS = $(CFL)
-$(obj)/vgetcpu.o: KBUILD_CFLAGS = $(CFL)
+$(vobjs): KBUILD_CFLAGS = $(CFL)
+
+targets += vdso-syms.lds
+obj-$(VDSO64-y)                        += vdso-syms.lds
+
+#
+# Match symbols in the DSO that look like VDSO*; produce a file of constants.
+#
+sed-vdsosym := -e 's/^00*/0/' \
+       -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
+quiet_cmd_vdsosym = VDSOSYM $@
+      cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
+
+$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
+       $(call if_changed,vdsosym)
+
+#
+# Build multiple 32-bit vDSO images to choose from at boot time.
+#
+obj-$(VDSO32-y)                        += vdso32-syms.lds
+vdso32.so-$(CONFIG_X86_32)     += int80
+vdso32.so-$(CONFIG_COMPAT)     += syscall
+vdso32.so-$(VDSO32-y)          += sysenter
+
+CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1
+
+# This makes sure the $(obj) subdirectory exists even though vdso32/
+# is not a kbuild sub-make subdirectory.
+override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
  
-# We also create a special relocatable object that should mirror the symbol
-# table and layout of the linked DSO.  With ld -R we can then refer to
-# these symbols in the kernel code rather than hand-coded addresses.
-extra-y += vdso-syms.o
-$(obj)/built-in.o: $(obj)/vdso-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
+targets += vdso32/vdso32.lds
+targets += $(vdso32.so-y:%=vdso32-%.so.dbg) $(vdso32.so-y:%=vdso32-%.so)
+targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o)
  
-SYSCFLAGS_vdso-syms.o = -r -d
-$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
-       $(call if_changed,syscall)
+extra-y        += $(vdso32.so-y:%=vdso32-%.so)
  
+$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so)
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32
+
+$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
+                                        $(obj)/vdso32/vdso32.lds \
+                                        $(obj)/vdso32/note.o \
+                                        $(obj)/vdso32/%.o
+       $(call if_changed,vdso)
+
+# Make vdso32-*-syms.lds from each image, and then make sure they match.
+# The only difference should be that some do not define VDSO32_SYSENTER_RETURN.
+
+targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds)
+
+quiet_cmd_vdso32sym = VDSOSYM $@
+define cmd_vdso32sym
+       if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \
+          $(foreach H,$(filter-out FORCE,$^),\
+                    if grep -q VDSO32_SYSENTER_RETURN $H; \
+                    then diff -u $(@D)/.tmp_$(@F) $H; \
+                    else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \
+                         diff -u - $H; fi &&) : ;\
+       then mv -f $(@D)/.tmp_$(@F) $@; \
+       else rm -f $(@D)/.tmp_$(@F); exit 1; \
+       fi
+endef
+
+$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
+       $(call if_changed,vdso32sym)
+
+#
+# The DSO images are built using a special linker script.
+#
+quiet_cmd_vdso = VDSO    $@
+      cmd_vdso = $(CC) -nostdlib -o $@ \
+                      $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+                      -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
+
+VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
+
+#
+# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
+#
  quiet_cmd_vdso_install = INSTALL $@
        cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
-vdso.so:
+$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE
         @mkdir -p $(MODLIB)/vdso
         $(call cmd,vdso_install)
  
-vdso_install: vdso.so
+PHONY += vdso_install $(vdso-install-y)
+vdso_install: $(vdso-install-y)
+
+clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c

index 5b54cdfb2b07f989c34696ba1ca95c8dcd9f518d..23476c2ebfc4b38b817e5907e0c658e7d21a6715 100644 (file)
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -19,7 +19,6 @@
  #include <asm/hpet.h>
  #include <asm/unistd.h>
  #include <asm/io.h>
-#include <asm/vgtod.h>
  #include "vextern.h"
  
  #define gtod vdso_vsyscall_gtod_data
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S

new file mode 100644 (file)

index 0000000..634a2cf
--- /dev/null
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -0,0 +1,64 @@
+/*
+ * Linker script for vDSO.  This is an ELF shared object prelinked to
+ * its virtual address, and with only one read-only segment.
+ * This script controls its layout.
+ */
+
+SECTIONS
+{
+       . = VDSO_PRELINK + SIZEOF_HEADERS;
+
+       .hash           : { *(.hash) }                  :text
+       .gnu.hash       : { *(.gnu.hash) }
+       .dynsym         : { *(.dynsym) }
+       .dynstr         : { *(.dynstr) }
+       .gnu.version    : { *(.gnu.version) }
+       .gnu.version_d  : { *(.gnu.version_d) }
+       .gnu.version_r  : { *(.gnu.version_r) }
+
+       .note           : { *(.note.*) }                :text   :note
+
+       .eh_frame_hdr   : { *(.eh_frame_hdr) }          :text   :eh_frame_hdr
+       .eh_frame       : { KEEP (*(.eh_frame)) }       :text
+
+       .dynamic        : { *(.dynamic) }               :text   :dynamic
+
+       .rodata         : { *(.rodata*) }               :text
+       .data           : {
+             *(.data*)
+             *(.sdata*)
+             *(.got.plt) *(.got)
+             *(.gnu.linkonce.d.*)
+             *(.bss*)
+             *(.dynbss*)
+             *(.gnu.linkonce.b.*)
+       }
+
+       .altinstructions        : { *(.altinstructions) }
+       .altinstr_replacement   : { *(.altinstr_replacement) }
+
+       /*
+        * Align the actual code well away from the non-instruction data.
+        * This is the best thing for the I-cache.
+        */
+       . = ALIGN(0x100);
+
+       .text           : { *(.text*) }                 :text   =0x90909090
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME        0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+       text            PT_LOAD         FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+       dynamic         PT_DYNAMIC      FLAGS(4);               /* PF_R */
+       note            PT_NOTE         FLAGS(4);               /* PF_R */
+       eh_frame_hdr    PT_GNU_EH_FRAME;
+}
diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S

deleted file mode 100644 (file)

index 2dc2cdb..0000000
--- a/arch/x86/vdso/vdso-start.S
+++ /dev/null
@@ -1,2 +0,0 @@
-       .globl vdso_kernel_start
-vdso_kernel_start:
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S

index 667d3245d972753f6b8e64480e6935706511eee7..4e5dd3b4de7f64428db97c5ef7ef9c29ceecfbf6 100644 (file)
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -1,79 +1,37 @@
  /*
- * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
- * object prelinked to its virtual address, and with only one read-only
- * segment (that fits in one page).  This script controls its layout.
+ * Linker script for 64-bit vDSO.
+ * We #include the file to define the layout details.
+ * Here we only choose the prelinked virtual address.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.  We can define local symbols here called VDSO* to make their
+ * values visible using the asm-x86/vdso.h macros from the kernel proper.
   */
-#include <asm/asm-offsets.h>
-#include "voffset.h"
  
  #define VDSO_PRELINK 0xffffffffff700000
-
-SECTIONS
-{
-  . = VDSO_PRELINK + SIZEOF_HEADERS;
-
-  .hash           : { *(.hash) }               :text
-  .gnu.hash       : { *(.gnu.hash) }
-  .dynsym         : { *(.dynsym) }
-  .dynstr         : { *(.dynstr) }
-  .gnu.version    : { *(.gnu.version) }
-  .gnu.version_d  : { *(.gnu.version_d) }
-  .gnu.version_r  : { *(.gnu.version_r) }
-
-  /* This linker script is used both with -r and with -shared.
-     For the layouts to match, we need to skip more than enough
-     space for the dynamic symbol table et al.  If this amount
-     is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
-
-  .text           : { *(.text*) }              :text
-  .rodata         : { *(.rodata*) }            :text
-  .data                  : {
-       *(.data*)
-       *(.sdata*)
-       *(.bss*)
-       *(.dynbss*)
-  }                                            :text
-
-  .altinstructions : { *(.altinstructions) }           :text
-  .altinstr_replacement  : { *(.altinstr_replacement) }        :text
-
-  .note                  : { *(.note.*) }              :text :note
-  .eh_frame_hdr   : { *(.eh_frame_hdr) }       :text :eh_frame_hdr
-  .eh_frame       : { KEEP (*(.eh_frame)) }    :text
-  .dynamic        : { *(.dynamic) }            :text :dynamic
-  .useless        : {
-       *(.got.plt) *(.got)
-       *(.gnu.linkonce.d.*)
-       *(.gnu.linkonce.b.*)
-  }                                            :text
-}
+#include "vdso-layout.lds.S"
  
  /*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ * This controls what userland symbols we export from the vDSO.
   */
-PHDRS
-{
-  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
-  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
-  note PT_NOTE FLAGS(4); /* PF_R */
-  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+VERSION {
+       LINUX_2.6 {
+       global:
+               clock_gettime;
+               __vdso_clock_gettime;
+               gettimeofday;
+               __vdso_gettimeofday;
+               getcpu;
+               __vdso_getcpu;
+       local: *;
+       };
  }
  
+VDSO64_PRELINK = VDSO_PRELINK;
+
  /*
- * This controls what symbols we export from the DSO.
+ * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
   */
-VERSION
-{
-  LINUX_2.6 {
-    global:
-       clock_gettime;
-       __vdso_clock_gettime;
-       gettimeofday;
-       __vdso_gettimeofday;
-       getcpu;
-       __vdso_getcpu;
-    local: *;
-  };
-}
+#define VEXTERN(x)     VDSO64_ ## x = vdso_ ## x;
+#include "vextern.h"
+#undef VEXTERN
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/vdso/vdso32-setup.c

similarity index 66%

rename from arch/x86/kernel/sysenter_32.c

rename to arch/x86/vdso/vdso32-setup.c

index 5a2d951e26088e59263e6dc3df5223e7e9db7ad3..348f1341e1c8f4693c5615f9045c49244cc555bf 100644 (file)
--- a/arch/x86/kernel/sysenter_32.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -23,6 +23,8 @@
  #include <asm/unistd.h>
  #include <asm/elf.h>
  #include <asm/tlbflush.h>
+#include <asm/vdso.h>
+#include <asm/proto.h>
  
  enum {
         VDSO_DISABLED = 0,
@@ -36,14 +38,24 @@ enum {
  #define VDSO_DEFAULT   VDSO_ENABLED
  #endif
  
+#ifdef CONFIG_X86_64
+#define vdso_enabled                   sysctl_vsyscall32
+#define arch_setup_additional_pages    syscall32_setup_pages
+#endif
+
+/*
+ * This is the difference between the prelinked addresses in the vDSO images
+ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
+ * in the user address space.
+ */
+#define VDSO_ADDR_ADJUST       (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
+
  /*
   * Should the kernel map a VDSO page into processes and pass its
   * address down to glibc upon exec()?
   */
  unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
  
-EXPORT_SYMBOL_GPL(vdso_enabled);
-
  static int __init vdso_setup(char *s)
  {
         vdso_enabled = simple_strtoul(s, NULL, 0);
@@ -51,9 +63,18 @@ static int __init vdso_setup(char *s)
         return 1;
  }
  
-__setup("vdso=", vdso_setup);
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
  
-extern asmlinkage void sysenter_entry(void);
+EXPORT_SYMBOL_GPL(vdso_enabled);
+#endif
  
  static __init void reloc_symtab(Elf32_Ehdr *ehdr,
                                 unsigned offset, unsigned size)
@@ -78,7 +99,7 @@ static __init void reloc_symtab(Elf32_Ehdr *ehdr,
                 case STT_FUNC:
                 case STT_SECTION:
                 case STT_FILE:
-                       sym->st_value += VDSO_HIGH_BASE;
+                       sym->st_value += VDSO_ADDR_ADJUST;
                 }
         }
  }
@@ -104,7 +125,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
                 case DT_VERNEED:
                 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
                         /* definitely pointers needing relocation */
-                       dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+                       dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
                         break;
  
                 case DT_ENCODING ... OLD_DT_LOOS-1:
@@ -113,7 +134,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
                            they're even */
                         if (dyn->d_tag >= DT_ENCODING &&
                             (dyn->d_tag & 1) == 0)
-                               dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+                               dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
                         break;
  
                 case DT_VERDEFNUM:
@@ -142,15 +163,15 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
         int i;
  
         BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
-              !elf_check_arch(ehdr) ||
+              !elf_check_arch_ia32(ehdr) ||
                ehdr->e_type != ET_DYN);
  
-       ehdr->e_entry += VDSO_HIGH_BASE;
+       ehdr->e_entry += VDSO_ADDR_ADJUST;
  
         /* rebase phdrs */
         phdr = (void *)ehdr + ehdr->e_phoff;
         for (i = 0; i < ehdr->e_phnum; i++) {
-               phdr[i].p_vaddr += VDSO_HIGH_BASE;
+               phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
  
                 /* relocate dynamic stuff */
                 if (phdr[i].p_type == PT_DYNAMIC)
@@ -163,7 +184,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
                 if (!(shdr[i].sh_flags & SHF_ALLOC))
                         continue;
  
-               shdr[i].sh_addr += VDSO_HIGH_BASE;
+               shdr[i].sh_addr += VDSO_ADDR_ADJUST;
  
                 if (shdr[i].sh_type == SHT_SYMTAB ||
                     shdr[i].sh_type == SHT_DYNSYM)
@@ -172,6 +193,45 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
         }
  }
  
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_default_start, vdso32_default_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+static struct page *vdso32_pages[1];
+
+#ifdef CONFIG_X86_64
+
+static int use_sysenter __read_mostly = -1;
+
+#define        vdso32_sysenter()       (use_sysenter > 0)
+
+/* May not be __init: called during resume */
+void syscall32_cpu_init(void)
+{
+       if (use_sysenter < 0)
+               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
+
+       /* Load these always in case some future AMD CPU supports
+          SYSENTER from compat mode too. */
+       checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+       checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
+       checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+
+       wrmsrl(MSR_CSTAR, ia32_cstar_target);
+}
+
+#define compat_uses_vma                1
+
+static inline void map_compat_vdso(int map)
+{
+}
+
+#else  /* CONFIG_X86_32 */
+
+#define vdso32_sysenter()      (boot_cpu_has(X86_FEATURE_SEP))
+
  void enable_sep_cpu(void)
  {
         int cpu = get_cpu();
@@ -183,10 +243,10 @@ void enable_sep_cpu(void)
         }
  
         tss->x86_tss.ss1 = __KERNEL_CS;
-       tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+       tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
         wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-       wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
-       wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
+       wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
+       wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
         put_cpu();      
  }
  
@@ -209,13 +269,7 @@ static int __init gate_vma_init(void)
         return 0;
  }
  
-/*
- * These symbols are defined by vsyscall.o to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vsyscall_int80_start, vsyscall_int80_end;
-extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
-static struct page *syscall_pages[1];
+#define compat_uses_vma                0
  
  static void map_compat_vdso(int map)
  {
@@ -226,31 +280,35 @@ static void map_compat_vdso(int map)
  
         vdso_mapped = map;
  
-       __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
+       __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
                      map ? PAGE_READONLY_EXEC : PAGE_NONE);
  
         /* flush stray tlbs */
         flush_tlb_all();
  }
  
+#endif /* CONFIG_X86_64 */
+
  int __init sysenter_setup(void)
  {
         void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
         const void *vsyscall;
         size_t vsyscall_len;
  
-       syscall_pages[0] = virt_to_page(syscall_page);
+       vdso32_pages[0] = virt_to_page(syscall_page);
  
+#ifdef CONFIG_X86_32
         gate_vma_init();
  
         printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+#endif
  
-       if (!boot_cpu_has(X86_FEATURE_SEP)) {
-               vsyscall = &vsyscall_int80_start;
-               vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
+       if (!vdso32_sysenter()) {
+               vsyscall = &vdso32_default_start;
+               vsyscall_len = &vdso32_default_end - &vdso32_default_start;
         } else {
-               vsyscall = &vsyscall_sysenter_start;
-               vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
+               vsyscall = &vdso32_sysenter_start;
+               vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
         }
  
         memcpy(syscall_page, vsyscall, vsyscall_len);
@@ -259,9 +317,6 @@ int __init sysenter_setup(void)
         return 0;
  }
  
-/* Defined in vsyscall-sysenter.S */
-extern void SYSENTER_RETURN;
-
  /* Setup a VMA at program startup for the vsyscall page */
  int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
  {
@@ -286,7 +341,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
                         ret = addr;
                         goto up_fail;
                 }
+       }
  
+       if (compat_uses_vma || !compat) {
                 /*
                  * MAYWRITE to allow gdb to COW and set breakpoints
                  *
@@ -300,7 +357,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
                                               VM_READ|VM_EXEC|
                                               VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
                                               VM_ALWAYSDUMP,
-                                             syscall_pages);
+                                             vdso32_pages);
  
                 if (ret)
                         goto up_fail;
@@ -308,7 +365,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
  
         current->mm->context.vdso = (void *)addr;
         current_thread_info()->sysenter_return =
-               (void *)VDSO_SYM(&SYSENTER_RETURN);
+               VDSO32_SYMBOL(addr, SYSENTER_RETURN);
  
    up_fail:
         up_write(&mm->mmap_sem);
@@ -316,6 +373,45 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
         return ret;
  }
  
+#ifdef CONFIG_X86_64
+
+__initcall(sysenter_setup);
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static ctl_table abi_table2[] = {
+       {
+               .procname       = "vsyscall32",
+               .data           = &sysctl_vsyscall32,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec
+       },
+       {}
+};
+
+static ctl_table abi_root_table2[] = {
+       {
+               .ctl_name = CTL_ABI,
+               .procname = "abi",
+               .mode = 0555,
+               .child = abi_table2
+       },
+       {}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+       register_sysctl_table(abi_root_table2);
+       return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
+
+#else  /* CONFIG_X86_32 */
+
  const char *arch_vma_name(struct vm_area_struct *vma)
  {
         if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
@@ -344,3 +440,5 @@ int in_gate_area_no_task(unsigned long addr)
  {
         return 0;
  }
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S

new file mode 100644 (file)

index 0000000..1e36f72
--- /dev/null
+++ b/arch/x86/vdso/vdso32.S
@@ -0,0 +1,19 @@
+#include <linux/init.h>
+
+__INITDATA
+
+       .globl vdso32_default_start, vdso32_default_end
+vdso32_default_start:
+#ifdef CONFIG_X86_32
+       .incbin "arch/x86/vdso/vdso32-int80.so"
+#else
+       .incbin "arch/x86/vdso/vdso32-syscall.so"
+#endif
+vdso32_default_end:
+
+       .globl vdso32_sysenter_start, vdso32_sysenter_end
+vdso32_sysenter_start:
+       .incbin "arch/x86/vdso/vdso32-sysenter.so"
+vdso32_sysenter_end:
+
+__FINIT
diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/vdso/vdso32/.gitignore

new file mode 100644 (file)

index 0000000..e45fba9
--- /dev/null
+++ b/arch/x86/vdso/vdso32/.gitignore
@@ -0,0 +1 @@
+vdso32.lds
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/vdso/vdso32/int80.S

similarity index 71%

rename from arch/x86/kernel/vsyscall-int80_32.S

rename to arch/x86/vdso/vdso32/int80.S

index 103cab6aa7c031141042a0e131a1f04d33d9bcd7..b15b7c01aedbeab4d832b462bbbfe73a341f4fe3 100644 (file)
--- a/arch/x86/kernel/vsyscall-int80_32.S
+++ b/arch/x86/vdso/vdso32/int80.S
@@ -1,15 +1,15 @@
  /*
- * Code for the vsyscall page.  This version uses the old int $0x80 method.
+ * Code for the vDSO.  This version uses the old int $0x80 method.
   *
- * NOTE:
- * 1) __kernel_vsyscall _must_ be first in this page.
- * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
- *    for details.
+ * First get the common code for the sigreturn entry points.
+ * This must come first.
   */
+#include "sigreturn.S"
  
         .text
         .globl __kernel_vsyscall
         .type __kernel_vsyscall,@function
+       ALIGN
  __kernel_vsyscall:
  .LSTART_vsyscall:
         int $0x80
@@ -47,7 +47,10 @@ __kernel_vsyscall:
  .LENDFDEDLSI:
         .previous
  
-/*
- * Get the common code for the sigreturn entry points.
- */
-#include "vsyscall-sigreturn_32.S"
+       /*
+        * Pad out the segment to match the size of the sysenter.S version.
+        */
+VDSO32_vsyscall_eh_frame_size = 0x40
+       .section .data,"aw",@progbits
+       .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
+       .previous
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/vdso/vdso32/note.S

similarity index 92%

rename from arch/x86/kernel/vsyscall-note_32.S

rename to arch/x86/vdso/vdso32/note.S

index fcf376a37f79c142bb35216234e56e15adf0b41c..c83f257346966dc170f953bba1399f94ffc20742 100644 (file)
--- a/arch/x86/kernel/vsyscall-note_32.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -33,12 +33,11 @@ ELFNOTE_END
   * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
   */
  
-#include "../../x86/xen/vdso.h"        /* Defines VDSO_NOTE_NONEGSEG_BIT.  */
+#include "../../xen/vdso.h"    /* Defines VDSO_NOTE_NONEGSEG_BIT.  */
  
-       .globl VDSO_NOTE_MASK
  ELFNOTE_START(GNU, 2, "a")
         .long 1                 /* ncaps */
-VDSO_NOTE_MASK:
+VDSO32_NOTE_MASK:              /* Symbol used by arch/x86/xen/setup.c */
         .long 0                 /* mask */
         .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
  ELFNOTE_END
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/vdso/vdso32/sigreturn.S

similarity index 65%

rename from arch/x86/kernel/vsyscall-sigreturn_32.S

rename to arch/x86/vdso/vdso32/sigreturn.S

index a92262f4165993fbd76266ccdc798bb7c7d60355..31776d0efc8c40fa0aa6989731b1ddf23229029c 100644 (file)
--- a/arch/x86/kernel/vsyscall-sigreturn_32.S
+++ b/arch/x86/vdso/vdso32/sigreturn.S
@@ -1,41 +1,42 @@
  /*
- * Common code for the sigreturn entry points on the vsyscall page.
+ * Common code for the sigreturn entry points in vDSO images.
   * So far this code is the same for both int80 and sysenter versions.
- * This file is #include'd by vsyscall-*.S to define them after the
- * vsyscall entry point.  The kernel assumes that the addresses of these
- * routines are constant for all vsyscall implementations.
+ * This file is #include'd by int80.S et al to define them first thing.
+ * The kernel assumes that the addresses of these routines are constant
+ * for all vDSO implementations.
   */
  
-#include <asm/unistd.h>
+#include <linux/linkage.h>
+#include <asm/unistd_32.h>
  #include <asm/asm-offsets.h>
  
-
-/* XXX
-   Should these be named "_sigtramp" or something?
-*/
+#ifndef SYSCALL_ENTER_KERNEL
+#define        SYSCALL_ENTER_KERNEL    int $0x80
+#endif
  
         .text
-       .org __kernel_vsyscall+32,0x90
         .globl __kernel_sigreturn
         .type __kernel_sigreturn,@function
+       ALIGN
  __kernel_sigreturn:
  .LSTART_sigreturn:
         popl %eax               /* XXX does this mean it needs unwind info? */
         movl $__NR_sigreturn, %eax
-       int $0x80
+       SYSCALL_ENTER_KERNEL
  .LEND_sigreturn:
+       nop
         .size __kernel_sigreturn,.-.LSTART_sigreturn
  
-       .balign 32
         .globl __kernel_rt_sigreturn
         .type __kernel_rt_sigreturn,@function
+       ALIGN
  __kernel_rt_sigreturn:
  .LSTART_rt_sigreturn:
         movl $__NR_rt_sigreturn, %eax
-       int $0x80
+       SYSCALL_ENTER_KERNEL
  .LEND_rt_sigreturn:
+       nop
         .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
-       .balign 32
         .previous
  
         .section .eh_frame,"a",@progbits
@@ -70,9 +71,9 @@ __kernel_rt_sigreturn:
            be the value of the stack pointer in the caller.  This means
            that we must define the CFA of this body of code to be the
            saved value of the stack pointer in the sigcontext.  Which
-          also means that there is no fixed relation to the other 
+          also means that there is no fixed relation to the other
            saved registers, which means that we must use DW_CFA_expression
-          to compute their addresses.  It also means that when we 
+          to compute their addresses.  It also means that when we
            adjust the stack with the popl, we have to do it all over again.  */
  
  #define do_cfa_expr(offset)                                            \
@@ -91,27 +92,27 @@ __kernel_rt_sigreturn:
         .sleb128 offset;                /*       offset */              \
  1:
  
-       do_cfa_expr(SIGCONTEXT_esp+4)
-       do_expr(0, SIGCONTEXT_eax+4)
-       do_expr(1, SIGCONTEXT_ecx+4)
-       do_expr(2, SIGCONTEXT_edx+4)
-       do_expr(3, SIGCONTEXT_ebx+4)
-       do_expr(5, SIGCONTEXT_ebp+4)
-       do_expr(6, SIGCONTEXT_esi+4)
-       do_expr(7, SIGCONTEXT_edi+4)
-       do_expr(8, SIGCONTEXT_eip+4)
+       do_cfa_expr(IA32_SIGCONTEXT_sp+4)
+       do_expr(0, IA32_SIGCONTEXT_ax+4)
+       do_expr(1, IA32_SIGCONTEXT_cx+4)
+       do_expr(2, IA32_SIGCONTEXT_dx+4)
+       do_expr(3, IA32_SIGCONTEXT_bx+4)
+       do_expr(5, IA32_SIGCONTEXT_bp+4)
+       do_expr(6, IA32_SIGCONTEXT_si+4)
+       do_expr(7, IA32_SIGCONTEXT_di+4)
+       do_expr(8, IA32_SIGCONTEXT_ip+4)
  
         .byte 0x42      /* DW_CFA_advance_loc 2 -- nop; popl eax. */
  
-       do_cfa_expr(SIGCONTEXT_esp)
-       do_expr(0, SIGCONTEXT_eax)
-       do_expr(1, SIGCONTEXT_ecx)
-       do_expr(2, SIGCONTEXT_edx)
-       do_expr(3, SIGCONTEXT_ebx)
-       do_expr(5, SIGCONTEXT_ebp)
-       do_expr(6, SIGCONTEXT_esi)
-       do_expr(7, SIGCONTEXT_edi)
-       do_expr(8, SIGCONTEXT_eip)
+       do_cfa_expr(IA32_SIGCONTEXT_sp)
+       do_expr(0, IA32_SIGCONTEXT_ax)
+       do_expr(1, IA32_SIGCONTEXT_cx)
+       do_expr(2, IA32_SIGCONTEXT_dx)
+       do_expr(3, IA32_SIGCONTEXT_bx)
+       do_expr(5, IA32_SIGCONTEXT_bp)
+       do_expr(6, IA32_SIGCONTEXT_si)
+       do_expr(7, IA32_SIGCONTEXT_di)
+       do_expr(8, IA32_SIGCONTEXT_ip)
  
         .align 4
  .LENDFDEDLSI1:
@@ -128,15 +129,15 @@ __kernel_rt_sigreturn:
            slightly less complicated than the above, since we don't
            modify the stack pointer in the process.  */
  
-       do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
-       do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
-       do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
-       do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
-       do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
-       do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
-       do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
-       do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
-       do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
+       do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
+       do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
+       do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
+       do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
+       do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
+       do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
+       do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
+       do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
+       do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
  
         .align 4
  .LENDFDEDLSI2:
diff --git a/arch/x86/ia32/vsyscall-syscall.S b/arch/x86/vdso/vdso32/syscall.S

similarity index 79%

rename from arch/x86/ia32/vsyscall-syscall.S

rename to arch/x86/vdso/vdso32/syscall.S

index cf9ef678de3e3c68a9266bc5ab975e867b14655e..5415b5613d5545b68dc88ee08de03f1061c48f8c 100644 (file)
--- a/arch/x86/ia32/vsyscall-syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -1,16 +1,18 @@
  /*
- * Code for the vsyscall page.  This version uses the syscall instruction.
+ * Code for the vDSO.  This version uses the syscall instruction.
+ *
+ * First get the common code for the sigreturn entry points.
+ * This must come first.
   */
+#define SYSCALL_ENTER_KERNEL   syscall
+#include "sigreturn.S"
  
-#include <asm/ia32_unistd.h>
-#include <asm/asm-offsets.h>
  #include <asm/segment.h>
  
-       .code32
         .text
-       .section .text.vsyscall,"ax"
         .globl __kernel_vsyscall
         .type __kernel_vsyscall,@function
+       ALIGN
  __kernel_vsyscall:
  .LSTART_vsyscall:
         push    %ebp
@@ -64,6 +66,12 @@ __kernel_vsyscall:
         .uleb128 4
         .align 4
  .LENDFDE1:
+       .previous
  
-#define SYSCALL_ENTER_KERNEL   syscall
-#include "vsyscall-sigreturn.S"
+       /*
+        * Pad out the segment to match the size of the sysenter.S version.
+        */
+VDSO32_vsyscall_eh_frame_size = 0x40
+       .section .data,"aw",@progbits
+       .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
+       .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/vdso/vdso32/sysenter.S

similarity index 76%

rename from arch/x86/kernel/vsyscall-sysenter_32.S

rename to arch/x86/vdso/vdso32/sysenter.S

index ed879bf42995aa6a51ad955c2feff15d7644b28f..e2800affa754d66d8ac3533f25834f0412ec9aee 100644 (file)
--- a/arch/x86/kernel/vsyscall-sysenter_32.S
+++ b/arch/x86/vdso/vdso32/sysenter.S
@@ -1,11 +1,10 @@
  /*
- * Code for the vsyscall page.  This version uses the sysenter instruction.
+ * Code for the vDSO.  This version uses the sysenter instruction.
   *
- * NOTE:
- * 1) __kernel_vsyscall _must_ be first in this page.
- * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
- *    for details.
+ * First get the common code for the sigreturn entry points.
+ * This must come first.
   */
+#include "sigreturn.S"
  
  /*
   * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
@@ -23,11 +22,12 @@
   * arg6 from the stack.
   *
   * You can not use this vsyscall for the clone() syscall because the
- * three dwords on the parent stack do not get copied to the child.
+ * three words on the parent stack do not get copied to the child.
   */
         .text
         .globl __kernel_vsyscall
         .type __kernel_vsyscall,@function
+       ALIGN
  __kernel_vsyscall:
  .LSTART_vsyscall:
         push %ecx
@@ -45,8 +45,7 @@ __kernel_vsyscall:
         /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
         jmp .Lenter_kernel
         /* 16: System call normal return point is here! */
-       .globl SYSENTER_RETURN  /* Symbol used by sysenter.c  */
-SYSENTER_RETURN:
+VDSO32_SYSENTER_RETURN:        /* Symbol used by sysenter.c via vdso32-syms.h */
         pop %ebp
  .Lpop_ebp:
         pop %edx
@@ -85,38 +84,33 @@ SYSENTER_RETURN:
         .uleb128 0
         /* What follows are the instructions for the table generation.
            We have to record all changes of the stack pointer.  */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpush_ecx-.LSTART_vsyscall
+       .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
         .byte 0x0e              /* DW_CFA_def_cfa_offset */
         .byte 0x08              /* RA at offset 8 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpush_edx-.Lpush_ecx
+       .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
         .byte 0x0e              /* DW_CFA_def_cfa_offset */
         .byte 0x0c              /* RA at offset 12 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lenter_kernel-.Lpush_edx
+       .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
         .byte 0x0e              /* DW_CFA_def_cfa_offset */
         .byte 0x10              /* RA at offset 16 now */
         .byte 0x85, 0x04        /* DW_CFA_offset %ebp -16 */
         /* Finally the epilogue.  */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_ebp-.Lenter_kernel
+       .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
         .byte 0x0e              /* DW_CFA_def_cfa_offset */
         .byte 0x0c              /* RA at offset 12 now */
         .byte 0xc5              /* DW_CFA_restore %ebp */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_edx-.Lpop_ebp
+       .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
         .byte 0x0e              /* DW_CFA_def_cfa_offset */
         .byte 0x08              /* RA at offset 8 now */
-       .byte 0x04              /* DW_CFA_advance_loc4 */
-       .long .Lpop_ecx-.Lpop_edx
+       .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
         .byte 0x0e              /* DW_CFA_def_cfa_offset */
         .byte 0x04              /* RA at offset 4 now */
         .align 4
  .LENDFDEDLSI:
         .previous
  
-/*
- * Get the common code for the sigreturn entry points.
- */
-#include "vsyscall-sigreturn_32.S"
+       /*
+        * Emit a symbol with the size of this .eh_frame data,
+        * to verify it matches the other versions.
+        */
+VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S

new file mode 100644 (file)

index 0000000..976124b
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -0,0 +1,37 @@
+/*
+ * Linker script for 32-bit vDSO.
+ * We #include the file to define the layout details.
+ * Here we only choose the prelinked virtual address.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.  We can define local symbols here called VDSO* to make their
+ * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ */
+
+#define VDSO_PRELINK 0
+#include "../vdso-layout.lds.S"
+
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_vsyscall);
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION
+{
+       LINUX_2.5 {
+       global:
+               __kernel_vsyscall;
+               __kernel_sigreturn;
+               __kernel_rt_sigreturn;
+       local: *;
+       };
+}
+
+/*
+ * Symbols we define here called VDSO* get their values into vdso32-syms.h.
+ */
+VDSO32_PRELINK         = VDSO_PRELINK;
+VDSO32_vsyscall                = __kernel_vsyscall;
+VDSO32_sigreturn       = __kernel_sigreturn;
+VDSO32_rt_sigreturn    = __kernel_rt_sigreturn;
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c

index 3b1ae1abfba9abc1f00332ccb6c7af0805a8e403..c8097f17f8a978a5d956d74bbd8e6150fefb9ddb 100644 (file)
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -15,11 +15,11 @@
  
  long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
  {
-       unsigned int dummy, p;
+       unsigned int p;
  
         if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
                 /* Load per CPU data from RDTSCP */
-               rdtscp(dummy, dummy, p);
+               native_read_tscp(&p);
         } else {
                 /* Load per CPU data from GDT */
                 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c

index ff9333e5fb083af76a77a5707b1a0661ae76e1ff..3fdd51497a838b801cc69dbd8408ea849da75546 100644 (file)
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -11,23 +11,20 @@
  #include <asm/vsyscall.h>
  #include <asm/vgtod.h>
  #include <asm/proto.h>
-#include "voffset.h"
+#include <asm/vdso.h>
  
-int vdso_enabled = 1;
-
-#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
-#include "vextern.h"
+#include "vextern.h"           /* Just for VMAGIC.  */
  #undef VEXTERN
  
-extern char vdso_kernel_start[], vdso_start[], vdso_end[];
+int vdso_enabled = 1;
+
+extern char vdso_start[], vdso_end[];
  extern unsigned short vdso_sync_cpuid;
  
  struct page **vdso_pages;
  
-static inline void *var_ref(void *vbase, char *var, char *name)
+static inline void *var_ref(void *p, char *name)
  {
-       unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
-       void *p = vbase + offset;
         if (*(void **)p != (void *)VMAGIC) {
                 printk("VDSO: variable %s broken\n", name);
                 vdso_enabled = 0;
@@ -62,9 +59,8 @@ static int __init init_vdso_vars(void)
                 vdso_enabled = 0;
         }
  
-#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
  #define VEXTERN(x) \
-       V(vdso_ ## x) = &__ ## x;
+       *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
  #include "vextern.h"
  #undef VEXTERN
         return 0;
diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h

deleted file mode 100644 (file)

index 4af67c7..0000000
--- a/arch/x86/vdso/voffset.h
+++ /dev/null
@@ -1 +0,0 @@
-#define VDSO_TEXT_OFFSET 0x600
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig

index fbfa55ce0d5566fbb7634d695fe9709ab63fbe07..4d5f2649bee4e9e5dc0dd7f9cc0dab20c31c7d0d 100644 (file)
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,6 +5,7 @@
  config XEN
         bool "Xen guest support"
         select PARAVIRT
+       depends on X86_32
         depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
         help
           This is the Linux Xen port.  Enabling this will allow the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index 79ad1525215016aeafa3526daa8834d453950d40..de647bc6e74db4e8bf42169d5ae9c5f37a3ad4a5 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -141,8 +141,8 @@ static void __init xen_banner(void)
         printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
  }
  
-static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
-                     unsigned int *ecx, unsigned int *edx)
+static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+                     unsigned int *cx, unsigned int *dx)
  {
         unsigned maskedx = ~0;
  
@@ -150,18 +150,18 @@ static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
          * Mask out inconvenient features, to try and disable as many
          * unsupported kernel subsystems as possible.
          */
-       if (*eax == 1)
+       if (*ax == 1)
                 maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
                             (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
                             (1 << X86_FEATURE_ACC));   /* thermal monitoring */
  
         asm(XEN_EMULATE_PREFIX "cpuid"
-               : "=a" (*eax),
-                 "=b" (*ebx),
-                 "=c" (*ecx),
-                 "=d" (*edx)
-               : "0" (*eax), "2" (*ecx));
-       *edx &= maskedx;
+               : "=a" (*ax),
+                 "=b" (*bx),
+                 "=c" (*cx),
+                 "=d" (*dx)
+               : "0" (*ax), "2" (*cx));
+       *dx &= maskedx;
  }
  
  static void xen_set_debugreg(int reg, unsigned long val)
@@ -275,19 +275,12 @@ static unsigned long xen_store_tr(void)
  
  static void xen_set_ldt(const void *addr, unsigned entries)
  {
-       unsigned long linear_addr = (unsigned long)addr;
         struct mmuext_op *op;
         struct multicall_space mcs = xen_mc_entry(sizeof(*op));
  
         op = mcs.args;
         op->cmd = MMUEXT_SET_LDT;
-       if (linear_addr) {
-               /* ldt my be vmalloced, use arbitrary_virt_to_machine */
-               xmaddr_t maddr;
-               maddr = arbitrary_virt_to_machine((unsigned long)addr);
-               linear_addr = (unsigned long)maddr.maddr;
-       }
-       op->arg1.linear_addr = linear_addr;
+       op->arg1.linear_addr = (unsigned long)addr;
         op->arg2.nr_ents = entries;
  
         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
@@ -295,7 +288,7 @@ static void xen_set_ldt(const void *addr, unsigned entries)
         xen_mc_issue(PARAVIRT_LAZY_CPU);
  }
  
-static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+static void xen_load_gdt(const struct desc_ptr *dtr)
  {
         unsigned long *frames;
         unsigned long va = dtr->address;
@@ -357,11 +350,11 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
  }
  
  static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
-                               u32 low, u32 high)
+                               const void *ptr)
  {
         unsigned long lp = (unsigned long)&dt[entrynum];
         xmaddr_t mach_lp = virt_to_machine(lp);
-       u64 entry = (u64)high << 32 | low;
+       u64 entry = *(u64 *)ptr;
  
         preempt_disable();
  
@@ -395,12 +388,11 @@ static int cvt_gate_to_trap(int vector, u32 low, u32 high,
  }
  
  /* Locations of each CPU's IDT */
-static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
  
  /* Set an IDT entry.  If the entry is part of the current IDT, then
     also update Xen. */
-static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
-                               u32 low, u32 high)
+static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
  {
         unsigned long p = (unsigned long)&dt[entrynum];
         unsigned long start, end;
@@ -412,14 +404,15 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
  
         xen_mc_flush();
  
-       write_dt_entry(dt, entrynum, low, high);
+       native_write_idt_entry(dt, entrynum, g);
  
         if (p >= start && (p + 8) <= end) {
                 struct trap_info info[2];
+               u32 *desc = (u32 *)g;
  
                 info[1].address = 0;
  
-               if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+               if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
                         if (HYPERVISOR_set_trap_table(info))
                                 BUG();
         }
@@ -427,7 +420,7 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
         preempt_enable();
  }
  
-static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+static void xen_convert_trap_info(const struct desc_ptr *desc,
                                   struct trap_info *traps)
  {
         unsigned in, out, count;
@@ -446,7 +439,7 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
  
  void xen_copy_trap_info(struct trap_info *traps)
  {
-       const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+       const struct desc_ptr *desc = &__get_cpu_var(idt_desc);
  
         xen_convert_trap_info(desc, traps);
  }
@@ -454,7 +447,7 @@ void xen_copy_trap_info(struct trap_info *traps)
  /* Load a new IDT into Xen.  In principle this can be per-CPU, so we
     hold a spinlock to protect the static traps[] array (static because
     it avoids allocation, and saves stack space). */
-static void xen_load_idt(const struct Xgt_desc_struct *desc)
+static void xen_load_idt(const struct desc_ptr *desc)
  {
         static DEFINE_SPINLOCK(lock);
         static struct trap_info traps[257];
@@ -475,22 +468,21 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
  /* Write a GDT descriptor entry.  Ignore LDT descriptors, since
     they're handled differently. */
  static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
-                               u32 low, u32 high)
+                               const void *desc, int type)
  {
         preempt_disable();
  
-       switch ((high >> 8) & 0xff) {
-       case DESCTYPE_LDT:
-       case DESCTYPE_TSS:
+       switch (type) {
+       case DESC_LDT:
+       case DESC_TSS:
                 /* ignore */
                 break;
  
         default: {
                 xmaddr_t maddr = virt_to_machine(&dt[entry]);
-               u64 desc = (u64)high << 32 | low;
  
                 xen_mc_flush();
-               if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+               if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
                         BUG();
         }
  
@@ -499,11 +491,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
         preempt_enable();
  }
  
-static void xen_load_esp0(struct tss_struct *tss,
+static void xen_load_sp0(struct tss_struct *tss,
                           struct thread_struct *thread)
  {
         struct multicall_space mcs = xen_mc_entry(0);
-       MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+       MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
         xen_mc_issue(PARAVIRT_LAZY_CPU);
  }
  
@@ -521,12 +513,12 @@ static void xen_io_delay(void)
  }
  
  #ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long xen_apic_read(unsigned long reg)
+static u32 xen_apic_read(unsigned long reg)
  {
         return 0;
  }
  
-static void xen_apic_write(unsigned long reg, unsigned long val)
+static void xen_apic_write(unsigned long reg, u32 val)
  {
         /* Warn to see if there's any stray references */
         WARN_ON(1);
@@ -666,6 +658,13 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
  }
  
+/* Early release_pt assumes that all pts are pinned, since there's
+   only init_mm and anything attached to that is pinned. */
+static void xen_release_pt_init(u32 pfn)
+{
+       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
  static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
  {
         struct mmuext_op op;
@@ -677,7 +676,7 @@ static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
  
  /* This needs to make sure the new pte page is pinned iff its being
     attached to a pinned pagetable. */
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
  {
         struct page *page = pfn_to_page(pfn);
  
@@ -686,7 +685,7 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
  
                 if (!PageHighMem(page)) {
                         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-                       pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+                       pin_pagetable_pfn(level, pfn);
                 } else
                         /* make sure there are no stray mappings of
                            this page */
@@ -694,6 +693,16 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
         }
  }
  
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+       xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE);
+}
+
+static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
+{
+       xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE);
+}
+
  /* This should never happen until we're OK to use struct page */
  static void xen_release_pt(u32 pfn)
  {
@@ -796,6 +805,9 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
         /* This will work as long as patching hasn't happened yet
            (which it hasn't) */
         pv_mmu_ops.alloc_pt = xen_alloc_pt;
+       pv_mmu_ops.alloc_pd = xen_alloc_pd;
+       pv_mmu_ops.release_pt = xen_release_pt;
+       pv_mmu_ops.release_pd = xen_release_pt;
         pv_mmu_ops.set_pte = xen_set_pte;
  
         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -953,7 +965,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
         .read_pmc = native_read_pmc,
  
         .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
-       .irq_enable_sysexit = NULL,  /* never called */
+       .irq_enable_syscall_ret = NULL,  /* never called */
  
         .load_tr_desc = paravirt_nop,
         .set_ldt = xen_set_ldt,
@@ -968,7 +980,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
         .write_ldt_entry = xen_write_ldt_entry,
         .write_gdt_entry = xen_write_gdt_entry,
         .write_idt_entry = xen_write_idt_entry,
-       .load_esp0 = xen_load_esp0,
+       .load_sp0 = xen_load_sp0,
  
         .set_iopl_mask = xen_set_iopl_mask,
         .io_delay = xen_io_delay,
@@ -1019,10 +1031,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .pte_update_defer = paravirt_nop,
  
         .alloc_pt = xen_alloc_pt_init,
-       .release_pt = xen_release_pt,
-       .alloc_pd = paravirt_nop,
+       .release_pt = xen_release_pt_init,
+       .alloc_pd = xen_alloc_pt_init,
         .alloc_pd_clone = paravirt_nop,
-       .release_pd = paravirt_nop,
+       .release_pd = xen_release_pt_init,
  
  #ifdef CONFIG_HIGHPTE
         .kmap_atomic_pte = xen_kmap_atomic_pte,
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c

index 6d1da5809e6fd34bcd3ba84fdff0335949f4b78b..dcf613e17581448926794286ac9c1fad05011857 100644 (file)
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -465,7 +465,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
   * a bitset of words which contain pending event bits.  The second
   * level is a bitset of pending events themselves.
   */
-fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+void xen_evtchn_do_upcall(struct pt_regs *regs)
  {
         int cpu = get_cpu();
         struct shared_info *s = HYPERVISOR_shared_info;
@@ -487,7 +487,7 @@ fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
                         int irq = evtchn_to_irq[port];
  
                         if (irq != -1) {
-                               regs->orig_eax = ~irq;
+                               regs->orig_ax = ~irq;
                                 do_IRQ(regs);
                         }
                 }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c

index 0ac6c5dc49ba0216276c605f269a7bc3ea01efda..45aa771e73a9221792f703e99c980ada406c8fa9 100644 (file)
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -58,7 +58,8 @@
  
  xmaddr_t arbitrary_virt_to_machine(unsigned long address)
  {
-       pte_t *pte = lookup_address(address);
+       int level;
+       pte_t *pte = lookup_address(address, &level);
         unsigned offset = address & PAGE_MASK;
  
         BUG_ON(pte == NULL);
@@ -70,8 +71,9 @@ void make_lowmem_page_readonly(void *vaddr)
  {
         pte_t *pte, ptev;
         unsigned long address = (unsigned long)vaddr;
+       int level;
  
-       pte = lookup_address(address);
+       pte = lookup_address(address, &level);
         BUG_ON(pte == NULL);
  
         ptev = pte_wrprotect(*pte);
@@ -84,8 +86,9 @@ void make_lowmem_page_readwrite(void *vaddr)
  {
         pte_t *pte, ptev;
         unsigned long address = (unsigned long)vaddr;
+       int level;
  
-       pte = lookup_address(address);
+       pte = lookup_address(address, &level);
         BUG_ON(pte == NULL);
  
         ptev = pte_mkwrite(*pte);
@@ -241,12 +244,12 @@ unsigned long long xen_pgd_val(pgd_t pgd)
  
  pte_t xen_make_pte(unsigned long long pte)
  {
-       if (pte & 1)
+       if (pte & _PAGE_PRESENT) {
                 pte = phys_to_machine(XPADDR(pte)).maddr;
+               pte &= ~(_PAGE_PCD | _PAGE_PWT);
+       }
  
-       pte &= ~_PAGE_PCD;
-
-       return (pte_t){ pte, pte >> 32 };
+       return (pte_t){ .pte = pte };
  }
  
  pmd_t xen_make_pmd(unsigned long long pmd)
@@ -290,10 +293,10 @@ unsigned long xen_pgd_val(pgd_t pgd)
  
  pte_t xen_make_pte(unsigned long pte)
  {
-       if (pte & _PAGE_PRESENT)
+       if (pte & _PAGE_PRESENT) {
                 pte = phys_to_machine(XPADDR(pte)).maddr;
-
-       pte &= ~_PAGE_PCD;
+               pte &= ~(_PAGE_PCD | _PAGE_PWT);
+       }
  
         return (pte_t){ pte };
  }
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c

index f84e772266461aec8a76b4feeb2ba318e21e0547..3bad4773a2f3c36ff0eb8694014a010cfd903fb8 100644 (file)
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
  #include <linux/pm.h>
  
  #include <asm/elf.h>
+#include <asm/vdso.h>
  #include <asm/e820.h>
  #include <asm/setup.h>
  #include <asm/xen/hypervisor.h>
@@ -59,12 +60,10 @@ static void xen_idle(void)
  /*
   * Set the bit indicating "nosegneg" library variants should be used.
   */
-static void fiddle_vdso(void)
+static void __init fiddle_vdso(void)
  {
-       extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S.  */
-       extern char vsyscall_int80_start;
-       u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
-                            &vsyscall_int80_start);
+       extern const char vdso32_default_start;
+       u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
         *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
  }
  
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c

index c1b131bcdcbe71f2adc14833339b9096d37e9964..aafc54437403f95e84d07f1cccad12ea873bd8a3 100644 (file)
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -146,7 +146,7 @@ void __init xen_smp_prepare_boot_cpu(void)
            old memory can be recycled */
         make_lowmem_page_readwrite(&per_cpu__gdt_page);
  
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+       for_each_possible_cpu(cpu) {
                 cpus_clear(per_cpu(cpu_sibling_map, cpu));
                 /*
                  * cpu_core_map lives in a per cpu area that is cleared
@@ -163,7 +163,7 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
  {
         unsigned cpu;
  
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+       for_each_possible_cpu(cpu) {
                 cpus_clear(per_cpu(cpu_sibling_map, cpu));
                 /*
                  * cpu_core_ map will be zeroed when the per
@@ -239,10 +239,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
         ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
  
         ctxt->user_regs.cs = __KERNEL_CS;
-       ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+       ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
  
         ctxt->kernel_ss = __KERNEL_DS;
-       ctxt->kernel_sp = idle->thread.esp0;
+       ctxt->kernel_sp = idle->thread.sp0;
  
         ctxt->event_callback_cs     = __KERNEL_CS;
         ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c

index d083ff5ef088eba01ddebb86cabccf4fa9a4202b..b3721fd6877b1ea03f1d22cc12c5c1a8566ef04a 100644 (file)
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -592,7 +592,7 @@ __init void xen_time_init(void)
         set_normalized_timespec(&wall_to_monotonic,
                                 -xtime.tv_sec, -xtime.tv_nsec);
  
-       tsc_disable = 0;
+       setup_force_cpu_cap(X86_FEATURE_TSC);
  
         xen_setup_timer(cpu);
         xen_setup_cpu_clockevents();
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S

index f8d6937db2ec8d1eb85dabd10f15c96526225dc8..288d587ce73cef8562fbccf751dcd395c4131d25 100644 (file)
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -4,16 +4,18 @@
  #ifdef CONFIG_XEN
  
  #include <linux/elfnote.h>
+#include <linux/init.h>
  #include <asm/boot.h>
  #include <xen/interface/elfnote.h>
  
-.pushsection .init.text
+       __INIT
  ENTRY(startup_xen)
         movl %esi,xen_start_info
         cld
         movl $(init_thread_union+THREAD_SIZE),%esp
         jmp xen_start_kernel
-.popsection
+
+       __FINIT
  
  .pushsection .bss.page_aligned
         .align PAGE_SIZE_asm
diff --git a/block/Makefile b/block/Makefile

index 826108190f00d3ed9afc1cba7869424584cd6dd4..5a43c7d795942d1b478483e9464f1d58d471a15d 100644 (file)
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,7 +2,9 @@
  # Makefile for the kernel block layer
  #
  
-obj-$(CONFIG_BLOCK) := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
+obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+                       blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
+                       blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o
  
  obj-$(CONFIG_BLK_DEV_BSG)      += bsg.o
  obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
diff --git a/block/as-iosched.c b/block/as-iosched.c

index b201d16a71020f1261a42bc56ac36a9915da16ad..96036846a0017505691060d62734afc0dcad9529 100644 (file)
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1275,9 +1275,13 @@ static void as_merged_requests(struct request_queue *q, struct request *req,
                          * Don't copy here but swap, because when anext is
                          * removed below, it must contain the unused context
                          */
-                       double_spin_lock(&rioc->lock, &nioc->lock, rioc < nioc);
-                       swap_io_context(&rioc, &nioc);
-                       double_spin_unlock(&rioc->lock, &nioc->lock, rioc < nioc);
+                       if (rioc != nioc) {
+                               double_spin_lock(&rioc->lock, &nioc->lock,
+                                                               rioc < nioc);
+                               swap_io_context(&rioc, &nioc);
+                               double_spin_unlock(&rioc->lock, &nioc->lock,
+                                                               rioc < nioc);
+                       }
                 }
         }
  
diff --git a/block/blk-barrier.c b/block/blk-barrier.c

new file mode 100644 (file)

index 0000000..5f74fec
--- /dev/null
+++ b/block/blk-barrier.c
@@ -0,0 +1,319 @@
+/*
+ * Functions related to barrier IO handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:        the request queue
+ * @ordered:  one of QUEUE_ORDERED_*
+ * @prepare_flush_fn: rq setup helper for cache flush ordered writes
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ **/
+int blk_queue_ordered(struct request_queue *q, unsigned ordered,
+                     prepare_flush_fn *prepare_flush_fn)
+{
+       if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
+           prepare_flush_fn == NULL) {
+               printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
+               return -EINVAL;
+       }
+
+       if (ordered != QUEUE_ORDERED_NONE &&
+           ordered != QUEUE_ORDERED_DRAIN &&
+           ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
+           ordered != QUEUE_ORDERED_DRAIN_FUA &&
+           ordered != QUEUE_ORDERED_TAG &&
+           ordered != QUEUE_ORDERED_TAG_FLUSH &&
+           ordered != QUEUE_ORDERED_TAG_FUA) {
+               printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+               return -EINVAL;
+       }
+
+       q->ordered = ordered;
+       q->next_ordered = ordered;
+       q->prepare_flush_fn = prepare_flush_fn;
+
+       return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/*
+ * Cache flushing for ordered writes handling
+ */
+inline unsigned blk_ordered_cur_seq(struct request_queue *q)
+{
+       if (!q->ordseq)
+               return 0;
+       return 1 << ffz(q->ordseq);
+}
+
+unsigned blk_ordered_req_seq(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+
+       BUG_ON(q->ordseq == 0);
+
+       if (rq == &q->pre_flush_rq)
+               return QUEUE_ORDSEQ_PREFLUSH;
+       if (rq == &q->bar_rq)
+               return QUEUE_ORDSEQ_BAR;
+       if (rq == &q->post_flush_rq)
+               return QUEUE_ORDSEQ_POSTFLUSH;
+
+       /*
+        * !fs requests don't need to follow barrier ordering.  Always
+        * put them at the front.  This fixes the following deadlock.
+        *
+        * http://thread.gmane.org/gmane.linux.kernel/537473
+        */
+       if (!blk_fs_request(rq))
+               return QUEUE_ORDSEQ_DRAIN;
+
+       if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
+           (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
+               return QUEUE_ORDSEQ_DRAIN;
+       else
+               return QUEUE_ORDSEQ_DONE;
+}
+
+void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+{
+       struct request *rq;
+
+       if (error && !q->orderr)
+               q->orderr = error;
+
+       BUG_ON(q->ordseq & seq);
+       q->ordseq |= seq;
+
+       if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
+               return;
+
+       /*
+        * Okay, sequence complete.
+        */
+       q->ordseq = 0;
+       rq = q->orig_bar_rq;
+
+       if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
+               BUG();
+}
+
+static void pre_flush_end_io(struct request *rq, int error)
+{
+       elv_completed_request(rq->q, rq);
+       blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
+
+static void bar_end_io(struct request *rq, int error)
+{
+       elv_completed_request(rq->q, rq);
+       blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
+
+static void post_flush_end_io(struct request *rq, int error)
+{
+       elv_completed_request(rq->q, rq);
+       blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
+
+static void queue_flush(struct request_queue *q, unsigned which)
+{
+       struct request *rq;
+       rq_end_io_fn *end_io;
+
+       if (which == QUEUE_ORDERED_PREFLUSH) {
+               rq = &q->pre_flush_rq;
+               end_io = pre_flush_end_io;
+       } else {
+               rq = &q->post_flush_rq;
+               end_io = post_flush_end_io;
+       }
+
+       rq->cmd_flags = REQ_HARDBARRIER;
+       rq_init(q, rq);
+       rq->elevator_private = NULL;
+       rq->elevator_private2 = NULL;
+       rq->rq_disk = q->bar_rq.rq_disk;
+       rq->end_io = end_io;
+       q->prepare_flush_fn(q, rq);
+
+       elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+}
+
+static inline struct request *start_ordered(struct request_queue *q,
+                                           struct request *rq)
+{
+       q->orderr = 0;
+       q->ordered = q->next_ordered;
+       q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+       /*
+        * Prep proxy barrier request.
+        */
+       blkdev_dequeue_request(rq);
+       q->orig_bar_rq = rq;
+       rq = &q->bar_rq;
+       rq->cmd_flags = 0;
+       rq_init(q, rq);
+       if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
+               rq->cmd_flags |= REQ_RW;
+       if (q->ordered & QUEUE_ORDERED_FUA)
+               rq->cmd_flags |= REQ_FUA;
+       rq->elevator_private = NULL;
+       rq->elevator_private2 = NULL;
+       init_request_from_bio(rq, q->orig_bar_rq->bio);
+       rq->end_io = bar_end_io;
+
+       /*
+        * Queue ordered sequence.  As we stack them at the head, we
+        * need to queue in reverse order.  Note that we rely on that
+        * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
+        * request gets inbetween ordered sequence. If this request is
+        * an empty barrier, we don't need to do a postflush ever since
+        * there will be no data written between the pre and post flush.
+        * Hence a single flush will suffice.
+        */
+       if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
+               queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+       else
+               q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+
+       elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+
+       if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
+               queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+               rq = &q->pre_flush_rq;
+       } else
+               q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
+
+       if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+               q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+       else
+               rq = NULL;
+
+       return rq;
+}
+
+int blk_do_ordered(struct request_queue *q, struct request **rqp)
+{
+       struct request *rq = *rqp;
+       const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
+
+       if (!q->ordseq) {
+               if (!is_barrier)
+                       return 1;
+
+               if (q->next_ordered != QUEUE_ORDERED_NONE) {
+                       *rqp = start_ordered(q, rq);
+                       return 1;
+               } else {
+                       /*
+                        * This can happen when the queue switches to
+                        * ORDERED_NONE while this request is on it.
+                        */
+                       blkdev_dequeue_request(rq);
+                       if (__blk_end_request(rq, -EOPNOTSUPP,
+                                             blk_rq_bytes(rq)))
+                               BUG();
+                       *rqp = NULL;
+                       return 0;
+               }
+       }
+
+       /*
+        * Ordered sequence in progress
+        */
+
+       /* Special requests are not subject to ordering rules. */
+       if (!blk_fs_request(rq) &&
+           rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
+               return 1;
+
+       if (q->ordered & QUEUE_ORDERED_TAG) {
+               /* Ordered by tag.  Blocking the next barrier is enough. */
+               if (is_barrier && rq != &q->bar_rq)
+                       *rqp = NULL;
+       } else {
+               /* Ordered by draining.  Wait for turn. */
+               WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
+               if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
+                       *rqp = NULL;
+       }
+
+       return 1;
+}
+
+static void bio_end_empty_barrier(struct bio *bio, int err)
+{
+       if (err)
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+
+       complete(bio->bi_private);
+}
+
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:      blockdev to issue flush for
+ * @error_sector:      error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to.  Caller must run wait_for_completion() on its own.
+ */
+int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+{
+       DECLARE_COMPLETION_ONSTACK(wait);
+       struct request_queue *q;
+       struct bio *bio;
+       int ret;
+
+       if (bdev->bd_disk == NULL)
+               return -ENXIO;
+
+       q = bdev_get_queue(bdev);
+       if (!q)
+               return -ENXIO;
+
+       bio = bio_alloc(GFP_KERNEL, 0);
+       if (!bio)
+               return -ENOMEM;
+
+       bio->bi_end_io = bio_end_empty_barrier;
+       bio->bi_private = &wait;
+       bio->bi_bdev = bdev;
+       submit_bio(1 << BIO_RW_BARRIER, bio);
+
+       wait_for_completion(&wait);
+
+       /*
+        * The driver must store the error location in ->bi_sector, if
+        * it supports it. For non-stacked drivers, this should be copied
+        * from rq->sector.
+        */
+       if (error_sector)
+               *error_sector = bio->bi_sector;
+
+       ret = 0;
+       if (!bio_flagged(bio, BIO_UPTODATE))
+               ret = -EIO;
+
+       bio_put(bio);
+       return ret;
+}
+
+EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-core.c b/block/blk-core.c

new file mode 100644 (file)

index 0000000..8ff9944
--- /dev/null
+++ b/block/blk-core.c
@@ -0,0 +1,2034 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
+ * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
+ * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
+ * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
+ * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
+ */
+
+/*
+ * This handles all read/write requests to block devices
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blktrace_api.h>
+#include <linux/fault-inject.h>
+
+#include "blk.h"
+
+static int __make_request(struct request_queue *q, struct bio *bio);
+
+/*
+ * For the allocated request tables
+ */
+struct kmem_cache *request_cachep;
+
+/*
+ * For queue allocation
+ */
+struct kmem_cache *blk_requestq_cachep = NULL;
+
+/*
+ * Controlling structure to kblockd
+ */
+static struct workqueue_struct *kblockd_workqueue;
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+static void drive_stat_acct(struct request *rq, int new_io)
+{
+       int rw = rq_data_dir(rq);
+
+       if (!blk_fs_request(rq) || !rq->rq_disk)
+               return;
+
+       if (!new_io) {
+               __disk_stat_inc(rq->rq_disk, merges[rw]);
+       } else {
+               disk_round_stats(rq->rq_disk);
+               rq->rq_disk->in_flight++;
+       }
+}
+
+void blk_queue_congestion_threshold(struct request_queue *q)
+{
+       int nr;
+
+       nr = q->nr_requests - (q->nr_requests / 8) + 1;
+       if (nr > q->nr_requests)
+               nr = q->nr_requests;
+       q->nr_congestion_on = nr;
+
+       nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
+       if (nr < 1)
+               nr = 1;
+       q->nr_congestion_off = nr;
+}
+
+/**
+ * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
+ * @bdev:      device
+ *
+ * Locates the passed device's request queue and returns the address of its
+ * backing_dev_info
+ *
+ * Will return NULL if the request queue cannot be located.
+ */
+struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
+{
+       struct backing_dev_info *ret = NULL;
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       if (q)
+               ret = &q->backing_dev_info;
+       return ret;
+}
+EXPORT_SYMBOL(blk_get_backing_dev_info);
+
+void rq_init(struct request_queue *q, struct request *rq)
+{
+       INIT_LIST_HEAD(&rq->queuelist);
+       INIT_LIST_HEAD(&rq->donelist);
+
+       rq->errors = 0;
+       rq->bio = rq->biotail = NULL;
+       INIT_HLIST_NODE(&rq->hash);
+       RB_CLEAR_NODE(&rq->rb_node);
+       rq->ioprio = 0;
+       rq->buffer = NULL;
+       rq->ref_count = 1;
+       rq->q = q;
+       rq->special = NULL;
+       rq->data_len = 0;
+       rq->data = NULL;
+       rq->nr_phys_segments = 0;
+       rq->sense = NULL;
+       rq->end_io = NULL;
+       rq->end_io_data = NULL;
+       rq->completion_data = NULL;
+       rq->next_rq = NULL;
+}
+
+static void req_bio_endio(struct request *rq, struct bio *bio,
+                         unsigned int nbytes, int error)
+{
+       struct request_queue *q = rq->q;
+
+       if (&q->bar_rq != rq) {
+               if (error)
+                       clear_bit(BIO_UPTODATE, &bio->bi_flags);
+               else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                       error = -EIO;
+
+               if (unlikely(nbytes > bio->bi_size)) {
+                       printk("%s: want %u bytes done, only %u left\n",
+                              __FUNCTION__, nbytes, bio->bi_size);
+                       nbytes = bio->bi_size;
+               }
+
+               bio->bi_size -= nbytes;
+               bio->bi_sector += (nbytes >> 9);
+               if (bio->bi_size == 0)
+                       bio_endio(bio, error);
+       } else {
+
+               /*
+                * Okay, this is the barrier request in progress, just
+                * record the error;
+                */
+               if (error && !q->orderr)
+                       q->orderr = error;
+       }
+}
+
+void blk_dump_rq_flags(struct request *rq, char *msg)
+{
+       int bit;
+
+       printk("%s: dev %s: type=%x, flags=%x\n", msg,
+               rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
+               rq->cmd_flags);
+
+       printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
+                                                      rq->nr_sectors,
+                                                      rq->current_nr_sectors);
+       printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
+
+       if (blk_pc_request(rq)) {
+               printk("cdb: ");
+               for (bit = 0; bit < sizeof(rq->cmd); bit++)
+                       printk("%02x ", rq->cmd[bit]);
+               printk("\n");
+       }
+}
+
+EXPORT_SYMBOL(blk_dump_rq_flags);
+
+/*
+ * "plug" the device if there are no outstanding requests: this will
+ * force the transfer to start only after we have put all the requests
+ * on the list.
+ *
+ * This is called with interrupts off and no requests on the queue and
+ * with the queue lock held.
+ */
+void blk_plug_device(struct request_queue *q)
+{
+       WARN_ON(!irqs_disabled());
+
+       /*
+        * don't plug a stopped queue, it must be paired with blk_start_queue()
+        * which will restart the queueing
+        */
+       if (blk_queue_stopped(q))
+               return;
+
+       if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
+               mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+               blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+       }
+}
+
+EXPORT_SYMBOL(blk_plug_device);
+
+/*
+ * remove the queue from the plugged list, if present. called with
+ * queue lock held and interrupts disabled.
+ */
+int blk_remove_plug(struct request_queue *q)
+{
+       WARN_ON(!irqs_disabled());
+
+       if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+               return 0;
+
+       del_timer(&q->unplug_timer);
+       return 1;
+}
+
+EXPORT_SYMBOL(blk_remove_plug);
+
+/*
+ * remove the plug and let it rip..
+ */
+void __generic_unplug_device(struct request_queue *q)
+{
+       if (unlikely(blk_queue_stopped(q)))
+               return;
+
+       if (!blk_remove_plug(q))
+               return;
+
+       q->request_fn(q);
+}
+EXPORT_SYMBOL(__generic_unplug_device);
+
+/**
+ * generic_unplug_device - fire a request queue
+ * @q:    The &struct request_queue in question
+ *
+ * Description:
+ *   Linux uses plugging to build bigger requests queues before letting
+ *   the device have at them. If a queue is plugged, the I/O scheduler
+ *   is still adding and merging requests on the queue. Once the queue
+ *   gets unplugged, the request_fn defined for the queue is invoked and
+ *   transfers started.
+ **/
+void generic_unplug_device(struct request_queue *q)
+{
+       spin_lock_irq(q->queue_lock);
+       __generic_unplug_device(q);
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(generic_unplug_device);
+
+static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
+                                  struct page *page)
+{
+       struct request_queue *q = bdi->unplug_io_data;
+
+       blk_unplug(q);
+}
+
+void blk_unplug_work(struct work_struct *work)
+{
+       struct request_queue *q =
+               container_of(work, struct request_queue, unplug_work);
+
+       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+                               q->rq.count[READ] + q->rq.count[WRITE]);
+
+       q->unplug_fn(q);
+}
+
+void blk_unplug_timeout(unsigned long data)
+{
+       struct request_queue *q = (struct request_queue *)data;
+
+       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+                               q->rq.count[READ] + q->rq.count[WRITE]);
+
+       kblockd_schedule_work(&q->unplug_work);
+}
+
+void blk_unplug(struct request_queue *q)
+{
+       /*
+        * devices don't necessarily have an ->unplug_fn defined
+        */
+       if (q->unplug_fn) {
+               blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+                                       q->rq.count[READ] + q->rq.count[WRITE]);
+
+               q->unplug_fn(q);
+       }
+}
+EXPORT_SYMBOL(blk_unplug);
+
+/**
+ * blk_start_queue - restart a previously stopped queue
+ * @q:    The &struct request_queue in question
+ *
+ * Description:
+ *   blk_start_queue() will clear the stop flag on the queue, and call
+ *   the request_fn for the queue if it was in a stopped state when
+ *   entered. Also see blk_stop_queue(). Queue lock must be held.
+ **/
+void blk_start_queue(struct request_queue *q)
+{
+       WARN_ON(!irqs_disabled());
+
+       clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
+
+       /*
+        * one level of recursion is ok and is much faster than kicking
+        * the unplug handling
+        */
+       if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
+               q->request_fn(q);
+               clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
+       } else {
+               blk_plug_device(q);
+               kblockd_schedule_work(&q->unplug_work);
+       }
+}
+
+EXPORT_SYMBOL(blk_start_queue);
+
+/**
+ * blk_stop_queue - stop a queue
+ * @q:    The &struct request_queue in question
+ *
+ * Description:
+ *   The Linux block layer assumes that a block driver will consume all
+ *   entries on the request queue when the request_fn strategy is called.
+ *   Often this will not happen, because of hardware limitations (queue
+ *   depth settings). If a device driver gets a 'queue full' response,
+ *   or if it simply chooses not to queue more I/O at one point, it can
+ *   call this function to prevent the request_fn from being called until
+ *   the driver has signalled it's ready to go again. This happens by calling
+ *   blk_start_queue() to restart queue operations. Queue lock must be held.
+ **/
+void blk_stop_queue(struct request_queue *q)
+{
+       blk_remove_plug(q);
+       set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
+}
+EXPORT_SYMBOL(blk_stop_queue);
+
+/**
+ * blk_sync_queue - cancel any pending callbacks on a queue
+ * @q: the queue
+ *
+ * Description:
+ *     The block layer may perform asynchronous callback activity
+ *     on a queue, such as calling the unplug function after a timeout.
+ *     A block device may call blk_sync_queue to ensure that any
+ *     such activity is cancelled, thus allowing it to release resources
+ *     that the callbacks might use. The caller must already have made sure
+ *     that its ->make_request_fn will not re-add plugging prior to calling
+ *     this function.
+ *
+ */
+void blk_sync_queue(struct request_queue *q)
+{
+       del_timer_sync(&q->unplug_timer);
+       kblockd_flush_work(&q->unplug_work);
+}
+EXPORT_SYMBOL(blk_sync_queue);
+
+/**
+ * blk_run_queue - run a single device queue
+ * @q: The queue to run
+ */
+void blk_run_queue(struct request_queue *q)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       blk_remove_plug(q);
+
+       /*
+        * Only recurse once to avoid overrunning the stack, let the unplug
+        * handling reinvoke the handler shortly if we already got there.
+        */
+       if (!elv_queue_empty(q)) {
+               if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
+                       q->request_fn(q);
+                       clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
+               } else {
+                       blk_plug_device(q);
+                       kblockd_schedule_work(&q->unplug_work);
+               }
+       }
+
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL(blk_run_queue);
+
+void blk_put_queue(struct request_queue *q)
+{
+       kobject_put(&q->kobj);
+}
+EXPORT_SYMBOL(blk_put_queue);
+
+void blk_cleanup_queue(struct request_queue * q)
+{
+       mutex_lock(&q->sysfs_lock);
+       set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
+       mutex_unlock(&q->sysfs_lock);
+
+       if (q->elevator)
+               elevator_exit(q->elevator);
+
+       blk_put_queue(q);
+}
+
+EXPORT_SYMBOL(blk_cleanup_queue);
+
+static int blk_init_free_list(struct request_queue *q)
+{
+       struct request_list *rl = &q->rq;
+
+       rl->count[READ] = rl->count[WRITE] = 0;
+       rl->starved[READ] = rl->starved[WRITE] = 0;
+       rl->elvpriv = 0;
+       init_waitqueue_head(&rl->wait[READ]);
+       init_waitqueue_head(&rl->wait[WRITE]);
+
+       rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
+                               mempool_free_slab, request_cachep, q->node);
+
+       if (!rl->rq_pool)
+               return -ENOMEM;
+
+       return 0;
+}
+
+struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
+{
+       return blk_alloc_queue_node(gfp_mask, -1);
+}
+EXPORT_SYMBOL(blk_alloc_queue);
+
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+{
+       struct request_queue *q;
+       int err;
+
+       q = kmem_cache_alloc_node(blk_requestq_cachep,
+                               gfp_mask | __GFP_ZERO, node_id);
+       if (!q)
+               return NULL;
+
+       q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+       q->backing_dev_info.unplug_io_data = q;
+       err = bdi_init(&q->backing_dev_info);
+       if (err) {
+               kmem_cache_free(blk_requestq_cachep, q);
+               return NULL;
+       }
+
+       init_timer(&q->unplug_timer);
+
+       kobject_init(&q->kobj, &blk_queue_ktype);
+
+       mutex_init(&q->sysfs_lock);
+
+       return q;
+}
+EXPORT_SYMBOL(blk_alloc_queue_node);
+
+/**
+ * blk_init_queue  - prepare a request queue for use with a block device
+ * @rfn:  The function to be called to process requests that have been
+ *        placed on the queue.
+ * @lock: Request queue spin lock
+ *
+ * Description:
+ *    If a block device wishes to use the standard request handling procedures,
+ *    which sorts requests and coalesces adjacent requests, then it must
+ *    call blk_init_queue().  The function @rfn will be called when there
+ *    are requests on the queue that need to be processed.  If the device
+ *    supports plugging, then @rfn may not be called immediately when requests
+ *    are available on the queue, but may be called at some time later instead.
+ *    Plugged queues are generally unplugged when a buffer belonging to one
+ *    of the requests on the queue is needed, or due to memory pressure.
+ *
+ *    @rfn is not required, or even expected, to remove all requests off the
+ *    queue, but only as many as it can handle at a time.  If it does leave
+ *    requests on the queue, it is responsible for arranging that the requests
+ *    get dealt with eventually.
+ *
+ *    The queue spin lock must be held while manipulating the requests on the
+ *    request queue; this lock will be taken also from interrupt context, so irq
+ *    disabling is needed for it.
+ *
+ *    Function returns a pointer to the initialized request queue, or NULL if
+ *    it didn't succeed.
+ *
+ * Note:
+ *    blk_init_queue() must be paired with a blk_cleanup_queue() call
+ *    when the block device is deactivated (such as at module unload).
+ **/
+
+struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
+{
+       return blk_init_queue_node(rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_queue);
+
+struct request_queue *
+blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
+{
+       struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+
+       if (!q)
+               return NULL;
+
+       q->node = node_id;
+       if (blk_init_free_list(q)) {
+               kmem_cache_free(blk_requestq_cachep, q);
+               return NULL;
+       }
+
+       /*
+        * if caller didn't supply a lock, they get per-queue locking with
+        * our embedded lock
+        */
+       if (!lock) {
+               spin_lock_init(&q->__queue_lock);
+               lock = &q->__queue_lock;
+       }
+
+       q->request_fn           = rfn;
+       q->prep_rq_fn           = NULL;
+       q->unplug_fn            = generic_unplug_device;
+       q->queue_flags          = (1 << QUEUE_FLAG_CLUSTER);
+       q->queue_lock           = lock;
+
+       blk_queue_segment_boundary(q, 0xffffffff);
+
+       blk_queue_make_request(q, __make_request);
+       blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
+
+       blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+       blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
+
+       q->sg_reserved_size = INT_MAX;
+
+       /*
+        * all done
+        */
+       if (!elevator_init(q, NULL)) {
+               blk_queue_congestion_threshold(q);
+               return q;
+       }
+
+       blk_put_queue(q);
+       return NULL;
+}
+EXPORT_SYMBOL(blk_init_queue_node);
+
+int blk_get_queue(struct request_queue *q)
+{
+       if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+               kobject_get(&q->kobj);
+               return 0;
+       }
+
+       return 1;
+}
+
+EXPORT_SYMBOL(blk_get_queue);
+
+static inline void blk_free_request(struct request_queue *q, struct request *rq)
+{
+       if (rq->cmd_flags & REQ_ELVPRIV)
+               elv_put_request(q, rq);
+       mempool_free(rq, q->rq.rq_pool);
+}
+
+static struct request *
+blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
+{
+       struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+
+       if (!rq)
+               return NULL;
+
+       /*
+        * first three bits are identical in rq->cmd_flags and bio->bi_rw,
+        * see bio.h and blkdev.h
+        */
+       rq->cmd_flags = rw | REQ_ALLOCED;
+
+       if (priv) {
+               if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+                       mempool_free(rq, q->rq.rq_pool);
+                       return NULL;
+               }
+               rq->cmd_flags |= REQ_ELVPRIV;
+       }
+
+       return rq;
+}
+
+/*
+ * ioc_batching returns true if the ioc is a valid batching request and
+ * should be given priority access to a request.
+ */
+static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
+{
+       if (!ioc)
+               return 0;
+
+       /*
+        * Make sure the process is able to allocate at least 1 request
+        * even if the batch times out, otherwise we could theoretically
+        * lose wakeups.
+        */
+       return ioc->nr_batch_requests == q->nr_batching ||
+               (ioc->nr_batch_requests > 0
+               && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
+}
+
+/*
+ * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
+ * will cause the process to be a "batcher" on all queues in the system. This
+ * is the behaviour we want though - once it gets a wakeup it should be given
+ * a nice run.
+ */
+static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
+{
+       if (!ioc || ioc_batching(q, ioc))
+               return;
+
+       ioc->nr_batch_requests = q->nr_batching;
+       ioc->last_waited = jiffies;
+}
+
+static void __freed_request(struct request_queue *q, int rw)
+{
+       struct request_list *rl = &q->rq;
+
+       if (rl->count[rw] < queue_congestion_off_threshold(q))
+               blk_clear_queue_congested(q, rw);
+
+       if (rl->count[rw] + 1 <= q->nr_requests) {
+               if (waitqueue_active(&rl->wait[rw]))
+                       wake_up(&rl->wait[rw]);
+
+               blk_clear_queue_full(q, rw);
+       }
+}
+
+/*
+ * A request has just been released.  Account for it, update the full and
+ * congestion status, wake up any waiters.   Called under q->queue_lock.
+ */
+static void freed_request(struct request_queue *q, int rw, int priv)
+{
+       struct request_list *rl = &q->rq;
+
+       rl->count[rw]--;
+       if (priv)
+               rl->elvpriv--;
+
+       __freed_request(q, rw);
+
+       if (unlikely(rl->starved[rw ^ 1]))
+               __freed_request(q, rw ^ 1);
+}
+
+#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
+/*
+ * Get a free request, queue_lock must be held.
+ * Returns NULL on failure, with queue_lock held.
+ * Returns !NULL on success, with queue_lock *not held*.
+ */
+static struct request *get_request(struct request_queue *q, int rw_flags,
+                                  struct bio *bio, gfp_t gfp_mask)
+{
+       struct request *rq = NULL;
+       struct request_list *rl = &q->rq;
+       struct io_context *ioc = NULL;
+       const int rw = rw_flags & 0x01;
+       int may_queue, priv;
+
+       may_queue = elv_may_queue(q, rw_flags);
+       if (may_queue == ELV_MQUEUE_NO)
+               goto rq_starved;
+
+       if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
+               if (rl->count[rw]+1 >= q->nr_requests) {
+                       ioc = current_io_context(GFP_ATOMIC, q->node);
+                       /*
+                        * The queue will fill after this allocation, so set
+                        * it as full, and mark this process as "batching".
+                        * This process will be allowed to complete a batch of
+                        * requests, others will be blocked.
+                        */
+                       if (!blk_queue_full(q, rw)) {
+                               ioc_set_batching(q, ioc);
+                               blk_set_queue_full(q, rw);
+                       } else {
+                               if (may_queue != ELV_MQUEUE_MUST
+                                               && !ioc_batching(q, ioc)) {
+                                       /*
+                                        * The queue is full and the allocating
+                                        * process is not a "batcher", and not
+                                        * exempted by the IO scheduler
+                                        */
+                                       goto out;
+                               }
+                       }
+               }
+               blk_set_queue_congested(q, rw);
+       }
+
+       /*
+        * Only allow batching queuers to allocate up to 50% over the defined
+        * limit of requests, otherwise we could have thousands of requests
+        * allocated with any setting of ->nr_requests
+        */
+       if (rl->count[rw] >= (3 * q->nr_requests / 2))
+               goto out;
+
+       rl->count[rw]++;
+       rl->starved[rw] = 0;
+
+       priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+       if (priv)
+               rl->elvpriv++;
+
+       spin_unlock_irq(q->queue_lock);
+
+       rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
+       if (unlikely(!rq)) {
+               /*
+                * Allocation failed presumably due to memory. Undo anything
+                * we might have messed up.
+                *
+                * Allocating task should really be put onto the front of the
+                * wait queue, but this is pretty rare.
+                */
+               spin_lock_irq(q->queue_lock);
+               freed_request(q, rw, priv);
+
+               /*
+                * in the very unlikely event that allocation failed and no
+                * requests for this direction was pending, mark us starved
+                * so that freeing of a request in the other direction will
+                * notice us. another possible fix would be to split the
+                * rq mempool into READ and WRITE
+                */
+rq_starved:
+               if (unlikely(rl->count[rw] == 0))
+                       rl->starved[rw] = 1;
+
+               goto out;
+       }
+
+       /*
+        * ioc may be NULL here, and ioc_batching will be false. That's
+        * OK, if the queue is under the request limit then requests need
+        * not count toward the nr_batch_requests limit. There will always
+        * be some limit enforced by BLK_BATCH_TIME.
+        */
+       if (ioc_batching(q, ioc))
+               ioc->nr_batch_requests--;
+       
+       rq_init(q, rq);
+
+       blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+out:
+       return rq;
+}
+
+/*
+ * No available requests for this queue, unplug the device and wait for some
+ * requests to become available.
+ *
+ * Called with q->queue_lock held, and returns with it unlocked.
+ */
+static struct request *get_request_wait(struct request_queue *q, int rw_flags,
+                                       struct bio *bio)
+{
+       const int rw = rw_flags & 0x01;
+       struct request *rq;
+
+       rq = get_request(q, rw_flags, bio, GFP_NOIO);
+       while (!rq) {
+               DEFINE_WAIT(wait);
+               struct request_list *rl = &q->rq;
+
+               prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+                               TASK_UNINTERRUPTIBLE);
+
+               rq = get_request(q, rw_flags, bio, GFP_NOIO);
+
+               if (!rq) {
+                       struct io_context *ioc;
+
+                       blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
+                       __generic_unplug_device(q);
+                       spin_unlock_irq(q->queue_lock);
+                       io_schedule();
+
+                       /*
+                        * After sleeping, we become a "batching" process and
+                        * will be able to allocate at least one request, and
+                        * up to a big batch of them for a small period time.
+                        * See ioc_batching, ioc_set_batching
+                        */
+                       ioc = current_io_context(GFP_NOIO, q->node);
+                       ioc_set_batching(q, ioc);
+
+                       spin_lock_irq(q->queue_lock);
+               }
+               finish_wait(&rl->wait[rw], &wait);
+       }
+
+       return rq;
+}
+
+struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+{
+       struct request *rq;
+
+       BUG_ON(rw != READ && rw != WRITE);
+
+       spin_lock_irq(q->queue_lock);
+       if (gfp_mask & __GFP_WAIT) {
+               rq = get_request_wait(q, rw, NULL);
+       } else {
+               rq = get_request(q, rw, NULL, gfp_mask);
+               if (!rq)
+                       spin_unlock_irq(q->queue_lock);
+       }
+       /* q->queue_lock is unlocked at this point */
+
+       return rq;
+}
+EXPORT_SYMBOL(blk_get_request);
+
+/**
+ * blk_start_queueing - initiate dispatch of requests to device
+ * @q:         request queue to kick into gear
+ *
+ * This is basically a helper to remove the need to know whether a queue
+ * is plugged or not if someone just wants to initiate dispatch of requests
+ * for this queue.
+ *
+ * The queue lock must be held with interrupts disabled.
+ */
+void blk_start_queueing(struct request_queue *q)
+{
+       if (!blk_queue_plugged(q))
+               q->request_fn(q);
+       else
+               __generic_unplug_device(q);
+}
+EXPORT_SYMBOL(blk_start_queueing);
+
+/**
+ * blk_requeue_request - put a request back on queue
+ * @q:         request queue where request should be inserted
+ * @rq:                request to be inserted
+ *
+ * Description:
+ *    Drivers often keep queueing requests until the hardware cannot accept
+ *    more, when that condition happens we need to put the request back
+ *    on the queue. Must be called with queue lock held.
+ */
+void blk_requeue_request(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
+       if (blk_rq_tagged(rq))
+               blk_queue_end_tag(q, rq);
+
+       elv_requeue_request(q, rq);
+}
+
+EXPORT_SYMBOL(blk_requeue_request);
+
+/**
+ * blk_insert_request - insert a special request in to a request queue
+ * @q:         request queue where request should be inserted
+ * @rq:                request to be inserted
+ * @at_head:   insert request at head or tail of queue
+ * @data:      private data
+ *
+ * Description:
+ *    Many block devices need to execute commands asynchronously, so they don't
+ *    block the whole kernel from preemption during request execution.  This is
+ *    accomplished normally by inserting aritficial requests tagged as
+ *    REQ_SPECIAL in to the corresponding request queue, and letting them be
+ *    scheduled for actual execution by the request queue.
+ *
+ *    We have the option of inserting the head or the tail of the queue.
+ *    Typically we use the tail for new ioctls and so forth.  We use the head
+ *    of the queue for things like a QUEUE_FULL message from a device, or a
+ *    host that is unable to accept a particular command.
+ */
+void blk_insert_request(struct request_queue *q, struct request *rq,
+                       int at_head, void *data)
+{
+       int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+       unsigned long flags;
+
+       /*
+        * tell I/O scheduler that this isn't a regular read/write (ie it
+        * must not attempt merges on this) and that it acts as a soft
+        * barrier
+        */
+       rq->cmd_type = REQ_TYPE_SPECIAL;
+       rq->cmd_flags |= REQ_SOFTBARRIER;
+
+       rq->special = data;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+
+       /*
+        * If command is tagged, release the tag
+        */
+       if (blk_rq_tagged(rq))
+               blk_queue_end_tag(q, rq);
+
+       drive_stat_acct(rq, 1);
+       __elv_add_request(q, rq, where, 0);
+       blk_start_queueing(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+EXPORT_SYMBOL(blk_insert_request);
+
+/*
+ * add-request adds a request to the linked list.
+ * queue lock is held and interrupts disabled, as we muck with the
+ * request queue list.
+ */
+static inline void add_request(struct request_queue * q, struct request * req)
+{
+       drive_stat_acct(req, 1);
+
+       /*
+        * elevator indicated where it wants this request to be
+        * inserted at elevator_merge time
+        */
+       __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
+}
+ 
+/*
+ * disk_round_stats()  - Round off the performance stats on a struct
+ * disk_stats.
+ *
+ * The average IO queue length and utilisation statistics are maintained
+ * by observing the current state of the queue length and the amount of
+ * time it has been in this state for.
+ *
+ * Normally, that accounting is done on IO completion, but that can result
+ * in more than a second's worth of IO being accounted for within any one
+ * second, leading to >100% utilisation.  To deal with that, we call this
+ * function to do a round-off before returning the results when reading
+ * /proc/diskstats.  This accounts immediately for all queue usage up to
+ * the current jiffies and restarts the counters again.
+ */
+void disk_round_stats(struct gendisk *disk)
+{
+       unsigned long now = jiffies;
+
+       if (now == disk->stamp)
+               return;
+
+       if (disk->in_flight) {
+               __disk_stat_add(disk, time_in_queue,
+                               disk->in_flight * (now - disk->stamp));
+               __disk_stat_add(disk, io_ticks, (now - disk->stamp));
+       }
+       disk->stamp = now;
+}
+
+EXPORT_SYMBOL_GPL(disk_round_stats);
+
+/*
+ * queue lock must be held
+ */
+void __blk_put_request(struct request_queue *q, struct request *req)
+{
+       if (unlikely(!q))
+               return;
+       if (unlikely(--req->ref_count))
+               return;
+
+       elv_completed_request(q, req);
+
+       /*
+        * Request may not have originated from ll_rw_blk. if not,
+        * it didn't come out of our reserved rq pools
+        */
+       if (req->cmd_flags & REQ_ALLOCED) {
+               int rw = rq_data_dir(req);
+               int priv = req->cmd_flags & REQ_ELVPRIV;
+
+               BUG_ON(!list_empty(&req->queuelist));
+               BUG_ON(!hlist_unhashed(&req->hash));
+
+               blk_free_request(q, req);
+               freed_request(q, rw, priv);
+       }
+}
+
+EXPORT_SYMBOL_GPL(__blk_put_request);
+
+void blk_put_request(struct request *req)
+{
+       unsigned long flags;
+       struct request_queue *q = req->q;
+
+       /*
+        * Gee, IDE calls in w/ NULL q.  Fix IDE and remove the
+        * following if (q) test.
+        */
+       if (q) {
+               spin_lock_irqsave(q->queue_lock, flags);
+               __blk_put_request(q, req);
+               spin_unlock_irqrestore(q->queue_lock, flags);
+       }
+}
+
+EXPORT_SYMBOL(blk_put_request);
+
+void init_request_from_bio(struct request *req, struct bio *bio)
+{
+       req->cmd_type = REQ_TYPE_FS;
+
+       /*
+        * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+        */
+       if (bio_rw_ahead(bio) || bio_failfast(bio))
+               req->cmd_flags |= REQ_FAILFAST;
+
+       /*
+        * REQ_BARRIER implies no merging, but lets make it explicit
+        */
+       if (unlikely(bio_barrier(bio)))
+               req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+
+       if (bio_sync(bio))
+               req->cmd_flags |= REQ_RW_SYNC;
+       if (bio_rw_meta(bio))
+               req->cmd_flags |= REQ_RW_META;
+
+       req->errors = 0;
+       req->hard_sector = req->sector = bio->bi_sector;
+       req->ioprio = bio_prio(bio);
+       req->start_time = jiffies;
+       blk_rq_bio_prep(req->q, req, bio);
+}
+
+static int __make_request(struct request_queue *q, struct bio *bio)
+{
+       struct request *req;
+       int el_ret, nr_sectors, barrier, err;
+       const unsigned short prio = bio_prio(bio);
+       const int sync = bio_sync(bio);
+       int rw_flags;
+
+       nr_sectors = bio_sectors(bio);
+
+       /*
+        * low level driver can indicate that it wants pages above a
+        * certain limit bounced to low memory (ie for highmem, or even
+        * ISA dma in theory)
+        */
+       blk_queue_bounce(q, &bio);
+
+       barrier = bio_barrier(bio);
+       if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
+               err = -EOPNOTSUPP;
+               goto end_io;
+       }
+
+       spin_lock_irq(q->queue_lock);
+
+       if (unlikely(barrier) || elv_queue_empty(q))
+               goto get_rq;
+
+       el_ret = elv_merge(q, &req, bio);
+       switch (el_ret) {
+               case ELEVATOR_BACK_MERGE:
+                       BUG_ON(!rq_mergeable(req));
+
+                       if (!ll_back_merge_fn(q, req, bio))
+                               break;
+
+                       blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
+                       req->biotail->bi_next = bio;
+                       req->biotail = bio;
+                       req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+                       req->ioprio = ioprio_best(req->ioprio, prio);
+                       drive_stat_acct(req, 0);
+                       if (!attempt_back_merge(q, req))
+                               elv_merged_request(q, req, el_ret);
+                       goto out;
+
+               case ELEVATOR_FRONT_MERGE:
+                       BUG_ON(!rq_mergeable(req));
+
+                       if (!ll_front_merge_fn(q, req, bio))
+                               break;
+
+                       blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
+                       bio->bi_next = req->bio;
+                       req->bio = bio;
+
+                       /*
+                        * may not be valid. if the low level driver said
+                        * it didn't need a bounce buffer then it better
+                        * not touch req->buffer either...
+                        */
+                       req->buffer = bio_data(bio);
+                       req->current_nr_sectors = bio_cur_sectors(bio);
+                       req->hard_cur_sectors = req->current_nr_sectors;
+                       req->sector = req->hard_sector = bio->bi_sector;
+                       req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+                       req->ioprio = ioprio_best(req->ioprio, prio);
+                       drive_stat_acct(req, 0);
+                       if (!attempt_front_merge(q, req))
+                               elv_merged_request(q, req, el_ret);
+                       goto out;
+
+               /* ELV_NO_MERGE: elevator says don't/can't merge. */
+               default:
+                       ;
+       }
+
+get_rq:
+       /*
+        * This sync check and mask will be re-done in init_request_from_bio(),
+        * but we need to set it earlier to expose the sync flag to the
+        * rq allocator and io schedulers.
+        */
+       rw_flags = bio_data_dir(bio);
+       if (sync)
+               rw_flags |= REQ_RW_SYNC;
+
+       /*
+        * Grab a free request. This is might sleep but can not fail.
+        * Returns with the queue unlocked.
+        */
+       req = get_request_wait(q, rw_flags, bio);
+
+       /*
+        * After dropping the lock and possibly sleeping here, our request
+        * may now be mergeable after it had proven unmergeable (above).
+        * We don't worry about that case for efficiency. It won't happen
+        * often, and the elevators are able to handle it.
+        */
+       init_request_from_bio(req, bio);
+
+       spin_lock_irq(q->queue_lock);
+       if (elv_queue_empty(q))
+               blk_plug_device(q);
+       add_request(q, req);
+out:
+       if (sync)
+               __generic_unplug_device(q);
+
+       spin_unlock_irq(q->queue_lock);
+       return 0;
+
+end_io:
+       bio_endio(bio, err);
+       return 0;
+}
+
+/*
+ * If bio->bi_dev is a partition, remap the location
+ */
+static inline void blk_partition_remap(struct bio *bio)
+{
+       struct block_device *bdev = bio->bi_bdev;
+
+       if (bio_sectors(bio) && bdev != bdev->bd_contains) {
+               struct hd_struct *p = bdev->bd_part;
+               const int rw = bio_data_dir(bio);
+
+               p->sectors[rw] += bio_sectors(bio);
+               p->ios[rw]++;
+
+               bio->bi_sector += p->start_sect;
+               bio->bi_bdev = bdev->bd_contains;
+
+               blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
+                                   bdev->bd_dev, bio->bi_sector,
+                                   bio->bi_sector - p->start_sect);
+       }
+}
+
+static void handle_bad_sector(struct bio *bio)
+{
+       char b[BDEVNAME_SIZE];
+
+       printk(KERN_INFO "attempt to access beyond end of device\n");
+       printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
+                       bdevname(bio->bi_bdev, b),
+                       bio->bi_rw,
+                       (unsigned long long)bio->bi_sector + bio_sectors(bio),
+                       (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
+
+       set_bit(BIO_EOF, &bio->bi_flags);
+}
+
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+
+static DECLARE_FAULT_ATTR(fail_make_request);
+
+static int __init setup_fail_make_request(char *str)
+{
+       return setup_fault_attr(&fail_make_request, str);
+}
+__setup("fail_make_request=", setup_fail_make_request);
+
+static int should_fail_request(struct bio *bio)
+{
+       if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
+           (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
+               return should_fail(&fail_make_request, bio->bi_size);
+
+       return 0;
+}
+
+static int __init fail_make_request_debugfs(void)
+{
+       return init_fault_attr_dentries(&fail_make_request,
+                                       "fail_make_request");
+}
+
+late_initcall(fail_make_request_debugfs);
+
+#else /* CONFIG_FAIL_MAKE_REQUEST */
+
+static inline int should_fail_request(struct bio *bio)
+{
+       return 0;
+}
+
+#endif /* CONFIG_FAIL_MAKE_REQUEST */
+
+/*
+ * Check whether this bio extends beyond the end of the device.
+ */
+static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
+{
+       sector_t maxsector;
+
+       if (!nr_sectors)
+               return 0;
+
+       /* Test device or partition size, when known. */
+       maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+       if (maxsector) {
+               sector_t sector = bio->bi_sector;
+
+               if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
+                       /*
+                        * This may well happen - the kernel calls bread()
+                        * without checking the size of the device, e.g., when
+                        * mounting a device.
+                        */
+                       handle_bad_sector(bio);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * generic_make_request: hand a buffer to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * generic_make_request() is used to make I/O requests of block
+ * devices. It is passed a &struct bio, which describes the I/O that needs
+ * to be done.
+ *
+ * generic_make_request() does not return any status.  The
+ * success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the bio->bi_end_io
+ * function described (one day) else where.
+ *
+ * The caller of generic_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ * set to describe the device address, and the
+ * bi_end_io and optionally bi_private are set to describe how
+ * completion notification should be signaled.
+ *
+ * generic_make_request and the drivers it calls may use bi_next if this
+ * bio happens to be merged with someone else, and may change bi_dev and
+ * bi_sector for remaps as it sees fit.  So the values of these fields
+ * should NOT be depended on after the call to generic_make_request.
+ */
+static inline void __generic_make_request(struct bio *bio)
+{
+       struct request_queue *q;
+       sector_t old_sector;
+       int ret, nr_sectors = bio_sectors(bio);
+       dev_t old_dev;
+       int err = -EIO;
+
+       might_sleep();
+
+       if (bio_check_eod(bio, nr_sectors))
+               goto end_io;
+
+       /*
+        * Resolve the mapping until finished. (drivers are
+        * still free to implement/resolve their own stacking
+        * by explicitly returning 0)
+        *
+        * NOTE: we don't repeat the blk_size check for each new device.
+        * Stacking drivers are expected to know what they are doing.
+        */
+       old_sector = -1;
+       old_dev = 0;
+       do {
+               char b[BDEVNAME_SIZE];
+
+               q = bdev_get_queue(bio->bi_bdev);
+               if (!q) {
+                       printk(KERN_ERR
+                              "generic_make_request: Trying to access "
+                               "nonexistent block-device %s (%Lu)\n",
+                               bdevname(bio->bi_bdev, b),
+                               (long long) bio->bi_sector);
+end_io:
+                       bio_endio(bio, err);
+                       break;
+               }
+
+               if (unlikely(nr_sectors > q->max_hw_sectors)) {
+                       printk("bio too big device %s (%u > %u)\n", 
+                               bdevname(bio->bi_bdev, b),
+                               bio_sectors(bio),
+                               q->max_hw_sectors);
+                       goto end_io;
+               }
+
+               if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+                       goto end_io;
+
+               if (should_fail_request(bio))
+                       goto end_io;
+
+               /*
+                * If this device has partitions, remap block n
+                * of partition p to block n+start(p) of the disk.
+                */
+               blk_partition_remap(bio);
+
+               if (old_sector != -1)
+                       blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
+                                           old_sector);
+
+               blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+               old_sector = bio->bi_sector;
+               old_dev = bio->bi_bdev->bd_dev;
+
+               if (bio_check_eod(bio, nr_sectors))
+                       goto end_io;
+               if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
+                       err = -EOPNOTSUPP;
+                       goto end_io;
+               }
+
+               ret = q->make_request_fn(q, bio);
+       } while (ret);
+}
+
+/*
+ * We only want one ->make_request_fn to be active at a time,
+ * else stack usage with stacked devices could be a problem.
+ * So use current->bio_{list,tail} to keep a list of requests
+ * submited by a make_request_fn function.
+ * current->bio_tail is also used as a flag to say if
+ * generic_make_request is currently active in this task or not.
+ * If it is NULL, then no make_request is active.  If it is non-NULL,
+ * then a make_request is active, and new requests should be added
+ * at the tail
+ */
+void generic_make_request(struct bio *bio)
+{
+       if (current->bio_tail) {
+               /* make_request is active */
+               *(current->bio_tail) = bio;
+               bio->bi_next = NULL;
+               current->bio_tail = &bio->bi_next;
+               return;
+       }
+       /* following loop may be a bit non-obvious, and so deserves some
+        * explanation.
+        * Before entering the loop, bio->bi_next is NULL (as all callers
+        * ensure that) so we have a list with a single bio.
+        * We pretend that we have just taken it off a longer list, so
+        * we assign bio_list to the next (which is NULL) and bio_tail
+        * to &bio_list, thus initialising the bio_list of new bios to be
+        * added.  __generic_make_request may indeed add some more bios
+        * through a recursive call to generic_make_request.  If it
+        * did, we find a non-NULL value in bio_list and re-enter the loop
+        * from the top.  In this case we really did just take the bio
+        * of the top of the list (no pretending) and so fixup bio_list and
+        * bio_tail or bi_next, and call into __generic_make_request again.
+        *
+        * The loop was structured like this to make only one call to
+        * __generic_make_request (which is important as it is large and
+        * inlined) and to keep the structure simple.
+        */
+       BUG_ON(bio->bi_next);
+       do {
+               current->bio_list = bio->bi_next;
+               if (bio->bi_next == NULL)
+                       current->bio_tail = &current->bio_list;
+               else
+                       bio->bi_next = NULL;
+               __generic_make_request(bio);
+               bio = current->bio_list;
+       } while (bio);
+       current->bio_tail = NULL; /* deactivate */
+}
+
+EXPORT_SYMBOL(generic_make_request);
+
+/**
+ * submit_bio: submit a bio to the block device layer for I/O
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+ * @bio: The &struct bio which describes the I/O
+ *
+ * submit_bio() is very similar in purpose to generic_make_request(), and
+ * uses that function to do most of the work. Both are fairly rough
+ * interfaces, @bio must be presetup and ready for I/O.
+ *
+ */
+void submit_bio(int rw, struct bio *bio)
+{
+       int count = bio_sectors(bio);
+
+       bio->bi_rw |= rw;
+
+       /*
+        * If it's a regular read/write or a barrier with data attached,
+        * go through the normal accounting stuff before submission.
+        */
+       if (!bio_empty_barrier(bio)) {
+
+               BIO_BUG_ON(!bio->bi_size);
+               BIO_BUG_ON(!bio->bi_io_vec);
+
+               if (rw & WRITE) {
+                       count_vm_events(PGPGOUT, count);
+               } else {
+                       task_io_account_read(bio->bi_size);
+                       count_vm_events(PGPGIN, count);
+               }
+
+               if (unlikely(block_dump)) {
+                       char b[BDEVNAME_SIZE];
+                       printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+                       current->comm, task_pid_nr(current),
+                               (rw & WRITE) ? "WRITE" : "READ",
+                               (unsigned long long)bio->bi_sector,
+                               bdevname(bio->bi_bdev,b));
+               }
+       }
+
+       generic_make_request(bio);
+}
+
+EXPORT_SYMBOL(submit_bio);
+
+/**
+ * __end_that_request_first - end I/O on a request
+ * @req:      the request being processed
+ * @error:    0 for success, < 0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @req, and sets it up
+ *     for the next range of segments (if any) in the cluster.
+ *
+ * Return:
+ *     0 - we are done with this request, call end_that_request_last()
+ *     1 - still buffers pending for this request
+ **/
+static int __end_that_request_first(struct request *req, int error,
+                                   int nr_bytes)
+{
+       int total_bytes, bio_nbytes, next_idx = 0;
+       struct bio *bio;
+
+       blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
+       /*
+        * for a REQ_BLOCK_PC request, we want to carry any eventual
+        * sense key with us all the way through
+        */
+       if (!blk_pc_request(req))
+               req->errors = 0;
+
+       if (error) {
+               if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
+                       printk("end_request: I/O error, dev %s, sector %llu\n",
+                               req->rq_disk ? req->rq_disk->disk_name : "?",
+                               (unsigned long long)req->sector);
+       }
+
+       if (blk_fs_request(req) && req->rq_disk) {
+               const int rw = rq_data_dir(req);
+
+               disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
+       }
+
+       total_bytes = bio_nbytes = 0;
+       while ((bio = req->bio) != NULL) {
+               int nbytes;
+
+               /*
+                * For an empty barrier request, the low level driver must
+                * store a potential error location in ->sector. We pass
+                * that back up in ->bi_sector.
+                */
+               if (blk_empty_barrier(req))
+                       bio->bi_sector = req->sector;
+
+               if (nr_bytes >= bio->bi_size) {
+                       req->bio = bio->bi_next;
+                       nbytes = bio->bi_size;
+                       req_bio_endio(req, bio, nbytes, error);
+                       next_idx = 0;
+                       bio_nbytes = 0;
+               } else {
+                       int idx = bio->bi_idx + next_idx;
+
+                       if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+                               blk_dump_rq_flags(req, "__end_that");
+                               printk("%s: bio idx %d >= vcnt %d\n",
+                                               __FUNCTION__,
+                                               bio->bi_idx, bio->bi_vcnt);
+                               break;
+                       }
+
+                       nbytes = bio_iovec_idx(bio, idx)->bv_len;
+                       BIO_BUG_ON(nbytes > bio->bi_size);
+
+                       /*
+                        * not a complete bvec done
+                        */
+                       if (unlikely(nbytes > nr_bytes)) {
+                               bio_nbytes += nr_bytes;
+                               total_bytes += nr_bytes;
+                               break;
+                       }
+
+                       /*
+                        * advance to the next vector
+                        */
+                       next_idx++;
+                       bio_nbytes += nbytes;
+               }
+
+               total_bytes += nbytes;
+               nr_bytes -= nbytes;
+
+               if ((bio = req->bio)) {
+                       /*
+                        * end more in this run, or just return 'not-done'
+                        */
+                       if (unlikely(nr_bytes <= 0))
+                               break;
+               }
+       }
+
+       /*
+        * completely done
+        */
+       if (!req->bio)
+               return 0;
+
+       /*
+        * if the request wasn't completed, update state
+        */
+       if (bio_nbytes) {
+               req_bio_endio(req, bio, bio_nbytes, error);
+               bio->bi_idx += next_idx;
+               bio_iovec(bio)->bv_offset += nr_bytes;
+               bio_iovec(bio)->bv_len -= nr_bytes;
+       }
+
+       blk_recalc_rq_sectors(req, total_bytes >> 9);
+       blk_recalc_rq_segments(req);
+       return 1;
+}
+
+/*
+ * splice the completion data to a local structure and hand off to
+ * process_completion_queue() to complete the requests
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+       struct list_head *cpu_list, local_list;
+
+       local_irq_disable();
+       cpu_list = &__get_cpu_var(blk_cpu_done);
+       list_replace_init(cpu_list, &local_list);
+       local_irq_enable();
+
+       while (!list_empty(&local_list)) {
+               struct request *rq = list_entry(local_list.next, struct request, donelist);
+
+               list_del_init(&rq->donelist);
+               rq->q->softirq_done_fn(rq);
+       }
+}
+
+static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,
+                         void *hcpu)
+{
+       /*
+        * If a CPU goes away, splice its entries to the current CPU
+        * and trigger a run of the softirq
+        */
+       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+               int cpu = (unsigned long) hcpu;
+
+               local_irq_disable();
+               list_splice_init(&per_cpu(blk_cpu_done, cpu),
+                                &__get_cpu_var(blk_cpu_done));
+               raise_softirq_irqoff(BLOCK_SOFTIRQ);
+               local_irq_enable();
+       }
+
+       return NOTIFY_OK;
+}
+
+
+static struct notifier_block blk_cpu_notifier __cpuinitdata = {
+       .notifier_call  = blk_cpu_notify,
+};
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req:      the request being processed
+ *
+ * Description:
+ *     Ends all I/O on a request. It does not handle partial completions,
+ *     unless the driver actually implements this in its completion callback
+ *     through requeueing. The actual completion happens out-of-order,
+ *     through a softirq handler. The user must have registered a completion
+ *     callback through blk_queue_softirq_done().
+ **/
+
+void blk_complete_request(struct request *req)
+{
+       struct list_head *cpu_list;
+       unsigned long flags;
+
+       BUG_ON(!req->q->softirq_done_fn);
+               
+       local_irq_save(flags);
+
+       cpu_list = &__get_cpu_var(blk_cpu_done);
+       list_add_tail(&req->donelist, cpu_list);
+       raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+       local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(blk_complete_request);
+       
+/*
+ * queue lock must be held
+ */
+static void end_that_request_last(struct request *req, int error)
+{
+       struct gendisk *disk = req->rq_disk;
+
+       if (blk_rq_tagged(req))
+               blk_queue_end_tag(req->q, req);
+
+       if (blk_queued_rq(req))
+               blkdev_dequeue_request(req);
+
+       if (unlikely(laptop_mode) && blk_fs_request(req))
+               laptop_io_completion();
+
+       /*
+        * Account IO completion.  bar_rq isn't accounted as a normal
+        * IO on queueing nor completion.  Accounting the containing
+        * request is enough.
+        */
+       if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
+               unsigned long duration = jiffies - req->start_time;
+               const int rw = rq_data_dir(req);
+
+               __disk_stat_inc(disk, ios[rw]);
+               __disk_stat_add(disk, ticks[rw], duration);
+               disk_round_stats(disk);
+               disk->in_flight--;
+       }
+
+       if (req->end_io)
+               req->end_io(req, error);
+       else {
+               if (blk_bidi_rq(req))
+                       __blk_put_request(req->next_rq->q, req->next_rq);
+
+               __blk_put_request(req->q, req);
+       }
+}
+
+static inline void __end_request(struct request *rq, int uptodate,
+                                unsigned int nr_bytes)
+{
+       int error = 0;
+
+       if (uptodate <= 0)
+               error = uptodate ? uptodate : -EIO;
+
+       __blk_end_request(rq, error, nr_bytes);
+}
+
+/**
+ * blk_rq_bytes - Returns bytes left to complete in the entire request
+ **/
+unsigned int blk_rq_bytes(struct request *rq)
+{
+       if (blk_fs_request(rq))
+               return rq->hard_nr_sectors << 9;
+
+       return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_bytes);
+
+/**
+ * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
+ **/
+unsigned int blk_rq_cur_bytes(struct request *rq)
+{
+       if (blk_fs_request(rq))
+               return rq->current_nr_sectors << 9;
+
+       if (rq->bio)
+               return rq->bio->bi_size;
+
+       return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
+
+/**
+ * end_queued_request - end all I/O on a queued request
+ * @rq:                the request being processed
+ * @uptodate:  error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends all I/O on a request, and removes it from the block layer queues.
+ *     Not suitable for normal IO completion, unless the driver still has
+ *     the request attached to the block layer.
+ *
+ **/
+void end_queued_request(struct request *rq, int uptodate)
+{
+       __end_request(rq, uptodate, blk_rq_bytes(rq));
+}
+EXPORT_SYMBOL(end_queued_request);
+
+/**
+ * end_dequeued_request - end all I/O on a dequeued request
+ * @rq:                the request being processed
+ * @uptodate:  error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends all I/O on a request. The request must already have been
+ *     dequeued using blkdev_dequeue_request(), as is normally the case
+ *     for most drivers.
+ *
+ **/
+void end_dequeued_request(struct request *rq, int uptodate)
+{
+       __end_request(rq, uptodate, blk_rq_bytes(rq));
+}
+EXPORT_SYMBOL(end_dequeued_request);
+
+
+/**
+ * end_request - end I/O on the current segment of the request
+ * @req:       the request being processed
+ * @uptodate:  error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled IO completions.
+ *     Modern drivers typically end IO on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Either use end_request_completely(), or the
+ *     end_that_request_chunk() (along with end_that_request_last()) for
+ *     partial completions.
+ *
+ **/
+void end_request(struct request *req, int uptodate)
+{
+       __end_request(req, uptodate, req->hard_cur_sectors << 9);
+}
+EXPORT_SYMBOL(end_request);
+
+/**
+ * blk_end_io - Generic end_io function to complete a request.
+ * @rq:           the request being processed
+ * @error:        0 for success, < 0 for error
+ * @nr_bytes:     number of bytes to complete @rq
+ * @bidi_bytes:   number of bytes to complete @rq->next_rq
+ * @drv_callback: function called between completion of bios in the request
+ *                and completion of the request.
+ *                If the callback returns non 0, this helper returns without
+ *                completion of the request.
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     0 - we are done with this request
+ *     1 - this request is not freed yet, it still has pending buffers.
+ **/
+static int blk_end_io(struct request *rq, int error, int nr_bytes,
+                     int bidi_bytes, int (drv_callback)(struct request *))
+{
+       struct request_queue *q = rq->q;
+       unsigned long flags = 0UL;
+
+       if (blk_fs_request(rq) || blk_pc_request(rq)) {
+               if (__end_that_request_first(rq, error, nr_bytes))
+                       return 1;
+
+               /* Bidi request must be completed as a whole */
+               if (blk_bidi_rq(rq) &&
+                   __end_that_request_first(rq->next_rq, error, bidi_bytes))
+                       return 1;
+       }
+
+       /* Special feature for tricky drivers */
+       if (drv_callback && drv_callback(rq))
+               return 1;
+
+       add_disk_randomness(rq->rq_disk);
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       end_that_request_last(rq, error);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+
+       return 0;
+}
+
+/**
+ * blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    0 for success, < 0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     0 - we are done with this request
+ *     1 - still buffers pending for this request
+ **/
+int blk_end_request(struct request *rq, int error, int nr_bytes)
+{
+       return blk_end_io(rq, error, nr_bytes, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(blk_end_request);
+
+/**
+ * __blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    0 for success, < 0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Must be called with queue lock held unlike blk_end_request().
+ *
+ * Return:
+ *     0 - we are done with this request
+ *     1 - still buffers pending for this request
+ **/
+int __blk_end_request(struct request *rq, int error, int nr_bytes)
+{
+       if (blk_fs_request(rq) || blk_pc_request(rq)) {
+               if (__end_that_request_first(rq, error, nr_bytes))
+                       return 1;
+       }
+
+       add_disk_randomness(rq->rq_disk);
+
+       end_that_request_last(rq, error);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(__blk_end_request);
+
+/**
+ * blk_end_bidi_request - Helper function for drivers to complete bidi request.
+ * @rq:         the bidi request being processed
+ * @error:      0 for success, < 0 for error
+ * @nr_bytes:   number of bytes to complete @rq
+ * @bidi_bytes: number of bytes to complete @rq->next_rq
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
+ *
+ * Return:
+ *     0 - we are done with this request
+ *     1 - still buffers pending for this request
+ **/
+int blk_end_bidi_request(struct request *rq, int error, int nr_bytes,
+                        int bidi_bytes)
+{
+       return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL);
+}
+EXPORT_SYMBOL_GPL(blk_end_bidi_request);
+
+/**
+ * blk_end_request_callback - Special helper function for tricky drivers
+ * @rq:           the request being processed
+ * @error:        0 for success, < 0 for error
+ * @nr_bytes:     number of bytes to complete
+ * @drv_callback: function called between completion of bios in the request
+ *                and completion of the request.
+ *                If the callback returns non 0, this helper returns without
+ *                completion of the request.
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ *     This special helper function is used only for existing tricky drivers.
+ *     (e.g. cdrom_newpc_intr() of ide-cd)
+ *     This interface will be removed when such drivers are rewritten.
+ *     Don't use this interface in other places anymore.
+ *
+ * Return:
+ *     0 - we are done with this request
+ *     1 - this request is not freed yet.
+ *         this request still has pending buffers or
+ *         the driver doesn't want to finish this request yet.
+ **/
+int blk_end_request_callback(struct request *rq, int error, int nr_bytes,
+                            int (drv_callback)(struct request *))
+{
+       return blk_end_io(rq, error, nr_bytes, 0, drv_callback);
+}
+EXPORT_SYMBOL_GPL(blk_end_request_callback);
+
+void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
+                    struct bio *bio)
+{
+       /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
+       rq->cmd_flags |= (bio->bi_rw & 3);
+
+       rq->nr_phys_segments = bio_phys_segments(q, bio);
+       rq->nr_hw_segments = bio_hw_segments(q, bio);
+       rq->current_nr_sectors = bio_cur_sectors(bio);
+       rq->hard_cur_sectors = rq->current_nr_sectors;
+       rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
+       rq->buffer = bio_data(bio);
+       rq->data_len = bio->bi_size;
+
+       rq->bio = rq->biotail = bio;
+
+       if (bio->bi_bdev)
+               rq->rq_disk = bio->bi_bdev->bd_disk;
+}
+
+int kblockd_schedule_work(struct work_struct *work)
+{
+       return queue_work(kblockd_workqueue, work);
+}
+
+EXPORT_SYMBOL(kblockd_schedule_work);
+
+void kblockd_flush_work(struct work_struct *work)
+{
+       cancel_work_sync(work);
+}
+EXPORT_SYMBOL(kblockd_flush_work);
+
+int __init blk_dev_init(void)
+{
+       int i;
+
+       kblockd_workqueue = create_workqueue("kblockd");
+       if (!kblockd_workqueue)
+               panic("Failed to create kblockd\n");
+
+       request_cachep = kmem_cache_create("blkdev_requests",
+                       sizeof(struct request), 0, SLAB_PANIC, NULL);
+
+       blk_requestq_cachep = kmem_cache_create("blkdev_queue",
+                       sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+
+       for_each_possible_cpu(i)
+               INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
+       register_hotcpu_notifier(&blk_cpu_notifier);
+
+       return 0;
+}
+
diff --git a/block/blk-exec.c b/block/blk-exec.c

new file mode 100644 (file)

index 0000000..ebfb44e
--- /dev/null
+++ b/block/blk-exec.c
@@ -0,0 +1,105 @@
+/*
+ * Functions related to setting various queue properties from drivers
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+
+/*
+ * for max sense size
+ */
+#include <scsi/scsi_cmnd.h>
+
+/**
+ * blk_end_sync_rq - executes a completion event on a request
+ * @rq: request to complete
+ * @error: end io status of the request
+ */
+void blk_end_sync_rq(struct request *rq, int error)
+{
+       struct completion *waiting = rq->end_io_data;
+
+       rq->end_io_data = NULL;
+       __blk_put_request(rq->q, rq);
+
+       /*
+        * complete last, if this is a stack request the process (and thus
+        * the rq pointer) could be invalid right after this complete()
+        */
+       complete(waiting);
+}
+EXPORT_SYMBOL(blk_end_sync_rq);
+
+/**
+ * blk_execute_rq_nowait - insert a request into queue for execution
+ * @q:         queue to insert the request in
+ * @bd_disk:   matching gendisk
+ * @rq:                request to insert
+ * @at_head:    insert request at head or tail of queue
+ * @done:      I/O completion handler
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the io scheduler queue
+ *    for execution.  Don't wait for completion.
+ */
+void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
+                          struct request *rq, int at_head,
+                          rq_end_io_fn *done)
+{
+       int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+
+       rq->rq_disk = bd_disk;
+       rq->cmd_flags |= REQ_NOMERGE;
+       rq->end_io = done;
+       WARN_ON(irqs_disabled());
+       spin_lock_irq(q->queue_lock);
+       __elv_add_request(q, rq, where, 1);
+       __generic_unplug_device(q);
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
+
+/**
+ * blk_execute_rq - insert a request into queue for execution
+ * @q:         queue to insert the request in
+ * @bd_disk:   matching gendisk
+ * @rq:                request to insert
+ * @at_head:    insert request at head or tail of queue
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the io scheduler queue
+ *    for execution and wait for completion.
+ */
+int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+                  struct request *rq, int at_head)
+{
+       DECLARE_COMPLETION_ONSTACK(wait);
+       char sense[SCSI_SENSE_BUFFERSIZE];
+       int err = 0;
+
+       /*
+        * we need an extra reference to the request, so we can look at
+        * it after io completion
+        */
+       rq->ref_count++;
+
+       if (!rq->sense) {
+               memset(sense, 0, sizeof(sense));
+               rq->sense = sense;
+               rq->sense_len = 0;
+       }
+
+       rq->end_io_data = &wait;
+       blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
+       wait_for_completion(&wait);
+
+       if (rq->errors)
+               err = -EIO;
+
+       return err;
+}
+
+EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c

new file mode 100644 (file)

index 0000000..6d16755
--- /dev/null
+++ b/block/blk-ioc.c
@@ -0,0 +1,194 @@
+/*
+ * Functions related to io context handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/bootmem.h>     /* for max_pfn/max_low_pfn */
+
+#include "blk.h"
+
+/*
+ * For io context allocations
+ */
+static struct kmem_cache *iocontext_cachep;
+
+static void cfq_dtor(struct io_context *ioc)
+{
+       struct cfq_io_context *cic[1];
+       int r;
+
+       /*
+        * We don't have a specific key to lookup with, so use the gang
+        * lookup to just retrieve the first item stored. The cfq exit
+        * function will iterate the full tree, so any member will do.
+        */
+       r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
+       if (r > 0)
+               cic[0]->dtor(ioc);
+}
+
+/*
+ * IO Context helper functions. put_io_context() returns 1 if there are no
+ * more users of this io context, 0 otherwise.
+ */
+int put_io_context(struct io_context *ioc)
+{
+       if (ioc == NULL)
+               return 1;
+
+       BUG_ON(atomic_read(&ioc->refcount) == 0);
+
+       if (atomic_dec_and_test(&ioc->refcount)) {
+               rcu_read_lock();
+               if (ioc->aic && ioc->aic->dtor)
+                       ioc->aic->dtor(ioc->aic);
+               rcu_read_unlock();
+               cfq_dtor(ioc);
+
+               kmem_cache_free(iocontext_cachep, ioc);
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(put_io_context);
+
+static void cfq_exit(struct io_context *ioc)
+{
+       struct cfq_io_context *cic[1];
+       int r;
+
+       rcu_read_lock();
+       /*
+        * See comment for cfq_dtor()
+        */
+       r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
+       rcu_read_unlock();
+
+       if (r > 0)
+               cic[0]->exit(ioc);
+}
+
+/* Called by the exitting task */
+void exit_io_context(void)
+{
+       struct io_context *ioc;
+
+       task_lock(current);
+       ioc = current->io_context;
+       current->io_context = NULL;
+       task_unlock(current);
+
+       if (atomic_dec_and_test(&ioc->nr_tasks)) {
+               if (ioc->aic && ioc->aic->exit)
+                       ioc->aic->exit(ioc->aic);
+               cfq_exit(ioc);
+
+               put_io_context(ioc);
+       }
+}
+
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+{
+       struct io_context *ret;
+
+       ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+       if (ret) {
+               atomic_set(&ret->refcount, 1);
+               atomic_set(&ret->nr_tasks, 1);
+               spin_lock_init(&ret->lock);
+               ret->ioprio_changed = 0;
+               ret->ioprio = 0;
+               ret->last_waited = jiffies; /* doesn't matter... */
+               ret->nr_batch_requests = 0; /* because this is 0 */
+               ret->aic = NULL;
+               INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
+               ret->ioc_data = NULL;
+       }
+
+       return ret;
+}
+
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * Otherwise, return its existing IO context.
+ *
+ * This returned IO context doesn't have a specifically elevated refcount,
+ * but since the current task itself holds a reference, the context can be
+ * used in general code, so long as it stays within `current` context.
+ */
+struct io_context *current_io_context(gfp_t gfp_flags, int node)
+{
+       struct task_struct *tsk = current;
+       struct io_context *ret;
+
+       ret = tsk->io_context;
+       if (likely(ret))
+               return ret;
+
+       ret = alloc_io_context(gfp_flags, node);
+       if (ret) {
+               /* make sure set_task_ioprio() sees the settings above */
+               smp_wmb();
+               tsk->io_context = ret;
+       }
+
+       return ret;
+}
+
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * If it does have a context, take a ref on it.
+ *
+ * This is always called in the context of the task which submitted the I/O.
+ */
+struct io_context *get_io_context(gfp_t gfp_flags, int node)
+{
+       struct io_context *ret = NULL;
+
+       /*
+        * Check for unlikely race with exiting task. ioc ref count is
+        * zero when ioc is being detached.
+        */
+       do {
+               ret = current_io_context(gfp_flags, node);
+               if (unlikely(!ret))
+                       break;
+       } while (!atomic_inc_not_zero(&ret->refcount));
+
+       return ret;
+}
+EXPORT_SYMBOL(get_io_context);
+
+void copy_io_context(struct io_context **pdst, struct io_context **psrc)
+{
+       struct io_context *src = *psrc;
+       struct io_context *dst = *pdst;
+
+       if (src) {
+               BUG_ON(atomic_read(&src->refcount) == 0);
+               atomic_inc(&src->refcount);
+               put_io_context(dst);
+               *pdst = src;
+       }
+}
+EXPORT_SYMBOL(copy_io_context);
+
+void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
+{
+       struct io_context *temp;
+       temp = *ioc1;
+       *ioc1 = *ioc2;
+       *ioc2 = temp;
+}
+EXPORT_SYMBOL(swap_io_context);
+
+int __init blk_ioc_init(void)
+{
+       iocontext_cachep = kmem_cache_create("blkdev_ioc",
+                       sizeof(struct io_context), 0, SLAB_PANIC, NULL);
+       return 0;
+}
+subsys_initcall(blk_ioc_init);
diff --git a/block/blk-map.c b/block/blk-map.c

new file mode 100644 (file)

index 0000000..916cfc9
--- /dev/null
+++ b/block/blk-map.c
@@ -0,0 +1,264 @@
+/*
+ * Functions related to mapping data to requests
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+
+int blk_rq_append_bio(struct request_queue *q, struct request *rq,
+                     struct bio *bio)
+{
+       if (!rq->bio)
+               blk_rq_bio_prep(q, rq, bio);
+       else if (!ll_back_merge_fn(q, rq, bio))
+               return -EINVAL;
+       else {
+               rq->biotail->bi_next = bio;
+               rq->biotail = bio;
+
+               rq->data_len += bio->bi_size;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(blk_rq_append_bio);
+
+static int __blk_rq_unmap_user(struct bio *bio)
+{
+       int ret = 0;
+
+       if (bio) {
+               if (bio_flagged(bio, BIO_USER_MAPPED))
+                       bio_unmap_user(bio);
+               else
+                       ret = bio_uncopy_user(bio);
+       }
+
+       return ret;
+}
+
+static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
+                            void __user *ubuf, unsigned int len)
+{
+       unsigned long uaddr;
+       struct bio *bio, *orig_bio;
+       int reading, ret;
+
+       reading = rq_data_dir(rq) == READ;
+
+       /*
+        * if alignment requirement is satisfied, map in user pages for
+        * direct dma. else, set up kernel bounce buffers
+        */
+       uaddr = (unsigned long) ubuf;
+       if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
+               bio = bio_map_user(q, NULL, uaddr, len, reading);
+       else
+               bio = bio_copy_user(q, uaddr, len, reading);
+
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       orig_bio = bio;
+       blk_queue_bounce(q, &bio);
+
+       /*
+        * We link the bounce buffer in and could have to traverse it
+        * later so we have to get a ref to prevent it from being freed
+        */
+       bio_get(bio);
+
+       ret = blk_rq_append_bio(q, rq, bio);
+       if (!ret)
+               return bio->bi_size;
+
+       /* if it was boucned we must call the end io function */
+       bio_endio(bio, 0);
+       __blk_rq_unmap_user(orig_bio);
+       bio_put(bio);
+       return ret;
+}
+
+/**
+ * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
+ * @q:         request queue where request should be inserted
+ * @rq:                request structure to fill
+ * @ubuf:      the user buffer
+ * @len:       length of user data
+ *
+ * Description:
+ *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    a kernel bounce buffer is used.
+ *
+ *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    still in process context.
+ *
+ *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
+ *    before being submitted to the device, as pages mapped may be out of
+ *    reach. It's the callers responsibility to make sure this happens. The
+ *    original bio must be passed back in to blk_rq_unmap_user() for proper
+ *    unmapping.
+ */
+int blk_rq_map_user(struct request_queue *q, struct request *rq,
+                   void __user *ubuf, unsigned long len)
+{
+       unsigned long bytes_read = 0;
+       struct bio *bio = NULL;
+       int ret;
+
+       if (len > (q->max_hw_sectors << 9))
+               return -EINVAL;
+       if (!len || !ubuf)
+               return -EINVAL;
+
+       while (bytes_read != len) {
+               unsigned long map_len, end, start;
+
+               map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
+               end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
+                                                               >> PAGE_SHIFT;
+               start = (unsigned long)ubuf >> PAGE_SHIFT;
+
+               /*
+                * A bad offset could cause us to require BIO_MAX_PAGES + 1
+                * pages. If this happens we just lower the requested
+                * mapping len by a page so that we can fit
+                */
+               if (end - start > BIO_MAX_PAGES)
+                       map_len -= PAGE_SIZE;
+
+               ret = __blk_rq_map_user(q, rq, ubuf, map_len);
+               if (ret < 0)
+                       goto unmap_rq;
+               if (!bio)
+                       bio = rq->bio;
+               bytes_read += ret;
+               ubuf += ret;
+       }
+
+       rq->buffer = rq->data = NULL;
+       return 0;
+unmap_rq:
+       blk_rq_unmap_user(bio);
+       return ret;
+}
+
+EXPORT_SYMBOL(blk_rq_map_user);
+
+/**
+ * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
+ * @q:         request queue where request should be inserted
+ * @rq:                request to map data to
+ * @iov:       pointer to the iovec
+ * @iov_count: number of elements in the iovec
+ * @len:       I/O byte count
+ *
+ * Description:
+ *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    a kernel bounce buffer is used.
+ *
+ *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    still in process context.
+ *
+ *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
+ *    before being submitted to the device, as pages mapped may be out of
+ *    reach. It's the callers responsibility to make sure this happens. The
+ *    original bio must be passed back in to blk_rq_unmap_user() for proper
+ *    unmapping.
+ */
+int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
+                       struct sg_iovec *iov, int iov_count, unsigned int len)
+{
+       struct bio *bio;
+
+       if (!iov || iov_count <= 0)
+               return -EINVAL;
+
+       /* we don't allow misaligned data like bio_map_user() does.  If the
+        * user is using sg, they're expected to know the alignment constraints
+        * and respect them accordingly */
+       bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       if (bio->bi_size != len) {
+               bio_endio(bio, 0);
+               bio_unmap_user(bio);
+               return -EINVAL;
+       }
+
+       bio_get(bio);
+       blk_rq_bio_prep(q, rq, bio);
+       rq->buffer = rq->data = NULL;
+       return 0;
+}
+
+EXPORT_SYMBOL(blk_rq_map_user_iov);
+
+/**
+ * blk_rq_unmap_user - unmap a request with user data
+ * @bio:              start of bio list
+ *
+ * Description:
+ *    Unmap a rq previously mapped by blk_rq_map_user(). The caller must
+ *    supply the original rq->bio from the blk_rq_map_user() return, since
+ *    the io completion may have changed rq->bio.
+ */
+int blk_rq_unmap_user(struct bio *bio)
+{
+       struct bio *mapped_bio;
+       int ret = 0, ret2;
+
+       while (bio) {
+               mapped_bio = bio;
+               if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
+                       mapped_bio = bio->bi_private;
+
+               ret2 = __blk_rq_unmap_user(mapped_bio);
+               if (ret2 && !ret)
+                       ret = ret2;
+
+               mapped_bio = bio;
+               bio = bio->bi_next;
+               bio_put(mapped_bio);
+       }
+
+       return ret;
+}
+
+EXPORT_SYMBOL(blk_rq_unmap_user);
+
+/**
+ * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
+ * @q:         request queue where request should be inserted
+ * @rq:                request to fill
+ * @kbuf:      the kernel buffer
+ * @len:       length of user data
+ * @gfp_mask:  memory allocation flags
+ */
+int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
+                   unsigned int len, gfp_t gfp_mask)
+{
+       struct bio *bio;
+
+       if (len > (q->max_hw_sectors << 9))
+               return -EINVAL;
+       if (!len || !kbuf)
+               return -EINVAL;
+
+       bio = bio_map_kern(q, kbuf, len, gfp_mask);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       if (rq_data_dir(rq) == WRITE)
+               bio->bi_rw |= (1 << BIO_RW);
+
+       blk_rq_bio_prep(q, rq, bio);
+       blk_queue_bounce(q, &rq->bio);
+       rq->buffer = rq->data = NULL;
+       return 0;
+}
+
+EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/blk-merge.c b/block/blk-merge.c

new file mode 100644 (file)

index 0000000..5023f0b
--- /dev/null
+++ b/block/blk-merge.c
@@ -0,0 +1,485 @@
+/*
+ * Functions related to segment and merge handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+void blk_recalc_rq_sectors(struct request *rq, int nsect)
+{
+       if (blk_fs_request(rq)) {
+               rq->hard_sector += nsect;
+               rq->hard_nr_sectors -= nsect;
+
+               /*
+                * Move the I/O submission pointers ahead if required.
+                */
+               if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
+                   (rq->sector <= rq->hard_sector)) {
+                       rq->sector = rq->hard_sector;
+                       rq->nr_sectors = rq->hard_nr_sectors;
+                       rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
+                       rq->current_nr_sectors = rq->hard_cur_sectors;
+                       rq->buffer = bio_data(rq->bio);
+               }
+
+               /*
+                * if total number of sectors is less than the first segment
+                * size, something has gone terribly wrong
+                */
+               if (rq->nr_sectors < rq->current_nr_sectors) {
+                       printk("blk: request botched\n");
+                       rq->nr_sectors = rq->current_nr_sectors;
+               }
+       }
+}
+
+void blk_recalc_rq_segments(struct request *rq)
+{
+       int nr_phys_segs;
+       int nr_hw_segs;
+       unsigned int phys_size;
+       unsigned int hw_size;
+       struct bio_vec *bv, *bvprv = NULL;
+       int seg_size;
+       int hw_seg_size;
+       int cluster;
+       struct req_iterator iter;
+       int high, highprv = 1;
+       struct request_queue *q = rq->q;
+
+       if (!rq->bio)
+               return;
+
+       cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
+       hw_seg_size = seg_size = 0;
+       phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
+       rq_for_each_segment(bv, rq, iter) {
+               /*
+                * the trick here is making sure that a high page is never
+                * considered part of another segment, since that might
+                * change with the bounce page.
+                */
+               high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+               if (high || highprv)
+                       goto new_hw_segment;
+               if (cluster) {
+                       if (seg_size + bv->bv_len > q->max_segment_size)
+                               goto new_segment;
+                       if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
+                               goto new_segment;
+                       if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+                               goto new_segment;
+                       if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
+                               goto new_hw_segment;
+
+                       seg_size += bv->bv_len;
+                       hw_seg_size += bv->bv_len;
+                       bvprv = bv;
+                       continue;
+               }
+new_segment:
+               if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
+                   !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
+                       hw_seg_size += bv->bv_len;
+               else {
+new_hw_segment:
+                       if (nr_hw_segs == 1 &&
+                           hw_seg_size > rq->bio->bi_hw_front_size)
+                               rq->bio->bi_hw_front_size = hw_seg_size;
+                       hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
+                       nr_hw_segs++;
+               }
+
+               nr_phys_segs++;
+               bvprv = bv;
+               seg_size = bv->bv_len;
+               highprv = high;
+       }
+
+       if (nr_hw_segs == 1 &&
+           hw_seg_size > rq->bio->bi_hw_front_size)
+               rq->bio->bi_hw_front_size = hw_seg_size;
+       if (hw_seg_size > rq->biotail->bi_hw_back_size)
+               rq->biotail->bi_hw_back_size = hw_seg_size;
+       rq->nr_phys_segments = nr_phys_segs;
+       rq->nr_hw_segments = nr_hw_segs;
+}
+
+void blk_recount_segments(struct request_queue *q, struct bio *bio)
+{
+       struct request rq;
+       struct bio *nxt = bio->bi_next;
+       rq.q = q;
+       rq.bio = rq.biotail = bio;
+       bio->bi_next = NULL;
+       blk_recalc_rq_segments(&rq);
+       bio->bi_next = nxt;
+       bio->bi_phys_segments = rq.nr_phys_segments;
+       bio->bi_hw_segments = rq.nr_hw_segments;
+       bio->bi_flags |= (1 << BIO_SEG_VALID);
+}
+EXPORT_SYMBOL(blk_recount_segments);
+
+static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
+                                  struct bio *nxt)
+{
+       if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
+               return 0;
+
+       if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
+               return 0;
+       if (bio->bi_size + nxt->bi_size > q->max_segment_size)
+               return 0;
+
+       /*
+        * bio and nxt are contigous in memory, check if the queue allows
+        * these two to be merged into one
+        */
+       if (BIO_SEG_BOUNDARY(q, bio, nxt))
+               return 1;
+
+       return 0;
+}
+
+static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
+                                struct bio *nxt)
+{
+       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+               blk_recount_segments(q, bio);
+       if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
+               blk_recount_segments(q, nxt);
+       if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
+           BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
+               return 0;
+       if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
+               return 0;
+
+       return 1;
+}
+
+/*
+ * map a request to scatterlist, return number of sg entries setup. Caller
+ * must make sure sg can hold rq->nr_phys_segments entries
+ */
+int blk_rq_map_sg(struct request_queue *q, struct request *rq,
+                 struct scatterlist *sglist)
+{
+       struct bio_vec *bvec, *bvprv;
+       struct req_iterator iter;
+       struct scatterlist *sg;
+       int nsegs, cluster;
+
+       nsegs = 0;
+       cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
+
+       /*
+        * for each bio in rq
+        */
+       bvprv = NULL;
+       sg = NULL;
+       rq_for_each_segment(bvec, rq, iter) {
+               int nbytes = bvec->bv_len;
+
+               if (bvprv && cluster) {
+                       if (sg->length + nbytes > q->max_segment_size)
+                               goto new_segment;
+
+                       if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
+                               goto new_segment;
+                       if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
+                               goto new_segment;
+
+                       sg->length += nbytes;
+               } else {
+new_segment:
+                       if (!sg)
+                               sg = sglist;
+                       else {
+                               /*
+                                * If the driver previously mapped a shorter
+                                * list, we could see a termination bit
+                                * prematurely unless it fully inits the sg
+                                * table on each mapping. We KNOW that there
+                                * must be more entries here or the driver
+                                * would be buggy, so force clear the
+                                * termination bit to avoid doing a full
+                                * sg_init_table() in drivers for each command.
+                                */
+                               sg->page_link &= ~0x02;
+                               sg = sg_next(sg);
+                       }
+
+                       sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
+                       nsegs++;
+               }
+               bvprv = bvec;
+       } /* segments in rq */
+
+       if (q->dma_drain_size) {
+               sg->page_link &= ~0x02;
+               sg = sg_next(sg);
+               sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
+                           q->dma_drain_size,
+                           ((unsigned long)q->dma_drain_buffer) &
+                           (PAGE_SIZE - 1));
+               nsegs++;
+       }
+
+       if (sg)
+               sg_mark_end(sg);
+
+       return nsegs;
+}
+
+EXPORT_SYMBOL(blk_rq_map_sg);
+
+static inline int ll_new_mergeable(struct request_queue *q,
+                                  struct request *req,
+                                  struct bio *bio)
+{
+       int nr_phys_segs = bio_phys_segments(q, bio);
+
+       if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+               req->cmd_flags |= REQ_NOMERGE;
+               if (req == q->last_merge)
+                       q->last_merge = NULL;
+               return 0;
+       }
+
+       /*
+        * A hw segment is just getting larger, bump just the phys
+        * counter.
+        */
+       req->nr_phys_segments += nr_phys_segs;
+       return 1;
+}
+
+static inline int ll_new_hw_segment(struct request_queue *q,
+                                   struct request *req,
+                                   struct bio *bio)
+{
+       int nr_hw_segs = bio_hw_segments(q, bio);
+       int nr_phys_segs = bio_phys_segments(q, bio);
+
+       if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
+           || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+               req->cmd_flags |= REQ_NOMERGE;
+               if (req == q->last_merge)
+                       q->last_merge = NULL;
+               return 0;
+       }
+
+       /*
+        * This will form the start of a new hw segment.  Bump both
+        * counters.
+        */
+       req->nr_hw_segments += nr_hw_segs;
+       req->nr_phys_segments += nr_phys_segs;
+       return 1;
+}
+
+int ll_back_merge_fn(struct request_queue *q, struct request *req,
+                    struct bio *bio)
+{
+       unsigned short max_sectors;
+       int len;
+
+       if (unlikely(blk_pc_request(req)))
+               max_sectors = q->max_hw_sectors;
+       else
+               max_sectors = q->max_sectors;
+
+       if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+               req->cmd_flags |= REQ_NOMERGE;
+               if (req == q->last_merge)
+                       q->last_merge = NULL;
+               return 0;
+       }
+       if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
+               blk_recount_segments(q, req->biotail);
+       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+               blk_recount_segments(q, bio);
+       len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
+       if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
+           !BIOVEC_VIRT_OVERSIZE(len)) {
+               int mergeable =  ll_new_mergeable(q, req, bio);
+
+               if (mergeable) {
+                       if (req->nr_hw_segments == 1)
+                               req->bio->bi_hw_front_size = len;
+                       if (bio->bi_hw_segments == 1)
+                               bio->bi_hw_back_size = len;
+               }
+               return mergeable;
+       }
+
+       return ll_new_hw_segment(q, req, bio);
+}
+
+int ll_front_merge_fn(struct request_queue *q, struct request *req, 
+                     struct bio *bio)
+{
+       unsigned short max_sectors;
+       int len;
+
+       if (unlikely(blk_pc_request(req)))
+               max_sectors = q->max_hw_sectors;
+       else
+               max_sectors = q->max_sectors;
+
+
+       if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+               req->cmd_flags |= REQ_NOMERGE;
+               if (req == q->last_merge)
+                       q->last_merge = NULL;
+               return 0;
+       }
+       len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
+       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
+               blk_recount_segments(q, bio);
+       if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
+               blk_recount_segments(q, req->bio);
+       if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
+           !BIOVEC_VIRT_OVERSIZE(len)) {
+               int mergeable =  ll_new_mergeable(q, req, bio);
+
+               if (mergeable) {
+                       if (bio->bi_hw_segments == 1)
+                               bio->bi_hw_front_size = len;
+                       if (req->nr_hw_segments == 1)
+                               req->biotail->bi_hw_back_size = len;
+               }
+               return mergeable;
+       }
+
+       return ll_new_hw_segment(q, req, bio);
+}
+
+static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
+                               struct request *next)
+{
+       int total_phys_segments;
+       int total_hw_segments;
+
+       /*
+        * First check if the either of the requests are re-queued
+        * requests.  Can't merge them if they are.
+        */
+       if (req->special || next->special)
+               return 0;
+
+       /*
+        * Will it become too large?
+        */
+       if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
+               return 0;
+
+       total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
+       if (blk_phys_contig_segment(q, req->biotail, next->bio))
+               total_phys_segments--;
+
+       if (total_phys_segments > q->max_phys_segments)
+               return 0;
+
+       total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
+       if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
+               int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
+               /*
+                * propagate the combined length to the end of the requests
+                */
+               if (req->nr_hw_segments == 1)
+                       req->bio->bi_hw_front_size = len;
+               if (next->nr_hw_segments == 1)
+                       next->biotail->bi_hw_back_size = len;
+               total_hw_segments--;
+       }
+
+       if (total_hw_segments > q->max_hw_segments)
+               return 0;
+
+       /* Merge is OK... */
+       req->nr_phys_segments = total_phys_segments;
+       req->nr_hw_segments = total_hw_segments;
+       return 1;
+}
+
+/*
+ * Has to be called with the request spinlock acquired
+ */
+static int attempt_merge(struct request_queue *q, struct request *req,
+                         struct request *next)
+{
+       if (!rq_mergeable(req) || !rq_mergeable(next))
+               return 0;
+
+       /*
+        * not contiguous
+        */
+       if (req->sector + req->nr_sectors != next->sector)
+               return 0;
+
+       if (rq_data_dir(req) != rq_data_dir(next)
+           || req->rq_disk != next->rq_disk
+           || next->special)
+               return 0;
+
+       /*
+        * If we are allowed to merge, then append bio list
+        * from next to rq and release next. merge_requests_fn
+        * will have updated segment counts, update sector
+        * counts here.
+        */
+       if (!ll_merge_requests_fn(q, req, next))
+               return 0;
+
+       /*
+        * At this point we have either done a back merge
+        * or front merge. We need the smaller start_time of
+        * the merged requests to be the current request
+        * for accounting purposes.
+        */
+       if (time_after(req->start_time, next->start_time))
+               req->start_time = next->start_time;
+
+       req->biotail->bi_next = next->bio;
+       req->biotail = next->biotail;
+
+       req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
+
+       elv_merge_requests(q, req, next);
+
+       if (req->rq_disk) {
+               disk_round_stats(req->rq_disk);
+               req->rq_disk->in_flight--;
+       }
+
+       req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+
+       __blk_put_request(q, next);
+       return 1;
+}
+
+int attempt_back_merge(struct request_queue *q, struct request *rq)
+{
+       struct request *next = elv_latter_request(q, rq);
+
+       if (next)
+               return attempt_merge(q, rq, next);
+
+       return 0;
+}
+
+int attempt_front_merge(struct request_queue *q, struct request *rq)
+{
+       struct request *prev = elv_former_request(q, rq);
+
+       if (prev)
+               return attempt_merge(q, prev, rq);
+
+       return 0;
+}
diff --git a/block/blk-settings.c b/block/blk-settings.c

new file mode 100644 (file)

index 0000000..4df09a1
--- /dev/null
+++ b/block/blk-settings.c
@@ -0,0 +1,402 @@
+/*
+ * Functions related to setting various queue properties from drivers
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/bootmem.h>     /* for max_pfn/max_low_pfn */
+
+#include "blk.h"
+
+unsigned long blk_max_low_pfn, blk_max_pfn;
+EXPORT_SYMBOL(blk_max_low_pfn);
+EXPORT_SYMBOL(blk_max_pfn);
+
+/**
+ * blk_queue_prep_rq - set a prepare_request function for queue
+ * @q:         queue
+ * @pfn:       prepare_request function
+ *
+ * It's possible for a queue to register a prepare_request callback which
+ * is invoked before the request is handed to the request_fn. The goal of
+ * the function is to prepare a request for I/O, it can be used to build a
+ * cdb from the request data for instance.
+ *
+ */
+void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
+{
+       q->prep_rq_fn = pfn;
+}
+
+EXPORT_SYMBOL(blk_queue_prep_rq);
+
+/**
+ * blk_queue_merge_bvec - set a merge_bvec function for queue
+ * @q:         queue
+ * @mbfn:      merge_bvec_fn
+ *
+ * Usually queues have static limitations on the max sectors or segments that
+ * we can put in a request. Stacking drivers may have some settings that
+ * are dynamic, and thus we have to query the queue whether it is ok to
+ * add a new bio_vec to a bio at a given offset or not. If the block device
+ * has such limitations, it needs to register a merge_bvec_fn to control
+ * the size of bio's sent to it. Note that a block device *must* allow a
+ * single page to be added to an empty bio. The block device driver may want
+ * to use the bio_split() function to deal with these bio's. By default
+ * no merge_bvec_fn is defined for a queue, and only the fixed limits are
+ * honored.
+ */
+void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
+{
+       q->merge_bvec_fn = mbfn;
+}
+
+EXPORT_SYMBOL(blk_queue_merge_bvec);
+
+void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
+{
+       q->softirq_done_fn = fn;
+}
+
+EXPORT_SYMBOL(blk_queue_softirq_done);
+
+/**
+ * blk_queue_make_request - define an alternate make_request function for a device
+ * @q:  the request queue for the device to be affected
+ * @mfn: the alternate make_request function
+ *
+ * Description:
+ *    The normal way for &struct bios to be passed to a device
+ *    driver is for them to be collected into requests on a request
+ *    queue, and then to allow the device driver to select requests
+ *    off that queue when it is ready.  This works well for many block
+ *    devices. However some block devices (typically virtual devices
+ *    such as md or lvm) do not benefit from the processing on the
+ *    request queue, and are served best by having the requests passed
+ *    directly to them.  This can be achieved by providing a function
+ *    to blk_queue_make_request().
+ *
+ * Caveat:
+ *    The driver that does this *must* be able to deal appropriately
+ *    with buffers in "highmemory". This can be accomplished by either calling
+ *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ *    blk_queue_bounce() to create a buffer in normal memory.
+ **/
+void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
+{
+       /*
+        * set defaults
+        */
+       q->nr_requests = BLKDEV_MAX_RQ;
+       blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
+       blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
+       q->make_request_fn = mfn;
+       q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+       q->backing_dev_info.state = 0;
+       q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+       blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
+       blk_queue_hardsect_size(q, 512);
+       blk_queue_dma_alignment(q, 511);
+       blk_queue_congestion_threshold(q);
+       q->nr_batching = BLK_BATCH_REQ;
+
+       q->unplug_thresh = 4;           /* hmm */
+       q->unplug_delay = (3 * HZ) / 1000;      /* 3 milliseconds */
+       if (q->unplug_delay == 0)
+               q->unplug_delay = 1;
+
+       INIT_WORK(&q->unplug_work, blk_unplug_work);
+
+       q->unplug_timer.function = blk_unplug_timeout;
+       q->unplug_timer.data = (unsigned long)q;
+
+       /*
+        * by default assume old behaviour and bounce for any highmem page
+        */
+       blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
+}
+
+EXPORT_SYMBOL(blk_queue_make_request);
+
+/**
+ * blk_queue_bounce_limit - set bounce buffer limit for queue
+ * @q:  the request queue for the device
+ * @dma_addr:   bus address limit
+ *
+ * Description:
+ *    Different hardware can have different requirements as to what pages
+ *    it can do I/O directly to. A low level driver can call
+ *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
+ *    buffers for doing I/O to pages residing above @page.
+ **/
+void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
+{
+       unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
+       int dma = 0;
+
+       q->bounce_gfp = GFP_NOIO;
+#if BITS_PER_LONG == 64
+       /* Assume anything <= 4GB can be handled by IOMMU.
+          Actually some IOMMUs can handle everything, but I don't
+          know of a way to test this here. */
+       if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
+               dma = 1;
+       q->bounce_pfn = max_low_pfn;
+#else
+       if (bounce_pfn < blk_max_low_pfn)
+               dma = 1;
+       q->bounce_pfn = bounce_pfn;
+#endif
+       if (dma) {
+               init_emergency_isa_pool();
+               q->bounce_gfp = GFP_NOIO | GFP_DMA;
+               q->bounce_pfn = bounce_pfn;
+       }
+}
+
+EXPORT_SYMBOL(blk_queue_bounce_limit);
+
+/**
+ * blk_queue_max_sectors - set max sectors for a request for this queue
+ * @q:  the request queue for the device
+ * @max_sectors:  max sectors in the usual 512b unit
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the size of
+ *    received requests.
+ **/
+void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
+{
+       if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
+               max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+               printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
+       }
+
+       if (BLK_DEF_MAX_SECTORS > max_sectors)
+               q->max_hw_sectors = q->max_sectors = max_sectors;
+       else {
+               q->max_sectors = BLK_DEF_MAX_SECTORS;
+               q->max_hw_sectors = max_sectors;
+       }
+}
+
+EXPORT_SYMBOL(blk_queue_max_sectors);
+
+/**
+ * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * @q:  the request queue for the device
+ * @max_segments:  max number of segments
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the number of
+ *    physical data segments in a request.  This would be the largest sized
+ *    scatter list the driver could handle.
+ **/
+void blk_queue_max_phys_segments(struct request_queue *q,
+                                unsigned short max_segments)
+{
+       if (!max_segments) {
+               max_segments = 1;
+               printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+       }
+
+       q->max_phys_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_phys_segments);
+
+/**
+ * blk_queue_max_hw_segments - set max hw segments for a request for this queue
+ * @q:  the request queue for the device
+ * @max_segments:  max number of segments
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the number of
+ *    hw data segments in a request.  This would be the largest number of
+ *    address/length pairs the host adapter can actually give as once
+ *    to the device.
+ **/
+void blk_queue_max_hw_segments(struct request_queue *q,
+                              unsigned short max_segments)
+{
+       if (!max_segments) {
+               max_segments = 1;
+               printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
+       }
+
+       q->max_hw_segments = max_segments;
+}
+
+EXPORT_SYMBOL(blk_queue_max_hw_segments);
+
+/**
+ * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
+ * @q:  the request queue for the device
+ * @max_size:  max size of segment in bytes
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the size of a
+ *    coalesced segment
+ **/
+void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
+{
+       if (max_size < PAGE_CACHE_SIZE) {
+               max_size = PAGE_CACHE_SIZE;
+               printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
+       }
+
+       q->max_segment_size = max_size;
+}
+
+EXPORT_SYMBOL(blk_queue_max_segment_size);
+
+/**
+ * blk_queue_hardsect_size - set hardware sector size for the queue
+ * @q:  the request queue for the device
+ * @size:  the hardware sector size, in bytes
+ *
+ * Description:
+ *   This should typically be set to the lowest possible sector size
+ *   that the hardware can operate on (possible without reverting to
+ *   even internal read-modify-write operations). Usually the default
+ *   of 512 covers most hardware.
+ **/
+void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
+{
+       q->hardsect_size = size;
+}
+
+EXPORT_SYMBOL(blk_queue_hardsect_size);
+
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+
+/**
+ * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
+ * @t: the stacking driver (top)
+ * @b:  the underlying device (bottom)
+ **/
+void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
+{
+       /* zero is "infinity" */
+       t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
+       t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
+
+       t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
+       t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
+       t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
+       t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
+       if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+               clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_stack_limits);
+
+/**
+ * blk_queue_dma_drain - Set up a drain buffer for excess dma.
+ *
+ * @q:  the request queue for the device
+ * @buf:       physically contiguous buffer
+ * @size:      size of the buffer in bytes
+ *
+ * Some devices have excess DMA problems and can't simply discard (or
+ * zero fill) the unwanted piece of the transfer.  They have to have a
+ * real area of memory to transfer it into.  The use case for this is
+ * ATAPI devices in DMA mode.  If the packet command causes a transfer
+ * bigger than the transfer size some HBAs will lock up if there
+ * aren't DMA elements to contain the excess transfer.  What this API
+ * does is adjust the queue so that the buf is always appended
+ * silently to the scatterlist.
+ *
+ * Note: This routine adjusts max_hw_segments to make room for
+ * appending the drain buffer.  If you call
+ * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after
+ * calling this routine, you must set the limit to one fewer than your
+ * device can support otherwise there won't be room for the drain
+ * buffer.
+ */
+int blk_queue_dma_drain(struct request_queue *q, void *buf,
+                               unsigned int size)
+{
+       if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
+               return -EINVAL;
+       /* make room for appending the drain */
+       --q->max_hw_segments;
+       --q->max_phys_segments;
+       q->dma_drain_buffer = buf;
+       q->dma_drain_size = size;
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
+
+/**
+ * blk_queue_segment_boundary - set boundary rules for segment merging
+ * @q:  the request queue for the device
+ * @mask:  the memory boundary mask
+ **/
+void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
+{
+       if (mask < PAGE_CACHE_SIZE - 1) {
+               mask = PAGE_CACHE_SIZE - 1;
+               printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
+       }
+
+       q->seg_boundary_mask = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_segment_boundary);
+
+/**
+ * blk_queue_dma_alignment - set dma length and memory alignment
+ * @q:     the request queue for the device
+ * @mask:  alignment mask
+ *
+ * description:
+ *    set required memory and length aligment for direct dma transactions.
+ *    this is used when buiding direct io requests for the queue.
+ *
+ **/
+void blk_queue_dma_alignment(struct request_queue *q, int mask)
+{
+       q->dma_alignment = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_dma_alignment);
+
+/**
+ * blk_queue_update_dma_alignment - update dma length and memory alignment
+ * @q:     the request queue for the device
+ * @mask:  alignment mask
+ *
+ * description:
+ *    update required memory and length aligment for direct dma transactions.
+ *    If the requested alignment is larger than the current alignment, then
+ *    the current queue alignment is updated to the new value, otherwise it
+ *    is left alone.  The design of this is to allow multiple objects
+ *    (driver, device, transport etc) to set their respective
+ *    alignments without having them interfere.
+ *
+ **/
+void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
+{
+       BUG_ON(mask > PAGE_SIZE);
+
+       if (mask > q->dma_alignment)
+               q->dma_alignment = mask;
+}
+
+EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+
+int __init blk_settings_init(void)
+{
+       blk_max_low_pfn = max_low_pfn - 1;
+       blk_max_pfn = max_pfn - 1;
+       return 0;
+}
+subsys_initcall(blk_settings_init);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

new file mode 100644 (file)

index 0000000..bc28776
--- /dev/null
+++ b/block/blk-sysfs.c
@@ -0,0 +1,309 @@
+/*
+ * Functions related to sysfs handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+
+#include "blk.h"
+
+struct queue_sysfs_entry {
+       struct attribute attr;
+       ssize_t (*show)(struct request_queue *, char *);
+       ssize_t (*store)(struct request_queue *, const char *, size_t);
+};
+
+static ssize_t
+queue_var_show(unsigned int var, char *page)
+{
+       return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+queue_var_store(unsigned long *var, const char *page, size_t count)
+{
+       char *p = (char *) page;
+
+       *var = simple_strtoul(p, &p, 10);
+       return count;
+}
+
+static ssize_t queue_requests_show(struct request_queue *q, char *page)
+{
+       return queue_var_show(q->nr_requests, (page));
+}
+
+static ssize_t
+queue_requests_store(struct request_queue *q, const char *page, size_t count)
+{
+       struct request_list *rl = &q->rq;
+       unsigned long nr;
+       int ret = queue_var_store(&nr, page, count);
+       if (nr < BLKDEV_MIN_RQ)
+               nr = BLKDEV_MIN_RQ;
+
+       spin_lock_irq(q->queue_lock);
+       q->nr_requests = nr;
+       blk_queue_congestion_threshold(q);
+
+       if (rl->count[READ] >= queue_congestion_on_threshold(q))
+               blk_set_queue_congested(q, READ);
+       else if (rl->count[READ] < queue_congestion_off_threshold(q))
+               blk_clear_queue_congested(q, READ);
+
+       if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
+               blk_set_queue_congested(q, WRITE);
+       else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
+               blk_clear_queue_congested(q, WRITE);
+
+       if (rl->count[READ] >= q->nr_requests) {
+               blk_set_queue_full(q, READ);
+       } else if (rl->count[READ]+1 <= q->nr_requests) {
+               blk_clear_queue_full(q, READ);
+               wake_up(&rl->wait[READ]);
+       }
+
+       if (rl->count[WRITE] >= q->nr_requests) {
+               blk_set_queue_full(q, WRITE);
+       } else if (rl->count[WRITE]+1 <= q->nr_requests) {
+               blk_clear_queue_full(q, WRITE);
+               wake_up(&rl->wait[WRITE]);
+       }
+       spin_unlock_irq(q->queue_lock);
+       return ret;
+}
+
+static ssize_t queue_ra_show(struct request_queue *q, char *page)
+{
+       int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
+
+       return queue_var_show(ra_kb, (page));
+}
+
+static ssize_t
+queue_ra_store(struct request_queue *q, const char *page, size_t count)
+{
+       unsigned long ra_kb;
+       ssize_t ret = queue_var_store(&ra_kb, page, count);
+
+       spin_lock_irq(q->queue_lock);
+       q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
+       spin_unlock_irq(q->queue_lock);
+
+       return ret;
+}
+
+static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
+{
+       int max_sectors_kb = q->max_sectors >> 1;
+
+       return queue_var_show(max_sectors_kb, (page));
+}
+
+static ssize_t queue_hw_sector_size_show(struct request_queue *q, char *page)
+{
+       return queue_var_show(q->hardsect_size, page);
+}
+
+static ssize_t
+queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
+{
+       unsigned long max_sectors_kb,
+                       max_hw_sectors_kb = q->max_hw_sectors >> 1,
+                       page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
+       ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+
+       if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
+               return -EINVAL;
+       /*
+        * Take the queue lock to update the readahead and max_sectors
+        * values synchronously:
+        */
+       spin_lock_irq(q->queue_lock);
+       q->max_sectors = max_sectors_kb << 1;
+       spin_unlock_irq(q->queue_lock);
+
+       return ret;
+}
+
+static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
+{
+       int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+
+       return queue_var_show(max_hw_sectors_kb, (page));
+}
+
+
+static struct queue_sysfs_entry queue_requests_entry = {
+       .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_requests_show,
+       .store = queue_requests_store,
+};
+
+static struct queue_sysfs_entry queue_ra_entry = {
+       .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_ra_show,
+       .store = queue_ra_store,
+};
+
+static struct queue_sysfs_entry queue_max_sectors_entry = {
+       .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_max_sectors_show,
+       .store = queue_max_sectors_store,
+};
+
+static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
+       .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+       .show = queue_max_hw_sectors_show,
+};
+
+static struct queue_sysfs_entry queue_iosched_entry = {
+       .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+       .show = elv_iosched_show,
+       .store = elv_iosched_store,
+};
+
+static struct queue_sysfs_entry queue_hw_sector_size_entry = {
+       .attr = {.name = "hw_sector_size", .mode = S_IRUGO },
+       .show = queue_hw_sector_size_show,
+};
+
+static struct attribute *default_attrs[] = {
+       &queue_requests_entry.attr,
+       &queue_ra_entry.attr,
+       &queue_max_hw_sectors_entry.attr,
+       &queue_max_sectors_entry.attr,
+       &queue_iosched_entry.attr,
+       &queue_hw_sector_size_entry.attr,
+       NULL,
+};
+
+#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
+
+static ssize_t
+queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+       struct queue_sysfs_entry *entry = to_queue(attr);
+       struct request_queue *q =
+               container_of(kobj, struct request_queue, kobj);
+       ssize_t res;
+
+       if (!entry->show)
+               return -EIO;
+       mutex_lock(&q->sysfs_lock);
+       if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+               mutex_unlock(&q->sysfs_lock);
+               return -ENOENT;
+       }
+       res = entry->show(q, page);
+       mutex_unlock(&q->sysfs_lock);
+       return res;
+}
+
+static ssize_t
+queue_attr_store(struct kobject *kobj, struct attribute *attr,
+                   const char *page, size_t length)
+{
+       struct queue_sysfs_entry *entry = to_queue(attr);
+       struct request_queue *q = container_of(kobj, struct request_queue, kobj);
+
+       ssize_t res;
+
+       if (!entry->store)
+               return -EIO;
+       mutex_lock(&q->sysfs_lock);
+       if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+               mutex_unlock(&q->sysfs_lock);
+               return -ENOENT;
+       }
+       res = entry->store(q, page, length);
+       mutex_unlock(&q->sysfs_lock);
+       return res;
+}
+
+/**
+ * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
+ * @kobj:    the kobj belonging of the request queue to be released
+ *
+ * Description:
+ *     blk_cleanup_queue is the pair to blk_init_queue() or
+ *     blk_queue_make_request().  It should be called when a request queue is
+ *     being released; typically when a block device is being de-registered.
+ *     Currently, its primary task it to free all the &struct request
+ *     structures that were allocated to the queue and the queue itself.
+ *
+ * Caveat:
+ *     Hopefully the low level driver will have finished any
+ *     outstanding requests first...
+ **/
+static void blk_release_queue(struct kobject *kobj)
+{
+       struct request_queue *q =
+               container_of(kobj, struct request_queue, kobj);
+       struct request_list *rl = &q->rq;
+
+       blk_sync_queue(q);
+
+       if (rl->rq_pool)
+               mempool_destroy(rl->rq_pool);
+
+       if (q->queue_tags)
+               __blk_queue_free_tags(q);
+
+       blk_trace_shutdown(q);
+
+       bdi_destroy(&q->backing_dev_info);
+       kmem_cache_free(blk_requestq_cachep, q);
+}
+
+static struct sysfs_ops queue_sysfs_ops = {
+       .show   = queue_attr_show,
+       .store  = queue_attr_store,
+};
+
+struct kobj_type blk_queue_ktype = {
+       .sysfs_ops      = &queue_sysfs_ops,
+       .default_attrs  = default_attrs,
+       .release        = blk_release_queue,
+};
+
+int blk_register_queue(struct gendisk *disk)
+{
+       int ret;
+
+       struct request_queue *q = disk->queue;
+
+       if (!q || !q->request_fn)
+               return -ENXIO;
+
+       ret = kobject_add(&q->kobj, kobject_get(&disk->dev.kobj),
+                         "%s", "queue");
+       if (ret < 0)
+               return ret;
+
+       kobject_uevent(&q->kobj, KOBJ_ADD);
+
+       ret = elv_register_queue(q);
+       if (ret) {
+               kobject_uevent(&q->kobj, KOBJ_REMOVE);
+               kobject_del(&q->kobj);
+               return ret;
+       }
+
+       return 0;
+}
+
+void blk_unregister_queue(struct gendisk *disk)
+{
+       struct request_queue *q = disk->queue;
+
+       if (q && q->request_fn) {
+               elv_unregister_queue(q);
+
+               kobject_uevent(&q->kobj, KOBJ_REMOVE);
+               kobject_del(&q->kobj);
+               kobject_put(&disk->dev.kobj);
+       }
+}
diff --git a/block/blk-tag.c b/block/blk-tag.c

new file mode 100644 (file)

index 0000000..d1fd300
--- /dev/null
+++ b/block/blk-tag.c
@@ -0,0 +1,396 @@
+/*
+ * Functions related to tagged command queuing
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+/**
+ * blk_queue_find_tag - find a request by its tag and queue
+ * @q:  The request queue for the device
+ * @tag: The tag of the request
+ *
+ * Notes:
+ *    Should be used when a device returns a tag and you want to match
+ *    it with a request.
+ *
+ *    no locks need be held.
+ **/
+struct request *blk_queue_find_tag(struct request_queue *q, int tag)
+{
+       return blk_map_queue_find_tag(q->queue_tags, tag);
+}
+
+EXPORT_SYMBOL(blk_queue_find_tag);
+
+/**
+ * __blk_free_tags - release a given set of tag maintenance info
+ * @bqt:       the tag map to free
+ *
+ * Tries to free the specified @bqt@.  Returns true if it was
+ * actually freed and false if there are still references using it
+ */
+static int __blk_free_tags(struct blk_queue_tag *bqt)
+{
+       int retval;
+
+       retval = atomic_dec_and_test(&bqt->refcnt);
+       if (retval) {
+               BUG_ON(bqt->busy);
+
+               kfree(bqt->tag_index);
+               bqt->tag_index = NULL;
+
+               kfree(bqt->tag_map);
+               bqt->tag_map = NULL;
+
+               kfree(bqt);
+       }
+
+       return retval;
+}
+
+/**
+ * __blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *    blk_cleanup_queue() will take care of calling this function, if tagging
+ *    has been used. So there's no need to call this directly.
+ **/
+void __blk_queue_free_tags(struct request_queue *q)
+{
+       struct blk_queue_tag *bqt = q->queue_tags;
+
+       if (!bqt)
+               return;
+
+       __blk_free_tags(bqt);
+
+       q->queue_tags = NULL;
+       q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
+}
+
+/**
+ * blk_free_tags - release a given set of tag maintenance info
+ * @bqt:       the tag map to free
+ *
+ * For externally managed @bqt@ frees the map.  Callers of this
+ * function must guarantee to have released all the queues that
+ * might have been using this tag map.
+ */
+void blk_free_tags(struct blk_queue_tag *bqt)
+{
+       if (unlikely(!__blk_free_tags(bqt)))
+               BUG();
+}
+EXPORT_SYMBOL(blk_free_tags);
+
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *     This is used to disabled tagged queuing to a device, yet leave
+ *     queue in function.
+ **/
+void blk_queue_free_tags(struct request_queue *q)
+{
+       clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_free_tags);
+
+static int
+init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
+{
+       struct request **tag_index;
+       unsigned long *tag_map;
+       int nr_ulongs;
+
+       if (q && depth > q->nr_requests * 2) {
+               depth = q->nr_requests * 2;
+               printk(KERN_ERR "%s: adjusted depth to %d\n",
+                               __FUNCTION__, depth);
+       }
+
+       tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+       if (!tag_index)
+               goto fail;
+
+       nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
+       tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
+       if (!tag_map)
+               goto fail;
+
+       tags->real_max_depth = depth;
+       tags->max_depth = depth;
+       tags->tag_index = tag_index;
+       tags->tag_map = tag_map;
+
+       return 0;
+fail:
+       kfree(tag_index);
+       return -ENOMEM;
+}
+
+static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
+                                                  int depth)
+{
+       struct blk_queue_tag *tags;
+
+       tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
+       if (!tags)
+               goto fail;
+
+       if (init_tag_map(q, tags, depth))
+               goto fail;
+
+       tags->busy = 0;
+       atomic_set(&tags->refcnt, 1);
+       return tags;
+fail:
+       kfree(tags);
+       return NULL;
+}
+
+/**
+ * blk_init_tags - initialize the tag info for an external tag map
+ * @depth:     the maximum queue depth supported
+ * @tags: the tag to use
+ **/
+struct blk_queue_tag *blk_init_tags(int depth)
+{
+       return __blk_queue_init_tags(NULL, depth);
+}
+EXPORT_SYMBOL(blk_init_tags);
+
+/**
+ * blk_queue_init_tags - initialize the queue tag info
+ * @q:  the request queue for the device
+ * @depth:  the maximum queue depth supported
+ * @tags: the tag to use
+ **/
+int blk_queue_init_tags(struct request_queue *q, int depth,
+                       struct blk_queue_tag *tags)
+{
+       int rc;
+
+       BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+       if (!tags && !q->queue_tags) {
+               tags = __blk_queue_init_tags(q, depth);
+
+               if (!tags)
+                       goto fail;
+       } else if (q->queue_tags) {
+               if ((rc = blk_queue_resize_tags(q, depth)))
+                       return rc;
+               set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+               return 0;
+       } else
+               atomic_inc(&tags->refcnt);
+
+       /*
+        * assign it, all done
+        */
+       q->queue_tags = tags;
+       q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
+       INIT_LIST_HEAD(&q->tag_busy_list);
+       return 0;
+fail:
+       kfree(tags);
+       return -ENOMEM;
+}
+
+EXPORT_SYMBOL(blk_queue_init_tags);
+
+/**
+ * blk_queue_resize_tags - change the queueing depth
+ * @q:  the request queue for the device
+ * @new_depth: the new max command queueing depth
+ *
+ *  Notes:
+ *    Must be called with the queue lock held.
+ **/
+int blk_queue_resize_tags(struct request_queue *q, int new_depth)
+{
+       struct blk_queue_tag *bqt = q->queue_tags;
+       struct request **tag_index;
+       unsigned long *tag_map;
+       int max_depth, nr_ulongs;
+
+       if (!bqt)
+               return -ENXIO;
+
+       /*
+        * if we already have large enough real_max_depth.  just
+        * adjust max_depth.  *NOTE* as requests with tag value
+        * between new_depth and real_max_depth can be in-flight, tag
+        * map can not be shrunk blindly here.
+        */
+       if (new_depth <= bqt->real_max_depth) {
+               bqt->max_depth = new_depth;
+               return 0;
+       }
+
+       /*
+        * Currently cannot replace a shared tag map with a new
+        * one, so error out if this is the case
+        */
+       if (atomic_read(&bqt->refcnt) != 1)
+               return -EBUSY;
+
+       /*
+        * save the old state info, so we can copy it back
+        */
+       tag_index = bqt->tag_index;
+       tag_map = bqt->tag_map;
+       max_depth = bqt->real_max_depth;
+
+       if (init_tag_map(q, bqt, new_depth))
+               return -ENOMEM;
+
+       memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
+       nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
+       memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
+
+       kfree(tag_index);
+       kfree(tag_map);
+       return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_resize_tags);
+
+/**
+ * blk_queue_end_tag - end tag operations for a request
+ * @q:  the request queue for the device
+ * @rq: the request that has completed
+ *
+ *  Description:
+ *    Typically called when end_that_request_first() returns 0, meaning
+ *    all transfers have been done for a request. It's important to call
+ *    this function before end_that_request_last(), as that will put the
+ *    request back on the free list thus corrupting the internal tag list.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+void blk_queue_end_tag(struct request_queue *q, struct request *rq)
+{
+       struct blk_queue_tag *bqt = q->queue_tags;
+       int tag = rq->tag;
+
+       BUG_ON(tag == -1);
+
+       if (unlikely(tag >= bqt->real_max_depth))
+               /*
+                * This can happen after tag depth has been reduced.
+                * FIXME: how about a warning or info message here?
+                */
+               return;
+
+       list_del_init(&rq->queuelist);
+       rq->cmd_flags &= ~REQ_QUEUED;
+       rq->tag = -1;
+
+       if (unlikely(bqt->tag_index[tag] == NULL))
+               printk(KERN_ERR "%s: tag %d is missing\n",
+                      __FUNCTION__, tag);
+
+       bqt->tag_index[tag] = NULL;
+
+       if (unlikely(!test_bit(tag, bqt->tag_map))) {
+               printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
+                      __FUNCTION__, tag);
+               return;
+       }
+       /*
+        * The tag_map bit acts as a lock for tag_index[bit], so we need
+        * unlock memory barrier semantics.
+        */
+       clear_bit_unlock(tag, bqt->tag_map);
+       bqt->busy--;
+}
+
+EXPORT_SYMBOL(blk_queue_end_tag);
+
+/**
+ * blk_queue_start_tag - find a free tag and assign it
+ * @q:  the request queue for the device
+ * @rq:  the block request that needs tagging
+ *
+ *  Description:
+ *    This can either be used as a stand-alone helper, or possibly be
+ *    assigned as the queue &prep_rq_fn (in which case &struct request
+ *    automagically gets a tag assigned). Note that this function
+ *    assumes that any type of request can be queued! if this is not
+ *    true for your device, you must check the request type before
+ *    calling this function.  The request will also be removed from
+ *    the request queue, so it's the drivers responsibility to readd
+ *    it if it should need to be restarted for some reason.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+int blk_queue_start_tag(struct request_queue *q, struct request *rq)
+{
+       struct blk_queue_tag *bqt = q->queue_tags;
+       int tag;
+
+       if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
+               printk(KERN_ERR 
+                      "%s: request %p for device [%s] already tagged %d",
+                      __FUNCTION__, rq,
+                      rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
+               BUG();
+       }
+
+       /*
+        * Protect against shared tag maps, as we may not have exclusive
+        * access to the tag map.
+        */
+       do {
+               tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
+               if (tag >= bqt->max_depth)
+                       return 1;
+
+       } while (test_and_set_bit_lock(tag, bqt->tag_map));
+       /*
+        * We need lock ordering semantics given by test_and_set_bit_lock.
+        * See blk_queue_end_tag for details.
+        */
+
+       rq->cmd_flags |= REQ_QUEUED;
+       rq->tag = tag;
+       bqt->tag_index[tag] = rq;
+       blkdev_dequeue_request(rq);
+       list_add(&rq->queuelist, &q->tag_busy_list);
+       bqt->busy++;
+       return 0;
+}
+
+EXPORT_SYMBOL(blk_queue_start_tag);
+
+/**
+ * blk_queue_invalidate_tags - invalidate all pending tags
+ * @q:  the request queue for the device
+ *
+ *  Description:
+ *   Hardware conditions may dictate a need to stop all pending requests.
+ *   In this case, we will safely clear the block side of the tag queue and
+ *   readd all requests to the request queue in the right order.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+void blk_queue_invalidate_tags(struct request_queue *q)
+{
+       struct list_head *tmp, *n;
+
+       list_for_each_safe(tmp, n, &q->tag_busy_list)
+               blk_requeue_request(q, list_entry_rq(tmp));
+}
+
+EXPORT_SYMBOL(blk_queue_invalidate_tags);
diff --git a/block/blk.h b/block/blk.h

new file mode 100644 (file)

index 0000000..ec898dd
--- /dev/null
+++ b/block/blk.h
@@ -0,0 +1,53 @@
+#ifndef BLK_INTERNAL_H
+#define BLK_INTERNAL_H
+
+/* Amount of time in which a process may batch requests */
+#define BLK_BATCH_TIME (HZ/50UL)
+
+/* Number of requests a "batching" process may submit */
+#define BLK_BATCH_REQ  32
+
+extern struct kmem_cache *blk_requestq_cachep;
+extern struct kobj_type blk_queue_ktype;
+
+void rq_init(struct request_queue *q, struct request *rq);
+void init_request_from_bio(struct request *req, struct bio *bio);
+void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
+                       struct bio *bio);
+void __blk_queue_free_tags(struct request_queue *q);
+
+void blk_unplug_work(struct work_struct *work);
+void blk_unplug_timeout(unsigned long data);
+
+struct io_context *current_io_context(gfp_t gfp_flags, int node);
+
+int ll_back_merge_fn(struct request_queue *q, struct request *req,
+                    struct bio *bio);
+int ll_front_merge_fn(struct request_queue *q, struct request *req, 
+                     struct bio *bio);
+int attempt_back_merge(struct request_queue *q, struct request *rq);
+int attempt_front_merge(struct request_queue *q, struct request *rq);
+void blk_recalc_rq_segments(struct request *rq);
+void blk_recalc_rq_sectors(struct request *rq, int nsect);
+
+void blk_queue_congestion_threshold(struct request_queue *q);
+
+/*
+ * Return the threshold (number of used requests) at which the queue is
+ * considered to be congested.  It include a little hysteresis to keep the
+ * context switch rate down.
+ */
+static inline int queue_congestion_on_threshold(struct request_queue *q)
+{
+       return q->nr_congestion_on;
+}
+
+/*
+ * The threshold at which a queue is considered to be uncongested
+ */
+static inline int queue_congestion_off_threshold(struct request_queue *q)
+{
+       return q->nr_congestion_off;
+}
+
+#endif
diff --git a/block/bsg.c b/block/bsg.c

index 69b0a9d333064c782676a4f84fd79e4006c2c879..8917c5174dc2646c5ad8d4d67eb67529a98783f5 100644 (file)
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -279,6 +279,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr)
                         goto out;
                 }
                 rq->next_rq = next_rq;
+               next_rq->cmd_type = rq->cmd_type;
  
                 dxferp = (void*)(unsigned long)hdr->din_xferp;
                 ret =  blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len);
diff --git a/block/genhd.c b/block/genhd.c

index 5e4ab4b37d9f4f94e05a8c741693cafd177d28f0..de2ebb2fab436119cf16b862c720c15be6d991c4 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -337,7 +337,7 @@ static int show_partition(struct seq_file *part, void *v)
         return 0;
  }
  
-struct seq_operations partitions_op = {
+const struct seq_operations partitions_op = {
         .start  = part_start,
         .next   = part_next,
         .stop   = part_stop,
@@ -595,7 +595,7 @@ static int diskstats_show(struct seq_file *s, void *v)
         return 0;
  }
  
-struct seq_operations diskstats_op = {
+const struct seq_operations diskstats_op = {
         .start  = diskstats_start,
         .next   = diskstats_next,
         .stop   = diskstats_stop,
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c

deleted file mode 100644 (file)

index 1932a56..0000000
--- a/block/ll_rw_blk.c
+++ /dev/null
@@ -1,4457 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
- * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
- * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
- * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
- * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
- */
-
-/*
- * This handles all read/write requests to block devices
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/backing-dev.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/kernel_stat.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>     /* for max_pfn/max_low_pfn */
-#include <linux/completion.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/blktrace_api.h>
-#include <linux/fault-inject.h>
-#include <linux/scatterlist.h>
-
-/*
- * for max sense size
- */
-#include <scsi/scsi_cmnd.h>
-
-static void blk_unplug_work(struct work_struct *work);
-static void blk_unplug_timeout(unsigned long data);
-static void drive_stat_acct(struct request *rq, int new_io);
-static void init_request_from_bio(struct request *req, struct bio *bio);
-static int __make_request(struct request_queue *q, struct bio *bio);
-static struct io_context *current_io_context(gfp_t gfp_flags, int node);
-static void blk_recalc_rq_segments(struct request *rq);
-static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
-                           struct bio *bio);
-
-/*
- * For the allocated request tables
- */
-static struct kmem_cache *request_cachep;
-
-/*
- * For queue allocation
- */
-static struct kmem_cache *requestq_cachep;
-
-/*
- * For io context allocations
- */
-static struct kmem_cache *iocontext_cachep;
-
-/*
- * Controlling structure to kblockd
- */
-static struct workqueue_struct *kblockd_workqueue;
-
-unsigned long blk_max_low_pfn, blk_max_pfn;
-
-EXPORT_SYMBOL(blk_max_low_pfn);
-EXPORT_SYMBOL(blk_max_pfn);
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
-/* Amount of time in which a process may batch requests */
-#define BLK_BATCH_TIME (HZ/50UL)
-
-/* Number of requests a "batching" process may submit */
-#define BLK_BATCH_REQ  32
-
-/*
- * Return the threshold (number of used requests) at which the queue is
- * considered to be congested.  It include a little hysteresis to keep the
- * context switch rate down.
- */
-static inline int queue_congestion_on_threshold(struct request_queue *q)
-{
-       return q->nr_congestion_on;
-}
-
-/*
- * The threshold at which a queue is considered to be uncongested
- */
-static inline int queue_congestion_off_threshold(struct request_queue *q)
-{
-       return q->nr_congestion_off;
-}
-
-static void blk_queue_congestion_threshold(struct request_queue *q)
-{
-       int nr;
-
-       nr = q->nr_requests - (q->nr_requests / 8) + 1;
-       if (nr > q->nr_requests)
-               nr = q->nr_requests;
-       q->nr_congestion_on = nr;
-
-       nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
-       if (nr < 1)
-               nr = 1;
-       q->nr_congestion_off = nr;
-}
-
-/**
- * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
- * @bdev:      device
- *
- * Locates the passed device's request queue and returns the address of its
- * backing_dev_info
- *
- * Will return NULL if the request queue cannot be located.
- */
-struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
-{
-       struct backing_dev_info *ret = NULL;
-       struct request_queue *q = bdev_get_queue(bdev);
-
-       if (q)
-               ret = &q->backing_dev_info;
-       return ret;
-}
-EXPORT_SYMBOL(blk_get_backing_dev_info);
-
-/**
- * blk_queue_prep_rq - set a prepare_request function for queue
- * @q:         queue
- * @pfn:       prepare_request function
- *
- * It's possible for a queue to register a prepare_request callback which
- * is invoked before the request is handed to the request_fn. The goal of
- * the function is to prepare a request for I/O, it can be used to build a
- * cdb from the request data for instance.
- *
- */
-void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
-{
-       q->prep_rq_fn = pfn;
-}
-
-EXPORT_SYMBOL(blk_queue_prep_rq);
-
-/**
- * blk_queue_merge_bvec - set a merge_bvec function for queue
- * @q:         queue
- * @mbfn:      merge_bvec_fn
- *
- * Usually queues have static limitations on the max sectors or segments that
- * we can put in a request. Stacking drivers may have some settings that
- * are dynamic, and thus we have to query the queue whether it is ok to
- * add a new bio_vec to a bio at a given offset or not. If the block device
- * has such limitations, it needs to register a merge_bvec_fn to control
- * the size of bio's sent to it. Note that a block device *must* allow a
- * single page to be added to an empty bio. The block device driver may want
- * to use the bio_split() function to deal with these bio's. By default
- * no merge_bvec_fn is defined for a queue, and only the fixed limits are
- * honored.
- */
-void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
-{
-       q->merge_bvec_fn = mbfn;
-}
-
-EXPORT_SYMBOL(blk_queue_merge_bvec);
-
-void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
-{
-       q->softirq_done_fn = fn;
-}
-
-EXPORT_SYMBOL(blk_queue_softirq_done);
-
-/**
- * blk_queue_make_request - define an alternate make_request function for a device
- * @q:  the request queue for the device to be affected
- * @mfn: the alternate make_request function
- *
- * Description:
- *    The normal way for &struct bios to be passed to a device
- *    driver is for them to be collected into requests on a request
- *    queue, and then to allow the device driver to select requests
- *    off that queue when it is ready.  This works well for many block
- *    devices. However some block devices (typically virtual devices
- *    such as md or lvm) do not benefit from the processing on the
- *    request queue, and are served best by having the requests passed
- *    directly to them.  This can be achieved by providing a function
- *    to blk_queue_make_request().
- *
- * Caveat:
- *    The driver that does this *must* be able to deal appropriately
- *    with buffers in "highmemory". This can be accomplished by either calling
- *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
- *    blk_queue_bounce() to create a buffer in normal memory.
- **/
-void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
-{
-       /*
-        * set defaults
-        */
-       q->nr_requests = BLKDEV_MAX_RQ;
-       blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
-       blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
-       q->make_request_fn = mfn;
-       q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-       q->backing_dev_info.state = 0;
-       q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
-       blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
-       blk_queue_hardsect_size(q, 512);
-       blk_queue_dma_alignment(q, 511);
-       blk_queue_congestion_threshold(q);
-       q->nr_batching = BLK_BATCH_REQ;
-
-       q->unplug_thresh = 4;           /* hmm */
-       q->unplug_delay = (3 * HZ) / 1000;      /* 3 milliseconds */
-       if (q->unplug_delay == 0)
-               q->unplug_delay = 1;
-
-       INIT_WORK(&q->unplug_work, blk_unplug_work);
-
-       q->unplug_timer.function = blk_unplug_timeout;
-       q->unplug_timer.data = (unsigned long)q;
-
-       /*
-        * by default assume old behaviour and bounce for any highmem page
-        */
-       blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
-}
-
-EXPORT_SYMBOL(blk_queue_make_request);
-
-static void rq_init(struct request_queue *q, struct request *rq)
-{
-       INIT_LIST_HEAD(&rq->queuelist);
-       INIT_LIST_HEAD(&rq->donelist);
-
-       rq->errors = 0;
-       rq->bio = rq->biotail = NULL;
-       INIT_HLIST_NODE(&rq->hash);
-       RB_CLEAR_NODE(&rq->rb_node);
-       rq->ioprio = 0;
-       rq->buffer = NULL;
-       rq->ref_count = 1;
-       rq->q = q;
-       rq->special = NULL;
-       rq->data_len = 0;
-       rq->data = NULL;
-       rq->nr_phys_segments = 0;
-       rq->sense = NULL;
-       rq->end_io = NULL;
-       rq->end_io_data = NULL;
-       rq->completion_data = NULL;
-       rq->next_rq = NULL;
-}
-
-/**
- * blk_queue_ordered - does this queue support ordered writes
- * @q:        the request queue
- * @ordered:  one of QUEUE_ORDERED_*
- * @prepare_flush_fn: rq setup helper for cache flush ordered writes
- *
- * Description:
- *   For journalled file systems, doing ordered writes on a commit
- *   block instead of explicitly doing wait_on_buffer (which is bad
- *   for performance) can be a big win. Block drivers supporting this
- *   feature should call this function and indicate so.
- *
- **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered,
-                     prepare_flush_fn *prepare_flush_fn)
-{
-       if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
-           prepare_flush_fn == NULL) {
-               printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
-               return -EINVAL;
-       }
-
-       if (ordered != QUEUE_ORDERED_NONE &&
-           ordered != QUEUE_ORDERED_DRAIN &&
-           ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-           ordered != QUEUE_ORDERED_DRAIN_FUA &&
-           ordered != QUEUE_ORDERED_TAG &&
-           ordered != QUEUE_ORDERED_TAG_FLUSH &&
-           ordered != QUEUE_ORDERED_TAG_FUA) {
-               printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
-               return -EINVAL;
-       }
-
-       q->ordered = ordered;
-       q->next_ordered = ordered;
-       q->prepare_flush_fn = prepare_flush_fn;
-
-       return 0;
-}
-
-EXPORT_SYMBOL(blk_queue_ordered);
-
-/*
- * Cache flushing for ordered writes handling
- */
-inline unsigned blk_ordered_cur_seq(struct request_queue *q)
-{
-       if (!q->ordseq)
-               return 0;
-       return 1 << ffz(q->ordseq);
-}
-
-unsigned blk_ordered_req_seq(struct request *rq)
-{
-       struct request_queue *q = rq->q;
-
-       BUG_ON(q->ordseq == 0);
-
-       if (rq == &q->pre_flush_rq)
-               return QUEUE_ORDSEQ_PREFLUSH;
-       if (rq == &q->bar_rq)
-               return QUEUE_ORDSEQ_BAR;
-       if (rq == &q->post_flush_rq)
-               return QUEUE_ORDSEQ_POSTFLUSH;
-
-       /*
-        * !fs requests don't need to follow barrier ordering.  Always
-        * put them at the front.  This fixes the following deadlock.
-        *
-        * http://thread.gmane.org/gmane.linux.kernel/537473
-        */
-       if (!blk_fs_request(rq))
-               return QUEUE_ORDSEQ_DRAIN;
-
-       if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
-           (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
-               return QUEUE_ORDSEQ_DRAIN;
-       else
-               return QUEUE_ORDSEQ_DONE;
-}
-
-void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
-{
-       struct request *rq;
-
-       if (error && !q->orderr)
-               q->orderr = error;
-
-       BUG_ON(q->ordseq & seq);
-       q->ordseq |= seq;
-
-       if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-               return;
-
-       /*
-        * Okay, sequence complete.
-        */
-       q->ordseq = 0;
-       rq = q->orig_bar_rq;
-
-       if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
-               BUG();
-}
-
-static void pre_flush_end_io(struct request *rq, int error)
-{
-       elv_completed_request(rq->q, rq);
-       blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
-}
-
-static void bar_end_io(struct request *rq, int error)
-{
-       elv_completed_request(rq->q, rq);
-       blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
-}
-
-static void post_flush_end_io(struct request *rq, int error)
-{
-       elv_completed_request(rq->q, rq);
-       blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
-}
-
-static void queue_flush(struct request_queue *q, unsigned which)
-{
-       struct request *rq;
-       rq_end_io_fn *end_io;
-
-       if (which == QUEUE_ORDERED_PREFLUSH) {
-               rq = &q->pre_flush_rq;
-               end_io = pre_flush_end_io;
-       } else {
-               rq = &q->post_flush_rq;
-               end_io = post_flush_end_io;
-       }
-
-       rq->cmd_flags = REQ_HARDBARRIER;
-       rq_init(q, rq);
-       rq->elevator_private = NULL;
-       rq->elevator_private2 = NULL;
-       rq->rq_disk = q->bar_rq.rq_disk;
-       rq->end_io = end_io;
-       q->prepare_flush_fn(q, rq);
-
-       elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-}
-
-static inline struct request *start_ordered(struct request_queue *q,
-                                           struct request *rq)
-{
-       q->orderr = 0;
-       q->ordered = q->next_ordered;
-       q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
-       /*
-        * Prep proxy barrier request.
-        */
-       blkdev_dequeue_request(rq);
-       q->orig_bar_rq = rq;
-       rq = &q->bar_rq;
-       rq->cmd_flags = 0;
-       rq_init(q, rq);
-       if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-               rq->cmd_flags |= REQ_RW;
-       if (q->ordered & QUEUE_ORDERED_FUA)
-               rq->cmd_flags |= REQ_FUA;
-       rq->elevator_private = NULL;
-       rq->elevator_private2 = NULL;
-       init_request_from_bio(rq, q->orig_bar_rq->bio);
-       rq->end_io = bar_end_io;
-
-       /*
-        * Queue ordered sequence.  As we stack them at the head, we
-        * need to queue in reverse order.  Note that we rely on that
-        * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-        * request gets inbetween ordered sequence. If this request is
-        * an empty barrier, we don't need to do a postflush ever since
-        * there will be no data written between the pre and post flush.
-        * Hence a single flush will suffice.
-        */
-       if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
-               queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
-       else
-               q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
-
-       elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-
-       if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
-               queue_flush(q, QUEUE_ORDERED_PREFLUSH);
-               rq = &q->pre_flush_rq;
-       } else
-               q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
-
-       if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
-               q->ordseq |= QUEUE_ORDSEQ_DRAIN;
-       else
-               rq = NULL;
-
-       return rq;
-}
-
-int blk_do_ordered(struct request_queue *q, struct request **rqp)
-{
-       struct request *rq = *rqp;
-       const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
-
-       if (!q->ordseq) {
-               if (!is_barrier)
-                       return 1;
-
-               if (q->next_ordered != QUEUE_ORDERED_NONE) {
-                       *rqp = start_ordered(q, rq);
-                       return 1;
-               } else {
-                       /*
-                        * This can happen when the queue switches to
-                        * ORDERED_NONE while this request is on it.
-                        */
-                       blkdev_dequeue_request(rq);
-                       if (__blk_end_request(rq, -EOPNOTSUPP,
-                                             blk_rq_bytes(rq)))
-                               BUG();
-                       *rqp = NULL;
-                       return 0;
-               }
-       }
-
-       /*
-        * Ordered sequence in progress
-        */
-
-       /* Special requests are not subject to ordering rules. */
-       if (!blk_fs_request(rq) &&
-           rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-               return 1;
-
-       if (q->ordered & QUEUE_ORDERED_TAG) {
-               /* Ordered by tag.  Blocking the next barrier is enough. */
-               if (is_barrier && rq != &q->bar_rq)
-                       *rqp = NULL;
-       } else {
-               /* Ordered by draining.  Wait for turn. */
-               WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-               if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-                       *rqp = NULL;
-       }
-
-       return 1;
-}
-
-static void req_bio_endio(struct request *rq, struct bio *bio,
-                         unsigned int nbytes, int error)
-{
-       struct request_queue *q = rq->q;
-
-       if (&q->bar_rq != rq) {
-               if (error)
-                       clear_bit(BIO_UPTODATE, &bio->bi_flags);
-               else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                       error = -EIO;
-
-               if (unlikely(nbytes > bio->bi_size)) {
-                       printk("%s: want %u bytes done, only %u left\n",
-                              __FUNCTION__, nbytes, bio->bi_size);
-                       nbytes = bio->bi_size;
-               }
-
-               bio->bi_size -= nbytes;
-               bio->bi_sector += (nbytes >> 9);
-               if (bio->bi_size == 0)
-                       bio_endio(bio, error);
-       } else {
-
-               /*
-                * Okay, this is the barrier request in progress, just
-                * record the error;
-                */
-               if (error && !q->orderr)
-                       q->orderr = error;
-       }
-}
-
-/**
- * blk_queue_bounce_limit - set bounce buffer limit for queue
- * @q:  the request queue for the device
- * @dma_addr:   bus address limit
- *
- * Description:
- *    Different hardware can have different requirements as to what pages
- *    it can do I/O directly to. A low level driver can call
- *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
- *    buffers for doing I/O to pages residing above @page.
- **/
-void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
-{
-       unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
-       int dma = 0;
-
-       q->bounce_gfp = GFP_NOIO;
-#if BITS_PER_LONG == 64
-       /* Assume anything <= 4GB can be handled by IOMMU.
-          Actually some IOMMUs can handle everything, but I don't
-          know of a way to test this here. */
-       if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
-               dma = 1;
-       q->bounce_pfn = max_low_pfn;
-#else
-       if (bounce_pfn < blk_max_low_pfn)
-               dma = 1;
-       q->bounce_pfn = bounce_pfn;
-#endif
-       if (dma) {
-               init_emergency_isa_pool();
-               q->bounce_gfp = GFP_NOIO | GFP_DMA;
-               q->bounce_pfn = bounce_pfn;
-       }
-}
-
-EXPORT_SYMBOL(blk_queue_bounce_limit);
-
-/**
- * blk_queue_max_sectors - set max sectors for a request for this queue
- * @q:  the request queue for the device
- * @max_sectors:  max sectors in the usual 512b unit
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the size of
- *    received requests.
- **/
-void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
-{
-       if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
-               max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
-               printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
-       }
-
-       if (BLK_DEF_MAX_SECTORS > max_sectors)
-               q->max_hw_sectors = q->max_sectors = max_sectors;
-       else {
-               q->max_sectors = BLK_DEF_MAX_SECTORS;
-               q->max_hw_sectors = max_sectors;
-       }
-}
-
-EXPORT_SYMBOL(blk_queue_max_sectors);
-
-/**
- * blk_queue_max_phys_segments - set max phys segments for a request for this queue
- * @q:  the request queue for the device
- * @max_segments:  max number of segments
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the number of
- *    physical data segments in a request.  This would be the largest sized
- *    scatter list the driver could handle.
- **/
-void blk_queue_max_phys_segments(struct request_queue *q,
-                                unsigned short max_segments)
-{
-       if (!max_segments) {
-               max_segments = 1;
-               printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
-       }
-
-       q->max_phys_segments = max_segments;
-}
-
-EXPORT_SYMBOL(blk_queue_max_phys_segments);
-
-/**
- * blk_queue_max_hw_segments - set max hw segments for a request for this queue
- * @q:  the request queue for the device
- * @max_segments:  max number of segments
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the number of
- *    hw data segments in a request.  This would be the largest number of
- *    address/length pairs the host adapter can actually give as once
- *    to the device.
- **/
-void blk_queue_max_hw_segments(struct request_queue *q,
-                              unsigned short max_segments)
-{
-       if (!max_segments) {
-               max_segments = 1;
-               printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
-       }
-
-       q->max_hw_segments = max_segments;
-}
-
-EXPORT_SYMBOL(blk_queue_max_hw_segments);
-
-/**
- * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
- * @q:  the request queue for the device
- * @max_size:  max size of segment in bytes
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the size of a
- *    coalesced segment
- **/
-void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
-{
-       if (max_size < PAGE_CACHE_SIZE) {
-               max_size = PAGE_CACHE_SIZE;
-               printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
-       }
-
-       q->max_segment_size = max_size;
-}
-
-EXPORT_SYMBOL(blk_queue_max_segment_size);
-
-/**
- * blk_queue_hardsect_size - set hardware sector size for the queue
- * @q:  the request queue for the device
- * @size:  the hardware sector size, in bytes
- *
- * Description:
- *   This should typically be set to the lowest possible sector size
- *   that the hardware can operate on (possible without reverting to
- *   even internal read-modify-write operations). Usually the default
- *   of 512 covers most hardware.
- **/
-void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
-{
-       q->hardsect_size = size;
-}
-
-EXPORT_SYMBOL(blk_queue_hardsect_size);
-
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
-/**
- * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
- * @t: the stacking driver (top)
- * @b:  the underlying device (bottom)
- **/
-void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
-{
-       /* zero is "infinity" */
-       t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
-       t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
-
-       t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
-       t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
-       t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
-       t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
-       if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
-               clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
-}
-
-EXPORT_SYMBOL(blk_queue_stack_limits);
-
-/**
- * blk_queue_dma_drain - Set up a drain buffer for excess dma.
- *
- * @q:  the request queue for the device
- * @buf:       physically contiguous buffer
- * @size:      size of the buffer in bytes
- *
- * Some devices have excess DMA problems and can't simply discard (or
- * zero fill) the unwanted piece of the transfer.  They have to have a
- * real area of memory to transfer it into.  The use case for this is
- * ATAPI devices in DMA mode.  If the packet command causes a transfer
- * bigger than the transfer size some HBAs will lock up if there
- * aren't DMA elements to contain the excess transfer.  What this API
- * does is adjust the queue so that the buf is always appended
- * silently to the scatterlist.
- *
- * Note: This routine adjusts max_hw_segments to make room for
- * appending the drain buffer.  If you call
- * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after
- * calling this routine, you must set the limit to one fewer than your
- * device can support otherwise there won't be room for the drain
- * buffer.
- */
-int blk_queue_dma_drain(struct request_queue *q, void *buf,
-                               unsigned int size)
-{
-       if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
-               return -EINVAL;
-       /* make room for appending the drain */
-       --q->max_hw_segments;
-       --q->max_phys_segments;
-       q->dma_drain_buffer = buf;
-       q->dma_drain_size = size;
-
-       return 0;
-}
-
-EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
-
-/**
- * blk_queue_segment_boundary - set boundary rules for segment merging
- * @q:  the request queue for the device
- * @mask:  the memory boundary mask
- **/
-void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
-{
-       if (mask < PAGE_CACHE_SIZE - 1) {
-               mask = PAGE_CACHE_SIZE - 1;
-               printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
-       }
-
-       q->seg_boundary_mask = mask;
-}
-
-EXPORT_SYMBOL(blk_queue_segment_boundary);
-
-/**
- * blk_queue_dma_alignment - set dma length and memory alignment
- * @q:     the request queue for the device
- * @mask:  alignment mask
- *
- * description:
- *    set required memory and length aligment for direct dma transactions.
- *    this is used when buiding direct io requests for the queue.
- *
- **/
-void blk_queue_dma_alignment(struct request_queue *q, int mask)
-{
-       q->dma_alignment = mask;
-}
-
-EXPORT_SYMBOL(blk_queue_dma_alignment);
-
-/**
- * blk_queue_update_dma_alignment - update dma length and memory alignment
- * @q:     the request queue for the device
- * @mask:  alignment mask
- *
- * description:
- *    update required memory and length aligment for direct dma transactions.
- *    If the requested alignment is larger than the current alignment, then
- *    the current queue alignment is updated to the new value, otherwise it
- *    is left alone.  The design of this is to allow multiple objects
- *    (driver, device, transport etc) to set their respective
- *    alignments without having them interfere.
- *
- **/
-void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
-{
-       BUG_ON(mask > PAGE_SIZE);
-
-       if (mask > q->dma_alignment)
-               q->dma_alignment = mask;
-}
-
-EXPORT_SYMBOL(blk_queue_update_dma_alignment);
-
-/**
- * blk_queue_find_tag - find a request by its tag and queue
- * @q:  The request queue for the device
- * @tag: The tag of the request
- *
- * Notes:
- *    Should be used when a device returns a tag and you want to match
- *    it with a request.
- *
- *    no locks need be held.
- **/
-struct request *blk_queue_find_tag(struct request_queue *q, int tag)
-{
-       return blk_map_queue_find_tag(q->queue_tags, tag);
-}
-
-EXPORT_SYMBOL(blk_queue_find_tag);
-
-/**
- * __blk_free_tags - release a given set of tag maintenance info
- * @bqt:       the tag map to free
- *
- * Tries to free the specified @bqt@.  Returns true if it was
- * actually freed and false if there are still references using it
- */
-static int __blk_free_tags(struct blk_queue_tag *bqt)
-{
-       int retval;
-
-       retval = atomic_dec_and_test(&bqt->refcnt);
-       if (retval) {
-               BUG_ON(bqt->busy);
-
-               kfree(bqt->tag_index);
-               bqt->tag_index = NULL;
-
-               kfree(bqt->tag_map);
-               bqt->tag_map = NULL;
-
-               kfree(bqt);
-
-       }
-
-       return retval;
-}
-
-/**
- * __blk_queue_free_tags - release tag maintenance info
- * @q:  the request queue for the device
- *
- *  Notes:
- *    blk_cleanup_queue() will take care of calling this function, if tagging
- *    has been used. So there's no need to call this directly.
- **/
-static void __blk_queue_free_tags(struct request_queue *q)
-{
-       struct blk_queue_tag *bqt = q->queue_tags;
-
-       if (!bqt)
-               return;
-
-       __blk_free_tags(bqt);
-
-       q->queue_tags = NULL;
-       q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
-}
-
-
-/**
- * blk_free_tags - release a given set of tag maintenance info
- * @bqt:       the tag map to free
- *
- * For externally managed @bqt@ frees the map.  Callers of this
- * function must guarantee to have released all the queues that
- * might have been using this tag map.
- */
-void blk_free_tags(struct blk_queue_tag *bqt)
-{
-       if (unlikely(!__blk_free_tags(bqt)))
-               BUG();
-}
-EXPORT_SYMBOL(blk_free_tags);
-
-/**
- * blk_queue_free_tags - release tag maintenance info
- * @q:  the request queue for the device
- *
- *  Notes:
- *     This is used to disabled tagged queuing to a device, yet leave
- *     queue in function.
- **/
-void blk_queue_free_tags(struct request_queue *q)
-{
-       clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
-}
-
-EXPORT_SYMBOL(blk_queue_free_tags);
-
-static int
-init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
-{
-       struct request **tag_index;
-       unsigned long *tag_map;
-       int nr_ulongs;
-
-       if (q && depth > q->nr_requests * 2) {
-               depth = q->nr_requests * 2;
-               printk(KERN_ERR "%s: adjusted depth to %d\n",
-                               __FUNCTION__, depth);
-       }
-
-       tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
-       if (!tag_index)
-               goto fail;
-
-       nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
-       tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
-       if (!tag_map)
-               goto fail;
-
-       tags->real_max_depth = depth;
-       tags->max_depth = depth;
-       tags->tag_index = tag_index;
-       tags->tag_map = tag_map;
-
-       return 0;
-fail:
-       kfree(tag_index);
-       return -ENOMEM;
-}
-
-static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
-                                                  int depth)
-{
-       struct blk_queue_tag *tags;
-
-       tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
-       if (!tags)
-               goto fail;
-
-       if (init_tag_map(q, tags, depth))
-               goto fail;
-
-       tags->busy = 0;
-       atomic_set(&tags->refcnt, 1);
-       return tags;
-fail:
-       kfree(tags);
-       return NULL;
-}
-
-/**
- * blk_init_tags - initialize the tag info for an external tag map
- * @depth:     the maximum queue depth supported
- * @tags: the tag to use
- **/
-struct blk_queue_tag *blk_init_tags(int depth)
-{
-       return __blk_queue_init_tags(NULL, depth);
-}
-EXPORT_SYMBOL(blk_init_tags);
-
-/**
- * blk_queue_init_tags - initialize the queue tag info
- * @q:  the request queue for the device
- * @depth:  the maximum queue depth supported
- * @tags: the tag to use
- **/
-int blk_queue_init_tags(struct request_queue *q, int depth,
-                       struct blk_queue_tag *tags)
-{
-       int rc;
-
-       BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
-
-       if (!tags && !q->queue_tags) {
-               tags = __blk_queue_init_tags(q, depth);
-
-               if (!tags)
-                       goto fail;
-       } else if (q->queue_tags) {
-               if ((rc = blk_queue_resize_tags(q, depth)))
-                       return rc;
-               set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
-               return 0;
-       } else
-               atomic_inc(&tags->refcnt);
-
-       /*
-        * assign it, all done
-        */
-       q->queue_tags = tags;
-       q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
-       INIT_LIST_HEAD(&q->tag_busy_list);
-       return 0;
-fail:
-       kfree(tags);
-       return -ENOMEM;
-}
-
-EXPORT_SYMBOL(blk_queue_init_tags);
-
-/**
- * blk_queue_resize_tags - change the queueing depth
- * @q:  the request queue for the device
- * @new_depth: the new max command queueing depth
- *
- *  Notes:
- *    Must be called with the queue lock held.
- **/
-int blk_queue_resize_tags(struct request_queue *q, int new_depth)
-{
-       struct blk_queue_tag *bqt = q->queue_tags;
-       struct request **tag_index;
-       unsigned long *tag_map;
-       int max_depth, nr_ulongs;
-
-       if (!bqt)
-               return -ENXIO;
-
-       /*
-        * if we already have large enough real_max_depth.  just
-        * adjust max_depth.  *NOTE* as requests with tag value
-        * between new_depth and real_max_depth can be in-flight, tag
-        * map can not be shrunk blindly here.
-        */
-       if (new_depth <= bqt->real_max_depth) {
-               bqt->max_depth = new_depth;
-               return 0;
-       }
-
-       /*
-        * Currently cannot replace a shared tag map with a new
-        * one, so error out if this is the case
-        */
-       if (atomic_read(&bqt->refcnt) != 1)
-               return -EBUSY;
-
-       /*
-        * save the old state info, so we can copy it back
-        */
-       tag_index = bqt->tag_index;
-       tag_map = bqt->tag_map;
-       max_depth = bqt->real_max_depth;
-
-       if (init_tag_map(q, bqt, new_depth))
-               return -ENOMEM;
-
-       memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
-       nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
-       memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
-
-       kfree(tag_index);
-       kfree(tag_map);
-       return 0;
-}
-
-EXPORT_SYMBOL(blk_queue_resize_tags);
-
-/**
- * blk_queue_end_tag - end tag operations for a request
- * @q:  the request queue for the device
- * @rq: the request that has completed
- *
- *  Description:
- *    Typically called when end_that_request_first() returns 0, meaning
- *    all transfers have been done for a request. It's important to call
- *    this function before end_that_request_last(), as that will put the
- *    request back on the free list thus corrupting the internal tag list.
- *
- *  Notes:
- *   queue lock must be held.
- **/
-void blk_queue_end_tag(struct request_queue *q, struct request *rq)
-{
-       struct blk_queue_tag *bqt = q->queue_tags;
-       int tag = rq->tag;
-
-       BUG_ON(tag == -1);
-
-       if (unlikely(tag >= bqt->real_max_depth))
-               /*
-                * This can happen after tag depth has been reduced.
-                * FIXME: how about a warning or info message here?
-                */
-               return;
-
-       list_del_init(&rq->queuelist);
-       rq->cmd_flags &= ~REQ_QUEUED;
-       rq->tag = -1;
-
-       if (unlikely(bqt->tag_index[tag] == NULL))
-               printk(KERN_ERR "%s: tag %d is missing\n",
-                      __FUNCTION__, tag);
-
-       bqt->tag_index[tag] = NULL;
-
-       if (unlikely(!test_bit(tag, bqt->tag_map))) {
-               printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
-                      __FUNCTION__, tag);
-               return;
-       }
-       /*
-        * The tag_map bit acts as a lock for tag_index[bit], so we need
-        * unlock memory barrier semantics.
-        */
-       clear_bit_unlock(tag, bqt->tag_map);
-       bqt->busy--;
-}
-
-EXPORT_SYMBOL(blk_queue_end_tag);
-
-/**
- * blk_queue_start_tag - find a free tag and assign it
- * @q:  the request queue for the device
- * @rq:  the block request that needs tagging
- *
- *  Description:
- *    This can either be used as a stand-alone helper, or possibly be
- *    assigned as the queue &prep_rq_fn (in which case &struct request
- *    automagically gets a tag assigned). Note that this function
- *    assumes that any type of request can be queued! if this is not
- *    true for your device, you must check the request type before
- *    calling this function.  The request will also be removed from
- *    the request queue, so it's the drivers responsibility to readd
- *    it if it should need to be restarted for some reason.
- *
- *  Notes:
- *   queue lock must be held.
- **/
-int blk_queue_start_tag(struct request_queue *q, struct request *rq)
-{
-       struct blk_queue_tag *bqt = q->queue_tags;
-       int tag;
-
-       if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
-               printk(KERN_ERR 
-                      "%s: request %p for device [%s] already tagged %d",
-                      __FUNCTION__, rq,
-                      rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
-               BUG();
-       }
-
-       /*
-        * Protect against shared tag maps, as we may not have exclusive
-        * access to the tag map.
-        */
-       do {
-               tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
-               if (tag >= bqt->max_depth)
-                       return 1;
-
-       } while (test_and_set_bit_lock(tag, bqt->tag_map));
-       /*
-        * We need lock ordering semantics given by test_and_set_bit_lock.
-        * See blk_queue_end_tag for details.
-        */
-
-       rq->cmd_flags |= REQ_QUEUED;
-       rq->tag = tag;
-       bqt->tag_index[tag] = rq;
-       blkdev_dequeue_request(rq);
-       list_add(&rq->queuelist, &q->tag_busy_list);
-       bqt->busy++;
-       return 0;
-}
-
-EXPORT_SYMBOL(blk_queue_start_tag);
-
-/**
- * blk_queue_invalidate_tags - invalidate all pending tags
- * @q:  the request queue for the device
- *
- *  Description:
- *   Hardware conditions may dictate a need to stop all pending requests.
- *   In this case, we will safely clear the block side of the tag queue and
- *   readd all requests to the request queue in the right order.
- *
- *  Notes:
- *   queue lock must be held.
- **/
-void blk_queue_invalidate_tags(struct request_queue *q)
-{
-       struct list_head *tmp, *n;
-
-       list_for_each_safe(tmp, n, &q->tag_busy_list)
-               blk_requeue_request(q, list_entry_rq(tmp));
-}
-
-EXPORT_SYMBOL(blk_queue_invalidate_tags);
-
-void blk_dump_rq_flags(struct request *rq, char *msg)
-{
-       int bit;
-
-       printk("%s: dev %s: type=%x, flags=%x\n", msg,
-               rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
-               rq->cmd_flags);
-
-       printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
-                                                      rq->nr_sectors,
-                                                      rq->current_nr_sectors);
-       printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
-
-       if (blk_pc_request(rq)) {
-               printk("cdb: ");
-               for (bit = 0; bit < sizeof(rq->cmd); bit++)
-                       printk("%02x ", rq->cmd[bit]);
-               printk("\n");
-       }
-}
-
-EXPORT_SYMBOL(blk_dump_rq_flags);
-
-void blk_recount_segments(struct request_queue *q, struct bio *bio)
-{
-       struct request rq;
-       struct bio *nxt = bio->bi_next;
-       rq.q = q;
-       rq.bio = rq.biotail = bio;
-       bio->bi_next = NULL;
-       blk_recalc_rq_segments(&rq);
-       bio->bi_next = nxt;
-       bio->bi_phys_segments = rq.nr_phys_segments;
-       bio->bi_hw_segments = rq.nr_hw_segments;
-       bio->bi_flags |= (1 << BIO_SEG_VALID);
-}
-EXPORT_SYMBOL(blk_recount_segments);
-
-static void blk_recalc_rq_segments(struct request *rq)
-{
-       int nr_phys_segs;
-       int nr_hw_segs;
-       unsigned int phys_size;
-       unsigned int hw_size;
-       struct bio_vec *bv, *bvprv = NULL;
-       int seg_size;
-       int hw_seg_size;
-       int cluster;
-       struct req_iterator iter;
-       int high, highprv = 1;
-       struct request_queue *q = rq->q;
-
-       if (!rq->bio)
-               return;
-
-       cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
-       hw_seg_size = seg_size = 0;
-       phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
-       rq_for_each_segment(bv, rq, iter) {
-               /*
-                * the trick here is making sure that a high page is never
-                * considered part of another segment, since that might
-                * change with the bounce page.
-                */
-               high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
-               if (high || highprv)
-                       goto new_hw_segment;
-               if (cluster) {
-                       if (seg_size + bv->bv_len > q->max_segment_size)
-                               goto new_segment;
-                       if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
-                               goto new_segment;
-                       if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
-                               goto new_segment;
-                       if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
-                               goto new_hw_segment;
-
-                       seg_size += bv->bv_len;
-                       hw_seg_size += bv->bv_len;
-                       bvprv = bv;
-                       continue;
-               }
-new_segment:
-               if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
-                   !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
-                       hw_seg_size += bv->bv_len;
-               else {
-new_hw_segment:
-                       if (nr_hw_segs == 1 &&
-                           hw_seg_size > rq->bio->bi_hw_front_size)
-                               rq->bio->bi_hw_front_size = hw_seg_size;
-                       hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
-                       nr_hw_segs++;
-               }
-
-               nr_phys_segs++;
-               bvprv = bv;
-               seg_size = bv->bv_len;
-               highprv = high;
-       }
-
-       if (nr_hw_segs == 1 &&
-           hw_seg_size > rq->bio->bi_hw_front_size)
-               rq->bio->bi_hw_front_size = hw_seg_size;
-       if (hw_seg_size > rq->biotail->bi_hw_back_size)
-               rq->biotail->bi_hw_back_size = hw_seg_size;
-       rq->nr_phys_segments = nr_phys_segs;
-       rq->nr_hw_segments = nr_hw_segs;
-}
-
-static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
-                                  struct bio *nxt)
-{
-       if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
-               return 0;
-
-       if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
-               return 0;
-       if (bio->bi_size + nxt->bi_size > q->max_segment_size)
-               return 0;
-
-       /*
-        * bio and nxt are contigous in memory, check if the queue allows
-        * these two to be merged into one
-        */
-       if (BIO_SEG_BOUNDARY(q, bio, nxt))
-               return 1;
-
-       return 0;
-}
-
-static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
-                                struct bio *nxt)
-{
-       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-               blk_recount_segments(q, bio);
-       if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
-               blk_recount_segments(q, nxt);
-       if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
-           BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
-               return 0;
-       if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
-               return 0;
-
-       return 1;
-}
-
-/*
- * map a request to scatterlist, return number of sg entries setup. Caller
- * must make sure sg can hold rq->nr_phys_segments entries
- */
-int blk_rq_map_sg(struct request_queue *q, struct request *rq,
-                 struct scatterlist *sglist)
-{
-       struct bio_vec *bvec, *bvprv;
-       struct req_iterator iter;
-       struct scatterlist *sg;
-       int nsegs, cluster;
-
-       nsegs = 0;
-       cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
-
-       /*
-        * for each bio in rq
-        */
-       bvprv = NULL;
-       sg = NULL;
-       rq_for_each_segment(bvec, rq, iter) {
-               int nbytes = bvec->bv_len;
-
-               if (bvprv && cluster) {
-                       if (sg->length + nbytes > q->max_segment_size)
-                               goto new_segment;
-
-                       if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
-                               goto new_segment;
-                       if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
-                               goto new_segment;
-
-                       sg->length += nbytes;
-               } else {
-new_segment:
-                       if (!sg)
-                               sg = sglist;
-                       else {
-                               /*
-                                * If the driver previously mapped a shorter
-                                * list, we could see a termination bit
-                                * prematurely unless it fully inits the sg
-                                * table on each mapping. We KNOW that there
-                                * must be more entries here or the driver
-                                * would be buggy, so force clear the
-                                * termination bit to avoid doing a full
-                                * sg_init_table() in drivers for each command.
-                                */
-                               sg->page_link &= ~0x02;
-                               sg = sg_next(sg);
-                       }
-
-                       sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
-                       nsegs++;
-               }
-               bvprv = bvec;
-       } /* segments in rq */
-
-       if (q->dma_drain_size) {
-               sg->page_link &= ~0x02;
-               sg = sg_next(sg);
-               sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
-                           q->dma_drain_size,
-                           ((unsigned long)q->dma_drain_buffer) &
-                           (PAGE_SIZE - 1));
-               nsegs++;
-       }
-
-       if (sg)
-               sg_mark_end(sg);
-
-       return nsegs;
-}
-
-EXPORT_SYMBOL(blk_rq_map_sg);
-
-/*
- * the standard queue merge functions, can be overridden with device
- * specific ones if so desired
- */
-
-static inline int ll_new_mergeable(struct request_queue *q,
-                                  struct request *req,
-                                  struct bio *bio)
-{
-       int nr_phys_segs = bio_phys_segments(q, bio);
-
-       if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
-               req->cmd_flags |= REQ_NOMERGE;
-               if (req == q->last_merge)
-                       q->last_merge = NULL;
-               return 0;
-       }
-
-       /*
-        * A hw segment is just getting larger, bump just the phys
-        * counter.
-        */
-       req->nr_phys_segments += nr_phys_segs;
-       return 1;
-}
-
-static inline int ll_new_hw_segment(struct request_queue *q,
-                                   struct request *req,
-                                   struct bio *bio)
-{
-       int nr_hw_segs = bio_hw_segments(q, bio);
-       int nr_phys_segs = bio_phys_segments(q, bio);
-
-       if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
-           || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
-               req->cmd_flags |= REQ_NOMERGE;
-               if (req == q->last_merge)
-                       q->last_merge = NULL;
-               return 0;
-       }
-
-       /*
-        * This will form the start of a new hw segment.  Bump both
-        * counters.
-        */
-       req->nr_hw_segments += nr_hw_segs;
-       req->nr_phys_segments += nr_phys_segs;
-       return 1;
-}
-
-static int ll_back_merge_fn(struct request_queue *q, struct request *req,
-                           struct bio *bio)
-{
-       unsigned short max_sectors;
-       int len;
-
-       if (unlikely(blk_pc_request(req)))
-               max_sectors = q->max_hw_sectors;
-       else
-               max_sectors = q->max_sectors;
-
-       if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
-               req->cmd_flags |= REQ_NOMERGE;
-               if (req == q->last_merge)
-                       q->last_merge = NULL;
-               return 0;
-       }
-       if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
-               blk_recount_segments(q, req->biotail);
-       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-               blk_recount_segments(q, bio);
-       len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
-       if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
-           !BIOVEC_VIRT_OVERSIZE(len)) {
-               int mergeable =  ll_new_mergeable(q, req, bio);
-
-               if (mergeable) {
-                       if (req->nr_hw_segments == 1)
-                               req->bio->bi_hw_front_size = len;
-                       if (bio->bi_hw_segments == 1)
-                               bio->bi_hw_back_size = len;
-               }
-               return mergeable;
-       }
-
-       return ll_new_hw_segment(q, req, bio);
-}
-
-static int ll_front_merge_fn(struct request_queue *q, struct request *req, 
-                            struct bio *bio)
-{
-       unsigned short max_sectors;
-       int len;
-
-       if (unlikely(blk_pc_request(req)))
-               max_sectors = q->max_hw_sectors;
-       else
-               max_sectors = q->max_sectors;
-
-
-       if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
-               req->cmd_flags |= REQ_NOMERGE;
-               if (req == q->last_merge)
-                       q->last_merge = NULL;
-               return 0;
-       }
-       len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
-       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-               blk_recount_segments(q, bio);
-       if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
-               blk_recount_segments(q, req->bio);
-       if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
-           !BIOVEC_VIRT_OVERSIZE(len)) {
-               int mergeable =  ll_new_mergeable(q, req, bio);
-
-               if (mergeable) {
-                       if (bio->bi_hw_segments == 1)
-                               bio->bi_hw_front_size = len;
-                       if (req->nr_hw_segments == 1)
-                               req->biotail->bi_hw_back_size = len;
-               }
-               return mergeable;
-       }
-
-       return ll_new_hw_segment(q, req, bio);
-}
-
-static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
-                               struct request *next)
-{
-       int total_phys_segments;
-       int total_hw_segments;
-
-       /*
-        * First check if the either of the requests are re-queued
-        * requests.  Can't merge them if they are.
-        */
-       if (req->special || next->special)
-               return 0;
-
-       /*
-        * Will it become too large?
-        */
-       if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
-               return 0;
-
-       total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
-       if (blk_phys_contig_segment(q, req->biotail, next->bio))
-               total_phys_segments--;
-
-       if (total_phys_segments > q->max_phys_segments)
-               return 0;
-
-       total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
-       if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
-               int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
-               /*
-                * propagate the combined length to the end of the requests
-                */
-               if (req->nr_hw_segments == 1)
-                       req->bio->bi_hw_front_size = len;
-               if (next->nr_hw_segments == 1)
-                       next->biotail->bi_hw_back_size = len;
-               total_hw_segments--;
-       }
-
-       if (total_hw_segments > q->max_hw_segments)
-               return 0;
-
-       /* Merge is OK... */
-       req->nr_phys_segments = total_phys_segments;
-       req->nr_hw_segments = total_hw_segments;
-       return 1;
-}
-
-/*
- * "plug" the device if there are no outstanding requests: this will
- * force the transfer to start only after we have put all the requests
- * on the list.
- *
- * This is called with interrupts off and no requests on the queue and
- * with the queue lock held.
- */
-void blk_plug_device(struct request_queue *q)
-{
-       WARN_ON(!irqs_disabled());
-
-       /*
-        * don't plug a stopped queue, it must be paired with blk_start_queue()
-        * which will restart the queueing
-        */
-       if (blk_queue_stopped(q))
-               return;
-
-       if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
-               mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-               blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
-       }
-}
-
-EXPORT_SYMBOL(blk_plug_device);
-
-/*
- * remove the queue from the plugged list, if present. called with
- * queue lock held and interrupts disabled.
- */
-int blk_remove_plug(struct request_queue *q)
-{
-       WARN_ON(!irqs_disabled());
-
-       if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
-               return 0;
-
-       del_timer(&q->unplug_timer);
-       return 1;
-}
-
-EXPORT_SYMBOL(blk_remove_plug);
-
-/*
- * remove the plug and let it rip..
- */
-void __generic_unplug_device(struct request_queue *q)
-{
-       if (unlikely(blk_queue_stopped(q)))
-               return;
-
-       if (!blk_remove_plug(q))
-               return;
-
-       q->request_fn(q);
-}
-EXPORT_SYMBOL(__generic_unplug_device);
-
-/**
- * generic_unplug_device - fire a request queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   Linux uses plugging to build bigger requests queues before letting
- *   the device have at them. If a queue is plugged, the I/O scheduler
- *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged, the request_fn defined for the queue is invoked and
- *   transfers started.
- **/
-void generic_unplug_device(struct request_queue *q)
-{
-       spin_lock_irq(q->queue_lock);
-       __generic_unplug_device(q);
-       spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(generic_unplug_device);
-
-static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
-                                  struct page *page)
-{
-       struct request_queue *q = bdi->unplug_io_data;
-
-       blk_unplug(q);
-}
-
-static void blk_unplug_work(struct work_struct *work)
-{
-       struct request_queue *q =
-               container_of(work, struct request_queue, unplug_work);
-
-       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-                               q->rq.count[READ] + q->rq.count[WRITE]);
-
-       q->unplug_fn(q);
-}
-
-static void blk_unplug_timeout(unsigned long data)
-{
-       struct request_queue *q = (struct request_queue *)data;
-
-       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
-                               q->rq.count[READ] + q->rq.count[WRITE]);
-
-       kblockd_schedule_work(&q->unplug_work);
-}
-
-void blk_unplug(struct request_queue *q)
-{
-       /*
-        * devices don't necessarily have an ->unplug_fn defined
-        */
-       if (q->unplug_fn) {
-               blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-                                       q->rq.count[READ] + q->rq.count[WRITE]);
-
-               q->unplug_fn(q);
-       }
-}
-EXPORT_SYMBOL(blk_unplug);
-
-/**
- * blk_start_queue - restart a previously stopped queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   blk_start_queue() will clear the stop flag on the queue, and call
- *   the request_fn for the queue if it was in a stopped state when
- *   entered. Also see blk_stop_queue(). Queue lock must be held.
- **/
-void blk_start_queue(struct request_queue *q)
-{
-       WARN_ON(!irqs_disabled());
-
-       clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
-
-       /*
-        * one level of recursion is ok and is much faster than kicking
-        * the unplug handling
-        */
-       if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
-               q->request_fn(q);
-               clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
-       } else {
-               blk_plug_device(q);
-               kblockd_schedule_work(&q->unplug_work);
-       }
-}
-
-EXPORT_SYMBOL(blk_start_queue);
-
-/**
- * blk_stop_queue - stop a queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   The Linux block layer assumes that a block driver will consume all
- *   entries on the request queue when the request_fn strategy is called.
- *   Often this will not happen, because of hardware limitations (queue
- *   depth settings). If a device driver gets a 'queue full' response,
- *   or if it simply chooses not to queue more I/O at one point, it can
- *   call this function to prevent the request_fn from being called until
- *   the driver has signalled it's ready to go again. This happens by calling
- *   blk_start_queue() to restart queue operations. Queue lock must be held.
- **/
-void blk_stop_queue(struct request_queue *q)
-{
-       blk_remove_plug(q);
-       set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
-}
-EXPORT_SYMBOL(blk_stop_queue);
-
-/**
- * blk_sync_queue - cancel any pending callbacks on a queue
- * @q: the queue
- *
- * Description:
- *     The block layer may perform asynchronous callback activity
- *     on a queue, such as calling the unplug function after a timeout.
- *     A block device may call blk_sync_queue to ensure that any
- *     such activity is cancelled, thus allowing it to release resources
- *     that the callbacks might use. The caller must already have made sure
- *     that its ->make_request_fn will not re-add plugging prior to calling
- *     this function.
- *
- */
-void blk_sync_queue(struct request_queue *q)
-{
-       del_timer_sync(&q->unplug_timer);
-       kblockd_flush_work(&q->unplug_work);
-}
-EXPORT_SYMBOL(blk_sync_queue);
-
-/**
- * blk_run_queue - run a single device queue
- * @q: The queue to run
- */
-void blk_run_queue(struct request_queue *q)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       blk_remove_plug(q);
-
-       /*
-        * Only recurse once to avoid overrunning the stack, let the unplug
-        * handling reinvoke the handler shortly if we already got there.
-        */
-       if (!elv_queue_empty(q)) {
-               if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
-                       q->request_fn(q);
-                       clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
-               } else {
-                       blk_plug_device(q);
-                       kblockd_schedule_work(&q->unplug_work);
-               }
-       }
-
-       spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_run_queue);
-
-/**
- * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
- * @kobj:    the kobj belonging of the request queue to be released
- *
- * Description:
- *     blk_cleanup_queue is the pair to blk_init_queue() or
- *     blk_queue_make_request().  It should be called when a request queue is
- *     being released; typically when a block device is being de-registered.
- *     Currently, its primary task it to free all the &struct request
- *     structures that were allocated to the queue and the queue itself.
- *
- * Caveat:
- *     Hopefully the low level driver will have finished any
- *     outstanding requests first...
- **/
-static void blk_release_queue(struct kobject *kobj)
-{
-       struct request_queue *q =
-               container_of(kobj, struct request_queue, kobj);
-       struct request_list *rl = &q->rq;
-
-       blk_sync_queue(q);
-
-       if (rl->rq_pool)
-               mempool_destroy(rl->rq_pool);
-
-       if (q->queue_tags)
-               __blk_queue_free_tags(q);
-
-       blk_trace_shutdown(q);
-
-       bdi_destroy(&q->backing_dev_info);
-       kmem_cache_free(requestq_cachep, q);
-}
-
-void blk_put_queue(struct request_queue *q)
-{
-       kobject_put(&q->kobj);
-}
-EXPORT_SYMBOL(blk_put_queue);
-
-void blk_cleanup_queue(struct request_queue * q)
-{
-       mutex_lock(&q->sysfs_lock);
-       set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
-       mutex_unlock(&q->sysfs_lock);
-
-       if (q->elevator)
-               elevator_exit(q->elevator);
-
-       blk_put_queue(q);
-}
-
-EXPORT_SYMBOL(blk_cleanup_queue);
-
-static int blk_init_free_list(struct request_queue *q)
-{
-       struct request_list *rl = &q->rq;
-
-       rl->count[READ] = rl->count[WRITE] = 0;
-       rl->starved[READ] = rl->starved[WRITE] = 0;
-       rl->elvpriv = 0;
-       init_waitqueue_head(&rl->wait[READ]);
-       init_waitqueue_head(&rl->wait[WRITE]);
-
-       rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-                               mempool_free_slab, request_cachep, q->node);
-
-       if (!rl->rq_pool)
-               return -ENOMEM;
-
-       return 0;
-}
-
-struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
-{
-       return blk_alloc_queue_node(gfp_mask, -1);
-}
-EXPORT_SYMBOL(blk_alloc_queue);
-
-static struct kobj_type queue_ktype;
-
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
-{
-       struct request_queue *q;
-       int err;
-
-       q = kmem_cache_alloc_node(requestq_cachep,
-                               gfp_mask | __GFP_ZERO, node_id);
-       if (!q)
-               return NULL;
-
-       q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
-       q->backing_dev_info.unplug_io_data = q;
-       err = bdi_init(&q->backing_dev_info);
-       if (err) {
-               kmem_cache_free(requestq_cachep, q);
-               return NULL;
-       }
-
-       init_timer(&q->unplug_timer);
-
-       kobject_init(&q->kobj, &queue_ktype);
-
-       mutex_init(&q->sysfs_lock);
-
-       return q;
-}
-EXPORT_SYMBOL(blk_alloc_queue_node);
-
-/**
- * blk_init_queue  - prepare a request queue for use with a block device
- * @rfn:  The function to be called to process requests that have been
- *        placed on the queue.
- * @lock: Request queue spin lock
- *
- * Description:
- *    If a block device wishes to use the standard request handling procedures,
- *    which sorts requests and coalesces adjacent requests, then it must
- *    call blk_init_queue().  The function @rfn will be called when there
- *    are requests on the queue that need to be processed.  If the device
- *    supports plugging, then @rfn may not be called immediately when requests
- *    are available on the queue, but may be called at some time later instead.
- *    Plugged queues are generally unplugged when a buffer belonging to one
- *    of the requests on the queue is needed, or due to memory pressure.
- *
- *    @rfn is not required, or even expected, to remove all requests off the
- *    queue, but only as many as it can handle at a time.  If it does leave
- *    requests on the queue, it is responsible for arranging that the requests
- *    get dealt with eventually.
- *
- *    The queue spin lock must be held while manipulating the requests on the
- *    request queue; this lock will be taken also from interrupt context, so irq
- *    disabling is needed for it.
- *
- *    Function returns a pointer to the initialized request queue, or NULL if
- *    it didn't succeed.
- *
- * Note:
- *    blk_init_queue() must be paired with a blk_cleanup_queue() call
- *    when the block device is deactivated (such as at module unload).
- **/
-
-struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
-{
-       return blk_init_queue_node(rfn, lock, -1);
-}
-EXPORT_SYMBOL(blk_init_queue);
-
-struct request_queue *
-blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
-{
-       struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
-
-       if (!q)
-               return NULL;
-
-       q->node = node_id;
-       if (blk_init_free_list(q)) {
-               kmem_cache_free(requestq_cachep, q);
-               return NULL;
-       }
-
-       /*
-        * if caller didn't supply a lock, they get per-queue locking with
-        * our embedded lock
-        */
-       if (!lock) {
-               spin_lock_init(&q->__queue_lock);
-               lock = &q->__queue_lock;
-       }
-
-       q->request_fn           = rfn;
-       q->prep_rq_fn           = NULL;
-       q->unplug_fn            = generic_unplug_device;
-       q->queue_flags          = (1 << QUEUE_FLAG_CLUSTER);
-       q->queue_lock           = lock;
-
-       blk_queue_segment_boundary(q, 0xffffffff);
-
-       blk_queue_make_request(q, __make_request);
-       blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
-
-       blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
-       blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
-
-       q->sg_reserved_size = INT_MAX;
-
-       /*
-        * all done
-        */
-       if (!elevator_init(q, NULL)) {
-               blk_queue_congestion_threshold(q);
-               return q;
-       }
-
-       blk_put_queue(q);
-       return NULL;
-}
-EXPORT_SYMBOL(blk_init_queue_node);
-
-int blk_get_queue(struct request_queue *q)
-{
-       if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-               kobject_get(&q->kobj);
-               return 0;
-       }
-
-       return 1;
-}
-
-EXPORT_SYMBOL(blk_get_queue);
-
-static inline void blk_free_request(struct request_queue *q, struct request *rq)
-{
-       if (rq->cmd_flags & REQ_ELVPRIV)
-               elv_put_request(q, rq);
-       mempool_free(rq, q->rq.rq_pool);
-}
-
-static struct request *
-blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
-{
-       struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
-
-       if (!rq)
-               return NULL;
-
-       /*
-        * first three bits are identical in rq->cmd_flags and bio->bi_rw,
-        * see bio.h and blkdev.h
-        */
-       rq->cmd_flags = rw | REQ_ALLOCED;
-
-       if (priv) {
-               if (unlikely(elv_set_request(q, rq, gfp_mask))) {
-                       mempool_free(rq, q->rq.rq_pool);
-                       return NULL;
-               }
-               rq->cmd_flags |= REQ_ELVPRIV;
-       }
-
-       return rq;
-}
-
-/*
- * ioc_batching returns true if the ioc is a valid batching request and
- * should be given priority access to a request.
- */
-static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
-{
-       if (!ioc)
-               return 0;
-
-       /*
-        * Make sure the process is able to allocate at least 1 request
-        * even if the batch times out, otherwise we could theoretically
-        * lose wakeups.
-        */
-       return ioc->nr_batch_requests == q->nr_batching ||
-               (ioc->nr_batch_requests > 0
-               && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
-}
-
-/*
- * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
- * will cause the process to be a "batcher" on all queues in the system. This
- * is the behaviour we want though - once it gets a wakeup it should be given
- * a nice run.
- */
-static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
-{
-       if (!ioc || ioc_batching(q, ioc))
-               return;
-
-       ioc->nr_batch_requests = q->nr_batching;
-       ioc->last_waited = jiffies;
-}
-
-static void __freed_request(struct request_queue *q, int rw)
-{
-       struct request_list *rl = &q->rq;
-
-       if (rl->count[rw] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, rw);
-
-       if (rl->count[rw] + 1 <= q->nr_requests) {
-               if (waitqueue_active(&rl->wait[rw]))
-                       wake_up(&rl->wait[rw]);
-
-               blk_clear_queue_full(q, rw);
-       }
-}
-
-/*
- * A request has just been released.  Account for it, update the full and
- * congestion status, wake up any waiters.   Called under q->queue_lock.
- */
-static void freed_request(struct request_queue *q, int rw, int priv)
-{
-       struct request_list *rl = &q->rq;
-
-       rl->count[rw]--;
-       if (priv)
-               rl->elvpriv--;
-
-       __freed_request(q, rw);
-
-       if (unlikely(rl->starved[rw ^ 1]))
-               __freed_request(q, rw ^ 1);
-}
-
-#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
-/*
- * Get a free request, queue_lock must be held.
- * Returns NULL on failure, with queue_lock held.
- * Returns !NULL on success, with queue_lock *not held*.
- */
-static struct request *get_request(struct request_queue *q, int rw_flags,
-                                  struct bio *bio, gfp_t gfp_mask)
-{
-       struct request *rq = NULL;
-       struct request_list *rl = &q->rq;
-       struct io_context *ioc = NULL;
-       const int rw = rw_flags & 0x01;
-       int may_queue, priv;
-
-       may_queue = elv_may_queue(q, rw_flags);
-       if (may_queue == ELV_MQUEUE_NO)
-               goto rq_starved;
-
-       if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
-               if (rl->count[rw]+1 >= q->nr_requests) {
-                       ioc = current_io_context(GFP_ATOMIC, q->node);
-                       /*
-                        * The queue will fill after this allocation, so set
-                        * it as full, and mark this process as "batching".
-                        * This process will be allowed to complete a batch of
-                        * requests, others will be blocked.
-                        */
-                       if (!blk_queue_full(q, rw)) {
-                               ioc_set_batching(q, ioc);
-                               blk_set_queue_full(q, rw);
-                       } else {
-                               if (may_queue != ELV_MQUEUE_MUST
-                                               && !ioc_batching(q, ioc)) {
-                                       /*
-                                        * The queue is full and the allocating
-                                        * process is not a "batcher", and not
-                                        * exempted by the IO scheduler
-                                        */
-                                       goto out;
-                               }
-                       }
-               }
-               blk_set_queue_congested(q, rw);
-       }
-
-       /*
-        * Only allow batching queuers to allocate up to 50% over the defined
-        * limit of requests, otherwise we could have thousands of requests
-        * allocated with any setting of ->nr_requests
-        */
-       if (rl->count[rw] >= (3 * q->nr_requests / 2))
-               goto out;
-
-       rl->count[rw]++;
-       rl->starved[rw] = 0;
-
-       priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-       if (priv)
-               rl->elvpriv++;
-
-       spin_unlock_irq(q->queue_lock);
-
-       rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
-       if (unlikely(!rq)) {
-               /*
-                * Allocation failed presumably due to memory. Undo anything
-                * we might have messed up.
-                *
-                * Allocating task should really be put onto the front of the
-                * wait queue, but this is pretty rare.
-                */
-               spin_lock_irq(q->queue_lock);
-               freed_request(q, rw, priv);
-
-               /*
-                * in the very unlikely event that allocation failed and no
-                * requests for this direction was pending, mark us starved
-                * so that freeing of a request in the other direction will
-                * notice us. another possible fix would be to split the
-                * rq mempool into READ and WRITE
-                */
-rq_starved:
-               if (unlikely(rl->count[rw] == 0))
-                       rl->starved[rw] = 1;
-
-               goto out;
-       }
-
-       /*
-        * ioc may be NULL here, and ioc_batching will be false. That's
-        * OK, if the queue is under the request limit then requests need
-        * not count toward the nr_batch_requests limit. There will always
-        * be some limit enforced by BLK_BATCH_TIME.
-        */
-       if (ioc_batching(q, ioc))
-               ioc->nr_batch_requests--;
-       
-       rq_init(q, rq);
-
-       blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
-out:
-       return rq;
-}
-
-/*
- * No available requests for this queue, unplug the device and wait for some
- * requests to become available.
- *
- * Called with q->queue_lock held, and returns with it unlocked.
- */
-static struct request *get_request_wait(struct request_queue *q, int rw_flags,
-                                       struct bio *bio)
-{
-       const int rw = rw_flags & 0x01;
-       struct request *rq;
-
-       rq = get_request(q, rw_flags, bio, GFP_NOIO);
-       while (!rq) {
-               DEFINE_WAIT(wait);
-               struct request_list *rl = &q->rq;
-
-               prepare_to_wait_exclusive(&rl->wait[rw], &wait,
-                               TASK_UNINTERRUPTIBLE);
-
-               rq = get_request(q, rw_flags, bio, GFP_NOIO);
-
-               if (!rq) {
-                       struct io_context *ioc;
-
-                       blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
-
-                       __generic_unplug_device(q);
-                       spin_unlock_irq(q->queue_lock);
-                       io_schedule();
-
-                       /*
-                        * After sleeping, we become a "batching" process and
-                        * will be able to allocate at least one request, and
-                        * up to a big batch of them for a small period time.
-                        * See ioc_batching, ioc_set_batching
-                        */
-                       ioc = current_io_context(GFP_NOIO, q->node);
-                       ioc_set_batching(q, ioc);
-
-                       spin_lock_irq(q->queue_lock);
-               }
-               finish_wait(&rl->wait[rw], &wait);
-       }
-
-       return rq;
-}
-
-struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
-{
-       struct request *rq;
-
-       BUG_ON(rw != READ && rw != WRITE);
-
-       spin_lock_irq(q->queue_lock);
-       if (gfp_mask & __GFP_WAIT) {
-               rq = get_request_wait(q, rw, NULL);
-       } else {
-               rq = get_request(q, rw, NULL, gfp_mask);
-               if (!rq)
-                       spin_unlock_irq(q->queue_lock);
-       }
-       /* q->queue_lock is unlocked at this point */
-
-       return rq;
-}
-EXPORT_SYMBOL(blk_get_request);
-
-/**
- * blk_start_queueing - initiate dispatch of requests to device
- * @q:         request queue to kick into gear
- *
- * This is basically a helper to remove the need to know whether a queue
- * is plugged or not if someone just wants to initiate dispatch of requests
- * for this queue.
- *
- * The queue lock must be held with interrupts disabled.
- */
-void blk_start_queueing(struct request_queue *q)
-{
-       if (!blk_queue_plugged(q))
-               q->request_fn(q);
-       else
-               __generic_unplug_device(q);
-}
-EXPORT_SYMBOL(blk_start_queueing);
-
-/**
- * blk_requeue_request - put a request back on queue
- * @q:         request queue where request should be inserted
- * @rq:                request to be inserted
- *
- * Description:
- *    Drivers often keep queueing requests until the hardware cannot accept
- *    more, when that condition happens we need to put the request back
- *    on the queue. Must be called with queue lock held.
- */
-void blk_requeue_request(struct request_queue *q, struct request *rq)
-{
-       blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
-
-       if (blk_rq_tagged(rq))
-               blk_queue_end_tag(q, rq);
-
-       elv_requeue_request(q, rq);
-}
-
-EXPORT_SYMBOL(blk_requeue_request);
-
-/**
- * blk_insert_request - insert a special request in to a request queue
- * @q:         request queue where request should be inserted
- * @rq:                request to be inserted
- * @at_head:   insert request at head or tail of queue
- * @data:      private data
- *
- * Description:
- *    Many block devices need to execute commands asynchronously, so they don't
- *    block the whole kernel from preemption during request execution.  This is
- *    accomplished normally by inserting aritficial requests tagged as
- *    REQ_SPECIAL in to the corresponding request queue, and letting them be
- *    scheduled for actual execution by the request queue.
- *
- *    We have the option of inserting the head or the tail of the queue.
- *    Typically we use the tail for new ioctls and so forth.  We use the head
- *    of the queue for things like a QUEUE_FULL message from a device, or a
- *    host that is unable to accept a particular command.
- */
-void blk_insert_request(struct request_queue *q, struct request *rq,
-                       int at_head, void *data)
-{
-       int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-       unsigned long flags;
-
-       /*
-        * tell I/O scheduler that this isn't a regular read/write (ie it
-        * must not attempt merges on this) and that it acts as a soft
-        * barrier
-        */
-       rq->cmd_type = REQ_TYPE_SPECIAL;
-       rq->cmd_flags |= REQ_SOFTBARRIER;
-
-       rq->special = data;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-
-       /*
-        * If command is tagged, release the tag
-        */
-       if (blk_rq_tagged(rq))
-               blk_queue_end_tag(q, rq);
-
-       drive_stat_acct(rq, 1);
-       __elv_add_request(q, rq, where, 0);
-       blk_start_queueing(q);
-       spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-EXPORT_SYMBOL(blk_insert_request);
-
-static int __blk_rq_unmap_user(struct bio *bio)
-{
-       int ret = 0;
-
-       if (bio) {
-               if (bio_flagged(bio, BIO_USER_MAPPED))
-                       bio_unmap_user(bio);
-               else
-                       ret = bio_uncopy_user(bio);
-       }
-
-       return ret;
-}
-
-int blk_rq_append_bio(struct request_queue *q, struct request *rq,
-                     struct bio *bio)
-{
-       if (!rq->bio)
-               blk_rq_bio_prep(q, rq, bio);
-       else if (!ll_back_merge_fn(q, rq, bio))
-               return -EINVAL;
-       else {
-               rq->biotail->bi_next = bio;
-               rq->biotail = bio;
-
-               rq->data_len += bio->bi_size;
-       }
-       return 0;
-}
-EXPORT_SYMBOL(blk_rq_append_bio);
-
-static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
-                            void __user *ubuf, unsigned int len)
-{
-       unsigned long uaddr;
-       struct bio *bio, *orig_bio;
-       int reading, ret;
-
-       reading = rq_data_dir(rq) == READ;
-
-       /*
-        * if alignment requirement is satisfied, map in user pages for
-        * direct dma. else, set up kernel bounce buffers
-        */
-       uaddr = (unsigned long) ubuf;
-       if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
-               bio = bio_map_user(q, NULL, uaddr, len, reading);
-       else
-               bio = bio_copy_user(q, uaddr, len, reading);
-
-       if (IS_ERR(bio))
-               return PTR_ERR(bio);
-
-       orig_bio = bio;
-       blk_queue_bounce(q, &bio);
-
-       /*
-        * We link the bounce buffer in and could have to traverse it
-        * later so we have to get a ref to prevent it from being freed
-        */
-       bio_get(bio);
-
-       ret = blk_rq_append_bio(q, rq, bio);
-       if (!ret)
-               return bio->bi_size;
-
-       /* if it was boucned we must call the end io function */
-       bio_endio(bio, 0);
-       __blk_rq_unmap_user(orig_bio);
-       bio_put(bio);
-       return ret;
-}
-
-/**
- * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
- * @q:         request queue where request should be inserted
- * @rq:                request structure to fill
- * @ubuf:      the user buffer
- * @len:       length of user data
- *
- * Description:
- *    Data will be mapped directly for zero copy io, if possible. Otherwise
- *    a kernel bounce buffer is used.
- *
- *    A matching blk_rq_unmap_user() must be issued at the end of io, while
- *    still in process context.
- *
- *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
- *    before being submitted to the device, as pages mapped may be out of
- *    reach. It's the callers responsibility to make sure this happens. The
- *    original bio must be passed back in to blk_rq_unmap_user() for proper
- *    unmapping.
- */
-int blk_rq_map_user(struct request_queue *q, struct request *rq,
-                   void __user *ubuf, unsigned long len)
-{
-       unsigned long bytes_read = 0;
-       struct bio *bio = NULL;
-       int ret;
-
-       if (len > (q->max_hw_sectors << 9))
-               return -EINVAL;
-       if (!len || !ubuf)
-               return -EINVAL;
-
-       while (bytes_read != len) {
-               unsigned long map_len, end, start;
-
-               map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
-               end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
-                                                               >> PAGE_SHIFT;
-               start = (unsigned long)ubuf >> PAGE_SHIFT;
-
-               /*
-                * A bad offset could cause us to require BIO_MAX_PAGES + 1
-                * pages. If this happens we just lower the requested
-                * mapping len by a page so that we can fit
-                */
-               if (end - start > BIO_MAX_PAGES)
-                       map_len -= PAGE_SIZE;
-
-               ret = __blk_rq_map_user(q, rq, ubuf, map_len);
-               if (ret < 0)
-                       goto unmap_rq;
-               if (!bio)
-                       bio = rq->bio;
-               bytes_read += ret;
-               ubuf += ret;
-       }
-
-       rq->buffer = rq->data = NULL;
-       return 0;
-unmap_rq:
-       blk_rq_unmap_user(bio);
-       return ret;
-}
-
-EXPORT_SYMBOL(blk_rq_map_user);
-
-/**
- * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
- * @q:         request queue where request should be inserted
- * @rq:                request to map data to
- * @iov:       pointer to the iovec
- * @iov_count: number of elements in the iovec
- * @len:       I/O byte count
- *
- * Description:
- *    Data will be mapped directly for zero copy io, if possible. Otherwise
- *    a kernel bounce buffer is used.
- *
- *    A matching blk_rq_unmap_user() must be issued at the end of io, while
- *    still in process context.
- *
- *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
- *    before being submitted to the device, as pages mapped may be out of
- *    reach. It's the callers responsibility to make sure this happens. The
- *    original bio must be passed back in to blk_rq_unmap_user() for proper
- *    unmapping.
- */
-int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
-                       struct sg_iovec *iov, int iov_count, unsigned int len)
-{
-       struct bio *bio;
-
-       if (!iov || iov_count <= 0)
-               return -EINVAL;
-
-       /* we don't allow misaligned data like bio_map_user() does.  If the
-        * user is using sg, they're expected to know the alignment constraints
-        * and respect them accordingly */
-       bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
-       if (IS_ERR(bio))
-               return PTR_ERR(bio);
-
-       if (bio->bi_size != len) {
-               bio_endio(bio, 0);
-               bio_unmap_user(bio);
-               return -EINVAL;
-       }
-
-       bio_get(bio);
-       blk_rq_bio_prep(q, rq, bio);
-       rq->buffer = rq->data = NULL;
-       return 0;
-}
-
-EXPORT_SYMBOL(blk_rq_map_user_iov);
-
-/**
- * blk_rq_unmap_user - unmap a request with user data
- * @bio:              start of bio list
- *
- * Description:
- *    Unmap a rq previously mapped by blk_rq_map_user(). The caller must
- *    supply the original rq->bio from the blk_rq_map_user() return, since
- *    the io completion may have changed rq->bio.
- */
-int blk_rq_unmap_user(struct bio *bio)
-{
-       struct bio *mapped_bio;
-       int ret = 0, ret2;
-
-       while (bio) {
-               mapped_bio = bio;
-               if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
-                       mapped_bio = bio->bi_private;
-
-               ret2 = __blk_rq_unmap_user(mapped_bio);
-               if (ret2 && !ret)
-                       ret = ret2;
-
-               mapped_bio = bio;
-               bio = bio->bi_next;
-               bio_put(mapped_bio);
-       }
-
-       return ret;
-}
-
-EXPORT_SYMBOL(blk_rq_unmap_user);
-
-/**
- * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
- * @q:         request queue where request should be inserted
- * @rq:                request to fill
- * @kbuf:      the kernel buffer
- * @len:       length of user data
- * @gfp_mask:  memory allocation flags
- */
-int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
-                   unsigned int len, gfp_t gfp_mask)
-{
-       struct bio *bio;
-
-       if (len > (q->max_hw_sectors << 9))
-               return -EINVAL;
-       if (!len || !kbuf)
-               return -EINVAL;
-
-       bio = bio_map_kern(q, kbuf, len, gfp_mask);
-       if (IS_ERR(bio))
-               return PTR_ERR(bio);
-
-       if (rq_data_dir(rq) == WRITE)
-               bio->bi_rw |= (1 << BIO_RW);
-
-       blk_rq_bio_prep(q, rq, bio);
-       blk_queue_bounce(q, &rq->bio);
-       rq->buffer = rq->data = NULL;
-       return 0;
-}
-
-EXPORT_SYMBOL(blk_rq_map_kern);
-
-/**
- * blk_execute_rq_nowait - insert a request into queue for execution
- * @q:         queue to insert the request in
- * @bd_disk:   matching gendisk
- * @rq:                request to insert
- * @at_head:    insert request at head or tail of queue
- * @done:      I/O completion handler
- *
- * Description:
- *    Insert a fully prepared request at the back of the io scheduler queue
- *    for execution.  Don't wait for completion.
- */
-void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
-                          struct request *rq, int at_head,
-                          rq_end_io_fn *done)
-{
-       int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-
-       rq->rq_disk = bd_disk;
-       rq->cmd_flags |= REQ_NOMERGE;
-       rq->end_io = done;
-       WARN_ON(irqs_disabled());
-       spin_lock_irq(q->queue_lock);
-       __elv_add_request(q, rq, where, 1);
-       __generic_unplug_device(q);
-       spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
-
-/**
- * blk_execute_rq - insert a request into queue for execution
- * @q:         queue to insert the request in
- * @bd_disk:   matching gendisk
- * @rq:                request to insert
- * @at_head:    insert request at head or tail of queue
- *
- * Description:
- *    Insert a fully prepared request at the back of the io scheduler queue
- *    for execution and wait for completion.
- */
-int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
-                  struct request *rq, int at_head)
-{
-       DECLARE_COMPLETION_ONSTACK(wait);
-       char sense[SCSI_SENSE_BUFFERSIZE];
-       int err = 0;
-
-       /*
-        * we need an extra reference to the request, so we can look at
-        * it after io completion
-        */
-       rq->ref_count++;
-
-       if (!rq->sense) {
-               memset(sense, 0, sizeof(sense));
-               rq->sense = sense;
-               rq->sense_len = 0;
-       }
-
-       rq->end_io_data = &wait;
-       blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
-       wait_for_completion(&wait);
-
-       if (rq->errors)
-               err = -EIO;
-
-       return err;
-}
-
-EXPORT_SYMBOL(blk_execute_rq);
-
-static void bio_end_empty_barrier(struct bio *bio, int err)
-{
-       if (err)
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
-
-       complete(bio->bi_private);
-}
-
-/**
- * blkdev_issue_flush - queue a flush
- * @bdev:      blockdev to issue flush for
- * @error_sector:      error sector
- *
- * Description:
- *    Issue a flush for the block device in question. Caller can supply
- *    room for storing the error offset in case of a flush error, if they
- *    wish to.  Caller must run wait_for_completion() on its own.
- */
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
-{
-       DECLARE_COMPLETION_ONSTACK(wait);
-       struct request_queue *q;
-       struct bio *bio;
-       int ret;
-
-       if (bdev->bd_disk == NULL)
-               return -ENXIO;
-
-       q = bdev_get_queue(bdev);
-       if (!q)
-               return -ENXIO;
-
-       bio = bio_alloc(GFP_KERNEL, 0);
-       if (!bio)
-               return -ENOMEM;
-
-       bio->bi_end_io = bio_end_empty_barrier;
-       bio->bi_private = &wait;
-       bio->bi_bdev = bdev;
-       submit_bio(1 << BIO_RW_BARRIER, bio);
-
-       wait_for_completion(&wait);
-
-       /*
-        * The driver must store the error location in ->bi_sector, if
-        * it supports it. For non-stacked drivers, this should be copied
-        * from rq->sector.
-        */
-       if (error_sector)
-               *error_sector = bio->bi_sector;
-
-       ret = 0;
-       if (!bio_flagged(bio, BIO_UPTODATE))
-               ret = -EIO;
-
-       bio_put(bio);
-       return ret;
-}
-
-EXPORT_SYMBOL(blkdev_issue_flush);
-
-static void drive_stat_acct(struct request *rq, int new_io)
-{
-       int rw = rq_data_dir(rq);
-
-       if (!blk_fs_request(rq) || !rq->rq_disk)
-               return;
-
-       if (!new_io) {
-               __disk_stat_inc(rq->rq_disk, merges[rw]);
-       } else {
-               disk_round_stats(rq->rq_disk);
-               rq->rq_disk->in_flight++;
-       }
-}
-
-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue * q, struct request * req)
-{
-       drive_stat_acct(req, 1);
-
-       /*
-        * elevator indicated where it wants this request to be
-        * inserted at elevator_merge time
-        */
-       __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
- 
-/*
- * disk_round_stats()  - Round off the performance stats on a struct
- * disk_stats.
- *
- * The average IO queue length and utilisation statistics are maintained
- * by observing the current state of the queue length and the amount of
- * time it has been in this state for.
- *
- * Normally, that accounting is done on IO completion, but that can result
- * in more than a second's worth of IO being accounted for within any one
- * second, leading to >100% utilisation.  To deal with that, we call this
- * function to do a round-off before returning the results when reading
- * /proc/diskstats.  This accounts immediately for all queue usage up to
- * the current jiffies and restarts the counters again.
- */
-void disk_round_stats(struct gendisk *disk)
-{
-       unsigned long now = jiffies;
-
-       if (now == disk->stamp)
-               return;
-
-       if (disk->in_flight) {
-               __disk_stat_add(disk, time_in_queue,
-                               disk->in_flight * (now - disk->stamp));
-               __disk_stat_add(disk, io_ticks, (now - disk->stamp));
-       }
-       disk->stamp = now;
-}
-
-EXPORT_SYMBOL_GPL(disk_round_stats);
-
-/*
- * queue lock must be held
- */
-void __blk_put_request(struct request_queue *q, struct request *req)
-{
-       if (unlikely(!q))
-               return;
-       if (unlikely(--req->ref_count))
-               return;
-
-       elv_completed_request(q, req);
-
-       /*
-        * Request may not have originated from ll_rw_blk. if not,
-        * it didn't come out of our reserved rq pools
-        */
-       if (req->cmd_flags & REQ_ALLOCED) {
-               int rw = rq_data_dir(req);
-               int priv = req->cmd_flags & REQ_ELVPRIV;
-
-               BUG_ON(!list_empty(&req->queuelist));
-               BUG_ON(!hlist_unhashed(&req->hash));
-
-               blk_free_request(q, req);
-               freed_request(q, rw, priv);
-       }
-}
-
-EXPORT_SYMBOL_GPL(__blk_put_request);
-
-void blk_put_request(struct request *req)
-{
-       unsigned long flags;
-       struct request_queue *q = req->q;
-
-       /*
-        * Gee, IDE calls in w/ NULL q.  Fix IDE and remove the
-        * following if (q) test.
-        */
-       if (q) {
-               spin_lock_irqsave(q->queue_lock, flags);
-               __blk_put_request(q, req);
-               spin_unlock_irqrestore(q->queue_lock, flags);
-       }
-}
-
-EXPORT_SYMBOL(blk_put_request);
-
-/**
- * blk_end_sync_rq - executes a completion event on a request
- * @rq: request to complete
- * @error: end io status of the request
- */
-void blk_end_sync_rq(struct request *rq, int error)
-{
-       struct completion *waiting = rq->end_io_data;
-
-       rq->end_io_data = NULL;
-       __blk_put_request(rq->q, rq);
-
-       /*
-        * complete last, if this is a stack request the process (and thus
-        * the rq pointer) could be invalid right after this complete()
-        */
-       complete(waiting);
-}
-EXPORT_SYMBOL(blk_end_sync_rq);
-
-/*
- * Has to be called with the request spinlock acquired
- */
-static int attempt_merge(struct request_queue *q, struct request *req,
-                         struct request *next)
-{
-       if (!rq_mergeable(req) || !rq_mergeable(next))
-               return 0;
-
-       /*
-        * not contiguous
-        */
-       if (req->sector + req->nr_sectors != next->sector)
-               return 0;
-
-       if (rq_data_dir(req) != rq_data_dir(next)
-           || req->rq_disk != next->rq_disk
-           || next->special)
-               return 0;
-
-       /*
-        * If we are allowed to merge, then append bio list
-        * from next to rq and release next. merge_requests_fn
-        * will have updated segment counts, update sector
-        * counts here.
-        */
-       if (!ll_merge_requests_fn(q, req, next))
-               return 0;
-
-       /*
-        * At this point we have either done a back merge
-        * or front merge. We need the smaller start_time of
-        * the merged requests to be the current request
-        * for accounting purposes.
-        */
-       if (time_after(req->start_time, next->start_time))
-               req->start_time = next->start_time;
-
-       req->biotail->bi_next = next->bio;
-       req->biotail = next->biotail;
-
-       req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
-
-       elv_merge_requests(q, req, next);
-
-       if (req->rq_disk) {
-               disk_round_stats(req->rq_disk);
-               req->rq_disk->in_flight--;
-       }
-
-       req->ioprio = ioprio_best(req->ioprio, next->ioprio);
-
-       __blk_put_request(q, next);
-       return 1;
-}
-
-static inline int attempt_back_merge(struct request_queue *q,
-                                    struct request *rq)
-{
-       struct request *next = elv_latter_request(q, rq);
-
-       if (next)
-               return attempt_merge(q, rq, next);
-
-       return 0;
-}
-
-static inline int attempt_front_merge(struct request_queue *q,
-                                     struct request *rq)
-{
-       struct request *prev = elv_former_request(q, rq);
-
-       if (prev)
-               return attempt_merge(q, prev, rq);
-
-       return 0;
-}
-
-static void init_request_from_bio(struct request *req, struct bio *bio)
-{
-       req->cmd_type = REQ_TYPE_FS;
-
-       /*
-        * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
-        */
-       if (bio_rw_ahead(bio) || bio_failfast(bio))
-               req->cmd_flags |= REQ_FAILFAST;
-
-       /*
-        * REQ_BARRIER implies no merging, but lets make it explicit
-        */
-       if (unlikely(bio_barrier(bio)))
-               req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
-
-       if (bio_sync(bio))
-               req->cmd_flags |= REQ_RW_SYNC;
-       if (bio_rw_meta(bio))
-               req->cmd_flags |= REQ_RW_META;
-
-       req->errors = 0;
-       req->hard_sector = req->sector = bio->bi_sector;
-       req->ioprio = bio_prio(bio);
-       req->start_time = jiffies;
-       blk_rq_bio_prep(req->q, req, bio);
-}
-
-static int __make_request(struct request_queue *q, struct bio *bio)
-{
-       struct request *req;
-       int el_ret, nr_sectors, barrier, err;
-       const unsigned short prio = bio_prio(bio);
-       const int sync = bio_sync(bio);
-       int rw_flags;
-
-       nr_sectors = bio_sectors(bio);
-
-       /*
-        * low level driver can indicate that it wants pages above a
-        * certain limit bounced to low memory (ie for highmem, or even
-        * ISA dma in theory)
-        */
-       blk_queue_bounce(q, &bio);
-
-       barrier = bio_barrier(bio);
-       if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
-               err = -EOPNOTSUPP;
-               goto end_io;
-       }
-
-       spin_lock_irq(q->queue_lock);
-
-       if (unlikely(barrier) || elv_queue_empty(q))
-               goto get_rq;
-
-       el_ret = elv_merge(q, &req, bio);
-       switch (el_ret) {
-               case ELEVATOR_BACK_MERGE:
-                       BUG_ON(!rq_mergeable(req));
-
-                       if (!ll_back_merge_fn(q, req, bio))
-                               break;
-
-                       blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
-
-                       req->biotail->bi_next = bio;
-                       req->biotail = bio;
-                       req->nr_sectors = req->hard_nr_sectors += nr_sectors;
-                       req->ioprio = ioprio_best(req->ioprio, prio);
-                       drive_stat_acct(req, 0);
-                       if (!attempt_back_merge(q, req))
-                               elv_merged_request(q, req, el_ret);
-                       goto out;
-
-               case ELEVATOR_FRONT_MERGE:
-                       BUG_ON(!rq_mergeable(req));
-
-                       if (!ll_front_merge_fn(q, req, bio))
-                               break;
-
-                       blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
-
-                       bio->bi_next = req->bio;
-                       req->bio = bio;
-
-                       /*
-                        * may not be valid. if the low level driver said
-                        * it didn't need a bounce buffer then it better
-                        * not touch req->buffer either...
-                        */
-                       req->buffer = bio_data(bio);
-                       req->current_nr_sectors = bio_cur_sectors(bio);
-                       req->hard_cur_sectors = req->current_nr_sectors;
-                       req->sector = req->hard_sector = bio->bi_sector;
-                       req->nr_sectors = req->hard_nr_sectors += nr_sectors;
-                       req->ioprio = ioprio_best(req->ioprio, prio);
-                       drive_stat_acct(req, 0);
-                       if (!attempt_front_merge(q, req))
-                               elv_merged_request(q, req, el_ret);
-                       goto out;
-
-               /* ELV_NO_MERGE: elevator says don't/can't merge. */
-               default:
-                       ;
-       }
-
-get_rq:
-       /*
-        * This sync check and mask will be re-done in init_request_from_bio(),
-        * but we need to set it earlier to expose the sync flag to the
-        * rq allocator and io schedulers.
-        */
-       rw_flags = bio_data_dir(bio);
-       if (sync)
-               rw_flags |= REQ_RW_SYNC;
-
-       /*
-        * Grab a free request. This is might sleep but can not fail.
-        * Returns with the queue unlocked.
-        */
-       req = get_request_wait(q, rw_flags, bio);
-
-       /*
-        * After dropping the lock and possibly sleeping here, our request
-        * may now be mergeable after it had proven unmergeable (above).
-        * We don't worry about that case for efficiency. It won't happen
-        * often, and the elevators are able to handle it.
-        */
-       init_request_from_bio(req, bio);
-
-       spin_lock_irq(q->queue_lock);
-       if (elv_queue_empty(q))
-               blk_plug_device(q);
-       add_request(q, req);
-out:
-       if (sync)
-               __generic_unplug_device(q);
-
-       spin_unlock_irq(q->queue_lock);
-       return 0;
-
-end_io:
-       bio_endio(bio, err);
-       return 0;
-}
-
-/*
- * If bio->bi_dev is a partition, remap the location
- */
-static inline void blk_partition_remap(struct bio *bio)
-{
-       struct block_device *bdev = bio->bi_bdev;
-
-       if (bio_sectors(bio) && bdev != bdev->bd_contains) {
-               struct hd_struct *p = bdev->bd_part;
-               const int rw = bio_data_dir(bio);
-
-               p->sectors[rw] += bio_sectors(bio);
-               p->ios[rw]++;
-
-               bio->bi_sector += p->start_sect;
-               bio->bi_bdev = bdev->bd_contains;
-
-               blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                   bdev->bd_dev, bio->bi_sector,
-                                   bio->bi_sector - p->start_sect);
-       }
-}
-
-static void handle_bad_sector(struct bio *bio)
-{
-       char b[BDEVNAME_SIZE];
-
-       printk(KERN_INFO "attempt to access beyond end of device\n");
-       printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
-                       bdevname(bio->bi_bdev, b),
-                       bio->bi_rw,
-                       (unsigned long long)bio->bi_sector + bio_sectors(bio),
-                       (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
-
-       set_bit(BIO_EOF, &bio->bi_flags);
-}
-
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-
-static DECLARE_FAULT_ATTR(fail_make_request);
-
-static int __init setup_fail_make_request(char *str)
-{
-       return setup_fault_attr(&fail_make_request, str);
-}
-__setup("fail_make_request=", setup_fail_make_request);
-
-static int should_fail_request(struct bio *bio)
-{
-       if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
-           (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
-               return should_fail(&fail_make_request, bio->bi_size);
-
-       return 0;
-}
-
-static int __init fail_make_request_debugfs(void)
-{
-       return init_fault_attr_dentries(&fail_make_request,
-                                       "fail_make_request");
-}
-
-late_initcall(fail_make_request_debugfs);
-
-#else /* CONFIG_FAIL_MAKE_REQUEST */
-
-static inline int should_fail_request(struct bio *bio)
-{
-       return 0;
-}
-
-#endif /* CONFIG_FAIL_MAKE_REQUEST */
-
-/*
- * Check whether this bio extends beyond the end of the device.
- */
-static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
-{
-       sector_t maxsector;
-
-       if (!nr_sectors)
-               return 0;
-
-       /* Test device or partition size, when known. */
-       maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
-       if (maxsector) {
-               sector_t sector = bio->bi_sector;
-
-               if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
-                       /*
-                        * This may well happen - the kernel calls bread()
-                        * without checking the size of the device, e.g., when
-                        * mounting a device.
-                        */
-                       handle_bad_sector(bio);
-                       return 1;
-               }
-       }
-
-       return 0;
-}
-
-/**
- * generic_make_request: hand a buffer to its device driver for I/O
- * @bio:  The bio describing the location in memory and on the device.
- *
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status.  The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may change bi_dev and
- * bi_sector for remaps as it sees fit.  So the values of these fields
- * should NOT be depended on after the call to generic_make_request.
- */
-static inline void __generic_make_request(struct bio *bio)
-{
-       struct request_queue *q;
-       sector_t old_sector;
-       int ret, nr_sectors = bio_sectors(bio);
-       dev_t old_dev;
-       int err = -EIO;
-
-       might_sleep();
-
-       if (bio_check_eod(bio, nr_sectors))
-               goto end_io;
-
-       /*
-        * Resolve the mapping until finished. (drivers are
-        * still free to implement/resolve their own stacking
-        * by explicitly returning 0)
-        *
-        * NOTE: we don't repeat the blk_size check for each new device.
-        * Stacking drivers are expected to know what they are doing.
-        */
-       old_sector = -1;
-       old_dev = 0;
-       do {
-               char b[BDEVNAME_SIZE];
-
-               q = bdev_get_queue(bio->bi_bdev);
-               if (!q) {
-                       printk(KERN_ERR
-                              "generic_make_request: Trying to access "
-                               "nonexistent block-device %s (%Lu)\n",
-                               bdevname(bio->bi_bdev, b),
-                               (long long) bio->bi_sector);
-end_io:
-                       bio_endio(bio, err);
-                       break;
-               }
-
-               if (unlikely(nr_sectors > q->max_hw_sectors)) {
-                       printk("bio too big device %s (%u > %u)\n", 
-                               bdevname(bio->bi_bdev, b),
-                               bio_sectors(bio),
-                               q->max_hw_sectors);
-                       goto end_io;
-               }
-
-               if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-                       goto end_io;
-
-               if (should_fail_request(bio))
-                       goto end_io;
-
-               /*
-                * If this device has partitions, remap block n
-                * of partition p to block n+start(p) of the disk.
-                */
-               blk_partition_remap(bio);
-
-               if (old_sector != -1)
-                       blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
-                                           old_sector);
-
-               blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
-
-               old_sector = bio->bi_sector;
-               old_dev = bio->bi_bdev->bd_dev;
-
-               if (bio_check_eod(bio, nr_sectors))
-                       goto end_io;
-               if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
-                       err = -EOPNOTSUPP;
-                       goto end_io;
-               }
-
-               ret = q->make_request_fn(q, bio);
-       } while (ret);
-}
-
-/*
- * We only want one ->make_request_fn to be active at a time,
- * else stack usage with stacked devices could be a problem.
- * So use current->bio_{list,tail} to keep a list of requests
- * submited by a make_request_fn function.
- * current->bio_tail is also used as a flag to say if
- * generic_make_request is currently active in this task or not.
- * If it is NULL, then no make_request is active.  If it is non-NULL,
- * then a make_request is active, and new requests should be added
- * at the tail
- */
-void generic_make_request(struct bio *bio)
-{
-       if (current->bio_tail) {
-               /* make_request is active */
-               *(current->bio_tail) = bio;
-               bio->bi_next = NULL;
-               current->bio_tail = &bio->bi_next;
-               return;
-       }
-       /* following loop may be a bit non-obvious, and so deserves some
-        * explanation.
-        * Before entering the loop, bio->bi_next is NULL (as all callers
-        * ensure that) so we have a list with a single bio.
-        * We pretend that we have just taken it off a longer list, so
-        * we assign bio_list to the next (which is NULL) and bio_tail
-        * to &bio_list, thus initialising the bio_list of new bios to be
-        * added.  __generic_make_request may indeed add some more bios
-        * through a recursive call to generic_make_request.  If it
-        * did, we find a non-NULL value in bio_list and re-enter the loop
-        * from the top.  In this case we really did just take the bio
-        * of the top of the list (no pretending) and so fixup bio_list and
-        * bio_tail or bi_next, and call into __generic_make_request again.
-        *
-        * The loop was structured like this to make only one call to
-        * __generic_make_request (which is important as it is large and
-        * inlined) and to keep the structure simple.
-        */
-       BUG_ON(bio->bi_next);
-       do {
-               current->bio_list = bio->bi_next;
-               if (bio->bi_next == NULL)
-                       current->bio_tail = &current->bio_list;
-               else
-                       bio->bi_next = NULL;
-               __generic_make_request(bio);
-               bio = current->bio_list;
-       } while (bio);
-       current->bio_tail = NULL; /* deactivate */
-}
-
-EXPORT_SYMBOL(generic_make_request);
-
-/**
- * submit_bio: submit a bio to the block device layer for I/O
- * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
- * @bio: The &struct bio which describes the I/O
- *
- * submit_bio() is very similar in purpose to generic_make_request(), and
- * uses that function to do most of the work. Both are fairly rough
- * interfaces, @bio must be presetup and ready for I/O.
- *
- */
-void submit_bio(int rw, struct bio *bio)
-{
-       int count = bio_sectors(bio);
-
-       bio->bi_rw |= rw;
-
-       /*
-        * If it's a regular read/write or a barrier with data attached,
-        * go through the normal accounting stuff before submission.
-        */
-       if (!bio_empty_barrier(bio)) {
-
-               BIO_BUG_ON(!bio->bi_size);
-               BIO_BUG_ON(!bio->bi_io_vec);
-
-               if (rw & WRITE) {
-                       count_vm_events(PGPGOUT, count);
-               } else {
-                       task_io_account_read(bio->bi_size);
-                       count_vm_events(PGPGIN, count);
-               }
-
-               if (unlikely(block_dump)) {
-                       char b[BDEVNAME_SIZE];
-                       printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
-                       current->comm, task_pid_nr(current),
-                               (rw & WRITE) ? "WRITE" : "READ",
-                               (unsigned long long)bio->bi_sector,
-                               bdevname(bio->bi_bdev,b));
-               }
-       }
-
-       generic_make_request(bio);
-}
-
-EXPORT_SYMBOL(submit_bio);
-
-static void blk_recalc_rq_sectors(struct request *rq, int nsect)
-{
-       if (blk_fs_request(rq)) {
-               rq->hard_sector += nsect;
-               rq->hard_nr_sectors -= nsect;
-
-               /*
-                * Move the I/O submission pointers ahead if required.
-                */
-               if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
-                   (rq->sector <= rq->hard_sector)) {
-                       rq->sector = rq->hard_sector;
-                       rq->nr_sectors = rq->hard_nr_sectors;
-                       rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
-                       rq->current_nr_sectors = rq->hard_cur_sectors;
-                       rq->buffer = bio_data(rq->bio);
-               }
-
-               /*
-                * if total number of sectors is less than the first segment
-                * size, something has gone terribly wrong
-                */
-               if (rq->nr_sectors < rq->current_nr_sectors) {
-                       printk("blk: request botched\n");
-                       rq->nr_sectors = rq->current_nr_sectors;
-               }
-       }
-}
-
-/**
- * __end_that_request_first - end I/O on a request
- * @req:      the request being processed
- * @error:    0 for success, < 0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @req, and sets it up
- *     for the next range of segments (if any) in the cluster.
- *
- * Return:
- *     0 - we are done with this request, call end_that_request_last()
- *     1 - still buffers pending for this request
- **/
-static int __end_that_request_first(struct request *req, int error,
-                                   int nr_bytes)
-{
-       int total_bytes, bio_nbytes, next_idx = 0;
-       struct bio *bio;
-
-       blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
-
-       /*
-        * for a REQ_BLOCK_PC request, we want to carry any eventual
-        * sense key with us all the way through
-        */
-       if (!blk_pc_request(req))
-               req->errors = 0;
-
-       if (error) {
-               if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
-                       printk("end_request: I/O error, dev %s, sector %llu\n",
-                               req->rq_disk ? req->rq_disk->disk_name : "?",
-                               (unsigned long long)req->sector);
-       }
-
-       if (blk_fs_request(req) && req->rq_disk) {
-               const int rw = rq_data_dir(req);
-
-               disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
-       }
-
-       total_bytes = bio_nbytes = 0;
-       while ((bio = req->bio) != NULL) {
-               int nbytes;
-
-               /*
-                * For an empty barrier request, the low level driver must
-                * store a potential error location in ->sector. We pass
-                * that back up in ->bi_sector.
-                */
-               if (blk_empty_barrier(req))
-                       bio->bi_sector = req->sector;
-
-               if (nr_bytes >= bio->bi_size) {
-                       req->bio = bio->bi_next;
-                       nbytes = bio->bi_size;
-                       req_bio_endio(req, bio, nbytes, error);
-                       next_idx = 0;
-                       bio_nbytes = 0;
-               } else {
-                       int idx = bio->bi_idx + next_idx;
-
-                       if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
-                               blk_dump_rq_flags(req, "__end_that");
-                               printk("%s: bio idx %d >= vcnt %d\n",
-                                               __FUNCTION__,
-                                               bio->bi_idx, bio->bi_vcnt);
-                               break;
-                       }
-
-                       nbytes = bio_iovec_idx(bio, idx)->bv_len;
-                       BIO_BUG_ON(nbytes > bio->bi_size);
-
-                       /*
-                        * not a complete bvec done
-                        */
-                       if (unlikely(nbytes > nr_bytes)) {
-                               bio_nbytes += nr_bytes;
-                               total_bytes += nr_bytes;
-                               break;
-                       }
-
-                       /*
-                        * advance to the next vector
-                        */
-                       next_idx++;
-                       bio_nbytes += nbytes;
-               }
-
-               total_bytes += nbytes;
-               nr_bytes -= nbytes;
-
-               if ((bio = req->bio)) {
-                       /*
-                        * end more in this run, or just return 'not-done'
-                        */
-                       if (unlikely(nr_bytes <= 0))
-                               break;
-               }
-       }
-
-       /*
-        * completely done
-        */
-       if (!req->bio)
-               return 0;
-
-       /*
-        * if the request wasn't completed, update state
-        */
-       if (bio_nbytes) {
-               req_bio_endio(req, bio, bio_nbytes, error);
-               bio->bi_idx += next_idx;
-               bio_iovec(bio)->bv_offset += nr_bytes;
-               bio_iovec(bio)->bv_len -= nr_bytes;
-       }
-
-       blk_recalc_rq_sectors(req, total_bytes >> 9);
-       blk_recalc_rq_segments(req);
-       return 1;
-}
-
-/*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
-       struct list_head *cpu_list, local_list;
-
-       local_irq_disable();
-       cpu_list = &__get_cpu_var(blk_cpu_done);
-       list_replace_init(cpu_list, &local_list);
-       local_irq_enable();
-
-       while (!list_empty(&local_list)) {
-               struct request *rq = list_entry(local_list.next, struct request, donelist);
-
-               list_del_init(&rq->donelist);
-               rq->q->softirq_done_fn(rq);
-       }
-}
-
-static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,
-                         void *hcpu)
-{
-       /*
-        * If a CPU goes away, splice its entries to the current CPU
-        * and trigger a run of the softirq
-        */
-       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-               int cpu = (unsigned long) hcpu;
-
-               local_irq_disable();
-               list_splice_init(&per_cpu(blk_cpu_done, cpu),
-                                &__get_cpu_var(blk_cpu_done));
-               raise_softirq_irqoff(BLOCK_SOFTIRQ);
-               local_irq_enable();
-       }
-
-       return NOTIFY_OK;
-}
-
-
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
-       .notifier_call  = blk_cpu_notify,
-};
-
-/**
- * blk_complete_request - end I/O on a request
- * @req:      the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completion callback
- *     through requeueing. The actual completion happens out-of-order,
- *     through a softirq handler. The user must have registered a completion
- *     callback through blk_queue_softirq_done().
- **/
-
-void blk_complete_request(struct request *req)
-{
-       struct list_head *cpu_list;
-       unsigned long flags;
-
-       BUG_ON(!req->q->softirq_done_fn);
-               
-       local_irq_save(flags);
-
-       cpu_list = &__get_cpu_var(blk_cpu_done);
-       list_add_tail(&req->donelist, cpu_list);
-       raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
-       local_irq_restore(flags);
-}
-
-EXPORT_SYMBOL(blk_complete_request);
-       
-/*
- * queue lock must be held
- */
-static void end_that_request_last(struct request *req, int error)
-{
-       struct gendisk *disk = req->rq_disk;
-
-       if (blk_rq_tagged(req))
-               blk_queue_end_tag(req->q, req);
-
-       if (blk_queued_rq(req))
-               blkdev_dequeue_request(req);
-
-       if (unlikely(laptop_mode) && blk_fs_request(req))
-               laptop_io_completion();
-
-       /*
-        * Account IO completion.  bar_rq isn't accounted as a normal
-        * IO on queueing nor completion.  Accounting the containing
-        * request is enough.
-        */
-       if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
-               unsigned long duration = jiffies - req->start_time;
-               const int rw = rq_data_dir(req);
-
-               __disk_stat_inc(disk, ios[rw]);
-               __disk_stat_add(disk, ticks[rw], duration);
-               disk_round_stats(disk);
-               disk->in_flight--;
-       }
-
-       if (req->end_io)
-               req->end_io(req, error);
-       else {
-               if (blk_bidi_rq(req))
-                       __blk_put_request(req->next_rq->q, req->next_rq);
-
-               __blk_put_request(req->q, req);
-       }
-}
-
-static inline void __end_request(struct request *rq, int uptodate,
-                                unsigned int nr_bytes)
-{
-       int error = 0;
-
-       if (uptodate <= 0)
-               error = uptodate ? uptodate : -EIO;
-
-       __blk_end_request(rq, error, nr_bytes);
-}
-
-/**
- * blk_rq_bytes - Returns bytes left to complete in the entire request
- **/
-unsigned int blk_rq_bytes(struct request *rq)
-{
-       if (blk_fs_request(rq))
-               return rq->hard_nr_sectors << 9;
-
-       return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_bytes);
-
-/**
- * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
- **/
-unsigned int blk_rq_cur_bytes(struct request *rq)
-{
-       if (blk_fs_request(rq))
-               return rq->current_nr_sectors << 9;
-
-       if (rq->bio)
-               return rq->bio->bi_size;
-
-       return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
-
-/**
- * end_queued_request - end all I/O on a queued request
- * @rq:                the request being processed
- * @uptodate:  error value or 0/1 uptodate flag
- *
- * Description:
- *     Ends all I/O on a request, and removes it from the block layer queues.
- *     Not suitable for normal IO completion, unless the driver still has
- *     the request attached to the block layer.
- *
- **/
-void end_queued_request(struct request *rq, int uptodate)
-{
-       __end_request(rq, uptodate, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(end_queued_request);
-
-/**
- * end_dequeued_request - end all I/O on a dequeued request
- * @rq:                the request being processed
- * @uptodate:  error value or 0/1 uptodate flag
- *
- * Description:
- *     Ends all I/O on a request. The request must already have been
- *     dequeued using blkdev_dequeue_request(), as is normally the case
- *     for most drivers.
- *
- **/
-void end_dequeued_request(struct request *rq, int uptodate)
-{
-       __end_request(rq, uptodate, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(end_dequeued_request);
-
-
-/**
- * end_request - end I/O on the current segment of the request
- * @req:       the request being processed
- * @uptodate:  error value or 0/1 uptodate flag
- *
- * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled IO completions.
- *     Modern drivers typically end IO on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Either use end_request_completely(), or the
- *     end_that_request_chunk() (along with end_that_request_last()) for
- *     partial completions.
- *
- **/
-void end_request(struct request *req, int uptodate)
-{
-       __end_request(req, uptodate, req->hard_cur_sectors << 9);
-}
-EXPORT_SYMBOL(end_request);
-
-/**
- * blk_end_io - Generic end_io function to complete a request.
- * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
- * @nr_bytes:     number of bytes to complete @rq
- * @bidi_bytes:   number of bytes to complete @rq->next_rq
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
- *                completion of the request.
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     0 - we are done with this request
- *     1 - this request is not freed yet, it still has pending buffers.
- **/
-static int blk_end_io(struct request *rq, int error, int nr_bytes,
-                     int bidi_bytes, int (drv_callback)(struct request *))
-{
-       struct request_queue *q = rq->q;
-       unsigned long flags = 0UL;
-
-       if (blk_fs_request(rq) || blk_pc_request(rq)) {
-               if (__end_that_request_first(rq, error, nr_bytes))
-                       return 1;
-
-               /* Bidi request must be completed as a whole */
-               if (blk_bidi_rq(rq) &&
-                   __end_that_request_first(rq->next_rq, error, bidi_bytes))
-                       return 1;
-       }
-
-       /* Special feature for tricky drivers */
-       if (drv_callback && drv_callback(rq))
-               return 1;
-
-       add_disk_randomness(rq->rq_disk);
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       end_that_request_last(rq, error);
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       return 0;
-}
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
- **/
-int blk_end_request(struct request *rq, int error, int nr_bytes)
-{
-       return blk_end_io(rq, error, nr_bytes, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(blk_end_request);
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
- **/
-int __blk_end_request(struct request *rq, int error, int nr_bytes)
-{
-       if (blk_fs_request(rq) || blk_pc_request(rq)) {
-               if (__end_that_request_first(rq, error, nr_bytes))
-                       return 1;
-       }
-
-       add_disk_randomness(rq->rq_disk);
-
-       end_that_request_last(rq, error);
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(__blk_end_request);
-
-/**
- * blk_end_bidi_request - Helper function for drivers to complete bidi request.
- * @rq:         the bidi request being processed
- * @error:      0 for success, < 0 for error
- * @nr_bytes:   number of bytes to complete @rq
- * @bidi_bytes: number of bytes to complete @rq->next_rq
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
- *
- * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
- **/
-int blk_end_bidi_request(struct request *rq, int error, int nr_bytes,
-                        int bidi_bytes)
-{
-       return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL);
-}
-EXPORT_SYMBOL_GPL(blk_end_bidi_request);
-
-/**
- * blk_end_request_callback - Special helper function for tricky drivers
- * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
- * @nr_bytes:     number of bytes to complete
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
- *                completion of the request.
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- *     This special helper function is used only for existing tricky drivers.
- *     (e.g. cdrom_newpc_intr() of ide-cd)
- *     This interface will be removed when such drivers are rewritten.
- *     Don't use this interface in other places anymore.
- *
- * Return:
- *     0 - we are done with this request
- *     1 - this request is not freed yet.
- *         this request still has pending buffers or
- *         the driver doesn't want to finish this request yet.
- **/
-int blk_end_request_callback(struct request *rq, int error, int nr_bytes,
-                            int (drv_callback)(struct request *))
-{
-       return blk_end_io(rq, error, nr_bytes, 0, drv_callback);
-}
-EXPORT_SYMBOL_GPL(blk_end_request_callback);
-
-static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
-                           struct bio *bio)
-{
-       /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
-       rq->cmd_flags |= (bio->bi_rw & 3);
-
-       rq->nr_phys_segments = bio_phys_segments(q, bio);
-       rq->nr_hw_segments = bio_hw_segments(q, bio);
-       rq->current_nr_sectors = bio_cur_sectors(bio);
-       rq->hard_cur_sectors = rq->current_nr_sectors;
-       rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
-       rq->buffer = bio_data(bio);
-       rq->data_len = bio->bi_size;
-
-       rq->bio = rq->biotail = bio;
-
-       if (bio->bi_bdev)
-               rq->rq_disk = bio->bi_bdev->bd_disk;
-}
-
-int kblockd_schedule_work(struct work_struct *work)
-{
-       return queue_work(kblockd_workqueue, work);
-}
-
-EXPORT_SYMBOL(kblockd_schedule_work);
-
-void kblockd_flush_work(struct work_struct *work)
-{
-       cancel_work_sync(work);
-}
-EXPORT_SYMBOL(kblockd_flush_work);
-
-int __init blk_dev_init(void)
-{
-       int i;
-
-       kblockd_workqueue = create_workqueue("kblockd");
-       if (!kblockd_workqueue)
-               panic("Failed to create kblockd\n");
-
-       request_cachep = kmem_cache_create("blkdev_requests",
-                       sizeof(struct request), 0, SLAB_PANIC, NULL);
-
-       requestq_cachep = kmem_cache_create("blkdev_queue",
-                       sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
-
-       iocontext_cachep = kmem_cache_create("blkdev_ioc",
-                       sizeof(struct io_context), 0, SLAB_PANIC, NULL);
-
-       for_each_possible_cpu(i)
-               INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
-       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
-       register_hotcpu_notifier(&blk_cpu_notifier);
-
-       blk_max_low_pfn = max_low_pfn - 1;
-       blk_max_pfn = max_pfn - 1;
-
-       return 0;
-}
-
-static void cfq_dtor(struct io_context *ioc)
-{
-       struct cfq_io_context *cic[1];
-       int r;
-
-       /*
-        * We don't have a specific key to lookup with, so use the gang
-        * lookup to just retrieve the first item stored. The cfq exit
-        * function will iterate the full tree, so any member will do.
-        */
-       r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
-       if (r > 0)
-               cic[0]->dtor(ioc);
-}
-
-/*
- * IO Context helper functions. put_io_context() returns 1 if there are no
- * more users of this io context, 0 otherwise.
- */
-int put_io_context(struct io_context *ioc)
-{
-       if (ioc == NULL)
-               return 1;
-
-       BUG_ON(atomic_read(&ioc->refcount) == 0);
-
-       if (atomic_dec_and_test(&ioc->refcount)) {
-               rcu_read_lock();
-               if (ioc->aic && ioc->aic->dtor)
-                       ioc->aic->dtor(ioc->aic);
-               rcu_read_unlock();
-               cfq_dtor(ioc);
-
-               kmem_cache_free(iocontext_cachep, ioc);
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL(put_io_context);
-
-static void cfq_exit(struct io_context *ioc)
-{
-       struct cfq_io_context *cic[1];
-       int r;
-
-       rcu_read_lock();
-       /*
-        * See comment for cfq_dtor()
-        */
-       r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1);
-       rcu_read_unlock();
-
-       if (r > 0)
-               cic[0]->exit(ioc);
-}
-
-/* Called by the exitting task */
-void exit_io_context(void)
-{
-       struct io_context *ioc;
-
-       task_lock(current);
-       ioc = current->io_context;
-       current->io_context = NULL;
-       task_unlock(current);
-
-       if (atomic_dec_and_test(&ioc->nr_tasks)) {
-               if (ioc->aic && ioc->aic->exit)
-                       ioc->aic->exit(ioc->aic);
-               cfq_exit(ioc);
-
-               put_io_context(ioc);
-       }
-}
-
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
-{
-       struct io_context *ret;
-
-       ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-       if (ret) {
-               atomic_set(&ret->refcount, 1);
-               atomic_set(&ret->nr_tasks, 1);
-               spin_lock_init(&ret->lock);
-               ret->ioprio_changed = 0;
-               ret->ioprio = 0;
-               ret->last_waited = jiffies; /* doesn't matter... */
-               ret->nr_batch_requests = 0; /* because this is 0 */
-               ret->aic = NULL;
-               INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
-               ret->ioc_data = NULL;
-       }
-
-       return ret;
-}
-
-/*
- * If the current task has no IO context then create one and initialise it.
- * Otherwise, return its existing IO context.
- *
- * This returned IO context doesn't have a specifically elevated refcount,
- * but since the current task itself holds a reference, the context can be
- * used in general code, so long as it stays within `current` context.
- */
-static struct io_context *current_io_context(gfp_t gfp_flags, int node)
-{
-       struct task_struct *tsk = current;
-       struct io_context *ret;
-
-       ret = tsk->io_context;
-       if (likely(ret))
-               return ret;
-
-       ret = alloc_io_context(gfp_flags, node);
-       if (ret) {
-               /* make sure set_task_ioprio() sees the settings above */
-               smp_wmb();
-               tsk->io_context = ret;
-       }
-
-       return ret;
-}
-
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
- *
- * This is always called in the context of the task which submitted the I/O.
- */
-struct io_context *get_io_context(gfp_t gfp_flags, int node)
-{
-       struct io_context *ret = NULL;
-
-       /*
-        * Check for unlikely race with exiting task. ioc ref count is
-        * zero when ioc is being detached.
-        */
-       do {
-               ret = current_io_context(gfp_flags, node);
-               if (unlikely(!ret))
-                       break;
-       } while (!atomic_inc_not_zero(&ret->refcount));
-
-       return ret;
-}
-EXPORT_SYMBOL(get_io_context);
-
-void copy_io_context(struct io_context **pdst, struct io_context **psrc)
-{
-       struct io_context *src = *psrc;
-       struct io_context *dst = *pdst;
-
-       if (src) {
-               BUG_ON(atomic_read(&src->refcount) == 0);
-               atomic_inc(&src->refcount);
-               put_io_context(dst);
-               *pdst = src;
-       }
-}
-EXPORT_SYMBOL(copy_io_context);
-
-void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
-{
-       struct io_context *temp;
-       temp = *ioc1;
-       *ioc1 = *ioc2;
-       *ioc2 = temp;
-}
-EXPORT_SYMBOL(swap_io_context);
-
-/*
- * sysfs parts below
- */
-struct queue_sysfs_entry {
-       struct attribute attr;
-       ssize_t (*show)(struct request_queue *, char *);
-       ssize_t (*store)(struct request_queue *, const char *, size_t);
-};
-
-static ssize_t
-queue_var_show(unsigned int var, char *page)
-{
-       return sprintf(page, "%d\n", var);
-}
-
-static ssize_t
-queue_var_store(unsigned long *var, const char *page, size_t count)
-{
-       char *p = (char *) page;
-
-       *var = simple_strtoul(p, &p, 10);
-       return count;
-}
-
-static ssize_t queue_requests_show(struct request_queue *q, char *page)
-{
-       return queue_var_show(q->nr_requests, (page));
-}
-
-static ssize_t
-queue_requests_store(struct request_queue *q, const char *page, size_t count)
-{
-       struct request_list *rl = &q->rq;
-       unsigned long nr;
-       int ret = queue_var_store(&nr, page, count);
-       if (nr < BLKDEV_MIN_RQ)
-               nr = BLKDEV_MIN_RQ;
-
-       spin_lock_irq(q->queue_lock);
-       q->nr_requests = nr;
-       blk_queue_congestion_threshold(q);
-
-       if (rl->count[READ] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, READ);
-       else if (rl->count[READ] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, READ);
-
-       if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, WRITE);
-       else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, WRITE);
-
-       if (rl->count[READ] >= q->nr_requests) {
-               blk_set_queue_full(q, READ);
-       } else if (rl->count[READ]+1 <= q->nr_requests) {
-               blk_clear_queue_full(q, READ);
-               wake_up(&rl->wait[READ]);
-       }
-
-       if (rl->count[WRITE] >= q->nr_requests) {
-               blk_set_queue_full(q, WRITE);
-       } else if (rl->count[WRITE]+1 <= q->nr_requests) {
-               blk_clear_queue_full(q, WRITE);
-               wake_up(&rl->wait[WRITE]);
-       }
-       spin_unlock_irq(q->queue_lock);
-       return ret;
-}
-
-static ssize_t queue_ra_show(struct request_queue *q, char *page)
-{
-       int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
-
-       return queue_var_show(ra_kb, (page));
-}
-
-static ssize_t
-queue_ra_store(struct request_queue *q, const char *page, size_t count)
-{
-       unsigned long ra_kb;
-       ssize_t ret = queue_var_store(&ra_kb, page, count);
-
-       spin_lock_irq(q->queue_lock);
-       q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
-       spin_unlock_irq(q->queue_lock);
-
-       return ret;
-}
-
-static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
-{
-       int max_sectors_kb = q->max_sectors >> 1;
-
-       return queue_var_show(max_sectors_kb, (page));
-}
-
-static ssize_t
-queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
-{
-       unsigned long max_sectors_kb,
-                       max_hw_sectors_kb = q->max_hw_sectors >> 1,
-                       page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
-       ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
-
-       if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
-               return -EINVAL;
-       /*
-        * Take the queue lock to update the readahead and max_sectors
-        * values synchronously:
-        */
-       spin_lock_irq(q->queue_lock);
-       q->max_sectors = max_sectors_kb << 1;
-       spin_unlock_irq(q->queue_lock);
-
-       return ret;
-}
-
-static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
-{
-       int max_hw_sectors_kb = q->max_hw_sectors >> 1;
-
-       return queue_var_show(max_hw_sectors_kb, (page));
-}
-
-
-static struct queue_sysfs_entry queue_requests_entry = {
-       .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
-       .show = queue_requests_show,
-       .store = queue_requests_store,
-};
-
-static struct queue_sysfs_entry queue_ra_entry = {
-       .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
-       .show = queue_ra_show,
-       .store = queue_ra_store,
-};
-
-static struct queue_sysfs_entry queue_max_sectors_entry = {
-       .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
-       .show = queue_max_sectors_show,
-       .store = queue_max_sectors_store,
-};
-
-static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
-       .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
-       .show = queue_max_hw_sectors_show,
-};
-
-static struct queue_sysfs_entry queue_iosched_entry = {
-       .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
-       .show = elv_iosched_show,
-       .store = elv_iosched_store,
-};
-
-static struct attribute *default_attrs[] = {
-       &queue_requests_entry.attr,
-       &queue_ra_entry.attr,
-       &queue_max_hw_sectors_entry.attr,
-       &queue_max_sectors_entry.attr,
-       &queue_iosched_entry.attr,
-       NULL,
-};
-
-#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
-
-static ssize_t
-queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-       struct queue_sysfs_entry *entry = to_queue(attr);
-       struct request_queue *q =
-               container_of(kobj, struct request_queue, kobj);
-       ssize_t res;
-
-       if (!entry->show)
-               return -EIO;
-       mutex_lock(&q->sysfs_lock);
-       if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
-               mutex_unlock(&q->sysfs_lock);
-               return -ENOENT;
-       }
-       res = entry->show(q, page);
-       mutex_unlock(&q->sysfs_lock);
-       return res;
-}
-
-static ssize_t
-queue_attr_store(struct kobject *kobj, struct attribute *attr,
-                   const char *page, size_t length)
-{
-       struct queue_sysfs_entry *entry = to_queue(attr);
-       struct request_queue *q = container_of(kobj, struct request_queue, kobj);
-
-       ssize_t res;
-
-       if (!entry->store)
-               return -EIO;
-       mutex_lock(&q->sysfs_lock);
-       if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
-               mutex_unlock(&q->sysfs_lock);
-               return -ENOENT;
-       }
-       res = entry->store(q, page, length);
-       mutex_unlock(&q->sysfs_lock);
-       return res;
-}
-
-static struct sysfs_ops queue_sysfs_ops = {
-       .show   = queue_attr_show,
-       .store  = queue_attr_store,
-};
-
-static struct kobj_type queue_ktype = {
-       .sysfs_ops      = &queue_sysfs_ops,
-       .default_attrs  = default_attrs,
-       .release        = blk_release_queue,
-};
-
-int blk_register_queue(struct gendisk *disk)
-{
-       int ret;
-
-       struct request_queue *q = disk->queue;
-
-       if (!q || !q->request_fn)
-               return -ENXIO;
-
-       ret = kobject_add(&q->kobj, kobject_get(&disk->dev.kobj),
-                         "%s", "queue");
-       if (ret < 0)
-               return ret;
-
-       kobject_uevent(&q->kobj, KOBJ_ADD);
-
-       ret = elv_register_queue(q);
-       if (ret) {
-               kobject_uevent(&q->kobj, KOBJ_REMOVE);
-               kobject_del(&q->kobj);
-               return ret;
-       }
-
-       return 0;
-}
-
-void blk_unregister_queue(struct gendisk *disk)
-{
-       struct request_queue *q = disk->queue;
-
-       if (q && q->request_fn) {
-               elv_unregister_queue(q);
-
-               kobject_uevent(&q->kobj, KOBJ_REMOVE);
-               kobject_del(&q->kobj);
-               kobject_put(&disk->dev.kobj);
-       }
-}
diff --git a/drivers/Kconfig b/drivers/Kconfig

index f4076d9e9902b88981c840097a9f283526c1e522..08d4ae201597cde366f07efc2a4a8f4f8e08d83e 100644 (file)
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
  
  source "drivers/auxdisplay/Kconfig"
  
-source "drivers/kvm/Kconfig"
-
  source "drivers/uio/Kconfig"
  
  source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile

index 8cb37e3557d490910efbea741a26ac9465f1db00..0ee9a8a4095e6a5106789717dc57516a73d81fa8 100644 (file)
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_SCSI)            += scsi/
  obj-$(CONFIG_ATA)              += ata/
  obj-$(CONFIG_FUSION)           += message/
  obj-$(CONFIG_FIREWIRE)         += firewire/
-obj-$(CONFIG_IEEE1394)         += ieee1394/
+obj-y                          += ieee1394/
  obj-$(CONFIG_UIO)              += uio/
  obj-y                          += cdrom/
  obj-y                          += auxdisplay/
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI)             += spi/
  obj-$(CONFIG_PCCARD)           += pcmcia/
  obj-$(CONFIG_DIO)              += dio/
  obj-$(CONFIG_SBUS)             += sbus/
-obj-$(CONFIG_KVM)              += kvm/
  obj-$(CONFIG_ZORRO)            += zorro/
  obj-$(CONFIG_MAC)              += macintosh/
  obj-$(CONFIG_ATA_OVER_ETH)     += block/aoe/
@@ -73,7 +72,7 @@ obj-$(CONFIG_ISDN)            += isdn/
  obj-$(CONFIG_EDAC)             += edac/
  obj-$(CONFIG_MCA)              += mca/
  obj-$(CONFIG_EISA)             += eisa/
-obj-$(CONFIG_LGUEST_GUEST)     += lguest/
+obj-y                          += lguest/
  obj-$(CONFIG_CPU_FREQ)         += cpufreq/
  obj-$(CONFIG_CPU_IDLE)         += cpuidle/
  obj-$(CONFIG_MMC)              += mmc/
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c

index 2235f4e02d26f46267b512a0d3c4c14badc34526..eb1f82f79153c0d3c6f9136b8a88faf829c886d1 100644 (file)
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -357,6 +357,26 @@ int acpi_processor_resume(struct acpi_device * device)
         return 0;
  }
  
+#if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
+static int tsc_halts_in_c(int state)
+{
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_AMD:
+               /*
+                * AMD Fam10h TSC will tick in all
+                * C/P/S0/S1 states when this bit is set.
+                */
+               if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+                       return 0;
+               /*FALL THROUGH*/
+       case X86_VENDOR_INTEL:
+               /* Several cases known where TSC halts in C2 too */
+       default:
+               return state > ACPI_STATE_C1;
+       }
+}
+#endif
+
  #ifndef CONFIG_CPU_IDLE
  static void acpi_processor_idle(void)
  {
@@ -516,7 +536,8 @@ static void acpi_processor_idle(void)
  
  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
                 /* TSC halts in C2, so notify users */
-               mark_tsc_unstable("possible TSC halt in C2");
+               if (tsc_halts_in_c(ACPI_STATE_C2))
+                       mark_tsc_unstable("possible TSC halt in C2");
  #endif
                 /* Compute time (ticks) that we were actually asleep */
                 sleep_ticks = ticks_elapsed(t1, t2);
@@ -534,6 +555,7 @@ static void acpi_processor_idle(void)
                 break;
  
         case ACPI_STATE_C3:
+               acpi_unlazy_tlb(smp_processor_id());
                 /*
                  * Must be done before busmaster disable as we might
                  * need to access HPET !
@@ -579,7 +601,8 @@ static void acpi_processor_idle(void)
  
  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
                 /* TSC halts in C3, so notify users */
-               mark_tsc_unstable("TSC halts in C3");
+               if (tsc_halts_in_c(ACPI_STATE_C3))
+                       mark_tsc_unstable("TSC halts in C3");
  #endif
                 /* Compute time (ticks) that we were actually asleep */
                 sleep_ticks = ticks_elapsed(t1, t2);
@@ -1423,6 +1446,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
                 return 0;
         }
  
+       acpi_unlazy_tlb(smp_processor_id());
         /*
          * Must be done before busmaster disable as we might need to
          * access HPET !
@@ -1443,7 +1467,8 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
  
  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
         /* TSC could halt in idle, so notify users */
-       mark_tsc_unstable("TSC halts in idle");;
+       if (tsc_halts_in_c(cx->type))
+               mark_tsc_unstable("TSC halts in idle");;
  #endif
         sleep_ticks = ticks_elapsed(t1, t2);
  
@@ -1554,7 +1579,8 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
  
  #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86_TSC)
         /* TSC could halt in idle, so notify users */
-       mark_tsc_unstable("TSC halts in idle");
+       if (tsc_halts_in_c(ACPI_STATE_C3))
+               mark_tsc_unstable("TSC halts in idle");
  #endif
         sleep_ticks = ticks_elapsed(t1, t2);
         /* Tell the scheduler how much we idled: */
diff --git a/drivers/base/bus.c b/drivers/base/bus.c

index f484495b2ad1bd27d83fae5cc661349e9cbdb78c..055989e94799eff3fb6e0b44823835257e8feb28 100644 (file)
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -163,15 +163,6 @@ static struct kset *bus_kset;
  
  #ifdef CONFIG_HOTPLUG
  /* Manually detach a device from its associated driver. */
-static int driver_helper(struct device *dev, void *data)
-{
-       const char *name = data;
-
-       if (strcmp(name, dev->bus_id) == 0)
-               return 1;
-       return 0;
-}
-
  static ssize_t driver_unbind(struct device_driver *drv,
                              const char *buf, size_t count)
  {
@@ -179,7 +170,7 @@ static ssize_t driver_unbind(struct device_driver *drv,
         struct device *dev;
         int err = -ENODEV;
  
-       dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+       dev = bus_find_device_by_name(bus, NULL, buf);
         if (dev && dev->driver == drv) {
                 if (dev->parent)        /* Needed for USB */
                         down(&dev->parent->sem);
@@ -206,7 +197,7 @@ static ssize_t driver_bind(struct device_driver *drv,
         struct device *dev;
         int err = -ENODEV;
  
-       dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+       dev = bus_find_device_by_name(bus, NULL, buf);
         if (dev && dev->driver == NULL) {
                 if (dev->parent)        /* Needed for USB */
                         down(&dev->parent->sem);
@@ -250,7 +241,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus,
  {
         struct device *dev;
  
-       dev = bus_find_device(bus, NULL, (void *)buf, driver_helper);
+       dev = bus_find_device_by_name(bus, NULL, buf);
         if (!dev)
                 return -ENODEV;
         if (bus_rescan_devices_helper(dev, NULL) != 0)
@@ -338,6 +329,32 @@ struct device *bus_find_device(struct bus_type *bus,
  }
  EXPORT_SYMBOL_GPL(bus_find_device);
  
+static int match_name(struct device *dev, void *data)
+{
+       const char *name = data;
+
+       if (strcmp(name, dev->bus_id) == 0)
+               return 1;
+       return 0;
+}
+
+/**
+ * bus_find_device_by_name - device iterator for locating a particular device of a specific name
+ * @bus: bus type
+ * @start: Device to begin with
+ * @name: name of the device to match
+ *
+ * This is similar to the bus_find_device() function above, but it handles
+ * searching by a name automatically, no need to write another strcmp matching
+ * function.
+ */
+struct device *bus_find_device_by_name(struct bus_type *bus,
+                                      struct device *start, const char *name)
+{
+       return bus_find_device(bus, start, (void *)name, match_name);
+}
+EXPORT_SYMBOL_GPL(bus_find_device_by_name);
+
  static struct device_driver *next_driver(struct klist_iter *i)
  {
         struct klist_node *n = klist_next(i);
diff --git a/drivers/base/class.c b/drivers/base/class.c

index 59cf35894cfce832c814128baa495a2e35f4971b..9d915376c313d1049e9dc0c8cdd65d2c788db3bd 100644 (file)
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -149,7 +149,7 @@ int class_register(struct class *cls)
         if (error)
                 return error;
  
-#ifdef CONFIG_SYSFS_DEPRECATED
+#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
         /* let the block class directory show up in the root of sysfs */
         if (cls != &block_class)
                 cls->subsys.kobj.kset = class_kset;
@@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
   * The callback should return 0 if the device doesn't match and non-zero
   * if it does.  If the callback returns non-zero, this function will
   * return to the caller and not iterate over any more devices.
-
+ *
   * Note, you will need to drop the reference with put_device() after use.
   *
   * We hold class->sem in this function, so it can not be
diff --git a/drivers/base/core.c b/drivers/base/core.c

index edf3bbeb8d6a0af7438119b915aafb3f0111617d..b1727876182cdff178550856fc0696a770db9bdc 100644 (file)
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,9 +27,17 @@
  int (*platform_notify)(struct device *dev) = NULL;
  int (*platform_notify_remove)(struct device *dev) = NULL;
  
-/*
- * sysfs bindings for devices.
- */
+#ifdef CONFIG_BLOCK
+static inline int device_is_not_partition(struct device *dev)
+{
+       return !(dev->type == &part_type);
+}
+#else
+static inline int device_is_not_partition(struct device *dev)
+{
+       return 1;
+}
+#endif
  
  /**
   * dev_driver_string - Return a device's driver name, if at all possible
@@ -652,14 +660,14 @@ static int device_add_class_symlinks(struct device *dev)
  #ifdef CONFIG_SYSFS_DEPRECATED
         /* stacked class devices need a symlink in the class directory */
         if (dev->kobj.parent != &dev->class->subsys.kobj &&
-           dev->type != &part_type) {
+           device_is_not_partition(dev)) {
                 error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj,
                                           dev->bus_id);
                 if (error)
                         goto out_subsys;
         }
  
-       if (dev->parent && dev->type != &part_type) {
+       if (dev->parent && device_is_not_partition(dev)) {
                 struct device *parent = dev->parent;
                 char *class_name;
  
@@ -688,11 +696,11 @@ static int device_add_class_symlinks(struct device *dev)
         return 0;
  
  out_device:
-       if (dev->parent && dev->type != &part_type)
+       if (dev->parent && device_is_not_partition(dev))
                 sysfs_remove_link(&dev->kobj, "device");
  out_busid:
         if (dev->kobj.parent != &dev->class->subsys.kobj &&
-           dev->type != &part_type)
+           device_is_not_partition(dev))
                 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
  #else
         /* link in the class directory pointing to the device */
@@ -701,7 +709,7 @@ out_busid:
         if (error)
                 goto out_subsys;
  
-       if (dev->parent && dev->type != &part_type) {
+       if (dev->parent && device_is_not_partition(dev)) {
                 error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
                                           "device");
                 if (error)
@@ -725,7 +733,7 @@ static void device_remove_class_symlinks(struct device *dev)
                 return;
  
  #ifdef CONFIG_SYSFS_DEPRECATED
-       if (dev->parent && dev->type != &part_type) {
+       if (dev->parent && device_is_not_partition(dev)) {
                 char *class_name;
  
                 class_name = make_class_name(dev->class->name, &dev->kobj);
@@ -737,10 +745,10 @@ static void device_remove_class_symlinks(struct device *dev)
         }
  
         if (dev->kobj.parent != &dev->class->subsys.kobj &&
-           dev->type != &part_type)
+           device_is_not_partition(dev))
                 sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
  #else
-       if (dev->parent && dev->type != &part_type)
+       if (dev->parent && device_is_not_partition(dev))
                 sysfs_remove_link(&dev->kobj, "device");
  
         sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c

index ef50068def8896e6e21b8c4a24f44bcc516c7b47..855ce8e5efbaf3aadca40a5f2405b026b751541f 100644 (file)
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2524,7 +2524,6 @@ after_error_processing:
                 resend_cciss_cmd(h, cmd);
                 return;
         }
-       cmd->rq->data_len = 0;
         cmd->rq->completion_data = cmd;
         blk_complete_request(cmd->rq);
  }
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c

index 2c81465fd60c9c10c6b6b0ee689cc58b01e458a1..78ebfffc77e33627b788e52e72094329e226f91a 100644 (file)
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -483,7 +483,6 @@ static void ace_fsm_dostate(struct ace_device *ace)
         u32 status;
         u16 val;
         int count;
-       int i;
  
  #if defined(DEBUG)
         dev_dbg(ace->dev, "fsm_state=%i, id_req_count=%i\n",
@@ -688,7 +687,6 @@ static void ace_fsm_dostate(struct ace_device *ace)
                 }
  
                 /* Transfer the next buffer */
-               i = 16;
                 if (ace->fsm_task == ACE_TASK_WRITE)
                         ace->reg_ops->dataout(ace);
                 else
@@ -702,8 +700,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
                 }
  
                 /* bio finished; is there another one? */
-               i = ace->req->current_nr_sectors;
-               if (__blk_end_request(ace->req, 0, i)) {
+               if (__blk_end_request(ace->req, 0,
+                                       blk_rq_cur_bytes(ace->req))) {
                         /* dev_dbg(ace->dev, "next block; h=%li c=%i\n",
                          *      ace->req->hard_nr_sectors,
                          *      ace->req->current_nr_sectors);
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c

index aa5ddb716ffb4e613d6018b94db5651af9137ea8..1ffb381130c3188c4e89e501e454aeee58c70b40 100644 (file)
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -145,7 +145,6 @@ static void *m1541_alloc_page(struct agp_bridge_data *bridge)
         void *addr = agp_generic_alloc_page(agp_bridge);
         u32 temp;
  
-       global_flush_tlb();
         if (!addr)
                 return NULL;
  
@@ -162,7 +161,6 @@ static void ali_destroy_page(void * addr, int flags)
                 if (flags & AGP_PAGE_DESTROY_UNMAP) {
                         global_cache_flush();   /* is this really needed?  --hch */
                         agp_generic_destroy_page(addr, flags);
-                       global_flush_tlb();
                 } else
                         agp_generic_destroy_page(addr, flags);
         }
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c

index 832ded20fe70f60e2aa77ef3730beb94aac02cd8..2720882e66fec157dd987f0c665ae732171aa8df 100644 (file)
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -147,7 +147,6 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
                         printk(KERN_ERR PFX "unable to get memory for scratch page.\n");
                         return -ENOMEM;
                 }
-               flush_agp_mappings();
  
                 bridge->scratch_page_real = virt_to_gart(addr);
                 bridge->scratch_page =
@@ -191,7 +190,6 @@ err_out:
         if (bridge->driver->needs_scratch_page) {
                 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
                                                  AGP_PAGE_DESTROY_UNMAP);
-               flush_agp_mappings();
                 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
                                                  AGP_PAGE_DESTROY_FREE);
         }
@@ -219,7 +217,6 @@ static void agp_backend_cleanup(struct agp_bridge_data *bridge)
             bridge->driver->needs_scratch_page) {
                 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
                                                  AGP_PAGE_DESTROY_UNMAP);
-               flush_agp_mappings();
                 bridge->driver->agp_destroy_page(gart_to_virt(bridge->scratch_page_real),
                                                  AGP_PAGE_DESTROY_FREE);
         }
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c

index 64b2f6d7059dc96f494e49ace44a2601937d5a85..1a4674ce0c718f4c1c3a9bceb8342551fc5e73b8 100644 (file)
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -197,7 +197,6 @@ void agp_free_memory(struct agp_memory *curr)
                 for (i = 0; i < curr->page_count; i++) {
                         curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_UNMAP);
                 }
-               flush_agp_mappings();
                 for (i = 0; i < curr->page_count; i++) {
                         curr->bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i]), AGP_PAGE_DESTROY_FREE);
                 }
@@ -267,8 +266,6 @@ struct agp_memory *agp_allocate_memory(struct agp_bridge_data *bridge,
         }
         new->bridge = bridge;
  
-       flush_agp_mappings();
-
         return new;
  }
  EXPORT_SYMBOL(agp_allocate_memory);
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c

index e72a83e2bad54bb46c8a426ebb18add2872aa30d..76f581c85a7d5697120668c3bfcea4a5b1b445ce 100644 (file)
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -527,7 +527,6 @@ static void *i460_alloc_page (struct agp_bridge_data *bridge)
  
         if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
                 page = agp_generic_alloc_page(agp_bridge);
-               global_flush_tlb();
         } else
                 /* Returning NULL would cause problems */
                 /* AK: really dubious code. */
@@ -539,7 +538,6 @@ static void i460_destroy_page (void *page, int flags)
  {
         if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
                 agp_generic_destroy_page(page, flags);
-               global_flush_tlb();
         }
  }
  
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c

index 03eac1eb8e0fc863356905611bf344a5828a1554..189efb6ef970e81b07aa69e90847ddabd14b161e 100644 (file)
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -210,13 +210,11 @@ static void *i8xx_alloc_pages(void)
         if (page == NULL)
                 return NULL;
  
-       if (change_page_attr(page, 4, PAGE_KERNEL_NOCACHE) < 0) {
-               change_page_attr(page, 4, PAGE_KERNEL);
-               global_flush_tlb();
+       if (set_pages_uc(page, 4) < 0) {
+               set_pages_wb(page, 4);
                 __free_pages(page, 2);
                 return NULL;
         }
-       global_flush_tlb();
         get_page(page);
         atomic_inc(&agp_bridge->current_memory_agp);
         return page_address(page);
@@ -230,8 +228,7 @@ static void i8xx_destroy_pages(void *addr)
                 return;
  
         page = virt_to_page(addr);
-       change_page_attr(page, 4, PAGE_KERNEL);
-       global_flush_tlb();
+       set_pages_wb(page, 4);
         put_page(page);
         __free_pages(page, 2);
         atomic_dec(&agp_bridge->current_memory_agp);
@@ -341,7 +338,6 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
  
         switch (pg_count) {
         case 1: addr = agp_bridge->driver->agp_alloc_page(agp_bridge);
-               global_flush_tlb();
                 break;
         case 4:
                 /* kludge to get 4 physical pages for ARGB cursor */
@@ -404,7 +400,6 @@ static void intel_i810_free_by_type(struct agp_memory *curr)
                 else {
                         agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
                                                              AGP_PAGE_DESTROY_UNMAP);
-                       global_flush_tlb();
                         agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
                                                              AGP_PAGE_DESTROY_FREE);
                 }
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c

index 4c16778e3f846dd97937a79e2c4b38bd30f5accc..465ad35ed38f63e66c9035538054269d3202b9a3 100644 (file)
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -600,63 +600,6 @@ static int hpet_is_known(struct hpet_data *hdp)
         return 0;
  }
  
-EXPORT_SYMBOL(hpet_alloc);
-EXPORT_SYMBOL(hpet_register);
-EXPORT_SYMBOL(hpet_unregister);
-EXPORT_SYMBOL(hpet_control);
-
-int hpet_register(struct hpet_task *tp, int periodic)
-{
-       unsigned int i;
-       u64 mask;
-       struct hpet_timer __iomem *timer;
-       struct hpet_dev *devp;
-       struct hpets *hpetp;
-
-       switch (periodic) {
-       case 1:
-               mask = Tn_PER_INT_CAP_MASK;
-               break;
-       case 0:
-               mask = 0;
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       tp->ht_opaque = NULL;
-
-       spin_lock_irq(&hpet_task_lock);
-       spin_lock(&hpet_lock);
-
-       for (devp = NULL, hpetp = hpets; hpetp && !devp; hpetp = hpetp->hp_next)
-               for (timer = hpetp->hp_hpet->hpet_timers, i = 0;
-                    i < hpetp->hp_ntimer; i++, timer++) {
-                       if ((readq(&timer->hpet_config) & Tn_PER_INT_CAP_MASK)
-                           != mask)
-                               continue;
-
-                       devp = &hpetp->hp_dev[i];
-
-                       if (devp->hd_flags & HPET_OPEN || devp->hd_task) {
-                               devp = NULL;
-                               continue;
-                       }
-
-                       tp->ht_opaque = devp;
-                       devp->hd_task = tp;
-                       break;
-               }
-
-       spin_unlock(&hpet_lock);
-       spin_unlock_irq(&hpet_task_lock);
-
-       if (tp->ht_opaque)
-               return 0;
-       else
-               return -EBUSY;
-}
-
  static inline int hpet_tpcheck(struct hpet_task *tp)
  {
         struct hpet_dev *devp;
@@ -706,24 +649,6 @@ int hpet_unregister(struct hpet_task *tp)
         return 0;
  }
  
-int hpet_control(struct hpet_task *tp, unsigned int cmd, unsigned long arg)
-{
-       struct hpet_dev *devp;
-       int err;
-
-       if ((err = hpet_tpcheck(tp)))
-               return err;
-
-       spin_lock_irq(&hpet_lock);
-       devp = tp->ht_opaque;
-       if (devp->hd_task != tp) {
-               spin_unlock_irq(&hpet_lock);
-               return -ENXIO;
-       }
-       spin_unlock_irq(&hpet_lock);
-       return hpet_ioctl_common(devp, cmd, arg, 1);
-}
-
  static ctl_table hpet_table[] = {
         {
          .ctl_name = CTL_UNNUMBERED,
@@ -806,14 +731,14 @@ static unsigned long hpet_calibrate(struct hpets *hpetp)
  
  int hpet_alloc(struct hpet_data *hdp)
  {
-       u64 cap, mcfg;
+       u64 cap, mcfg, hpet_config;
         struct hpet_dev *devp;
-       u32 i, ntimer;
+       u32 i, ntimer, irq;
         struct hpets *hpetp;
         size_t siz;
         struct hpet __iomem *hpet;
         static struct hpets *last = NULL;
-       unsigned long period;
+       unsigned long period, irq_bitmap;
         unsigned long long temp;
  
         /*
@@ -840,11 +765,47 @@ int hpet_alloc(struct hpet_data *hdp)
         hpetp->hp_hpet_phys = hdp->hd_phys_address;
  
         hpetp->hp_ntimer = hdp->hd_nirqs;
+       hpet = hpetp->hp_hpet;
  
-       for (i = 0; i < hdp->hd_nirqs; i++)
-               hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i];
+       /* Assign IRQs statically for legacy devices */
+       hpetp->hp_dev[0].hd_hdwirq = hdp->hd_irq[0];
+       hpetp->hp_dev[1].hd_hdwirq = hdp->hd_irq[1];
  
-       hpet = hpetp->hp_hpet;
+       /* Assign IRQs dynamically for the others */
+       for (i = 2, devp = &hpetp->hp_dev[2]; i < hdp->hd_nirqs; i++, devp++) {
+               struct hpet_timer __iomem *timer;
+
+               timer = &hpet->hpet_timers[devp - hpetp->hp_dev];
+
+               /* Check if there's already an IRQ assigned to the timer */
+               if (hdp->hd_irq[i]) {
+                       hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i];
+                       continue;
+               }
+
+               hpet_config = readq(&timer->hpet_config);
+               irq_bitmap = (hpet_config & Tn_INT_ROUTE_CAP_MASK)
+                       >> Tn_INT_ROUTE_CAP_SHIFT;
+               if (!irq_bitmap)
+                       irq = 0;        /* No valid IRQ Assignable */
+               else {
+                       irq = find_first_bit(&irq_bitmap, 32);
+                       do {
+                               hpet_config |= irq << Tn_INT_ROUTE_CNF_SHIFT;
+                               writeq(hpet_config, &timer->hpet_config);
+
+                               /*
+                                * Verify whether we have written a valid
+                                * IRQ number by reading it back again
+                                */
+                               hpet_config = readq(&timer->hpet_config);
+                               if (irq == (hpet_config & Tn_INT_ROUTE_CNF_MASK)
+                                               >> Tn_INT_ROUTE_CNF_SHIFT)
+                                       break;  /* Success */
+                       } while ((irq = (find_next_bit(&irq_bitmap, 32, irq))));
+               }
+               hpetp->hp_dev[i].hd_hdwirq = irq;
+       }
  
         cap = readq(&hpet->hpet_cap);
  
@@ -875,7 +836,8 @@ int hpet_alloc(struct hpet_data *hdp)
                 hpetp->hp_which, hdp->hd_phys_address,
                 hpetp->hp_ntimer > 1 ? "s" : "");
         for (i = 0; i < hpetp->hp_ntimer; i++)
-               printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]);
+               printk("%s %d", i > 0 ? "," : "",
+                               hpetp->hp_dev[i].hd_hdwirq);
         printk("\n");
  
         printk(KERN_INFO "hpet%u: %u %d-bit timers, %Lu Hz\n",
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c

index 0c66b802736a10540bca3ed9d7ef34f5b0208b4a..78b151c4d20f94b0bccecc1acb3540e14b77cdc9 100644 (file)
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -1,5 +1,5 @@
  /*
- *     Real Time Clock interface for Linux     
+ *     Real Time Clock interface for Linux
   *
   *     Copyright (C) 1996 Paul Gortmaker
   *
@@ -17,7 +17,7 @@
   *     has been received. If a RTC interrupt has already happened,
   *     it will output an unsigned long and then block. The output value
   *     contains the interrupt status in the low byte and the number of
- *     interrupts since the last read in the remaining high bytes. The 
+ *     interrupts since the last read in the remaining high bytes. The
   *     /dev/rtc interface can also be used with the select(2) call.
   *
   *     This program is free software; you can redistribute it and/or
@@ -104,12 +104,14 @@ static int rtc_has_irq = 1;
  
  #ifndef CONFIG_HPET_EMULATE_RTC
  #define is_hpet_enabled()                      0
-#define hpet_set_alarm_time(hrs, min, sec)     0
-#define hpet_set_periodic_freq(arg)            0
-#define hpet_mask_rtc_irq_bit(arg)             0
-#define hpet_set_rtc_irq_bit(arg)              0
-#define hpet_rtc_timer_init()                  do { } while (0)
-#define hpet_rtc_dropped_irq()                         0
+#define hpet_set_alarm_time(hrs, min, sec)     0
+#define hpet_set_periodic_freq(arg)            0
+#define hpet_mask_rtc_irq_bit(arg)             0
+#define hpet_set_rtc_irq_bit(arg)              0
+#define hpet_rtc_timer_init()                  do { } while (0)
+#define hpet_rtc_dropped_irq()                 0
+#define hpet_register_irq_handler(h)           0
+#define hpet_unregister_irq_handler(h)         0
  #ifdef RTC_IRQ
  static irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
  {
@@ -147,7 +149,7 @@ static int rtc_ioctl(struct inode *inode, struct file *file,
  static unsigned int rtc_poll(struct file *file, poll_table *wait);
  #endif
  
-static void get_rtc_alm_time (struct rtc_time *alm_tm);
+static void get_rtc_alm_time(struct rtc_time *alm_tm);
  #ifdef RTC_IRQ
  static void set_rtc_irq_bit_locked(unsigned char bit);
  static void mask_rtc_irq_bit_locked(unsigned char bit);
@@ -185,9 +187,9 @@ static int rtc_proc_open(struct inode *inode, struct file *file);
   * rtc_status but before mod_timer is called, which would then reenable the
   * timer (but you would need to have an awful timing before you'd trip on it)
   */
-static unsigned long rtc_status = 0;   /* bitmapped status byte.       */
-static unsigned long rtc_freq = 0;     /* Current periodic IRQ rate    */
-static unsigned long rtc_irq_data = 0; /* our output to the world      */
+static unsigned long rtc_status;       /* bitmapped status byte.       */
+static unsigned long rtc_freq;         /* Current periodic IRQ rate    */
+static unsigned long rtc_irq_data;     /* our output to the world      */
  static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */
  
  #ifdef RTC_IRQ
@@ -195,7 +197,7 @@ static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */
   * rtc_task_lock nests inside rtc_lock.
   */
  static DEFINE_SPINLOCK(rtc_task_lock);
-static rtc_task_t *rtc_callback = NULL;
+static rtc_task_t *rtc_callback;
  #endif
  
  /*
@@ -205,7 +207,7 @@ static rtc_task_t *rtc_callback = NULL;
  
  static unsigned long epoch = 1900;     /* year corresponding to 0x00   */
  
-static const unsigned char days_in_mo[] = 
+static const unsigned char days_in_mo[] =
  {0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
  
  /*
@@ -242,7 +244,7 @@ irqreturn_t rtc_interrupt(int irq, void *dev_id)
          *      the last read in the remainder of rtc_irq_data.
          */
  
-       spin_lock (&rtc_lock);
+       spin_lock(&rtc_lock);
         rtc_irq_data += 0x100;
         rtc_irq_data &= ~0xff;
         if (is_hpet_enabled()) {
@@ -259,16 +261,16 @@ irqreturn_t rtc_interrupt(int irq, void *dev_id)
         if (rtc_status & RTC_TIMER_ON)
                 mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
  
-       spin_unlock (&rtc_lock);
+       spin_unlock(&rtc_lock);
  
         /* Now do the rest of the actions */
         spin_lock(&rtc_task_lock);
         if (rtc_callback)
                 rtc_callback->func(rtc_callback->private_data);
         spin_unlock(&rtc_task_lock);
-       wake_up_interruptible(&rtc_wait);       
+       wake_up_interruptible(&rtc_wait);
  
-       kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+       kill_fasync(&rtc_async_queue, SIGIO, POLL_IN);
  
         return IRQ_HANDLED;
  }
@@ -335,7 +337,7 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
         DECLARE_WAITQUEUE(wait, current);
         unsigned long data;
         ssize_t retval;
-       
+
         if (rtc_has_irq == 0)
                 return -EIO;
  
@@ -358,11 +360,11 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
                  * confusing. And no, xchg() is not the answer. */
  
                 __set_current_state(TASK_INTERRUPTIBLE);
-               
-               spin_lock_irq (&rtc_lock);
+
+               spin_lock_irq(&rtc_lock);
                 data = rtc_irq_data;
                 rtc_irq_data = 0;
-               spin_unlock_irq (&rtc_lock);
+               spin_unlock_irq(&rtc_lock);
  
                 if (data != 0)
                         break;
@@ -378,10 +380,13 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
                 schedule();
         } while (1);
  
-       if (count == sizeof(unsigned int))
-               retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int);
-       else
-               retval = put_user(data, (unsigned long __user *)buf) ?: sizeof(long);
+       if (count == sizeof(unsigned int)) {
+               retval = put_user(data,
+                                 (unsigned int __user *)buf) ?: sizeof(int);
+       } else {
+               retval = put_user(data,
+                                 (unsigned long __user *)buf) ?: sizeof(long);
+       }
         if (!retval)
                 retval = count;
   out:
@@ -394,7 +399,7 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
  
  static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
  {
-       struct rtc_time wtime; 
+       struct rtc_time wtime;
  
  #ifdef RTC_IRQ
         if (rtc_has_irq == 0) {
@@ -426,35 +431,41 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
         }
         case RTC_PIE_OFF:       /* Mask periodic int. enab. bit */
         {
-               unsigned long flags; /* can be called from isr via rtc_control() */
-               spin_lock_irqsave (&rtc_lock, flags);
+               /* can be called from isr via rtc_control() */
+               unsigned long flags;
+
+               spin_lock_irqsave(&rtc_lock, flags);
                 mask_rtc_irq_bit_locked(RTC_PIE);
                 if (rtc_status & RTC_TIMER_ON) {
                         rtc_status &= ~RTC_TIMER_ON;
                         del_timer(&rtc_irq_timer);
                 }
-               spin_unlock_irqrestore (&rtc_lock, flags);
+               spin_unlock_irqrestore(&rtc_lock, flags);
+
                 return 0;
         }
         case RTC_PIE_ON:        /* Allow periodic ints          */
         {
-               unsigned long flags; /* can be called from isr via rtc_control() */
+               /* can be called from isr via rtc_control() */
+               unsigned long flags;
+
                 /*
                  * We don't really want Joe User enabling more
                  * than 64Hz of interrupts on a multi-user machine.
                  */
                 if (!kernel && (rtc_freq > rtc_max_user_freq) &&
-                       (!capable(CAP_SYS_RESOURCE)))
+                                               (!capable(CAP_SYS_RESOURCE)))
                         return -EACCES;
  
-               spin_lock_irqsave (&rtc_lock, flags);
+               spin_lock_irqsave(&rtc_lock, flags);
                 if (!(rtc_status & RTC_TIMER_ON)) {
                         mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq +
                                         2*HZ/100);
                         rtc_status |= RTC_TIMER_ON;
                 }
                 set_rtc_irq_bit_locked(RTC_PIE);
-               spin_unlock_irqrestore (&rtc_lock, flags);
+               spin_unlock_irqrestore(&rtc_lock, flags);
+
                 return 0;
         }
         case RTC_UIE_OFF:       /* Mask ints from RTC updates.  */
@@ -477,7 +488,7 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
                  */
                 memset(&wtime, 0, sizeof(struct rtc_time));
                 get_rtc_alm_time(&wtime);
-               break; 
+               break;
         }
         case RTC_ALM_SET:       /* Store a time into the alarm */
         {
@@ -505,16 +516,21 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
                          */
                 }
                 if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) ||
-                   RTC_ALWAYS_BCD)
-               {
-                       if (sec < 60) BIN_TO_BCD(sec);
-                       else sec = 0xff;
-
-                       if (min < 60) BIN_TO_BCD(min);
-                       else min = 0xff;
-
-                       if (hrs < 24) BIN_TO_BCD(hrs);
-                       else hrs = 0xff;
+                                                       RTC_ALWAYS_BCD) {
+                       if (sec < 60)
+                               BIN_TO_BCD(sec);
+                       else
+                               sec = 0xff;
+
+                       if (min < 60)
+                               BIN_TO_BCD(min);
+                       else
+                               min = 0xff;
+
+                       if (hrs < 24)
+                               BIN_TO_BCD(hrs);
+                       else
+                               hrs = 0xff;
                 }
                 CMOS_WRITE(hrs, RTC_HOURS_ALARM);
                 CMOS_WRITE(min, RTC_MINUTES_ALARM);
@@ -563,11 +579,12 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
  
                 if (day > (days_in_mo[mon] + ((mon == 2) && leap_yr)))
                         return -EINVAL;
-                       
+
                 if ((hrs >= 24) || (min >= 60) || (sec >= 60))
                         return -EINVAL;
  
-               if ((yrs -= epoch) > 255)    /* They are unsigned */
+               yrs -= epoch;
+               if (yrs > 255)          /* They are unsigned */
                         return -EINVAL;
  
                 spin_lock_irq(&rtc_lock);
@@ -635,9 +652,10 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
         {
                 int tmp = 0;
                 unsigned char val;
-               unsigned long flags; /* can be called from isr via rtc_control() */
+               /* can be called from isr via rtc_control() */
+               unsigned long flags;
  
-               /* 
+               /*
                  * The max we can do is 8192Hz.
                  */
                 if ((arg < 2) || (arg > 8192))
@@ -646,7 +664,8 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
                  * We don't really want Joe User generating more
                  * than 64Hz of interrupts on a multi-user machine.
                  */
-               if (!kernel && (arg > rtc_max_user_freq) && (!capable(CAP_SYS_RESOURCE)))
+               if (!kernel && (arg > rtc_max_user_freq) &&
+                                       !capable(CAP_SYS_RESOURCE))
                         return -EACCES;
  
                 while (arg > (1<<tmp))
@@ -674,11 +693,11 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
  #endif
         case RTC_EPOCH_READ:    /* Read the epoch.      */
         {
-               return put_user (epoch, (unsigned long __user *)arg);
+               return put_user(epoch, (unsigned long __user *)arg);
         }
         case RTC_EPOCH_SET:     /* Set the epoch.       */
         {
-               /* 
+               /*
                  * There were no RTC clocks before 1900.
                  */
                 if (arg < 1900)
@@ -693,7 +712,8 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
         default:
                 return -ENOTTY;
         }
-       return copy_to_user((void __user *)arg, &wtime, sizeof wtime) ? -EFAULT : 0;
+       return copy_to_user((void __user *)arg,
+                           &wtime, sizeof wtime) ? -EFAULT : 0;
  }
  
  static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
@@ -712,26 +732,25 @@ static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
   * needed here. Or anywhere else in this driver. */
  static int rtc_open(struct inode *inode, struct file *file)
  {
-       spin_lock_irq (&rtc_lock);
+       spin_lock_irq(&rtc_lock);
  
-       if(rtc_status & RTC_IS_OPEN)
+       if (rtc_status & RTC_IS_OPEN)
                 goto out_busy;
  
         rtc_status |= RTC_IS_OPEN;
  
         rtc_irq_data = 0;
-       spin_unlock_irq (&rtc_lock);
+       spin_unlock_irq(&rtc_lock);
         return 0;
  
  out_busy:
-       spin_unlock_irq (&rtc_lock);
+       spin_unlock_irq(&rtc_lock);
         return -EBUSY;
  }
  
-static int rtc_fasync (int fd, struct file *filp, int on)
-
+static int rtc_fasync(int fd, struct file *filp, int on)
  {
-       return fasync_helper (fd, filp, on, &rtc_async_queue);
+       return fasync_helper(fd, filp, on, &rtc_async_queue);
  }
  
  static int rtc_release(struct inode *inode, struct file *file)
@@ -762,16 +781,16 @@ static int rtc_release(struct inode *inode, struct file *file)
         }
         spin_unlock_irq(&rtc_lock);
  
-       if (file->f_flags & FASYNC) {
-               rtc_fasync (-1, file, 0);
-       }
+       if (file->f_flags & FASYNC)
+               rtc_fasync(-1, file, 0);
  no_irq:
  #endif
  
-       spin_lock_irq (&rtc_lock);
+       spin_lock_irq(&rtc_lock);
         rtc_irq_data = 0;
         rtc_status &= ~RTC_IS_OPEN;
-       spin_unlock_irq (&rtc_lock);
+       spin_unlock_irq(&rtc_lock);
+
         return 0;
  }
  
@@ -786,9 +805,9 @@ static unsigned int rtc_poll(struct file *file, poll_table *wait)
  
         poll_wait(file, &rtc_wait, wait);
  
-       spin_lock_irq (&rtc_lock);
+       spin_lock_irq(&rtc_lock);
         l = rtc_irq_data;
-       spin_unlock_irq (&rtc_lock);
+       spin_unlock_irq(&rtc_lock);
  
         if (l != 0)
                 return POLLIN | POLLRDNORM;
@@ -796,14 +815,6 @@ static unsigned int rtc_poll(struct file *file, poll_table *wait)
  }
  #endif
  
-/*
- * exported stuffs
- */
-
-EXPORT_SYMBOL(rtc_register);
-EXPORT_SYMBOL(rtc_unregister);
-EXPORT_SYMBOL(rtc_control);
-
  int rtc_register(rtc_task_t *task)
  {
  #ifndef RTC_IRQ
@@ -829,6 +840,7 @@ int rtc_register(rtc_task_t *task)
         return 0;
  #endif
  }
+EXPORT_SYMBOL(rtc_register);
  
  int rtc_unregister(rtc_task_t *task)
  {
@@ -845,7 +857,7 @@ int rtc_unregister(rtc_task_t *task)
                 return -ENXIO;
         }
         rtc_callback = NULL;
-       
+
         /* disable controls */
         if (!hpet_mask_rtc_irq_bit(RTC_PIE | RTC_AIE | RTC_UIE)) {
                 tmp = CMOS_READ(RTC_CONTROL);
@@ -865,6 +877,7 @@ int rtc_unregister(rtc_task_t *task)
         return 0;
  #endif
  }
+EXPORT_SYMBOL(rtc_unregister);
  
  int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg)
  {
@@ -883,7 +896,7 @@ int rtc_control(rtc_task_t *task, unsigned int cmd, unsigned long arg)
         return rtc_do_ioctl(cmd, arg, 1);
  #endif
  }
-
+EXPORT_SYMBOL(rtc_control);
  
  /*
   *     The various file operations we support.
@@ -910,11 +923,11 @@ static struct miscdevice rtc_dev = {
  
  #ifdef CONFIG_PROC_FS
  static const struct file_operations rtc_proc_fops = {
-       .owner = THIS_MODULE,
-       .open = rtc_proc_open,
-       .read  = seq_read,
-       .llseek = seq_lseek,
-       .release = single_release,
+       .owner          = THIS_MODULE,
+       .open           = rtc_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
  };
  #endif
  
@@ -965,7 +978,7 @@ static int __init rtc_init(void)
  #ifdef CONFIG_SPARC32
         for_each_ebus(ebus) {
                 for_each_ebusdev(edev, ebus) {
-                       if(strcmp(edev->prom_node->name, "rtc") == 0) {
+                       if (strcmp(edev->prom_node->name, "rtc") == 0) {
                                 rtc_port = edev->resource[0].start;
                                 rtc_irq = edev->irqs[0];
                                 goto found;
@@ -986,7 +999,8 @@ found:
          * XXX Interrupt pin #7 in Espresso is shared between RTC and
          * PCI Slot 2 INTA# (and some INTx# in Slot 1).
          */
-       if (request_irq(rtc_irq, rtc_interrupt, IRQF_SHARED, "rtc", (void *)&rtc_port)) {
+       if (request_irq(rtc_irq, rtc_interrupt, IRQF_SHARED, "rtc",
+                       (void *)&rtc_port)) {
                 rtc_has_irq = 0;
                 printk(KERN_ERR "rtc: cannot register IRQ %d\n", rtc_irq);
                 return -EIO;
@@ -1015,16 +1029,26 @@ no_irq:
  
  #ifdef RTC_IRQ
         if (is_hpet_enabled()) {
+               int err;
+
                 rtc_int_handler_ptr = hpet_rtc_interrupt;
+               err = hpet_register_irq_handler(rtc_interrupt);
+               if (err != 0) {
+                       printk(KERN_WARNING "hpet_register_irq_handler failed "
+                                       "in rtc_init().");
+                       return err;
+               }
         } else {
                 rtc_int_handler_ptr = rtc_interrupt;
         }
  
-       if(request_irq(RTC_IRQ, rtc_int_handler_ptr, IRQF_DISABLED, "rtc", NULL)) {
+       if (request_irq(RTC_IRQ, rtc_int_handler_ptr, IRQF_DISABLED,
+                       "rtc", NULL)) {
                 /* Yeah right, seeing as irq 8 doesn't even hit the bus. */
                 rtc_has_irq = 0;
                 printk(KERN_ERR "rtc: IRQ %d is not free.\n", RTC_IRQ);
                 rtc_release_region();
+
                 return -EIO;
         }
         hpet_rtc_timer_init();
@@ -1036,6 +1060,7 @@ no_irq:
         if (misc_register(&rtc_dev)) {
  #ifdef RTC_IRQ
                 free_irq(RTC_IRQ, NULL);
+               hpet_unregister_irq_handler(rtc_interrupt);
                 rtc_has_irq = 0;
  #endif
                 rtc_release_region();
@@ -1052,21 +1077,21 @@ no_irq:
  
  #if defined(__alpha__) || defined(__mips__)
         rtc_freq = HZ;
-       
+
         /* Each operating system on an Alpha uses its own epoch.
            Let's try to guess which one we are using now. */
-       
+
         if (rtc_is_updating() != 0)
                 msleep(20);
-       
+
         spin_lock_irq(&rtc_lock);
         year = CMOS_READ(RTC_YEAR);
         ctrl = CMOS_READ(RTC_CONTROL);
         spin_unlock_irq(&rtc_lock);
-       
+
         if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
                 BCD_TO_BIN(year);       /* This should never happen... */
-       
+
         if (year < 20) {
                 epoch = 2000;
                 guess = "SRM (post-2000)";
@@ -1087,7 +1112,8 @@ no_irq:
  #endif
         }
         if (guess)
-               printk(KERN_INFO "rtc: %s epoch (%lu) detected\n", guess, epoch);
+               printk(KERN_INFO "rtc: %s epoch (%lu) detected\n",
+                       guess, epoch);
  #endif
  #ifdef RTC_IRQ
         if (rtc_has_irq == 0)
@@ -1096,8 +1122,12 @@ no_irq:
         spin_lock_irq(&rtc_lock);
         rtc_freq = 1024;
         if (!hpet_set_periodic_freq(rtc_freq)) {
-               /* Initialize periodic freq. to CMOS reset default, which is 1024Hz */
-               CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06), RTC_FREQ_SELECT);
+               /*
+                * Initialize periodic frequency to CMOS reset default,
+                * which is 1024Hz
+                */
+               CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06),
+                          RTC_FREQ_SELECT);
         }
         spin_unlock_irq(&rtc_lock);
  no_irq2:
@@ -1110,20 +1140,22 @@ no_irq2:
         return 0;
  }
  
-static void __exit rtc_exit (void)
+static void __exit rtc_exit(void)
  {
         cleanup_sysctl();
-       remove_proc_entry ("driver/rtc", NULL);
+       remove_proc_entry("driver/rtc", NULL);
         misc_deregister(&rtc_dev);
  
  #ifdef CONFIG_SPARC32
         if (rtc_has_irq)
-               free_irq (rtc_irq, &rtc_port);
+               free_irq(rtc_irq, &rtc_port);
  #else
         rtc_release_region();
  #ifdef RTC_IRQ
-       if (rtc_has_irq)
-               free_irq (RTC_IRQ, NULL);
+       if (rtc_has_irq) {
+               free_irq(RTC_IRQ, NULL);
+               hpet_unregister_irq_handler(hpet_rtc_interrupt);
+       }
  #endif
  #endif /* CONFIG_SPARC32 */
  }
@@ -1133,14 +1165,14 @@ module_exit(rtc_exit);
  
  #ifdef RTC_IRQ
  /*
- *     At IRQ rates >= 4096Hz, an interrupt may get lost altogether.
+ *     At IRQ rates >= 4096Hz, an interrupt may get lost altogether.
   *     (usually during an IDE disk interrupt, with IRQ unmasking off)
   *     Since the interrupt handler doesn't get called, the IRQ status
   *     byte doesn't get read, and the RTC stops generating interrupts.
   *     A timer is set, and will call this function if/when that happens.
   *     To get it out of this stalled state, we just read the status.
   *     At least a jiffy of interrupts (rtc_freq/HZ) will have been lost.
- *     (You *really* shouldn't be trying to use a non-realtime system 
+ *     (You *really* shouldn't be trying to use a non-realtime system
   *     for something that requires a steady > 1KHz signal anyways.)
   */
  
@@ -1148,7 +1180,7 @@ static void rtc_dropped_irq(unsigned long data)
  {
         unsigned long freq;
  
-       spin_lock_irq (&rtc_lock);
+       spin_lock_irq(&rtc_lock);
  
         if (hpet_rtc_dropped_irq()) {
                 spin_unlock_irq(&rtc_lock);
@@ -1167,13 +1199,15 @@ static void rtc_dropped_irq(unsigned long data)
  
         spin_unlock_irq(&rtc_lock);
  
-       if (printk_ratelimit())
-               printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq);
+       if (printk_ratelimit()) {
+               printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+                       freq);
+       }
  
         /* Now we have new data */
         wake_up_interruptible(&rtc_wait);
  
-       kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+       kill_fasync(&rtc_async_queue, SIGIO, POLL_IN);
  }
  #endif
  
@@ -1277,7 +1311,7 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
          * can take just over 2ms. We wait 20ms. There is no need to
          * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP.
          * If you need to know *exactly* when a second has started, enable
-        * periodic update complete interrupts, (via ioctl) and then 
+        * periodic update complete interrupts, (via ioctl) and then
          * immediately read /dev/rtc which will block until you get the IRQ.
          * Once the read clears, read the RTC time (again via ioctl). Easy.
          */
@@ -1307,8 +1341,7 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
         ctrl = CMOS_READ(RTC_CONTROL);
         spin_unlock_irqrestore(&rtc_lock, flags);
  
-       if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-       {
+       if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
                 BCD_TO_BIN(rtc_tm->tm_sec);
                 BCD_TO_BIN(rtc_tm->tm_min);
                 BCD_TO_BIN(rtc_tm->tm_hour);
@@ -1326,7 +1359,8 @@ void rtc_get_rtc_time(struct rtc_time *rtc_tm)
          * Account for differences between how the RTC uses the values
          * and how they are defined in a struct rtc_time;
          */
-       if ((rtc_tm->tm_year += (epoch - 1900)) <= 69)
+       rtc_tm->tm_year += epoch - 1900;
+       if (rtc_tm->tm_year <= 69)
                 rtc_tm->tm_year += 100;
  
         rtc_tm->tm_mon--;
@@ -1347,8 +1381,7 @@ static void get_rtc_alm_time(struct rtc_time *alm_tm)
         ctrl = CMOS_READ(RTC_CONTROL);
         spin_unlock_irq(&rtc_lock);
  
-       if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-       {
+       if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
                 BCD_TO_BIN(alm_tm->tm_sec);
                 BCD_TO_BIN(alm_tm->tm_min);
                 BCD_TO_BIN(alm_tm->tm_hour);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c

index 5efd5550f4ca7a4afcf1a606d29e293eea5b4c9e..b730d670952957871f2eecf3f35989e673c70855 100644 (file)
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1604,7 +1604,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
         memcpy(&policy->cpuinfo, &data->cpuinfo,
                                 sizeof(struct cpufreq_cpuinfo));
  
-       if (policy->min > data->min && policy->min > policy->max) {
+       if (policy->min > data->max || policy->max < data->min) {
                 ret = -EINVAL;
                 goto error_out;
         }
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c

index 5e596a7e36013dc2d24082726a0eab5b792fc354..9008ed5ef4ce2e384b41c591cdc7c629a08af324 100644 (file)
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -8,6 +8,8 @@
  #include <linux/slab.h>
  #include <asm/dmi.h>
  
+static char dmi_empty_string[] = "        ";
+
  static char * __init dmi_string(const struct dmi_header *dm, u8 s)
  {
         const u8 *bp = ((u8 *) dm) + dm->length;
@@ -21,11 +23,16 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s)
                 }
  
                 if (*bp != 0) {
-                       str = dmi_alloc(strlen(bp) + 1);
+                       size_t len = strlen(bp)+1;
+                       size_t cmp_len = len > 8 ? 8 : len;
+
+                       if (!memcmp(bp, dmi_empty_string, cmp_len))
+                               return dmi_empty_string;
+                       str = dmi_alloc(len);
                         if (str != NULL)
                                 strcpy(str, bp);
                         else
-                               printk(KERN_ERR "dmi_string: out of memory.\n");
+                               printk(KERN_ERR "dmi_string: cannot allocate %Zu bytes.\n", len);
                 }
         }
  
@@ -175,12 +182,23 @@ static void __init dmi_save_devices(const struct dmi_header *dm)
         }
  }
  
+static struct dmi_device empty_oem_string_dev = {
+       .name = dmi_empty_string,
+};
+
  static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
  {
         int i, count = *(u8 *)(dm + 1);
         struct dmi_device *dev;
  
         for (i = 1; i <= count; i++) {
+               char *devname = dmi_string(dm, i);
+
+               if (!strcmp(devname, dmi_empty_string)) {
+                       list_add(&empty_oem_string_dev.list, &dmi_devices);
+                       continue;
+               }
+
                 dev = dmi_alloc(sizeof(*dev));
                 if (!dev) {
                         printk(KERN_ERR
@@ -189,7 +207,7 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
                 }
  
                 dev->type = DMI_DEV_TYPE_OEM_STRING;
-               dev->name = dmi_string(dm, i);
+               dev->name = devname;
                 dev->device_data = NULL;
  
                 list_add(&dev->list, &dmi_devices);
@@ -331,9 +349,11 @@ void __init dmi_scan_machine(void)
                         rc = dmi_present(q);
                         if (!rc) {
                                 dmi_available = 1;
+                               dmi_iounmap(p, 0x10000);
                                 return;
                         }
                 }
+               dmi_iounmap(p, 0x10000);
         }
   out:  printk(KERN_INFO "DMI not present or invalid.\n");
  }
diff --git a/drivers/ieee1394/Makefile b/drivers/ieee1394/Makefile

index 489c133664d5a1d7c9db7bbdf40dc244704402c4..1f8153b57503310fc9f70563aea664fba289a922 100644 (file)
--- a/drivers/ieee1394/Makefile
+++ b/drivers/ieee1394/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_IEEE1394_SBP2) += sbp2.o
  obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o
  obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o
  
+obj-$(CONFIG_PROVIDE_OHCI1394_DMA_INIT) += init_ohci1394_dma.o
diff --git a/drivers/ieee1394/init_ohci1394_dma.c b/drivers/ieee1394/init_ohci1394_dma.c

new file mode 100644 (file)

index 0000000..ddaab6e
--- /dev/null
+++ b/drivers/ieee1394/init_ohci1394_dma.c
@@ -0,0 +1,285 @@
+/*
+ * init_ohci1394_dma.c - Initializes physical DMA on all OHCI 1394 controllers
+ *
+ * Copyright (C) 2006-2007      Bernhard Kaindl <bk@suse.de>
+ *
+ * Derived from drivers/ieee1394/ohci1394.c and arch/x86/kernel/early-quirks.c
+ * this file has functions to:
+ * - scan the PCI very early on boot for all OHCI 1394-compliant controllers
+ * - reset and initialize them and make them join the IEEE1394 bus and
+ * - enable physical DMA on them to allow remote debugging
+ *
+ * All code and data is marked as __init and __initdata, respective as
+ * during boot, all OHCI1394 controllers may be claimed by the firewire
+ * stack and at this point, this code should not touch them anymore.
+ *
+ * To use physical DMA after the initialization of the firewire stack,
+ * be sure that the stack enables it and (re-)attach after the bus reset
+ * which may be caused by the firewire stack initialization.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/interrupt.h>   /* for ohci1394.h */
+#include <linux/delay.h>
+#include <linux/pci.h>         /* for PCI defines */
+#include <linux/init_ohci1394_dma.h>
+#include <asm/pci-direct.h>    /* for direct PCI config space access */
+#include <asm/fixmap.h>
+
+#include "ieee1394_types.h"
+#include "ohci1394.h"
+
+int __initdata init_ohci1394_dma_early;
+
+/* Reads a PHY register of an OHCI-1394 controller */
+static inline u8 __init get_phy_reg(struct ti_ohci *ohci, u8 addr)
+{
+       int i;
+       quadlet_t r;
+
+       reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | 0x00008000);
+
+       for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+               if (reg_read(ohci, OHCI1394_PhyControl) & 0x80000000)
+                       break;
+               mdelay(1);
+       }
+       r = reg_read(ohci, OHCI1394_PhyControl);
+
+       return (r & 0x00ff0000) >> 16;
+}
+
+/* Writes to a PHY register of an OHCI-1394 controller */
+static inline void __init set_phy_reg(struct ti_ohci *ohci, u8 addr, u8 data)
+{
+       int i;
+
+       reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | data | 0x00004000);
+
+       for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+               u32 r = reg_read(ohci, OHCI1394_PhyControl);
+               if (!(r & 0x00004000))
+                       break;
+               mdelay(1);
+       }
+}
+
+/* Resets an OHCI-1394 controller (for sane state before initialization) */
+static inline void __init init_ohci1394_soft_reset(struct ti_ohci *ohci) {
+       int i;
+
+       reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_softReset);
+
+       for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+               if (!(reg_read(ohci, OHCI1394_HCControlSet)
+                                  & OHCI1394_HCControl_softReset))
+                       break;
+               mdelay(1);
+       }
+}
+
+/* Basic OHCI-1394 register and port inititalization */
+static inline void __init init_ohci1394_initialize(struct ti_ohci *ohci)
+{
+       quadlet_t bus_options;
+       int num_ports, i;
+
+       /* Put some defaults to these undefined bus options */
+       bus_options = reg_read(ohci, OHCI1394_BusOptions);
+       bus_options |=  0x60000000; /* Enable CMC and ISC */
+       bus_options &= ~0x00ff0000; /* XXX: Set cyc_clk_acc to zero for now */
+       bus_options &= ~0x18000000; /* Disable PMC and BMC */
+       reg_write(ohci, OHCI1394_BusOptions, bus_options);
+
+       /* Set the bus number */
+       reg_write(ohci, OHCI1394_NodeID, 0x0000ffc0);
+
+       /* Enable posted writes */
+       reg_write(ohci, OHCI1394_HCControlSet,
+                       OHCI1394_HCControl_postedWriteEnable);
+
+       /* Clear link control register */
+       reg_write(ohci, OHCI1394_LinkControlClear, 0xffffffff);
+
+       /* enable phys */
+       reg_write(ohci, OHCI1394_LinkControlSet,
+                       OHCI1394_LinkControl_RcvPhyPkt);
+
+       /* Don't accept phy packets into AR request context */
+       reg_write(ohci, OHCI1394_LinkControlClear, 0x00000400);
+
+       /* Clear the Isochonouys interrupt masks */
+       reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, 0xffffffff);
+       reg_write(ohci, OHCI1394_IsoRecvIntEventClear, 0xffffffff);
+       reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, 0xffffffff);
+       reg_write(ohci, OHCI1394_IsoXmitIntEventClear, 0xffffffff);
+
+       /* Accept asyncronous transfer requests from all nodes for now */
+       reg_write(ohci,OHCI1394_AsReqFilterHiSet, 0x80000000);
+
+       /* Specify asyncronous transfer retries */
+       reg_write(ohci, OHCI1394_ATRetries,
+                 OHCI1394_MAX_AT_REQ_RETRIES |
+                 (OHCI1394_MAX_AT_RESP_RETRIES<<4) |
+                 (OHCI1394_MAX_PHYS_RESP_RETRIES<<8));
+
+       /* We don't want hardware swapping */
+       reg_write(ohci, OHCI1394_HCControlClear, OHCI1394_HCControl_noByteSwap);
+
+       /* Enable link */
+       reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_linkEnable);
+
+       /* If anything is connected to a port, make sure it is enabled */
+       num_ports = get_phy_reg(ohci, 2) & 0xf;
+       for (i = 0; i < num_ports; i++) {
+               unsigned int status;
+
+               set_phy_reg(ohci, 7, i);
+               status = get_phy_reg(ohci, 8);
+
+               if (status & 0x20)
+                       set_phy_reg(ohci, 8, status & ~1);
+       }
+}
+
+/**
+ * init_ohci1394_wait_for_busresets - wait until bus resets are completed
+ *
+ * OHCI1394 initialization itself and any device going on- or offline
+ * and any cable issue cause a IEEE1394 bus reset. The OHCI1394 spec
+ * specifies that physical DMA is disabled on each bus reset and it
+ * has to be enabled after each bus reset when needed. We resort
+ * to polling here because on early boot, we have no interrupts.
+ */
+static inline void __init init_ohci1394_wait_for_busresets(struct ti_ohci *ohci)
+{
+       int i, events;
+
+       for (i=0; i < 9; i++) {
+               mdelay(200);
+               events = reg_read(ohci, OHCI1394_IntEventSet);
+               if (events & OHCI1394_busReset)
+                       reg_write(ohci, OHCI1394_IntEventClear,
+                                       OHCI1394_busReset);
+       }
+}
+
+/**
+ * init_ohci1394_enable_physical_dma - Enable physical DMA for remote debugging
+ * This enables remote DMA access over IEEE1394 from every host for the low
+ * 4GB of address space. DMA accesses above 4GB are not available currently.
+ */
+static inline void __init init_ohci1394_enable_physical_dma(struct ti_ohci *hci)
+{
+       reg_write(hci, OHCI1394_PhyReqFilterHiSet, 0xffffffff);
+       reg_write(hci, OHCI1394_PhyReqFilterLoSet, 0xffffffff);
+       reg_write(hci, OHCI1394_PhyUpperBound, 0xffff0000);
+}
+
+/**
+ * init_ohci1394_reset_and_init_dma - init controller and enable DMA
+ * This initializes the given controller and enables physical DMA engine in it.
+ */
+static inline void __init init_ohci1394_reset_and_init_dma(struct ti_ohci *ohci)
+{
+       /* Start off with a soft reset, clears everything to a sane state. */
+       init_ohci1394_soft_reset(ohci);
+
+       /* Accessing some registers without LPS enabled may cause lock up */
+       reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_LPS);
+
+       /* Disable and clear interrupts */
+       reg_write(ohci, OHCI1394_IntEventClear, 0xffffffff);
+       reg_write(ohci, OHCI1394_IntMaskClear, 0xffffffff);
+
+       mdelay(50); /* Wait 50msec to make sure we have full link enabled */
+
+       init_ohci1394_initialize(ohci);
+       /*
+        * The initialization causes at least one IEEE1394 bus reset. Enabling
+        * physical DMA only works *after* *all* bus resets have calmed down:
+        */
+       init_ohci1394_wait_for_busresets(ohci);
+
+       /* We had to wait and do this now if we want to debug early problems */
+       init_ohci1394_enable_physical_dma(ohci);
+}
+
+/**
+ * init_ohci1394_controller - Map the registers of the controller and init DMA
+ * This maps the registers of the specified controller and initializes it
+ */
+static inline void __init init_ohci1394_controller(int num, int slot, int func)
+{
+       unsigned long ohci_base;
+       struct ti_ohci ohci;
+
+       printk(KERN_INFO "init_ohci1394_dma: initializing OHCI-1394"
+                        " at %02x:%02x.%x\n", num, slot, func);
+
+       ohci_base = read_pci_config(num, slot, func, PCI_BASE_ADDRESS_0+(0<<2))
+                                                  & PCI_BASE_ADDRESS_MEM_MASK;
+
+       set_fixmap_nocache(FIX_OHCI1394_BASE, ohci_base);
+
+       ohci.registers = (void *)fix_to_virt(FIX_OHCI1394_BASE);
+
+       init_ohci1394_reset_and_init_dma(&ohci);
+}
+
+/**
+ * debug_init_ohci1394_dma - scan for OHCI1394 controllers and init DMA on them
+ * Scans the whole PCI space for OHCI1394 controllers and inits DMA on them
+ */
+void __init init_ohci1394_dma_on_all_controllers(void)
+{
+       int num, slot, func;
+
+       if (!early_pci_allowed())
+               return;
+
+       /* Poor man's PCI discovery, the only thing we can do at early boot */
+       for (num = 0; num < 32; num++) {
+               for (slot = 0; slot < 32; slot++) {
+                       for (func = 0; func < 8; func++) {
+                               u32 class = read_pci_config(num,slot,func,
+                                                       PCI_CLASS_REVISION);
+                               if ((class == 0xffffffff))
+                                       continue; /* No device at this func */
+
+                               if (class>>8 != PCI_CLASS_SERIAL_FIREWIRE_OHCI)
+                                       continue; /* Not an OHCI-1394 device */
+
+                               init_ohci1394_controller(num, slot, func);
+                               break; /* Assume one controller per device */
+                       }
+               }
+       }
+       printk(KERN_INFO "init_ohci1394_dma: finished initializing OHCI DMA\n");
+}
+
+/**
+ * setup_init_ohci1394_early - enables early OHCI1394 DMA initialization
+ */
+static int __init setup_ohci1394_dma(char *opt)
+{
+       if (!strcmp(opt, "early"))
+               init_ohci1394_dma_early = 1;
+       return 0;
+}
+
+/* passing ohci1394_dma=early on boot causes early OHCI1394 DMA initialization */
+early_param("ohci1394_dma", setup_ohci1394_dma);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c

index f2d2c7e2c76b9cd1b778da992a481b65e575bf7a..195ce7c123191896aa2946b0dc2499f13d23cbfb 100644 (file)
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1571,7 +1571,6 @@ static struct scsi_host_template srp_template = {
         .this_id                        = -1,
         .cmd_per_lun                    = SRP_SQ_SIZE,
         .use_clustering                 = ENABLE_CLUSTERING,
-       .use_sg_chaining                = ENABLE_SG_CHAINING,
         .shost_attrs                    = srp_host_attrs
  };
  
diff --git a/drivers/input/mouse/pc110pad.c b/drivers/input/mouse/pc110pad.c

index 8991ab0b4fe3b3caed329b0a32aea07d29c6d303..61cff8374e6c0047f54779e358563745ee7b8226 100644 (file)
--- a/drivers/input/mouse/pc110pad.c
+++ b/drivers/input/mouse/pc110pad.c
@@ -39,6 +39,7 @@
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/pci.h>
+#include <linux/delay.h>
  
  #include <asm/io.h>
  #include <asm/irq.h>
@@ -62,8 +63,10 @@ static irqreturn_t pc110pad_interrupt(int irq, void *ptr)
         int value     = inb_p(pc110pad_io);
         int handshake = inb_p(pc110pad_io + 2);
  
-       outb_p(handshake |  1, pc110pad_io + 2);
-       outb_p(handshake & ~1, pc110pad_io + 2);
+       outb(handshake |  1, pc110pad_io + 2);
+       udelay(2);
+       outb(handshake & ~1, pc110pad_io + 2);
+       udelay(2);
         inb_p(0x64);
  
         pc110pad_data[pc110pad_count++] = value;
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h

deleted file mode 100644 (file)

index 11fc014..0000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2007, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- * Authors:
- *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include "kvm.h"
-
-typedef void irq_request_func(void *opaque, int level);
-
-struct kvm_kpic_state {
-       u8 last_irr;    /* edge detection */
-       u8 irr;         /* interrupt request register */
-       u8 imr;         /* interrupt mask register */
-       u8 isr;         /* interrupt service register */
-       u8 priority_add;        /* highest irq priority */
-       u8 irq_base;
-       u8 read_reg_select;
-       u8 poll;
-       u8 special_mask;
-       u8 init_state;
-       u8 auto_eoi;
-       u8 rotate_on_auto_eoi;
-       u8 special_fully_nested_mode;
-       u8 init4;               /* true if 4 byte init */
-       u8 elcr;                /* PIIX edge/trigger selection */
-       u8 elcr_mask;
-       struct kvm_pic *pics_state;
-};
-
-struct kvm_pic {
-       struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-       irq_request_func *irq_request;
-       void *irq_request_opaque;
-       int output;             /* intr from master PIC */
-       struct kvm_io_device dev;
-};
-
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
-void kvm_pic_update_irq(struct kvm_pic *s);
-
-#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
-#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
-#define IOAPIC_EDGE_TRIG  0
-#define IOAPIC_LEVEL_TRIG 1
-
-#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
-#define IOAPIC_MEM_LENGTH            0x100
-
-/* Direct registers. */
-#define IOAPIC_REG_SELECT  0x00
-#define IOAPIC_REG_WINDOW  0x10
-#define IOAPIC_REG_EOI     0x40        /* IA64 IOSAPIC only */
-
-/* Indirect registers. */
-#define IOAPIC_REG_APIC_ID 0x00        /* x86 IOAPIC only */
-#define IOAPIC_REG_VERSION 0x01
-#define IOAPIC_REG_ARB_ID  0x02        /* x86 IOAPIC only */
-
-struct kvm_ioapic {
-       u64 base_address;
-       u32 ioregsel;
-       u32 id;
-       u32 irr;
-       u32 pad;
-       union ioapic_redir_entry {
-               u64 bits;
-               struct {
-                       u8 vector;
-                       u8 delivery_mode:3;
-                       u8 dest_mode:1;
-                       u8 delivery_status:1;
-                       u8 polarity:1;
-                       u8 remote_irr:1;
-                       u8 trig_mode:1;
-                       u8 mask:1;
-                       u8 reserve:7;
-                       u8 reserved[4];
-                       u8 dest_id;
-               } fields;
-       } redirtbl[IOAPIC_NUM_PINS];
-       struct kvm_io_device dev;
-       struct kvm *kvm;
-};
-
-struct kvm_lapic {
-       unsigned long base_address;
-       struct kvm_io_device dev;
-       struct {
-               atomic_t pending;
-               s64 period;     /* unit: ns */
-               u32 divide_count;
-               ktime_t last_update;
-               struct hrtimer dev;
-       } timer;
-       struct kvm_vcpu *vcpu;
-       struct page *regs_page;
-       void *regs;
-};
-
-#ifdef DEBUG
-#define ASSERT(x)                                                      \
-do {                                                                   \
-       if (!(x)) {                                                     \
-               printk(KERN_EMERG "assertion failed %s: %d: %s\n",      \
-                      __FILE__, __LINE__, #x);                         \
-               BUG();                                                  \
-       }                                                               \
-} while (0)
-#else
-#define ASSERT(x) do { } while (0)
-#endif
-
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
-int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
-void kvm_free_apic(struct kvm_lapic *apic);
-u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-                                      unsigned long bitmap);
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
-void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c

deleted file mode 100644 (file)

index feb5ac9..0000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "vmx.h"
-#include "kvm.h"
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 1;
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x)                                                      \
-       if (!(x)) {                                                     \
-               printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
-                      __FILE__, __LINE__, #x);                         \
-       }
-#endif
-
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
-
-#define PT_WRITABLE_SHIFT 1
-
-#define PT_PRESENT_MASK (1ULL << 0)
-#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
-#define PT_PWT_MASK (1ULL << 3)
-#define PT_PCD_MASK (1ULL << 4)
-#define PT_ACCESSED_MASK (1ULL << 5)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
-#define PT_PAT_MASK (1ULL << 7)
-#define PT_GLOBAL_MASK (1ULL << 8)
-#define PT64_NX_MASK (1ULL << 63)
-
-#define PT_PAT_SHIFT 7
-#define PT_DIR_PAT_SHIFT 12
-#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
-               ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
-
-#define PT64_LEVEL_MASK(level) \
-               (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
-       (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
-               ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
-
-#define PT32_LEVEL_MASK(level) \
-               (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-
-#define PT32_INDEX(address, level)\
-       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
-       (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
-       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-
-
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_FETCH_MASK (1U << 4)
-
-#define PT64_ROOT_LEVEL 4
-#define PT32_ROOT_LEVEL 2
-#define PT32E_ROOT_LEVEL 3
-
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
-
-#define RMAP_EXT 4
-
-struct kvm_rmap_desc {
-       u64 *shadow_ptes[RMAP_EXT];
-       struct kvm_rmap_desc *more;
-};
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static int is_write_protection(struct kvm_vcpu *vcpu)
-{
-       return vcpu->cr0 & X86_CR0_WP;
-}
-
-static int is_cpuid_PSE36(void)
-{
-       return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
-       return vcpu->shadow_efer & EFER_NX;
-}
-
-static int is_present_pte(unsigned long pte)
-{
-       return pte & PT_PRESENT_MASK;
-}
-
-static int is_writeble_pte(unsigned long pte)
-{
-       return pte & PT_WRITABLE_MASK;
-}
-
-static int is_io_pte(unsigned long pte)
-{
-       return pte & PT_SHADOW_IO_MARK;
-}
-
-static int is_rmap_pte(u64 pte)
-{
-       return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
-               == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
-}
-
-static void set_shadow_pte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
-       set_64bit((unsigned long *)sptep, spte);
-#else
-       set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-                                 struct kmem_cache *base_cache, int min)
-{
-       void *obj;
-
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
-               if (!obj)
-                       return -ENOMEM;
-               cache->objects[cache->nobjs++] = obj;
-       }
-       return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-       while (mc->nobjs)
-               kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
-                                      int min)
-{
-       struct page *page;
-
-       if (cache->nobjs >= min)
-               return 0;
-       while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-               page = alloc_page(GFP_KERNEL);
-               if (!page)
-                       return -ENOMEM;
-               set_page_private(page, 0);
-               cache->objects[cache->nobjs++] = page_address(page);
-       }
-       return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
-       while (mc->nobjs)
-               free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
-       int r;
-
-       kvm_mmu_free_some_pages(vcpu);
-       r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
-                                  pte_chain_cache, 4);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
-                                  rmap_desc_cache, 1);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
-       if (r)
-               goto out;
-       r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
-                                  mmu_page_header_cache, 4);
-out:
-       return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-       mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
-       mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
-       mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
-       mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
-                                   size_t size)
-{
-       void *p;
-
-       BUG_ON(!mc->nobjs);
-       p = mc->objects[--mc->nobjs];
-       memset(p, 0, size);
-       return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
-       return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
-                                     sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
-       kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
-       return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
-                                     sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
-       kfree(rd);
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If page->private bit zero is zero, then page->private points to the
- * shadow page table entry that points to page_address(page).
- *
- * If page->private bit zero is one, (then page->private & ~1) points
- * to a struct kvm_rmap_desc containing more mappings.
- */
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
-{
-       struct page *page;
-       struct kvm_rmap_desc *desc;
-       int i;
-
-       if (!is_rmap_pte(*spte))
-               return;
-       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
-       if (!page_private(page)) {
-               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-               set_page_private(page,(unsigned long)spte);
-       } else if (!(page_private(page) & 1)) {
-               rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-               desc = mmu_alloc_rmap_desc(vcpu);
-               desc->shadow_ptes[0] = (u64 *)page_private(page);
-               desc->shadow_ptes[1] = spte;
-               set_page_private(page,(unsigned long)desc | 1);
-       } else {
-               rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-               desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
-               while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
-                       desc = desc->more;
-               if (desc->shadow_ptes[RMAP_EXT-1]) {
-                       desc->more = mmu_alloc_rmap_desc(vcpu);
-                       desc = desc->more;
-               }
-               for (i = 0; desc->shadow_ptes[i]; ++i)
-                       ;
-               desc->shadow_ptes[i] = spte;
-       }
-}
-
-static void rmap_desc_remove_entry(struct page *page,
-                                  struct kvm_rmap_desc *desc,
-                                  int i,
-                                  struct kvm_rmap_desc *prev_desc)
-{
-       int j;
-
-       for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
-               ;
-       desc->shadow_ptes[i] = desc->shadow_ptes[j];
-       desc->shadow_ptes[j] = NULL;
-       if (j != 0)
-               return;
-       if (!prev_desc && !desc->more)
-               set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
-       else
-               if (prev_desc)
-                       prev_desc->more = desc->more;
-               else
-                       set_page_private(page,(unsigned long)desc->more | 1);
-       mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(u64 *spte)
-{
-       struct page *page;
-       struct kvm_rmap_desc *desc;
-       struct kvm_rmap_desc *prev_desc;
-       int i;
-
-       if (!is_rmap_pte(*spte))
-               return;
-       page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
-       if (!page_private(page)) {
-               printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
-               BUG();
-       } else if (!(page_private(page) & 1)) {
-               rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
-               if ((u64 *)page_private(page) != spte) {
-                       printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-                              spte, *spte);
-                       BUG();
-               }
-               set_page_private(page,0);
-       } else {
-               rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
-               desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
-               prev_desc = NULL;
-               while (desc) {
-                       for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
-                               if (desc->shadow_ptes[i] == spte) {
-                                       rmap_desc_remove_entry(page,
-                                                              desc, i,
-                                                              prev_desc);
-                                       return;
-                               }
-                       prev_desc = desc;
-                       desc = desc->more;
-               }
-               BUG();
-       }
-}
-
-static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
-{
-       struct kvm *kvm = vcpu->kvm;
-       struct page *page;
-       struct kvm_rmap_desc *desc;
-       u64 *spte;
-
-       page = gfn_to_page(kvm, gfn);
-       BUG_ON(!page);
-
-       while (page_private(page)) {
-               if (!(page_private(page) & 1))
-                       spte = (u64 *)page_private(page);
-               else {
-                       desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
-                       spte = desc->shadow_ptes[0];
-               }
-               BUG_ON(!spte);
-               BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
-                      != page_to_pfn(page));
-               BUG_ON(!(*spte & PT_PRESENT_MASK));
-               BUG_ON(!(*spte & PT_WRITABLE_MASK));
-               rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
-               rmap_remove(spte);
-               set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
-               kvm_flush_remote_tlbs(vcpu->kvm);
-       }
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
-       u64 *pos;
-       u64 *end;
-
-       for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
-               if (*pos != 0) {
-                       printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
-                              pos, *pos);
-                       return 0;
-               }
-       return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm,
-                             struct kvm_mmu_page *page_head)
-{
-       ASSERT(is_empty_shadow_page(page_head->spt));
-       list_del(&page_head->link);
-       __free_page(virt_to_page(page_head->spt));
-       kfree(page_head);
-       ++kvm->n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
-       return gfn;
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-                                              u64 *parent_pte)
-{
-       struct kvm_mmu_page *page;
-
-       if (!vcpu->kvm->n_free_mmu_pages)
-               return NULL;
-
-       page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
-                                     sizeof *page);
-       page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
-       set_page_private(virt_to_page(page->spt), (unsigned long)page);
-       list_add(&page->link, &vcpu->kvm->active_mmu_pages);
-       ASSERT(is_empty_shadow_page(page->spt));
-       page->slot_bitmap = 0;
-       page->multimapped = 0;
-       page->parent_pte = parent_pte;
-       --vcpu->kvm->n_free_mmu_pages;
-       return page;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
-                                   struct kvm_mmu_page *page, u64 *parent_pte)
-{
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
-       if (!parent_pte)
-               return;
-       if (!page->multimapped) {
-               u64 *old = page->parent_pte;
-
-               if (!old) {
-                       page->parent_pte = parent_pte;
-                       return;
-               }
-               page->multimapped = 1;
-               pte_chain = mmu_alloc_pte_chain(vcpu);
-               INIT_HLIST_HEAD(&page->parent_ptes);
-               hlist_add_head(&pte_chain->link, &page->parent_ptes);
-               pte_chain->parent_ptes[0] = old;
-       }
-       hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
-               if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-                       continue;
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-                       if (!pte_chain->parent_ptes[i]) {
-                               pte_chain->parent_ptes[i] = parent_pte;
-                               return;
-                       }
-       }
-       pte_chain = mmu_alloc_pte_chain(vcpu);
-       BUG_ON(!pte_chain);
-       hlist_add_head(&pte_chain->link, &page->parent_ptes);
-       pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
-                                      u64 *parent_pte)
-{
-       struct kvm_pte_chain *pte_chain;
-       struct hlist_node *node;
-       int i;
-
-       if (!page->multimapped) {
-               BUG_ON(page->parent_pte != parent_pte);
-               page->parent_pte = NULL;
-               return;
-       }
-       hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
-               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-                       if (!pte_chain->parent_ptes[i])
-                               break;
-                       if (pte_chain->parent_ptes[i] != parent_pte)
-                               continue;
-                       while (i + 1 < NR_PTE_CHAIN_ENTRIES
-                               && pte_chain->parent_ptes[i + 1]) {
-                               pte_chain->parent_ptes[i]
-                                       = pte_chain->parent_ptes[i + 1];
-                               ++i;
-                       }
-                       pte_chain->parent_ptes[i] = NULL;
-                       if (i == 0) {
-                               hlist_del(&pte_chain->link);
-                               mmu_free_pte_chain(pte_chain);
-                               if (hlist_empty(&page->parent_ptes)) {
-                                       page->multimapped = 0;
-                                       page->parent_pte = NULL;
-                               }
-                       }
-                       return;
-               }
-       BUG();
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
-                                               gfn_t gfn)
-{
-       unsigned index;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *page;
-       struct hlist_node *node;
-
-       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &vcpu->kvm->mmu_page_hash[index];
-       hlist_for_each_entry(page, node, bucket, hash_link)
-               if (page->gfn == gfn && !page->role.metaphysical) {
-                       pgprintk("%s: found role %x\n",
-                                __FUNCTION__, page->role.word);
-                       return page;
-               }
-       return NULL;
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
-                                            gfn_t gfn,
-                                            gva_t gaddr,
-                                            unsigned level,
-                                            int metaphysical,
-                                            unsigned hugepage_access,
-                                            u64 *parent_pte)
-{
-       union kvm_mmu_page_role role;
-       unsigned index;
-       unsigned quadrant;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *page;
-       struct hlist_node *node;
-
-       role.word = 0;
-       role.glevels = vcpu->mmu.root_level;
-       role.level = level;
-       role.metaphysical = metaphysical;
-       role.hugepage_access = hugepage_access;
-       if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
-               quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
-               quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
-               role.quadrant = quadrant;
-       }
-       pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
-                gfn, role.word);
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &vcpu->kvm->mmu_page_hash[index];
-       hlist_for_each_entry(page, node, bucket, hash_link)
-               if (page->gfn == gfn && page->role.word == role.word) {
-                       mmu_page_add_parent_pte(vcpu, page, parent_pte);
-                       pgprintk("%s: found\n", __FUNCTION__);
-                       return page;
-               }
-       page = kvm_mmu_alloc_page(vcpu, parent_pte);
-       if (!page)
-               return page;
-       pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
-       page->gfn = gfn;
-       page->role = role;
-       hlist_add_head(&page->hash_link, bucket);
-       if (!metaphysical)
-               rmap_write_protect(vcpu, gfn);
-       return page;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
-                                        struct kvm_mmu_page *page)
-{
-       unsigned i;
-       u64 *pt;
-       u64 ent;
-
-       pt = page->spt;
-
-       if (page->role.level == PT_PAGE_TABLE_LEVEL) {
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-                       if (pt[i] & PT_PRESENT_MASK)
-                               rmap_remove(&pt[i]);
-                       pt[i] = 0;
-               }
-               kvm_flush_remote_tlbs(kvm);
-               return;
-       }
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-               ent = pt[i];
-
-               pt[i] = 0;
-               if (!(ent & PT_PRESENT_MASK))
-                       continue;
-               ent &= PT64_BASE_ADDR_MASK;
-               mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
-       }
-       kvm_flush_remote_tlbs(kvm);
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *page,
-                            u64 *parent_pte)
-{
-       mmu_page_remove_parent_pte(page, parent_pte);
-}
-
-static void kvm_mmu_zap_page(struct kvm *kvm,
-                            struct kvm_mmu_page *page)
-{
-       u64 *parent_pte;
-
-       while (page->multimapped || page->parent_pte) {
-               if (!page->multimapped)
-                       parent_pte = page->parent_pte;
-               else {
-                       struct kvm_pte_chain *chain;
-
-                       chain = container_of(page->parent_ptes.first,
-                                            struct kvm_pte_chain, link);
-                       parent_pte = chain->parent_ptes[0];
-               }
-               BUG_ON(!parent_pte);
-               kvm_mmu_put_page(page, parent_pte);
-               set_shadow_pte(parent_pte, 0);
-       }
-       kvm_mmu_page_unlink_children(kvm, page);
-       if (!page->root_count) {
-               hlist_del(&page->hash_link);
-               kvm_mmu_free_page(kvm, page);
-       } else
-               list_move(&page->link, &kvm->active_mmu_pages);
-}
-
-static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-       unsigned index;
-       struct hlist_head *bucket;
-       struct kvm_mmu_page *page;
-       struct hlist_node *node, *n;
-       int r;
-
-       pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
-       r = 0;
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &vcpu->kvm->mmu_page_hash[index];
-       hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
-               if (page->gfn == gfn && !page->role.metaphysical) {
-                       pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
-                                page->role.word);
-                       kvm_mmu_zap_page(vcpu->kvm, page);
-                       r = 1;
-               }
-       return r;
-}
-
-static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-       struct kvm_mmu_page *page;
-
-       while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
-               pgprintk("%s: zap %lx %x\n",
-                        __FUNCTION__, gfn, page->role.word);
-               kvm_mmu_zap_page(vcpu->kvm, page);
-       }
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
-{
-       int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
-       struct kvm_mmu_page *page_head = page_header(__pa(pte));
-
-       __set_bit(slot, &page_head->slot_bitmap);
-}
-
-hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-       hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
-       return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
-}
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-       struct page *page;
-
-       ASSERT((gpa & HPA_ERR_MASK) == 0);
-       page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-       if (!page)
-               return gpa | HPA_ERR_MASK;
-       return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
-               | (gpa & (PAGE_SIZE-1));
-}
-
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
-{
-       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
-       if (gpa == UNMAPPED_GVA)
-               return UNMAPPED_GVA;
-       return gpa_to_hpa(vcpu, gpa);
-}
-
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
-       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
-       if (gpa == UNMAPPED_GVA)
-               return NULL;
-       return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
-{
-       int level = PT32E_ROOT_LEVEL;
-       hpa_t table_addr = vcpu->mmu.root_hpa;
-
-       for (; ; level--) {
-               u32 index = PT64_INDEX(v, level);
-               u64 *table;
-               u64 pte;
-
-               ASSERT(VALID_PAGE(table_addr));
-               table = __va(table_addr);
-
-               if (level == 1) {
-                       pte = table[index];
-                       if (is_present_pte(pte) && is_writeble_pte(pte))
-                               return 0;
-                       mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
-                       page_header_update_slot(vcpu->kvm, table, v);
-                       table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-                                                               PT_USER_MASK;
-                       rmap_add(vcpu, &table[index]);
-                       return 0;
-               }
-
-               if (table[index] == 0) {
-                       struct kvm_mmu_page *new_table;
-                       gfn_t pseudo_gfn;
-
-                       pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
-                               >> PAGE_SHIFT;
-                       new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
-                                                    v, level - 1,
-                                                    1, 0, &table[index]);
-                       if (!new_table) {
-                               pgprintk("nonpaging_map: ENOMEM\n");
-                               return -ENOMEM;
-                       }
-
-                       table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
-                               | PT_WRITABLE_MASK | PT_USER_MASK;
-               }
-               table_addr = table[index] & PT64_BASE_ADDR_MASK;
-       }
-}
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
-       int i;
-       struct kvm_mmu_page *page;
-
-       if (!VALID_PAGE(vcpu->mmu.root_hpa))
-               return;
-#ifdef CONFIG_X86_64
-       if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->mmu.root_hpa;
-
-               page = page_header(root);
-               --page->root_count;
-               vcpu->mmu.root_hpa = INVALID_PAGE;
-               return;
-       }
-#endif
-       for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->mmu.pae_root[i];
-
-               if (root) {
-                       root &= PT64_BASE_ADDR_MASK;
-                       page = page_header(root);
-                       --page->root_count;
-               }
-               vcpu->mmu.pae_root[i] = INVALID_PAGE;
-       }
-       vcpu->mmu.root_hpa = INVALID_PAGE;
-}
-
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
-       int i;
-       gfn_t root_gfn;
-       struct kvm_mmu_page *page;
-
-       root_gfn = vcpu->cr3 >> PAGE_SHIFT;
-
-#ifdef CONFIG_X86_64
-       if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-               hpa_t root = vcpu->mmu.root_hpa;
-
-               ASSERT(!VALID_PAGE(root));
-               page = kvm_mmu_get_page(vcpu, root_gfn, 0,
-                                       PT64_ROOT_LEVEL, 0, 0, NULL);
-               root = __pa(page->spt);
-               ++page->root_count;
-               vcpu->mmu.root_hpa = root;
-               return;
-       }
-#endif
-       for (i = 0; i < 4; ++i) {
-               hpa_t root = vcpu->mmu.pae_root[i];
-
-               ASSERT(!VALID_PAGE(root));
-               if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
-                       if (!is_present_pte(vcpu->pdptrs[i])) {
-                               vcpu->mmu.pae_root[i] = 0;
-                               continue;
-                       }
-                       root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
-               } else if (vcpu->mmu.root_level == 0)
-                       root_gfn = 0;
-               page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-                                       PT32_ROOT_LEVEL, !is_paging(vcpu),
-                                       0, NULL);
-               root = __pa(page->spt);
-               ++page->root_count;
-               vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
-       }
-       vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-       return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-                              u32 error_code)
-{
-       gpa_t addr = gva;
-       hpa_t paddr;
-       int r;
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       ASSERT(vcpu);
-       ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
-
-
-       paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
-
-       if (is_error_hpa(paddr))
-               return 1;
-
-       return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
-       mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu *context = &vcpu->mmu;
-
-       context->new_cr3 = nonpaging_new_cr3;
-       context->page_fault = nonpaging_page_fault;
-       context->gva_to_gpa = nonpaging_gva_to_gpa;
-       context->free = nonpaging_free;
-       context->root_level = 0;
-       context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
-       return 0;
-}
-
-static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
-       ++vcpu->stat.tlb_flush;
-       kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
-       pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
-       mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-                             u64 addr,
-                             u32 err_code)
-{
-       kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
-       nonpaging_free(vcpu);
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
-       struct kvm_mmu *context = &vcpu->mmu;
-
-       ASSERT(is_pae(vcpu));
-       context->new_cr3 = paging_new_cr3;
-       context->page_fault = paging64_page_fault;
-       context->gva_to_gpa = paging64_gva_to_gpa;
-       context->free = paging_free;
-       context->root_level = level;
-       context->shadow_root_level = level;
-       context->root_hpa = INVALID_PAGE;
-       return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
-       return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu *context = &vcpu->mmu;
-
-       context->new_cr3 = paging_new_cr3;
-       context->page_fault = paging32_page_fault;
-       context->gva_to_gpa = paging32_gva_to_gpa;
-       context->free = paging_free;
-       context->root_level = PT32_ROOT_LEVEL;
-       context->shadow_root_level = PT32E_ROOT_LEVEL;
-       context->root_hpa = INVALID_PAGE;
-       return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
-       return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
-       if (!is_paging(vcpu))
-               return nonpaging_init_context(vcpu);
-       else if (is_long_mode(vcpu))
-               return paging64_init_context(vcpu);
-       else if (is_pae(vcpu))
-               return paging32E_init_context(vcpu);
-       else
-               return paging32_init_context(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       if (VALID_PAGE(vcpu->mmu.root_hpa)) {
-               vcpu->mmu.free(vcpu);
-               vcpu->mmu.root_hpa = INVALID_PAGE;
-       }
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
-       destroy_kvm_mmu(vcpu);
-       return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
-       int r;
-
-       mutex_lock(&vcpu->kvm->lock);
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               goto out;
-       mmu_alloc_roots(vcpu);
-       kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
-       kvm_mmu_flush_tlb(vcpu);
-out:
-       mutex_unlock(&vcpu->kvm->lock);
-       return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
-       mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *page,
-                                 u64 *spte)
-{
-       u64 pte;
-       struct kvm_mmu_page *child;
-
-       pte = *spte;
-       if (is_present_pte(pte)) {
-               if (page->role.level == PT_PAGE_TABLE_LEVEL)
-                       rmap_remove(spte);
-               else {
-                       child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, spte);
-               }
-       }
-       set_shadow_pte(spte, 0);
-       kvm_flush_remote_tlbs(vcpu->kvm);
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu_page *page,
-                                 u64 *spte,
-                                 const void *new, int bytes)
-{
-       if (page->role.level != PT_PAGE_TABLE_LEVEL)
-               return;
-
-       if (page->role.glevels == PT32_ROOT_LEVEL)
-               paging32_update_pte(vcpu, page, spte, new, bytes);
-       else
-               paging64_update_pte(vcpu, page, spte, new, bytes);
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
-{
-       gfn_t gfn = gpa >> PAGE_SHIFT;
-       struct kvm_mmu_page *page;
-       struct hlist_node *node, *n;
-       struct hlist_head *bucket;
-       unsigned index;
-       u64 *spte;
-       unsigned offset = offset_in_page(gpa);
-       unsigned pte_size;
-       unsigned page_offset;
-       unsigned misaligned;
-       unsigned quadrant;
-       int level;
-       int flooded = 0;
-       int npte;
-
-       pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
-       if (gfn == vcpu->last_pt_write_gfn) {
-               ++vcpu->last_pt_write_count;
-               if (vcpu->last_pt_write_count >= 3)
-                       flooded = 1;
-       } else {
-               vcpu->last_pt_write_gfn = gfn;
-               vcpu->last_pt_write_count = 1;
-       }
-       index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
-       bucket = &vcpu->kvm->mmu_page_hash[index];
-       hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
-               if (page->gfn != gfn || page->role.metaphysical)
-                       continue;
-               pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
-               misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-               misaligned |= bytes < 4;
-               if (misaligned || flooded) {
-                       /*
-                        * Misaligned accesses are too much trouble to fix
-                        * up; also, they usually indicate a page is not used
-                        * as a page table.
-                        *
-                        * If we're seeing too many writes to a page,
-                        * it may no longer be a page table, or we may be
-                        * forking, in which case it is better to unmap the
-                        * page.
-                        */
-                       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
-                                gpa, bytes, page->role.word);
-                       kvm_mmu_zap_page(vcpu->kvm, page);
-                       continue;
-               }
-               page_offset = offset;
-               level = page->role.level;
-               npte = 1;
-               if (page->role.glevels == PT32_ROOT_LEVEL) {
-                       page_offset <<= 1;      /* 32->64 */
-                       /*
-                        * A 32-bit pde maps 4MB while the shadow pdes map
-                        * only 2MB.  So we need to double the offset again
-                        * and zap two pdes instead of one.
-                        */
-                       if (level == PT32_ROOT_LEVEL) {
-                               page_offset &= ~7; /* kill rounding error */
-                               page_offset <<= 1;
-                               npte = 2;
-                       }
-                       quadrant = page_offset >> PAGE_SHIFT;
-                       page_offset &= ~PAGE_MASK;
-                       if (quadrant != page->role.quadrant)
-                               continue;
-               }
-               spte = &page->spt[page_offset / sizeof(*spte)];
-               while (npte--) {
-                       mmu_pte_write_zap_pte(vcpu, page, spte);
-                       mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
-                       ++spte;
-               }
-       }
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
-       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
-
-       return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
-}
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-       while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
-               struct kvm_mmu_page *page;
-
-               page = container_of(vcpu->kvm->active_mmu_pages.prev,
-                                   struct kvm_mmu_page, link);
-               kvm_mmu_zap_page(vcpu->kvm, page);
-       }
-}
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *page;
-
-       while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
-               page = container_of(vcpu->kvm->active_mmu_pages.next,
-                                   struct kvm_mmu_page, link);
-               kvm_mmu_zap_page(vcpu->kvm, page);
-       }
-       free_page((unsigned long)vcpu->mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
-       struct page *page;
-       int i;
-
-       ASSERT(vcpu);
-
-       vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
-
-       /*
-        * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-        * Therefore we need to allocate shadow page tables in the first
-        * 4GB of memory, which happens to fit the DMA32 zone.
-        */
-       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-       if (!page)
-               goto error_1;
-       vcpu->mmu.pae_root = page_address(page);
-       for (i = 0; i < 4; ++i)
-               vcpu->mmu.pae_root[i] = INVALID_PAGE;
-
-       return 0;
-
-error_1:
-       free_mmu_pages(vcpu);
-       return -ENOMEM;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
-       return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-       ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
-
-       return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-       ASSERT(vcpu);
-
-       destroy_kvm_mmu(vcpu);
-       free_mmu_pages(vcpu);
-       mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
-       struct kvm_mmu_page *page;
-
-       list_for_each_entry(page, &kvm->active_mmu_pages, link) {
-               int i;
-               u64 *pt;
-
-               if (!test_bit(slot, &page->slot_bitmap))
-                       continue;
-
-               pt = page->spt;
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-                       /* avoid RMW */
-                       if (pt[i] & PT_WRITABLE_MASK) {
-                               rmap_remove(&pt[i]);
-                               pt[i] &= ~PT_WRITABLE_MASK;
-                       }
-       }
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-       struct kvm_mmu_page *page, *node;
-
-       list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
-               kvm_mmu_zap_page(kvm, page);
-
-       kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_module_exit(void)
-{
-       if (pte_chain_cache)
-               kmem_cache_destroy(pte_chain_cache);
-       if (rmap_desc_cache)
-               kmem_cache_destroy(rmap_desc_cache);
-       if (mmu_page_header_cache)
-               kmem_cache_destroy(mmu_page_header_cache);
-}
-
-int kvm_mmu_module_init(void)
-{
-       pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-                                           sizeof(struct kvm_pte_chain),
-                                           0, 0, NULL);
-       if (!pte_chain_cache)
-               goto nomem;
-       rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-                                           sizeof(struct kvm_rmap_desc),
-                                           0, 0, NULL);
-       if (!rmap_desc_cache)
-               goto nomem;
-
-       mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
-                                                 sizeof(struct kvm_mmu_page),
-                                                 0, 0, NULL);
-       if (!mmu_page_header_cache)
-               goto nomem;
-
-       return 0;
-
-nomem:
-       kvm_mmu_module_exit();
-       return -ENOMEM;
-}
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-       gva = (long long)(gva << 16) >> 16;
-#endif
-       return gva;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-                               gva_t va, int level)
-{
-       u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-       int i;
-       gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-       for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-               u64 ent = pt[i];
-
-               if (!(ent & PT_PRESENT_MASK))
-                       continue;
-
-               va = canonicalize(va);
-               if (level > 1)
-                       audit_mappings_page(vcpu, ent, va, level - 1);
-               else {
-                       gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
-                       hpa_t hpa = gpa_to_hpa(vcpu, gpa);
-
-                       if ((ent & PT_PRESENT_MASK)
-                           && (ent & PT64_BASE_ADDR_MASK) != hpa)
-                               printk(KERN_ERR "audit error: (%s) levels %d"
-                                      " gva %lx gpa %llx hpa %llx ent %llx\n",
-                                      audit_msg, vcpu->mmu.root_level,
-                                      va, gpa, hpa, ent);
-               }
-       }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-       unsigned i;
-
-       if (vcpu->mmu.root_level == 4)
-               audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
-       else
-               for (i = 0; i < 4; ++i)
-                       if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
-                               audit_mappings_page(vcpu,
-                                                   vcpu->mmu.pae_root[i],
-                                                   i << 30,
-                                                   2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-       int nmaps = 0;
-       int i, j, k;
-
-       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-               struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
-               struct kvm_rmap_desc *d;
-
-               for (j = 0; j < m->npages; ++j) {
-                       struct page *page = m->phys_mem[j];
-
-                       if (!page->private)
-                               continue;
-                       if (!(page->private & 1)) {
-                               ++nmaps;
-                               continue;
-                       }
-                       d = (struct kvm_rmap_desc *)(page->private & ~1ul);
-                       while (d) {
-                               for (k = 0; k < RMAP_EXT; ++k)
-                                       if (d->shadow_ptes[k])
-                                               ++nmaps;
-                                       else
-                                               break;
-                               d = d->more;
-                       }
-               }
-       }
-       return nmaps;
-}
-
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
-{
-       int nmaps = 0;
-       struct kvm_mmu_page *page;
-       int i;
-
-       list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
-               u64 *pt = page->spt;
-
-               if (page->role.level != PT_PAGE_TABLE_LEVEL)
-                       continue;
-
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-                       u64 ent = pt[i];
-
-                       if (!(ent & PT_PRESENT_MASK))
-                               continue;
-                       if (!(ent & PT_WRITABLE_MASK))
-                               continue;
-                       ++nmaps;
-               }
-       }
-       return nmaps;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-       int n_rmap = count_rmaps(vcpu);
-       int n_actual = count_writable_mappings(vcpu);
-
-       if (n_rmap != n_actual)
-               printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-                      __FUNCTION__, audit_msg, n_rmap, n_actual);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-       struct kvm_mmu_page *page;
-
-       list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
-               hfn_t hfn;
-               struct page *pg;
-
-               if (page->role.metaphysical)
-                       continue;
-
-               hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
-                       >> PAGE_SHIFT;
-               pg = pfn_to_page(hfn);
-               if (pg->private)
-                       printk(KERN_ERR "%s: (%s) shadow page has writable"
-                              " mappings: gfn %lx role %x\n",
-                              __FUNCTION__, audit_msg, page->gfn,
-                              page->role.word);
-       }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
-       int olddbg = dbg;
-
-       dbg = 0;
-       audit_msg = msg;
-       audit_rmap(vcpu);
-       audit_write_protection(vcpu);
-       audit_mappings(vcpu);
-       dbg = olddbg;
-}
-
-#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h

deleted file mode 100644 (file)

index 6b094b4..0000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
-       #define pt_element_t u64
-       #define guest_walker guest_walker64
-       #define FNAME(name) paging##64_##name
-       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
-       #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
-       #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
-       #ifdef CONFIG_X86_64
-       #define PT_MAX_FULL_LEVELS 4
-       #else
-       #define PT_MAX_FULL_LEVELS 2
-       #endif
-#elif PTTYPE == 32
-       #define pt_element_t u32
-       #define guest_walker guest_walker32
-       #define FNAME(name) paging##32_##name
-       #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
-       #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
-       #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
-       #define PT_MAX_FULL_LEVELS 2
-#else
-       #error Invalid PTTYPE value
-#endif
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
-       int level;
-       gfn_t table_gfn[PT_MAX_FULL_LEVELS];
-       pt_element_t *table;
-       pt_element_t pte;
-       pt_element_t *ptep;
-       struct page *page;
-       int index;
-       pt_element_t inherited_ar;
-       gfn_t gfn;
-       u32 error_code;
-};
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-                           struct kvm_vcpu *vcpu, gva_t addr,
-                           int write_fault, int user_fault, int fetch_fault)
-{
-       hpa_t hpa;
-       struct kvm_memory_slot *slot;
-       pt_element_t *ptep;
-       pt_element_t root;
-       gfn_t table_gfn;
-
-       pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
-       walker->level = vcpu->mmu.root_level;
-       walker->table = NULL;
-       walker->page = NULL;
-       walker->ptep = NULL;
-       root = vcpu->cr3;
-#if PTTYPE == 64
-       if (!is_long_mode(vcpu)) {
-               walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
-               root = *walker->ptep;
-               walker->pte = root;
-               if (!(root & PT_PRESENT_MASK))
-                       goto not_present;
-               --walker->level;
-       }
-#endif
-       table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-       walker->table_gfn[walker->level - 1] = table_gfn;
-       pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
-                walker->level - 1, table_gfn);
-       slot = gfn_to_memslot(vcpu->kvm, table_gfn);
-       hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
-       walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
-       walker->table = kmap_atomic(walker->page, KM_USER0);
-
-       ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-              (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
-       walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
-
-       for (;;) {
-               int index = PT_INDEX(addr, walker->level);
-               hpa_t paddr;
-
-               ptep = &walker->table[index];
-               walker->index = index;
-               ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
-                      ((unsigned long)ptep & PAGE_MASK));
-
-               if (!is_present_pte(*ptep))
-                       goto not_present;
-
-               if (write_fault && !is_writeble_pte(*ptep))
-                       if (user_fault || is_write_protection(vcpu))
-                               goto access_error;
-
-               if (user_fault && !(*ptep & PT_USER_MASK))
-                       goto access_error;
-
-#if PTTYPE == 64
-               if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
-                       goto access_error;
-#endif
-
-               if (!(*ptep & PT_ACCESSED_MASK)) {
-                       mark_page_dirty(vcpu->kvm, table_gfn);
-                       *ptep |= PT_ACCESSED_MASK;
-               }
-
-               if (walker->level == PT_PAGE_TABLE_LEVEL) {
-                       walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
-                               >> PAGE_SHIFT;
-                       break;
-               }
-
-               if (walker->level == PT_DIRECTORY_LEVEL
-                   && (*ptep & PT_PAGE_SIZE_MASK)
-                   && (PTTYPE == 64 || is_pse(vcpu))) {
-                       walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
-                               >> PAGE_SHIFT;
-                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
-                       break;
-               }
-
-               walker->inherited_ar &= walker->table[index];
-               table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-               kunmap_atomic(walker->table, KM_USER0);
-               paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
-               walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
-               walker->table = kmap_atomic(walker->page, KM_USER0);
-               --walker->level;
-               walker->table_gfn[walker->level - 1 ] = table_gfn;
-               pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
-                        walker->level - 1, table_gfn);
-       }
-       walker->pte = *ptep;
-       if (walker->page)
-               walker->ptep = NULL;
-       if (walker->table)
-               kunmap_atomic(walker->table, KM_USER0);
-       pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
-       return 1;
-
-not_present:
-       walker->error_code = 0;
-       goto err;
-
-access_error:
-       walker->error_code = PFERR_PRESENT_MASK;
-
-err:
-       if (write_fault)
-               walker->error_code |= PFERR_WRITE_MASK;
-       if (user_fault)
-               walker->error_code |= PFERR_USER_MASK;
-       if (fetch_fault)
-               walker->error_code |= PFERR_FETCH_MASK;
-       if (walker->table)
-               kunmap_atomic(walker->table, KM_USER0);
-       return 0;
-}
-
-static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
-                                       struct guest_walker *walker)
-{
-       mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
-}
-
-static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
-                                 u64 *shadow_pte,
-                                 gpa_t gaddr,
-                                 pt_element_t gpte,
-                                 u64 access_bits,
-                                 int user_fault,
-                                 int write_fault,
-                                 int *ptwrite,
-                                 struct guest_walker *walker,
-                                 gfn_t gfn)
-{
-       hpa_t paddr;
-       int dirty = gpte & PT_DIRTY_MASK;
-       u64 spte = *shadow_pte;
-       int was_rmapped = is_rmap_pte(spte);
-
-       pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
-                " user_fault %d gfn %lx\n",
-                __FUNCTION__, spte, (u64)gpte, access_bits,
-                write_fault, user_fault, gfn);
-
-       if (write_fault && !dirty) {
-               pt_element_t *guest_ent, *tmp = NULL;
-
-               if (walker->ptep)
-                       guest_ent = walker->ptep;
-               else {
-                       tmp = kmap_atomic(walker->page, KM_USER0);
-                       guest_ent = &tmp[walker->index];
-               }
-
-               *guest_ent |= PT_DIRTY_MASK;
-               if (!walker->ptep)
-                       kunmap_atomic(tmp, KM_USER0);
-               dirty = 1;
-               FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
-       }
-
-       spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
-       spte |= gpte & PT64_NX_MASK;
-       if (!dirty)
-               access_bits &= ~PT_WRITABLE_MASK;
-
-       paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
-
-       spte |= PT_PRESENT_MASK;
-       if (access_bits & PT_USER_MASK)
-               spte |= PT_USER_MASK;
-
-       if (is_error_hpa(paddr)) {
-               spte |= gaddr;
-               spte |= PT_SHADOW_IO_MARK;
-               spte &= ~PT_PRESENT_MASK;
-               set_shadow_pte(shadow_pte, spte);
-               return;
-       }
-
-       spte |= paddr;
-
-       if ((access_bits & PT_WRITABLE_MASK)
-           || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-               struct kvm_mmu_page *shadow;
-
-               spte |= PT_WRITABLE_MASK;
-               if (user_fault) {
-                       mmu_unshadow(vcpu, gfn);
-                       goto unshadowed;
-               }
-
-               shadow = kvm_mmu_lookup_page(vcpu, gfn);
-               if (shadow) {
-                       pgprintk("%s: found shadow page for %lx, marking ro\n",
-                                __FUNCTION__, gfn);
-                       access_bits &= ~PT_WRITABLE_MASK;
-                       if (is_writeble_pte(spte)) {
-                               spte &= ~PT_WRITABLE_MASK;
-                               kvm_x86_ops->tlb_flush(vcpu);
-                       }
-                       if (write_fault)
-                               *ptwrite = 1;
-               }
-       }
-
-unshadowed:
-
-       if (access_bits & PT_WRITABLE_MASK)
-               mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
-
-       set_shadow_pte(shadow_pte, spte);
-       page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
-       if (!was_rmapped)
-               rmap_add(vcpu, shadow_pte);
-}
-
-static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
-                          u64 *shadow_pte, u64 access_bits,
-                          int user_fault, int write_fault, int *ptwrite,
-                          struct guest_walker *walker, gfn_t gfn)
-{
-       access_bits &= gpte;
-       FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
-                             gpte, access_bits, user_fault, write_fault,
-                             ptwrite, walker, gfn);
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
-                             u64 *spte, const void *pte, int bytes)
-{
-       pt_element_t gpte;
-
-       if (bytes < sizeof(pt_element_t))
-               return;
-       gpte = *(const pt_element_t *)pte;
-       if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
-               return;
-       pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
-       FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
-                      0, NULL, NULL,
-                      (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
-}
-
-static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
-                          u64 *shadow_pte, u64 access_bits,
-                          int user_fault, int write_fault, int *ptwrite,
-                          struct guest_walker *walker, gfn_t gfn)
-{
-       gpa_t gaddr;
-
-       access_bits &= gpde;
-       gaddr = (gpa_t)gfn << PAGE_SHIFT;
-       if (PTTYPE == 32 && is_cpuid_PSE36())
-               gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
-                       (32 - PT32_DIR_PSE36_SHIFT);
-       FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
-                             gpde, access_bits, user_fault, write_fault,
-                             ptwrite, walker, gfn);
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-                        struct guest_walker *walker,
-                        int user_fault, int write_fault, int *ptwrite)
-{
-       hpa_t shadow_addr;
-       int level;
-       u64 *shadow_ent;
-       u64 *prev_shadow_ent = NULL;
-
-       if (!is_present_pte(walker->pte))
-               return NULL;
-
-       shadow_addr = vcpu->mmu.root_hpa;
-       level = vcpu->mmu.shadow_root_level;
-       if (level == PT32E_ROOT_LEVEL) {
-               shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
-               shadow_addr &= PT64_BASE_ADDR_MASK;
-               --level;
-       }
-
-       for (; ; level--) {
-               u32 index = SHADOW_PT_INDEX(addr, level);
-               struct kvm_mmu_page *shadow_page;
-               u64 shadow_pte;
-               int metaphysical;
-               gfn_t table_gfn;
-               unsigned hugepage_access = 0;
-
-               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-               if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
-                       if (level == PT_PAGE_TABLE_LEVEL)
-                               break;
-                       shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
-                       prev_shadow_ent = shadow_ent;
-                       continue;
-               }
-
-               if (level == PT_PAGE_TABLE_LEVEL)
-                       break;
-
-               if (level - 1 == PT_PAGE_TABLE_LEVEL
-                   && walker->level == PT_DIRECTORY_LEVEL) {
-                       metaphysical = 1;
-                       hugepage_access = walker->pte;
-                       hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
-                       if (walker->pte & PT64_NX_MASK)
-                               hugepage_access |= (1 << 2);
-                       hugepage_access >>= PT_WRITABLE_SHIFT;
-                       table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
-                               >> PAGE_SHIFT;
-               } else {
-                       metaphysical = 0;
-                       table_gfn = walker->table_gfn[level - 2];
-               }
-               shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-                                              metaphysical, hugepage_access,
-                                              shadow_ent);
-               shadow_addr = __pa(shadow_page->spt);
-               shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
-                       | PT_WRITABLE_MASK | PT_USER_MASK;
-               *shadow_ent = shadow_pte;
-               prev_shadow_ent = shadow_ent;
-       }
-
-       if (walker->level == PT_DIRECTORY_LEVEL) {
-               FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
-                              walker->inherited_ar, user_fault, write_fault,
-                              ptwrite, walker, walker->gfn);
-       } else {
-               ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
-               FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
-                              walker->inherited_ar, user_fault, write_fault,
-                              ptwrite, walker, walker->gfn);
-       }
-       return shadow_ent;
-}
-
-/*
- * Page fault handler.  There are several causes for a page fault:
- *   - there is no shadow pte for the guest pte
- *   - write access through a shadow pte marked read only so that we can set
- *     the dirty bit
- *   - write access to a shadow pte marked read only so we can update the page
- *     dirty bitmap, when userspace requests it
- *   - mmio access; in this case we will never install a present shadow pte
- *   - normal guest page fault due to the guest pte marked not present, not
- *     writable, or not executable
- *
- *  Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- *           a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
-                              u32 error_code)
-{
-       int write_fault = error_code & PFERR_WRITE_MASK;
-       int user_fault = error_code & PFERR_USER_MASK;
-       int fetch_fault = error_code & PFERR_FETCH_MASK;
-       struct guest_walker walker;
-       u64 *shadow_pte;
-       int write_pt = 0;
-       int r;
-
-       pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
-       kvm_mmu_audit(vcpu, "pre page fault");
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       /*
-        * Look up the shadow pte for the faulting address.
-        */
-       r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-                            fetch_fault);
-
-       /*
-        * The page is not mapped by the guest.  Let the guest handle it.
-        */
-       if (!r) {
-               pgprintk("%s: guest page fault\n", __FUNCTION__);
-               inject_page_fault(vcpu, addr, walker.error_code);
-               vcpu->last_pt_write_count = 0; /* reset fork detector */
-               return 0;
-       }
-
-       shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-                                 &write_pt);
-       pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
-                shadow_pte, *shadow_pte, write_pt);
-
-       if (!write_pt)
-               vcpu->last_pt_write_count = 0; /* reset fork detector */
-
-       /*
-        * mmio: emulate if accessible, otherwise its a guest fault.
-        */
-       if (is_io_pte(*shadow_pte))
-               return 1;
-
-       ++vcpu->stat.pf_fixed;
-       kvm_mmu_audit(vcpu, "post page fault (fixed)");
-
-       return write_pt;
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
-{
-       struct guest_walker walker;
-       gpa_t gpa = UNMAPPED_GVA;
-       int r;
-
-       r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
-
-       if (r) {
-               gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
-               gpa |= vaddr & ~PAGE_MASK;
-       }
-
-       return gpa;
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef SHADOW_PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
-#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c

deleted file mode 100644 (file)

index bd46de6..0000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
-/******************************************************************************
- * x86_emulate.c
- *
- * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
- *
- * Copyright (c) 2005 Keir Fraser
- *
- * Linux coding style, mod r/m decoder, segment base fixes, real-mode
- * privileged instructions:
- *
- * Copyright (C) 2006 Qumranet
- *
- *   Avi Kivity <avi@qumranet.com>
- *   Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
- */
-
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf( _f , ## _a )
-#else
-#include "kvm.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include "x86_emulate.h"
-#include <linux/module.h>
-
-/*
- * Opcode effective-address decode tables.
- * Note that we only emulate instructions that have at least one memory
- * operand (excluding implicit stack references). We assume that stack
- * references and instruction fetches will never occur in special memory
- * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
- * not be handled.
- */
-
-/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<0)     /* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1)     /* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<1)     /* Register operand. */
-#define DstMem      (3<<1)     /* Memory operand. */
-#define DstMask     (3<<1)
-/* Source operand type. */
-#define SrcNone     (0<<3)     /* No source operand. */
-#define SrcImplicit (0<<3)     /* Source operand is implicit in the opcode. */
-#define SrcReg      (1<<3)     /* Register operand. */
-#define SrcMem      (2<<3)     /* Memory operand. */
-#define SrcMem16    (3<<3)     /* Memory operand (16-bit). */
-#define SrcMem32    (4<<3)     /* Memory operand (32-bit). */
-#define SrcImm      (5<<3)     /* Immediate operand. */
-#define SrcImmByte  (6<<3)     /* 8-bit sign-extended immediate operand. */
-#define SrcMask     (7<<3)
-/* Generic ModRM decode. */
-#define ModRM       (1<<6)
-/* Destination is only written; never read. */
-#define Mov         (1<<7)
-#define BitOp       (1<<8)
-
-static u8 opcode_table[256] = {
-       /* 0x00 - 0x07 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x08 - 0x0F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x10 - 0x17 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x18 - 0x1F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x20 - 0x27 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       SrcImmByte, SrcImm, 0, 0,
-       /* 0x28 - 0x2F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x30 - 0x37 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x38 - 0x3F */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
-       /* 0x40 - 0x4F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x50 - 0x57 */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x58 - 0x5F */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x60 - 0x67 */
-       0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-       0, 0, 0, 0,
-       /* 0x68 - 0x6F */
-       0, 0, ImplicitOps|Mov, 0,
-       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
-       SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
-       /* 0x70 - 0x77 */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x78 - 0x7F */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x80 - 0x87 */
-       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-       /* 0x88 - 0x8F */
-       ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
-       ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
-       /* 0x90 - 0x9F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
-       /* 0xA0 - 0xA7 */
-       ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
-       ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
-       ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
-       ByteOp | ImplicitOps, ImplicitOps,
-       /* 0xA8 - 0xAF */
-       0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
-       ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
-       ByteOp | ImplicitOps, ImplicitOps,
-       /* 0xB0 - 0xBF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xC0 - 0xC7 */
-       ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-       0, ImplicitOps, 0, 0,
-       ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
-       /* 0xC8 - 0xCF */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xD0 - 0xD7 */
-       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-       ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-       0, 0, 0, 0,
-       /* 0xD8 - 0xDF */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xE0 - 0xE7 */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xE8 - 0xEF */
-       ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
-       /* 0xF0 - 0xF7 */
-       0, 0, 0, 0,
-       ImplicitOps, 0,
-       ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
-       /* 0xF8 - 0xFF */
-       0, 0, 0, 0,
-       0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
-};
-
-static u16 twobyte_table[256] = {
-       /* 0x00 - 0x0F */
-       0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
-       ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
-       /* 0x10 - 0x1F */
-       0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x20 - 0x2F */
-       ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x30 - 0x3F */
-       ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x40 - 0x47 */
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       /* 0x48 - 0x4F */
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-       /* 0x50 - 0x5F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x60 - 0x6F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x70 - 0x7F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0x80 - 0x8F */
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-       /* 0x90 - 0x9F */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xA0 - 0xA7 */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-       /* 0xA8 - 0xAF */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
-       /* 0xB0 - 0xB7 */
-       ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
-           DstMem | SrcReg | ModRM | BitOp,
-       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-           DstReg | SrcMem16 | ModRM | Mov,
-       /* 0xB8 - 0xBF */
-       0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
-       0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-           DstReg | SrcMem16 | ModRM | Mov,
-       /* 0xC0 - 0xCF */
-       0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
-       0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xD0 - 0xDF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xE0 - 0xEF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       /* 0xF0 - 0xFF */
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-/* Type, address-of, and value of an instruction's operand. */
-struct operand {
-       enum { OP_REG, OP_MEM, OP_IMM } type;
-       unsigned int bytes;
-       unsigned long val, orig_val, *ptr;
-};
-
-/* EFLAGS bit definitions. */
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
-
-#if defined(CONFIG_X86_64)
-#define _LO32 "k"              /* force 32-bit operand */
-#define _STK  "%%rsp"          /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 ""               /* force 32-bit operand */
-#define _STK  "%%esp"          /* stack pointer */
-#endif
-
-/*
- * These EFLAGS bits are restored from saved value during emulation, and
- * any changes are written back to the saved value after emulation.
- */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
-       /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */        \
-       "push %"_sav"; "                                        \
-       "movl %"_msk",%"_LO32 _tmp"; "                          \
-       "andl %"_LO32 _tmp",("_STK"); "                         \
-       "pushf; "                                               \
-       "notl %"_LO32 _tmp"; "                                  \
-       "andl %"_LO32 _tmp",("_STK"); "                         \
-       "pop  %"_tmp"; "                                        \
-       "orl  %"_LO32 _tmp",("_STK"); "                         \
-       "popf; "                                                \
-       /* _sav &= ~msk; */                                     \
-       "movl %"_msk",%"_LO32 _tmp"; "                          \
-       "notl %"_LO32 _tmp"; "                                  \
-       "andl %"_LO32 _tmp",%"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
-       /* _sav |= EFLAGS & _msk; */            \
-       "pushf; "                               \
-       "pop  %"_tmp"; "                        \
-       "andl %"_msk",%"_LO32 _tmp"; "          \
-       "orl  %"_LO32 _tmp",%"_sav"; "
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-       do {                                                                \
-               unsigned long _tmp;                                         \
-                                                                           \
-               switch ((_dst).bytes) {                                     \
-               case 2:                                                     \
-                       __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0","4","2")                    \
-                               _op"w %"_wx"3,%1; "                         \
-                               _POST_EFLAGS("0","4","2")                   \
-                               : "=m" (_eflags), "=m" ((_dst).val),        \
-                                 "=&r" (_tmp)                              \
-                               : _wy ((_src).val), "i" (EFLAGS_MASK) );    \
-                       break;                                              \
-               case 4:                                                     \
-                       __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0","4","2")                    \
-                               _op"l %"_lx"3,%1; "                         \
-                               _POST_EFLAGS("0","4","2")                   \
-                               : "=m" (_eflags), "=m" ((_dst).val),        \
-                                 "=&r" (_tmp)                              \
-                               : _ly ((_src).val), "i" (EFLAGS_MASK) );    \
-                       break;                                              \
-               case 8:                                                     \
-                       __emulate_2op_8byte(_op, _src, _dst,                \
-                                           _eflags, _qx, _qy);             \
-                       break;                                              \
-               }                                                           \
-       } while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
-       do {                                                                 \
-               unsigned long _tmp;                                          \
-               switch ( (_dst).bytes )                                      \
-               {                                                            \
-               case 1:                                                      \
-                       __asm__ __volatile__ (                               \
-                               _PRE_EFLAGS("0","4","2")                     \
-                               _op"b %"_bx"3,%1; "                          \
-                               _POST_EFLAGS("0","4","2")                    \
-                               : "=m" (_eflags), "=m" ((_dst).val),         \
-                                 "=&r" (_tmp)                               \
-                               : _by ((_src).val), "i" (EFLAGS_MASK) );     \
-                       break;                                               \
-               default:                                                     \
-                       __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
-                                            _wx, _wy, _lx, _ly, _qx, _qy);  \
-                       break;                                               \
-               }                                                            \
-       } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags)                      \
-       __emulate_2op(_op, _src, _dst, _eflags,                         \
-                     "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags)                      \
-       __emulate_2op(_op, _src, _dst, _eflags,                         \
-                     "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags)               \
-       __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
-                            "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-                                                                       \
-               switch ( (_dst).bytes )                                 \
-               {                                                       \
-               case 1:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
-                               _op"b %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
-                       break;                                          \
-               case 2:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
-                               _op"w %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
-                       break;                                          \
-               case 4:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0","3","2")                \
-                               _op"l %1; "                             \
-                               _POST_EFLAGS("0","3","2")               \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK) );                  \
-                       break;                                          \
-               case 8:                                                 \
-                       __emulate_1op_8byte(_op, _dst, _eflags);        \
-                       break;                                          \
-               }                                                       \
-       } while (0)
-
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
-       do {                                                              \
-               __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0","4","2")                          \
-                       _op"q %"_qx"3,%1; "                               \
-                       _POST_EFLAGS("0","4","2")                         \
-                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : _qy ((_src).val), "i" (EFLAGS_MASK) );          \
-       } while (0)
-
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
-       do {                                                              \
-               __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0","3","2")                          \
-                       _op"q %1; "                                       \
-                       _POST_EFLAGS("0","3","2")                         \
-                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : "i" (EFLAGS_MASK) );                            \
-       } while (0)
-
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif                         /* __i386__ */
-
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                  \
-({     unsigned long _x;                                               \
-       rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x,  \
-                                                  (_size), ctxt->vcpu); \
-       if ( rc != 0 )                                                  \
-               goto done;                                              \
-       (_eip) += (_size);                                              \
-       (_type)_x;                                                      \
-})
-
-/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg)                                              \
-       ((ad_bytes == sizeof(unsigned long)) ?                          \
-               (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
-#define register_address(base, reg)                                     \
-       ((base) + address_mask(reg))
-#define register_address_increment(reg, inc)                            \
-       do {                                                            \
-               /* signed type ensures sign extension to long */        \
-               int _inc = (inc);                                       \
-               if ( ad_bytes == sizeof(unsigned long) )                \
-                       (reg) += _inc;                                  \
-               else                                                    \
-                       (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
-                          (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
-       } while (0)
-
-#define JMP_REL(rel)                                                   \
-       do {                                                            \
-               register_address_increment(_eip, rel);                  \
-       } while (0)
-
-/*
- * Given the 'reg' portion of a ModRM byte, and a register block, return a
- * pointer into the block that addresses the relevant register.
- * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
- */
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
-                            int highbyte_regs)
-{
-       void *p;
-
-       p = &regs[modrm_reg];
-       if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
-               p = (unsigned char *)&regs[modrm_reg & 3] + 1;
-       return p;
-}
-
-static int read_descriptor(struct x86_emulate_ctxt *ctxt,
-                          struct x86_emulate_ops *ops,
-                          void *ptr,
-                          u16 *size, unsigned long *address, int op_bytes)
-{
-       int rc;
-
-       if (op_bytes == 2)
-               op_bytes = 3;
-       *address = 0;
-       rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
-                          ctxt->vcpu);
-       if (rc)
-               return rc;
-       rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
-                          ctxt->vcpu);
-       return rc;
-}
-
-static int test_cc(unsigned int condition, unsigned int flags)
-{
-       int rc = 0;
-
-       switch ((condition & 15) >> 1) {
-       case 0: /* o */
-               rc |= (flags & EFLG_OF);
-               break;
-       case 1: /* b/c/nae */
-               rc |= (flags & EFLG_CF);
-               break;
-       case 2: /* z/e */
-               rc |= (flags & EFLG_ZF);
-               break;
-       case 3: /* be/na */
-               rc |= (flags & (EFLG_CF|EFLG_ZF));
-               break;
-       case 4: /* s */
-               rc |= (flags & EFLG_SF);
-               break;
-       case 5: /* p/pe */
-               rc |= (flags & EFLG_PF);
-               break;
-       case 7: /* le/ng */
-               rc |= (flags & EFLG_ZF);
-               /* fall through */
-       case 6: /* l/nge */
-               rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
-               break;
-       }
-
-       /* Odd condition identifiers (lsb == 1) have inverted sense. */
-       return (!!rc ^ (condition & 1));
-}
-
-int
-x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
-{
-       unsigned d;
-       u8 b, sib, twobyte = 0, rex_prefix = 0;
-       u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
-       unsigned long *override_base = NULL;
-       unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
-       int rc = 0;
-       struct operand src, dst;
-       unsigned long cr2 = ctxt->cr2;
-       int mode = ctxt->mode;
-       unsigned long modrm_ea;
-       int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
-       int no_wb = 0;
-       u64 msr_data;
-
-       /* Shadow copy of register state. Committed on successful emulation. */
-       unsigned long _regs[NR_VCPU_REGS];
-       unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
-       unsigned long modrm_val = 0;
-
-       memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
-
-       switch (mode) {
-       case X86EMUL_MODE_REAL:
-       case X86EMUL_MODE_PROT16:
-               op_bytes = ad_bytes = 2;
-               break;
-       case X86EMUL_MODE_PROT32:
-               op_bytes = ad_bytes = 4;
-               break;
-#ifdef CONFIG_X86_64
-       case X86EMUL_MODE_PROT64:
-               op_bytes = 4;
-               ad_bytes = 8;
-               break;
-#endif
-       default:
-               return -1;
-       }
-
-       /* Legacy prefixes. */
-       for (i = 0; i < 8; i++) {
-               switch (b = insn_fetch(u8, 1, _eip)) {
-               case 0x66:      /* operand-size override */
-                       op_bytes ^= 6;  /* switch between 2/4 bytes */
-                       break;
-               case 0x67:      /* address-size override */
-                       if (mode == X86EMUL_MODE_PROT64)
-                               ad_bytes ^= 12; /* switch between 4/8 bytes */
-                       else
-                               ad_bytes ^= 6;  /* switch between 2/4 bytes */
-                       break;
-               case 0x2e:      /* CS override */
-                       override_base = &ctxt->cs_base;
-                       break;
-               case 0x3e:      /* DS override */
-                       override_base = &ctxt->ds_base;
-                       break;
-               case 0x26:      /* ES override */
-                       override_base = &ctxt->es_base;
-                       break;
-               case 0x64:      /* FS override */
-                       override_base = &ctxt->fs_base;
-                       break;
-               case 0x65:      /* GS override */
-                       override_base = &ctxt->gs_base;
-                       break;
-               case 0x36:      /* SS override */
-                       override_base = &ctxt->ss_base;
-                       break;
-               case 0xf0:      /* LOCK */
-                       lock_prefix = 1;
-                       break;
-               case 0xf2:      /* REPNE/REPNZ */
-               case 0xf3:      /* REP/REPE/REPZ */
-                       rep_prefix = 1;
-                       break;
-               default:
-                       goto done_prefixes;
-               }
-       }
-
-done_prefixes:
-
-       /* REX prefix. */
-       if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
-               rex_prefix = b;
-               if (b & 8)
-                       op_bytes = 8;   /* REX.W */
-               modrm_reg = (b & 4) << 1;       /* REX.R */
-               index_reg = (b & 2) << 2; /* REX.X */
-               modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
-               b = insn_fetch(u8, 1, _eip);
-       }
-
-       /* Opcode byte(s). */
-       d = opcode_table[b];
-       if (d == 0) {
-               /* Two-byte opcode? */
-               if (b == 0x0f) {
-                       twobyte = 1;
-                       b = insn_fetch(u8, 1, _eip);
-                       d = twobyte_table[b];
-               }
-
-               /* Unrecognised? */
-               if (d == 0)
-                       goto cannot_emulate;
-       }
-
-       /* ModRM and SIB bytes. */
-       if (d & ModRM) {
-               modrm = insn_fetch(u8, 1, _eip);
-               modrm_mod |= (modrm & 0xc0) >> 6;
-               modrm_reg |= (modrm & 0x38) >> 3;
-               modrm_rm |= (modrm & 0x07);
-               modrm_ea = 0;
-               use_modrm_ea = 1;
-
-               if (modrm_mod == 3) {
-                       modrm_val = *(unsigned long *)
-                               decode_register(modrm_rm, _regs, d & ByteOp);
-                       goto modrm_done;
-               }
-
-               if (ad_bytes == 2) {
-                       unsigned bx = _regs[VCPU_REGS_RBX];
-                       unsigned bp = _regs[VCPU_REGS_RBP];
-                       unsigned si = _regs[VCPU_REGS_RSI];
-                       unsigned di = _regs[VCPU_REGS_RDI];
-
-                       /* 16-bit ModR/M decode. */
-                       switch (modrm_mod) {
-                       case 0:
-                               if (modrm_rm == 6)
-                                       modrm_ea += insn_fetch(u16, 2, _eip);
-                               break;
-                       case 1:
-                               modrm_ea += insn_fetch(s8, 1, _eip);
-                               break;
-                       case 2:
-                               modrm_ea += insn_fetch(u16, 2, _eip);
-                               break;
-                       }
-                       switch (modrm_rm) {
-                       case 0:
-                               modrm_ea += bx + si;
-                               break;
-                       case 1:
-                               modrm_ea += bx + di;
-                               break;
-                       case 2:
-                               modrm_ea += bp + si;
-                               break;
-                       case 3:
-                               modrm_ea += bp + di;
-                               break;
-                       case 4:
-                               modrm_ea += si;
-                               break;
-                       case 5:
-                               modrm_ea += di;
-                               break;
-                       case 6:
-                               if (modrm_mod != 0)
-                                       modrm_ea += bp;
-                               break;
-                       case 7:
-                               modrm_ea += bx;
-                               break;
-                       }
-                       if (modrm_rm == 2 || modrm_rm == 3 ||
-                           (modrm_rm == 6 && modrm_mod != 0))
-                               if (!override_base)
-                                       override_base = &ctxt->ss_base;
-                       modrm_ea = (u16)modrm_ea;
-               } else {
-                       /* 32/64-bit ModR/M decode. */
-                       switch (modrm_rm) {
-                       case 4:
-                       case 12:
-                               sib = insn_fetch(u8, 1, _eip);
-                               index_reg |= (sib >> 3) & 7;
-                               base_reg |= sib & 7;
-                               scale = sib >> 6;
-
-                               switch (base_reg) {
-                               case 5:
-                                       if (modrm_mod != 0)
-                                               modrm_ea += _regs[base_reg];
-                                       else
-                                               modrm_ea += insn_fetch(s32, 4, _eip);
-                                       break;
-                               default:
-                                       modrm_ea += _regs[base_reg];
-                               }
-                               switch (index_reg) {
-                               case 4:
-                                       break;
-                               default:
-                                       modrm_ea += _regs[index_reg] << scale;
-
-                               }
-                               break;
-                       case 5:
-                               if (modrm_mod != 0)
-                                       modrm_ea += _regs[modrm_rm];
-                               else if (mode == X86EMUL_MODE_PROT64)
-                                       rip_relative = 1;
-                               break;
-                       default:
-                               modrm_ea += _regs[modrm_rm];
-                               break;
-                       }
-                       switch (modrm_mod) {
-                       case 0:
-                               if (modrm_rm == 5)
-                                       modrm_ea += insn_fetch(s32, 4, _eip);
-                               break;
-                       case 1:
-                               modrm_ea += insn_fetch(s8, 1, _eip);
-                               break;
-                       case 2:
-                               modrm_ea += insn_fetch(s32, 4, _eip);
-                               break;
-                       }
-               }
-               if (!override_base)
-                       override_base = &ctxt->ds_base;
-               if (mode == X86EMUL_MODE_PROT64 &&
-                   override_base != &ctxt->fs_base &&
-                   override_base != &ctxt->gs_base)
-                       override_base = NULL;
-
-               if (override_base)
-                       modrm_ea += *override_base;
-
-               if (rip_relative) {
-                       modrm_ea += _eip;
-                       switch (d & SrcMask) {
-                       case SrcImmByte:
-                               modrm_ea += 1;
-                               break;
-                       case SrcImm:
-                               if (d & ByteOp)
-                                       modrm_ea += 1;
-                               else
-                                       if (op_bytes == 8)
-                                               modrm_ea += 4;
-                                       else
-                                               modrm_ea += op_bytes;
-                       }
-               }
-               if (ad_bytes != 8)
-                       modrm_ea = (u32)modrm_ea;
-               cr2 = modrm_ea;
-       modrm_done:
-               ;
-       }
-
-       /*
-        * Decode and fetch the source operand: register, memory
-        * or immediate.
-        */
-       switch (d & SrcMask) {
-       case SrcNone:
-               break;
-       case SrcReg:
-               src.type = OP_REG;
-               if (d & ByteOp) {
-                       src.ptr = decode_register(modrm_reg, _regs,
-                                                 (rex_prefix == 0));
-                       src.val = src.orig_val = *(u8 *) src.ptr;
-                       src.bytes = 1;
-               } else {
-                       src.ptr = decode_register(modrm_reg, _regs, 0);
-                       switch ((src.bytes = op_bytes)) {
-                       case 2:
-                               src.val = src.orig_val = *(u16 *) src.ptr;
-                               break;
-                       case 4:
-                               src.val = src.orig_val = *(u32 *) src.ptr;
-                               break;
-                       case 8:
-                               src.val = src.orig_val = *(u64 *) src.ptr;
-                               break;
-                       }
-               }
-               break;
-       case SrcMem16:
-               src.bytes = 2;
-               goto srcmem_common;
-       case SrcMem32:
-               src.bytes = 4;
-               goto srcmem_common;
-       case SrcMem:
-               src.bytes = (d & ByteOp) ? 1 : op_bytes;
-               /* Don't fetch the address for invlpg: it could be unmapped. */
-               if (twobyte && b == 0x01 && modrm_reg == 7)
-                       break;
-             srcmem_common:
-               /*
-                * For instructions with a ModR/M byte, switch to register
-                * access if Mod = 3.
-                */
-               if ((d & ModRM) && modrm_mod == 3) {
-                       src.type = OP_REG;
-                       break;
-               }
-               src.type = OP_MEM;
-               src.ptr = (unsigned long *)cr2;
-               src.val = 0;
-               if ((rc = ops->read_emulated((unsigned long)src.ptr,
-                                            &src.val, src.bytes, ctxt->vcpu)) != 0)
-                       goto done;
-               src.orig_val = src.val;
-               break;
-       case SrcImm:
-               src.type = OP_IMM;
-               src.ptr = (unsigned long *)_eip;
-               src.bytes = (d & ByteOp) ? 1 : op_bytes;
-               if (src.bytes == 8)
-                       src.bytes = 4;
-               /* NB. Immediates are sign-extended as necessary. */
-               switch (src.bytes) {
-               case 1:
-                       src.val = insn_fetch(s8, 1, _eip);
-                       break;
-               case 2:
-                       src.val = insn_fetch(s16, 2, _eip);
-                       break;
-               case 4:
-                       src.val = insn_fetch(s32, 4, _eip);
-                       break;
-               }
-               break;
-       case SrcImmByte:
-               src.type = OP_IMM;
-               src.ptr = (unsigned long *)_eip;
-               src.bytes = 1;
-               src.val = insn_fetch(s8, 1, _eip);
-               break;
-       }
-
-       /* Decode and fetch the destination operand: register or memory. */
-       switch (d & DstMask) {
-       case ImplicitOps:
-               /* Special instructions do their own operand decoding. */
-               goto special_insn;
-       case DstReg:
-               dst.type = OP_REG;
-               if ((d & ByteOp)
-                   && !(twobyte && (b == 0xb6 || b == 0xb7))) {
-                       dst.ptr = decode_register(modrm_reg, _regs,
-                                                 (rex_prefix == 0));
-                       dst.val = *(u8 *) dst.ptr;
-                       dst.bytes = 1;
-               } else {
-                       dst.ptr = decode_register(modrm_reg, _regs, 0);
-                       switch ((dst.bytes = op_bytes)) {
-                       case 2:
-                               dst.val = *(u16 *)dst.ptr;
-                               break;
-                       case 4:
-                               dst.val = *(u32 *)dst.ptr;
-                               break;
-                       case 8:
-                               dst.val = *(u64 *)dst.ptr;
-                               break;
-                       }
-               }
-               break;
-       case DstMem:
-               dst.type = OP_MEM;
-               dst.ptr = (unsigned long *)cr2;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               dst.val = 0;
-               /*
-                * For instructions with a ModR/M byte, switch to register
-                * access if Mod = 3.
-                */
-               if ((d & ModRM) && modrm_mod == 3) {
-                       dst.type = OP_REG;
-                       break;
-               }
-               if (d & BitOp) {
-                       unsigned long mask = ~(dst.bytes * 8 - 1);
-
-                       dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
-               }
-               if (!(d & Mov) && /* optimisation - avoid slow emulated read */
-                   ((rc = ops->read_emulated((unsigned long)dst.ptr,
-                                             &dst.val, dst.bytes, ctxt->vcpu)) != 0))
-                       goto done;
-               break;
-       }
-       dst.orig_val = dst.val;
-
-       if (twobyte)
-               goto twobyte_insn;
-
-       switch (b) {
-       case 0x00 ... 0x05:
-             add:              /* add */
-               emulate_2op_SrcV("add", src, dst, _eflags);
-               break;
-       case 0x08 ... 0x0d:
-             or:               /* or */
-               emulate_2op_SrcV("or", src, dst, _eflags);
-               break;
-       case 0x10 ... 0x15:
-             adc:              /* adc */
-               emulate_2op_SrcV("adc", src, dst, _eflags);
-               break;
-       case 0x18 ... 0x1d:
-             sbb:              /* sbb */
-               emulate_2op_SrcV("sbb", src, dst, _eflags);
-               break;
-       case 0x20 ... 0x23:
-             and:              /* and */
-               emulate_2op_SrcV("and", src, dst, _eflags);
-               break;
-       case 0x24:              /* and al imm8 */
-               dst.type = OP_REG;
-               dst.ptr = &_regs[VCPU_REGS_RAX];
-               dst.val = *(u8 *)dst.ptr;
-               dst.bytes = 1;
-               dst.orig_val = dst.val;
-               goto and;
-       case 0x25:              /* and ax imm16, or eax imm32 */
-               dst.type = OP_REG;
-               dst.bytes = op_bytes;
-               dst.ptr = &_regs[VCPU_REGS_RAX];
-               if (op_bytes == 2)
-                       dst.val = *(u16 *)dst.ptr;
-               else
-                       dst.val = *(u32 *)dst.ptr;
-               dst.orig_val = dst.val;
-               goto and;
-       case 0x28 ... 0x2d:
-             sub:              /* sub */
-               emulate_2op_SrcV("sub", src, dst, _eflags);
-               break;
-       case 0x30 ... 0x35:
-             xor:              /* xor */
-               emulate_2op_SrcV("xor", src, dst, _eflags);
-               break;
-       case 0x38 ... 0x3d:
-             cmp:              /* cmp */
-               emulate_2op_SrcV("cmp", src, dst, _eflags);
-               break;
-       case 0x63:              /* movsxd */
-               if (mode != X86EMUL_MODE_PROT64)
-                       goto cannot_emulate;
-               dst.val = (s32) src.val;
-               break;
-       case 0x80 ... 0x83:     /* Grp1 */
-               switch (modrm_reg) {
-               case 0:
-                       goto add;
-               case 1:
-                       goto or;
-               case 2:
-                       goto adc;
-               case 3:
-                       goto sbb;
-               case 4:
-                       goto and;
-               case 5:
-                       goto sub;
-               case 6:
-                       goto xor;
-               case 7:
-                       goto cmp;
-               }
-               break;
-       case 0x84 ... 0x85:
-             test:             /* test */
-               emulate_2op_SrcV("test", src, dst, _eflags);
-               break;
-       case 0x86 ... 0x87:     /* xchg */
-               /* Write back the register source. */
-               switch (dst.bytes) {
-               case 1:
-                       *(u8 *) src.ptr = (u8) dst.val;
-                       break;
-               case 2:
-                       *(u16 *) src.ptr = (u16) dst.val;
-                       break;
-               case 4:
-                       *src.ptr = (u32) dst.val;
-                       break;  /* 64b reg: zero-extend */
-               case 8:
-                       *src.ptr = dst.val;
-                       break;
-               }
-               /*
-                * Write back the memory destination with implicit LOCK
-                * prefix.
-                */
-               dst.val = src.val;
-               lock_prefix = 1;
-               break;
-       case 0x88 ... 0x8b:     /* mov */
-               goto mov;
-       case 0x8d: /* lea r16/r32, m */
-               dst.val = modrm_val;
-               break;
-       case 0x8f:              /* pop (sole member of Grp1a) */
-               /* 64-bit mode: POP always pops a 64-bit operand. */
-               if (mode == X86EMUL_MODE_PROT64)
-                       dst.bytes = 8;
-               if ((rc = ops->read_std(register_address(ctxt->ss_base,
-                                                        _regs[VCPU_REGS_RSP]),
-                                       &dst.val, dst.bytes, ctxt->vcpu)) != 0)
-                       goto done;
-               register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
-               break;
-       case 0xa0 ... 0xa1:     /* mov */
-               dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-               dst.val = src.val;
-               _eip += ad_bytes;       /* skip src displacement */
-               break;
-       case 0xa2 ... 0xa3:     /* mov */
-               dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
-               _eip += ad_bytes;       /* skip dst displacement */
-               break;
-       case 0xc0 ... 0xc1:
-             grp2:             /* Grp2 */
-               switch (modrm_reg) {
-               case 0: /* rol */
-                       emulate_2op_SrcB("rol", src, dst, _eflags);
-                       break;
-               case 1: /* ror */
-                       emulate_2op_SrcB("ror", src, dst, _eflags);
-                       break;
-               case 2: /* rcl */
-                       emulate_2op_SrcB("rcl", src, dst, _eflags);
-                       break;
-               case 3: /* rcr */
-                       emulate_2op_SrcB("rcr", src, dst, _eflags);
-                       break;
-               case 4: /* sal/shl */
-               case 6: /* sal/shl */
-                       emulate_2op_SrcB("sal", src, dst, _eflags);
-                       break;
-               case 5: /* shr */
-                       emulate_2op_SrcB("shr", src, dst, _eflags);
-                       break;
-               case 7: /* sar */
-                       emulate_2op_SrcB("sar", src, dst, _eflags);
-                       break;
-               }
-               break;
-       case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
-       mov:
-               dst.val = src.val;
-               break;
-       case 0xd0 ... 0xd1:     /* Grp2 */
-               src.val = 1;
-               goto grp2;
-       case 0xd2 ... 0xd3:     /* Grp2 */
-               src.val = _regs[VCPU_REGS_RCX];
-               goto grp2;
-       case 0xf6 ... 0xf7:     /* Grp3 */
-               switch (modrm_reg) {
-               case 0 ... 1:   /* test */
-                       /*
-                        * Special case in Grp3: test has an immediate
-                        * source operand.
-                        */
-                       src.type = OP_IMM;
-                       src.ptr = (unsigned long *)_eip;
-                       src.bytes = (d & ByteOp) ? 1 : op_bytes;
-                       if (src.bytes == 8)
-                               src.bytes = 4;
-                       switch (src.bytes) {
-                       case 1:
-                               src.val = insn_fetch(s8, 1, _eip);
-                               break;
-                       case 2:
-                               src.val = insn_fetch(s16, 2, _eip);
-                               break;
-                       case 4:
-                               src.val = insn_fetch(s32, 4, _eip);
-                               break;
-                       }
-                       goto test;
-               case 2: /* not */
-                       dst.val = ~dst.val;
-                       break;
-               case 3: /* neg */
-                       emulate_1op("neg", dst, _eflags);
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
-               break;
-       case 0xfe ... 0xff:     /* Grp4/Grp5 */
-               switch (modrm_reg) {
-               case 0: /* inc */
-                       emulate_1op("inc", dst, _eflags);
-                       break;
-               case 1: /* dec */
-                       emulate_1op("dec", dst, _eflags);
-                       break;
-               case 4: /* jmp abs */
-                       if (b == 0xff)
-                               _eip = dst.val;
-                       else
-                               goto cannot_emulate;
-                       break;
-               case 6: /* push */
-                       /* 64-bit mode: PUSH always pushes a 64-bit operand. */
-                       if (mode == X86EMUL_MODE_PROT64) {
-                               dst.bytes = 8;
-                               if ((rc = ops->read_std((unsigned long)dst.ptr,
-                                                       &dst.val, 8,
-                                                       ctxt->vcpu)) != 0)
-                                       goto done;
-                       }
-                       register_address_increment(_regs[VCPU_REGS_RSP],
-                                                  -dst.bytes);
-                       if ((rc = ops->write_emulated(
-                                    register_address(ctxt->ss_base,
-                                                     _regs[VCPU_REGS_RSP]),
-                                    &dst.val, dst.bytes, ctxt->vcpu)) != 0)
-                               goto done;
-                       no_wb = 1;
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
-               break;
-       }
-
-writeback:
-       if (!no_wb) {
-               switch (dst.type) {
-               case OP_REG:
-                       /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-                       switch (dst.bytes) {
-                       case 1:
-                               *(u8 *)dst.ptr = (u8)dst.val;
-                               break;
-                       case 2:
-                               *(u16 *)dst.ptr = (u16)dst.val;
-                               break;
-                       case 4:
-                               *dst.ptr = (u32)dst.val;
-                               break;  /* 64b: zero-ext */
-                       case 8:
-                               *dst.ptr = dst.val;
-                               break;
-                       }
-                       break;
-               case OP_MEM:
-                       if (lock_prefix)
-                               rc = ops->cmpxchg_emulated((unsigned long)dst.
-                                                          ptr, &dst.orig_val,
-                                                          &dst.val, dst.bytes,
-                                                          ctxt->vcpu);
-                       else
-                               rc = ops->write_emulated((unsigned long)dst.ptr,
-                                                        &dst.val, dst.bytes,
-                                                        ctxt->vcpu);
-                       if (rc != 0)
-                               goto done;
-               default:
-                       break;
-               }
-       }
-
-       /* Commit shadow register state. */
-       memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
-       ctxt->eflags = _eflags;
-       ctxt->vcpu->rip = _eip;
-
-done:
-       return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-
-special_insn:
-       if (twobyte)
-               goto twobyte_special_insn;
-       switch(b) {
-       case 0x50 ... 0x57:  /* push reg */
-               if (op_bytes == 2)
-                       src.val = (u16) _regs[b & 0x7];
-               else
-                       src.val = (u32) _regs[b & 0x7];
-               dst.type  = OP_MEM;
-               dst.bytes = op_bytes;
-               dst.val = src.val;
-               register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
-               dst.ptr = (void *) register_address(
-                       ctxt->ss_base, _regs[VCPU_REGS_RSP]);
-               break;
-       case 0x58 ... 0x5f: /* pop reg */
-               dst.ptr = (unsigned long *)&_regs[b & 0x7];
-       pop_instruction:
-               if ((rc = ops->read_std(register_address(ctxt->ss_base,
-                       _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
-                       != 0)
-                       goto done;
-
-               register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
-               no_wb = 1; /* Disable writeback. */
-               break;
-       case 0x6a: /* push imm8 */
-               src.val = 0L;
-               src.val = insn_fetch(s8, 1, _eip);
-       push:
-               dst.type  = OP_MEM;
-               dst.bytes = op_bytes;
-               dst.val = src.val;
-               register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
-               dst.ptr = (void *) register_address(ctxt->ss_base,
-                                                       _regs[VCPU_REGS_RSP]);
-               break;
-       case 0x6c:              /* insb */
-       case 0x6d:              /* insw/insd */
-                if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-                               1,                                      /* in */
-                               (d & ByteOp) ? 1 : op_bytes,            /* size */
-                               rep_prefix ?
-                               address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
-                               (_eflags & EFLG_DF),                    /* down */
-                               register_address(ctxt->es_base,
-                                                _regs[VCPU_REGS_RDI]), /* address */
-                               rep_prefix,
-                               _regs[VCPU_REGS_RDX]                    /* port */
-                               ) == 0)
-                       return -1;
-               return 0;
-       case 0x6e:              /* outsb */
-       case 0x6f:              /* outsw/outsd */
-               if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
-                               0,                                      /* in */
-                               (d & ByteOp) ? 1 : op_bytes,            /* size */
-                               rep_prefix ?
-                               address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
-                               (_eflags & EFLG_DF),                    /* down */
-                               register_address(override_base ?
-                                                *override_base : ctxt->ds_base,
-                                                _regs[VCPU_REGS_RSI]), /* address */
-                               rep_prefix,
-                               _regs[VCPU_REGS_RDX]                    /* port */
-                               ) == 0)
-                       return -1;
-               return 0;
-       case 0x70 ... 0x7f: /* jcc (short) */ {
-               int rel = insn_fetch(s8, 1, _eip);
-
-               if (test_cc(b, _eflags))
-               JMP_REL(rel);
-               break;
-       }
-       case 0x9c: /* pushf */
-               src.val =  (unsigned long) _eflags;
-               goto push;
-       case 0x9d: /* popf */
-               dst.ptr = (unsigned long *) &_eflags;
-               goto pop_instruction;
-       case 0xc3: /* ret */
-               dst.ptr = &_eip;
-               goto pop_instruction;
-       case 0xf4:              /* hlt */
-               ctxt->vcpu->halt_request = 1;
-               goto done;
-       }
-       if (rep_prefix) {
-               if (_regs[VCPU_REGS_RCX] == 0) {
-                       ctxt->vcpu->rip = _eip;
-                       goto done;
-               }
-               _regs[VCPU_REGS_RCX]--;
-               _eip = ctxt->vcpu->rip;
-       }
-       switch (b) {
-       case 0xa4 ... 0xa5:     /* movs */
-               dst.type = OP_MEM;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               dst.ptr = (unsigned long *)register_address(ctxt->es_base,
-                                                       _regs[VCPU_REGS_RDI]);
-               if ((rc = ops->read_emulated(register_address(
-                     override_base ? *override_base : ctxt->ds_base,
-                     _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
-                       goto done;
-               register_address_increment(_regs[VCPU_REGS_RSI],
-                            (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-               register_address_increment(_regs[VCPU_REGS_RDI],
-                            (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-               break;
-       case 0xa6 ... 0xa7:     /* cmps */
-               DPRINTF("Urk! I don't handle CMPS.\n");
-               goto cannot_emulate;
-       case 0xaa ... 0xab:     /* stos */
-               dst.type = OP_MEM;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               dst.ptr = (unsigned long *)cr2;
-               dst.val = _regs[VCPU_REGS_RAX];
-               register_address_increment(_regs[VCPU_REGS_RDI],
-                            (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-               break;
-       case 0xac ... 0xad:     /* lods */
-               dst.type = OP_REG;
-               dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-               dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-               if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
-                                            ctxt->vcpu)) != 0)
-                       goto done;
-               register_address_increment(_regs[VCPU_REGS_RSI],
-                          (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
-               break;
-       case 0xae ... 0xaf:     /* scas */
-               DPRINTF("Urk! I don't handle SCAS.\n");
-               goto cannot_emulate;
-       case 0xe8: /* call (near) */ {
-               long int rel;
-               switch (op_bytes) {
-               case 2:
-                       rel = insn_fetch(s16, 2, _eip);
-                       break;
-               case 4:
-                       rel = insn_fetch(s32, 4, _eip);
-                       break;
-               case 8:
-                       rel = insn_fetch(s64, 8, _eip);
-                       break;
-               default:
-                       DPRINTF("Call: Invalid op_bytes\n");
-                       goto cannot_emulate;
-               }
-               src.val = (unsigned long) _eip;
-               JMP_REL(rel);
-               op_bytes = ad_bytes;
-               goto push;
-       }
-       case 0xe9: /* jmp rel */
-       case 0xeb: /* jmp rel short */
-               JMP_REL(src.val);
-               no_wb = 1; /* Disable writeback. */
-               break;
-
-
-       }
-       goto writeback;
-
-twobyte_insn:
-       switch (b) {
-       case 0x01: /* lgdt, lidt, lmsw */
-               /* Disable writeback. */
-               no_wb = 1;
-               switch (modrm_reg) {
-                       u16 size;
-                       unsigned long address;
-
-               case 2: /* lgdt */
-                       rc = read_descriptor(ctxt, ops, src.ptr,
-                                            &size, &address, op_bytes);
-                       if (rc)
-                               goto done;
-                       realmode_lgdt(ctxt->vcpu, size, address);
-                       break;
-               case 3: /* lidt */
-                       rc = read_descriptor(ctxt, ops, src.ptr,
-                                            &size, &address, op_bytes);
-                       if (rc)
-                               goto done;
-                       realmode_lidt(ctxt->vcpu, size, address);
-                       break;
-               case 4: /* smsw */
-                       if (modrm_mod != 3)
-                               goto cannot_emulate;
-                       *(u16 *)&_regs[modrm_rm]
-                               = realmode_get_cr(ctxt->vcpu, 0);
-                       break;
-               case 6: /* lmsw */
-                       if (modrm_mod != 3)
-                               goto cannot_emulate;
-                       realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
-                       break;
-               case 7: /* invlpg*/
-                       emulate_invlpg(ctxt->vcpu, cr2);
-                       break;
-               default:
-                       goto cannot_emulate;
-               }
-               break;
-       case 0x21: /* mov from dr to reg */
-               no_wb = 1;
-               if (modrm_mod != 3)
-                       goto cannot_emulate;
-               rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
-               break;
-       case 0x23: /* mov from reg to dr */
-               no_wb = 1;
-               if (modrm_mod != 3)
-                       goto cannot_emulate;
-               rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
-               break;
-       case 0x40 ... 0x4f:     /* cmov */
-               dst.val = dst.orig_val = src.val;
-               no_wb = 1;
-               /*
-                * First, assume we're decoding an even cmov opcode
-                * (lsb == 0).
-                */
-               switch ((b & 15) >> 1) {
-               case 0: /* cmovo */
-                       no_wb = (_eflags & EFLG_OF) ? 0 : 1;
-                       break;
-               case 1: /* cmovb/cmovc/cmovnae */
-                       no_wb = (_eflags & EFLG_CF) ? 0 : 1;
-                       break;
-               case 2: /* cmovz/cmove */
-                       no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-                       break;
-               case 3: /* cmovbe/cmovna */
-                       no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
-                       break;
-               case 4: /* cmovs */
-                       no_wb = (_eflags & EFLG_SF) ? 0 : 1;
-                       break;
-               case 5: /* cmovp/cmovpe */
-                       no_wb = (_eflags & EFLG_PF) ? 0 : 1;
-                       break;
-               case 7: /* cmovle/cmovng */
-                       no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
-                       /* fall through */
-               case 6: /* cmovl/cmovnge */
-                       no_wb &= (!(_eflags & EFLG_SF) !=
-                             !(_eflags & EFLG_OF)) ? 0 : 1;
-                       break;
-               }
-               /* Odd cmov opcodes (lsb == 1) have inverted sense. */
-               no_wb ^= b & 1;
-               break;
-       case 0xa3:
-             bt:               /* bt */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
-               break;
-       case 0xab:
-             bts:              /* bts */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
-               break;
-       case 0xb0 ... 0xb1:     /* cmpxchg */
-               /*
-                * Save real source value, then compare EAX against
-                * destination.
-                */
-               src.orig_val = src.val;
-               src.val = _regs[VCPU_REGS_RAX];
-               emulate_2op_SrcV("cmp", src, dst, _eflags);
-               if (_eflags & EFLG_ZF) {
-                       /* Success: write back to memory. */
-                       dst.val = src.orig_val;
-               } else {
-                       /* Failure: write the value we saw to EAX. */
-                       dst.type = OP_REG;
-                       dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
-               }
-               break;
-       case 0xb3:
-             btr:              /* btr */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
-               break;
-       case 0xb6 ... 0xb7:     /* movzx */
-               dst.bytes = op_bytes;
-               dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
-               break;
-       case 0xba:              /* Grp8 */
-               switch (modrm_reg & 3) {
-               case 0:
-                       goto bt;
-               case 1:
-                       goto bts;
-               case 2:
-                       goto btr;
-               case 3:
-                       goto btc;
-               }
-               break;
-       case 0xbb:
-             btc:              /* btc */
-               src.val &= (dst.bytes << 3) - 1; /* only subword offset */
-               emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
-               break;
-       case 0xbe ... 0xbf:     /* movsx */
-               dst.bytes = op_bytes;
-               dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
-               break;
-       case 0xc3:              /* movnti */
-               dst.bytes = op_bytes;
-               dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
-               break;
-       }
-       goto writeback;
-
-twobyte_special_insn:
-       /* Disable writeback. */
-       no_wb = 1;
-       switch (b) {
-       case 0x06:
-               emulate_clts(ctxt->vcpu);
-               break;
-       case 0x08:              /* invd */
-               break;
-       case 0x09:              /* wbinvd */
-               break;
-       case 0x0d:              /* GrpP (prefetch) */
-       case 0x18:              /* Grp16 (prefetch/nop) */
-               break;
-       case 0x20: /* mov cr, reg */
-               if (modrm_mod != 3)
-                       goto cannot_emulate;
-               _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
-               break;
-       case 0x22: /* mov reg, cr */
-               if (modrm_mod != 3)
-                       goto cannot_emulate;
-               realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
-               break;
-       case 0x30:
-               /* wrmsr */
-               msr_data = (u32)_regs[VCPU_REGS_RAX]
-                       | ((u64)_regs[VCPU_REGS_RDX] << 32);
-               rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
-               if (rc) {
-                       kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
-                       _eip = ctxt->vcpu->rip;
-               }
-               rc = X86EMUL_CONTINUE;
-               break;
-       case 0x32:
-               /* rdmsr */
-               rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
-               if (rc) {
-                       kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
-                       _eip = ctxt->vcpu->rip;
-               } else {
-                       _regs[VCPU_REGS_RAX] = (u32)msr_data;
-                       _regs[VCPU_REGS_RDX] = msr_data >> 32;
-               }
-               rc = X86EMUL_CONTINUE;
-               break;
-       case 0x80 ... 0x8f: /* jnz rel, etc*/ {
-               long int rel;
-
-               switch (op_bytes) {
-               case 2:
-                       rel = insn_fetch(s16, 2, _eip);
-                       break;
-               case 4:
-                       rel = insn_fetch(s32, 4, _eip);
-                       break;
-               case 8:
-                       rel = insn_fetch(s64, 8, _eip);
-                       break;
-               default:
-                       DPRINTF("jnz: Invalid op_bytes\n");
-                       goto cannot_emulate;
-               }
-               if (test_cc(b, _eflags))
-                       JMP_REL(rel);
-               break;
-       }
-       case 0xc7:              /* Grp9 (cmpxchg8b) */
-               {
-                       u64 old, new;
-                       if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
-                                                                       != 0)
-                               goto done;
-                       if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
-                           ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
-                               _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-                               _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
-                               _eflags &= ~EFLG_ZF;
-                       } else {
-                               new = ((u64)_regs[VCPU_REGS_RCX] << 32)
-                                       | (u32) _regs[VCPU_REGS_RBX];
-                               if ((rc = ops->cmpxchg_emulated(cr2, &old,
-                                                         &new, 8, ctxt->vcpu)) != 0)
-                                       goto done;
-                               _eflags |= EFLG_ZF;
-                       }
-                       break;
-               }
-       }
-       goto writeback;
-
-cannot_emulate:
-       DPRINTF("Cannot emulate %02x\n", b);
-       return -1;
-}
-
-#ifdef __XEN__
-
-#include <asm/mm.h>
-#include <asm/uaccess.h>
-
-int
-x86_emulate_read_std(unsigned long addr,
-                    unsigned long *val,
-                    unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
-       unsigned int rc;
-
-       *val = 0;
-
-       if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
-               propagate_page_fault(addr + bytes - rc, 0);     /* read fault */
-               return X86EMUL_PROPAGATE_FAULT;
-       }
-
-       return X86EMUL_CONTINUE;
-}
-
-int
-x86_emulate_write_std(unsigned long addr,
-                     unsigned long val,
-                     unsigned int bytes, struct x86_emulate_ctxt *ctxt)
-{
-       unsigned int rc;
-
-       if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
-               propagate_page_fault(addr + bytes - rc, PGERR_write_access);
-               return X86EMUL_PROPAGATE_FAULT;
-       }
-
-       return X86EMUL_CONTINUE;
-}
-
-#endif
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c

index cb4c67025d52ae5c6c4826a855695ec09dcf29e6..7743d73768df273c008b700f8165ec5dafd668ec 100644 (file)
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg,
  /* This routine copies memory from the Guest.  Here we can see how useful the
   * kill_lguest() routine we met in the Launcher can be: we return a random
   * value (all zeroes) instead of needing to return an error. */
-void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
+void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
  {
-       if (!lguest_address_ok(lg, addr, bytes)
-           || copy_from_user(b, lg->mem_base + addr, bytes) != 0) {
+       if (!lguest_address_ok(cpu->lg, addr, bytes)
+           || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
                 /* copy_from_user should do this, but as we rely on it... */
                 memset(b, 0, bytes);
-               kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
+               kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
         }
  }
  
  /* This is the write (copy into guest) version. */
-void __lgwrite(struct lguest *lg, unsigned long addr, const void *b,
+void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
                unsigned bytes)
  {
-       if (!lguest_address_ok(lg, addr, bytes)
-           || copy_to_user(lg->mem_base + addr, b, bytes) != 0)
-               kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
+       if (!lguest_address_ok(cpu->lg, addr, bytes)
+           || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
+               kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
  }
  /*:*/
  
  /*H:030 Let's jump straight to the the main loop which runs the Guest.
   * Remember, this is called by the Launcher reading /dev/lguest, and we keep
   * going around and around until something interesting happens. */
-int run_guest(struct lguest *lg, unsigned long __user *user)
+int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
  {
         /* We stop running once the Guest is dead. */
-       while (!lg->dead) {
+       while (!cpu->lg->dead) {
                 /* First we run any hypercalls the Guest wants done. */
-               if (lg->hcall)
-                       do_hypercalls(lg);
+               if (cpu->hcall)
+                       do_hypercalls(cpu);
  
                 /* It's possible the Guest did a NOTIFY hypercall to the
                  * Launcher, in which case we return from the read() now. */
-               if (lg->pending_notify) {
-                       if (put_user(lg->pending_notify, user))
+               if (cpu->pending_notify) {
+                       if (put_user(cpu->pending_notify, user))
                                 return -EFAULT;
-                       return sizeof(lg->pending_notify);
+                       return sizeof(cpu->pending_notify);
                 }
  
                 /* Check for signals */
@@ -195,13 +195,13 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
                         return -ERESTARTSYS;
  
                 /* If Waker set break_out, return to Launcher. */
-               if (lg->break_out)
+               if (cpu->break_out)
                         return -EAGAIN;
  
                 /* Check if there are any interrupts which can be delivered
                  * now: if so, this sets up the hander to be executed when we
                  * next run the Guest. */
-               maybe_do_interrupt(lg);
+               maybe_do_interrupt(cpu);
  
                 /* All long-lived kernel loops need to check with this horrible
                  * thing called the freezer.  If the Host is trying to suspend,
@@ -210,12 +210,12 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
  
                 /* Just make absolutely sure the Guest is still alive.  One of
                  * those hypercalls could have been fatal, for example. */
-               if (lg->dead)
+               if (cpu->lg->dead)
                         break;
  
                 /* If the Guest asked to be stopped, we sleep.  The Guest's
                  * clock timer or LHCALL_BREAK from the Waker will wake us. */
-               if (lg->halted) {
+               if (cpu->halted) {
                         set_current_state(TASK_INTERRUPTIBLE);
                         schedule();
                         continue;
@@ -226,15 +226,17 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
                 local_irq_disable();
  
                 /* Actually run the Guest until something happens. */
-               lguest_arch_run_guest(lg);
+               lguest_arch_run_guest(cpu);
  
                 /* Now we're ready to be interrupted or moved to other CPUs */
                 local_irq_enable();
  
                 /* Now we deal with whatever happened to the Guest. */
-               lguest_arch_handle_trap(lg);
+               lguest_arch_handle_trap(cpu);
         }
  
+       if (cpu->lg->dead == ERR_PTR(-ERESTART))
+               return -ERESTART;
         /* The Guest is dead => "No such file or directory" */
         return -ENOENT;
  }
@@ -253,7 +255,7 @@ static int __init init(void)
  
         /* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
         if (paravirt_enabled()) {
-               printk("lguest is afraid of %s\n", pv_info.name);
+               printk("lguest is afraid of being a guest\n");
                 return -EPERM;
         }
  
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c

index b478affe8f91331733a288fa039fce50012e8409..0f2cb4fd7c6980448e01770476e262d6ea708d70 100644 (file)
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -23,13 +23,14 @@
  #include <linux/uaccess.h>
  #include <linux/syscalls.h>
  #include <linux/mm.h>
+#include <linux/ktime.h>
  #include <asm/page.h>
  #include <asm/pgtable.h>
  #include "lg.h"
  
  /*H:120 This is the core hypercall routine: where the Guest gets what it wants.
   * Or gets killed.  Or, in the case of LHCALL_CRASH, both. */
-static void do_hcall(struct lguest *lg, struct hcall_args *args)
+static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
  {
         switch (args->arg0) {
         case LHCALL_FLUSH_ASYNC:
@@ -39,60 +40,62 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
         case LHCALL_LGUEST_INIT:
                 /* You can't get here unless you're already initialized.  Don't
                  * do that. */
-               kill_guest(lg, "already have lguest_data");
+               kill_guest(cpu, "already have lguest_data");
                 break;
-       case LHCALL_CRASH: {
-               /* Crash is such a trivial hypercall that we do it in four
+       case LHCALL_SHUTDOWN: {
+               /* Shutdown is such a trivial hypercall that we do it in four
                  * lines right here. */
                 char msg[128];
                 /* If the lgread fails, it will call kill_guest() itself; the
                  * kill_guest() with the message will be ignored. */
-               __lgread(lg, msg, args->arg1, sizeof(msg));
+               __lgread(cpu, msg, args->arg1, sizeof(msg));
                 msg[sizeof(msg)-1] = '\0';
-               kill_guest(lg, "CRASH: %s", msg);
+               kill_guest(cpu, "CRASH: %s", msg);
+               if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
+                       cpu->lg->dead = ERR_PTR(-ERESTART);
                 break;
         }
         case LHCALL_FLUSH_TLB:
                 /* FLUSH_TLB comes in two flavors, depending on the
                  * argument: */
                 if (args->arg1)
-                       guest_pagetable_clear_all(lg);
+                       guest_pagetable_clear_all(cpu);
                 else
-                       guest_pagetable_flush_user(lg);
+                       guest_pagetable_flush_user(cpu);
                 break;
  
         /* All these calls simply pass the arguments through to the right
          * routines. */
         case LHCALL_NEW_PGTABLE:
-               guest_new_pagetable(lg, args->arg1);
+               guest_new_pagetable(cpu, args->arg1);
                 break;
         case LHCALL_SET_STACK:
-               guest_set_stack(lg, args->arg1, args->arg2, args->arg3);
+               guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
                 break;
         case LHCALL_SET_PTE:
-               guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3));
+               guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
                 break;
         case LHCALL_SET_PMD:
-               guest_set_pmd(lg, args->arg1, args->arg2);
+               guest_set_pmd(cpu->lg, args->arg1, args->arg2);
                 break;
         case LHCALL_SET_CLOCKEVENT:
-               guest_set_clockevent(lg, args->arg1);
+               guest_set_clockevent(cpu, args->arg1);
                 break;
         case LHCALL_TS:
                 /* This sets the TS flag, as we saw used in run_guest(). */
-               lg->ts = args->arg1;
+               cpu->ts = args->arg1;
                 break;
         case LHCALL_HALT:
                 /* Similarly, this sets the halted flag for run_guest(). */
-               lg->halted = 1;
+               cpu->halted = 1;
                 break;
         case LHCALL_NOTIFY:
-               lg->pending_notify = args->arg1;
+               cpu->pending_notify = args->arg1;
                 break;
         default:
                 /* It should be an architecture-specific hypercall. */
-               if (lguest_arch_do_hcall(lg, args))
-                       kill_guest(lg, "Bad hypercall %li\n", args->arg0);
+               if (lguest_arch_do_hcall(cpu, args))
+                       kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
         }
  }
  /*:*/
@@ -104,13 +107,13 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
   * Guest put them in the ring, but we also promise the Guest that they will
   * happen before any normal hypercall (which is why we check this before
   * checking for a normal hcall). */
-static void do_async_hcalls(struct lguest *lg)
+static void do_async_hcalls(struct lg_cpu *cpu)
  {
         unsigned int i;
         u8 st[LHCALL_RING_SIZE];
  
         /* For simplicity, we copy the entire call status array in at once. */
-       if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
+       if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
                 return;
  
         /* We process "struct lguest_data"s hcalls[] ring once. */
@@ -119,7 +122,7 @@ static void do_async_hcalls(struct lguest *lg)
                 /* We remember where we were up to from last time.  This makes
                  * sure that the hypercalls are done in the order the Guest
                  * places them in the ring. */
-               unsigned int n = lg->next_hcall;
+               unsigned int n = cpu->next_hcall;
  
                 /* 0xFF means there's no call here (yet). */
                 if (st[n] == 0xFF)
@@ -127,65 +130,65 @@ static void do_async_hcalls(struct lguest *lg)
  
                 /* OK, we have hypercall.  Increment the "next_hcall" cursor,
                  * and wrap back to 0 if we reach the end. */
-               if (++lg->next_hcall == LHCALL_RING_SIZE)
-                       lg->next_hcall = 0;
+               if (++cpu->next_hcall == LHCALL_RING_SIZE)
+                       cpu->next_hcall = 0;
  
                 /* Copy the hypercall arguments into a local copy of
                  * the hcall_args struct. */
-               if (copy_from_user(&args, &lg->lguest_data->hcalls[n],
+               if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
                                    sizeof(struct hcall_args))) {
-                       kill_guest(lg, "Fetching async hypercalls");
+                       kill_guest(cpu, "Fetching async hypercalls");
                         break;
                 }
  
                 /* Do the hypercall, same as a normal one. */
-               do_hcall(lg, &args);
+               do_hcall(cpu, &args);
  
                 /* Mark the hypercall done. */
-               if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
-                       kill_guest(lg, "Writing result for async hypercall");
+               if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
+                       kill_guest(cpu, "Writing result for async hypercall");
                         break;
                 }
  
                 /* Stop doing hypercalls if they want to notify the Launcher:
                  * it needs to service this first. */
-               if (lg->pending_notify)
+               if (cpu->pending_notify)
                         break;
         }
  }
  
  /* Last of all, we look at what happens first of all.  The very first time the
   * Guest makes a hypercall, we end up here to set things up: */
-static void initialize(struct lguest *lg)
+static void initialize(struct lg_cpu *cpu)
  {
         /* You can't do anything until you're initialized.  The Guest knows the
          * rules, so we're unforgiving here. */
-       if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
-               kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0);
+       if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
+               kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
                 return;
         }
  
-       if (lguest_arch_init_hypercalls(lg))
-               kill_guest(lg, "bad guest page %p", lg->lguest_data);
+       if (lguest_arch_init_hypercalls(cpu))
+               kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
  
         /* The Guest tells us where we're not to deliver interrupts by putting
          * the range of addresses into "struct lguest_data". */
-       if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
-           || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
-               kill_guest(lg, "bad guest page %p", lg->lguest_data);
+       if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
+           || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
+               kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
  
         /* We write the current time into the Guest's data page once so it can
          * set its clock. */
-       write_timestamp(lg);
+       write_timestamp(cpu);
  
         /* page_tables.c will also do some setup. */
-       page_table_guest_data_init(lg);
+       page_table_guest_data_init(cpu);
  
         /* This is the one case where the above accesses might have been the
          * first write to a Guest page.  This may have caused a copy-on-write
          * fault, but the old page might be (read-only) in the Guest
          * pagetable. */
-       guest_pagetable_clear_all(lg);
+       guest_pagetable_clear_all(cpu);
  }
  
  /*H:100
@@ -194,27 +197,27 @@ static void initialize(struct lguest *lg)
   * Remember from the Guest, hypercalls come in two flavors: normal and
   * asynchronous.  This file handles both of types.
   */
-void do_hypercalls(struct lguest *lg)
+void do_hypercalls(struct lg_cpu *cpu)
  {
         /* Not initialized yet?  This hypercall must do it. */
-       if (unlikely(!lg->lguest_data)) {
+       if (unlikely(!cpu->lg->lguest_data)) {
                 /* Set up the "struct lguest_data" */
-               initialize(lg);
+               initialize(cpu);
                 /* Hcall is done. */
-               lg->hcall = NULL;
+               cpu->hcall = NULL;
                 return;
         }
  
         /* The Guest has initialized.
          *
          * Look in the hypercall ring for the async hypercalls: */
-       do_async_hcalls(lg);
+       do_async_hcalls(cpu);
  
         /* If we stopped reading the hypercall ring because the Guest did a
          * NOTIFY to the Launcher, we want to return now.  Otherwise we do
          * the hypercall. */
-       if (!lg->pending_notify) {
-               do_hcall(lg, lg->hcall);
+       if (!cpu->pending_notify) {
+               do_hcall(cpu, cpu->hcall);
                 /* Tricky point: we reset the hcall pointer to mark the
                  * hypercall as "done".  We use the hcall pointer rather than
                  * the trap number to indicate a hypercall is pending.
@@ -225,16 +228,17 @@ void do_hypercalls(struct lguest *lg)
                  * Launcher, the run_guest() loop will exit without running the
                  * Guest.  When it comes back it would try to re-run the
                  * hypercall. */
-               lg->hcall = NULL;
+               cpu->hcall = NULL;
         }
  }
  
  /* This routine supplies the Guest with time: it's used for wallclock time at
   * initial boot and as a rough time source if the TSC isn't available. */
-void write_timestamp(struct lguest *lg)
+void write_timestamp(struct lg_cpu *cpu)
  {
         struct timespec now;
         ktime_get_real_ts(&now);
-       if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec)))
-               kill_guest(lg, "Writing timestamp");
+       if (copy_to_user(&cpu->lg->lguest_data->time,
+                        &now, sizeof(struct timespec)))
+               kill_guest(cpu, "Writing timestamp");
  }
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c

index 2b66f79c208b519d19937af625ae64a8b8fc70a5..32e97c1858e571a2608c678e602755a7f3b7aa38 100644 (file)
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi)
  
  /* We need a helper to "push" a value onto the Guest's stack, since that's a
   * big part of what delivering an interrupt does. */
-static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
+static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
  {
         /* Stack grows upwards: move stack then write value. */
         *gstack -= 4;
-       lgwrite(lg, *gstack, u32, val);
+       lgwrite(cpu, *gstack, u32, val);
  }
  
  /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
@@ -60,7 +60,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
   * We set up the stack just like the CPU does for a real interrupt, so it's
   * identical for the Guest (and the standard "iret" instruction will undo
   * it). */
-static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err)
  {
         unsigned long gstack, origstack;
         u32 eflags, ss, irq_enable;
@@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
         /* There are two cases for interrupts: one where the Guest is already
          * in the kernel, and a more complex one where the Guest is in
          * userspace.  We check the privilege level to find out. */
-       if ((lg->regs->ss&0x3) != GUEST_PL) {
+       if ((cpu->regs->ss&0x3) != GUEST_PL) {
                 /* The Guest told us their kernel stack with the SET_STACK
                  * hypercall: both the virtual address and the segment */
-               virtstack = lg->esp1;
-               ss = lg->ss1;
+               virtstack = cpu->esp1;
+               ss = cpu->ss1;
  
-               origstack = gstack = guest_pa(lg, virtstack);
+               origstack = gstack = guest_pa(cpu, virtstack);
                 /* We push the old stack segment and pointer onto the new
                  * stack: when the Guest does an "iret" back from the interrupt
                  * handler the CPU will notice they're dropping privilege
                  * levels and expect these here. */
-               push_guest_stack(lg, &gstack, lg->regs->ss);
-               push_guest_stack(lg, &gstack, lg->regs->esp);
+               push_guest_stack(cpu, &gstack, cpu->regs->ss);
+               push_guest_stack(cpu, &gstack, cpu->regs->esp);
         } else {
                 /* We're staying on the same Guest (kernel) stack. */
-               virtstack = lg->regs->esp;
-               ss = lg->regs->ss;
+               virtstack = cpu->regs->esp;
+               ss = cpu->regs->ss;
  
-               origstack = gstack = guest_pa(lg, virtstack);
+               origstack = gstack = guest_pa(cpu, virtstack);
         }
  
         /* Remember that we never let the Guest actually disable interrupts, so
          * the "Interrupt Flag" bit is always set.  We copy that bit from the
          * Guest's "irq_enabled" field into the eflags word: we saw the Guest
          * copy it back in "lguest_iret". */
-       eflags = lg->regs->eflags;
-       if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
+       eflags = cpu->regs->eflags;
+       if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
             && !(irq_enable & X86_EFLAGS_IF))
                 eflags &= ~X86_EFLAGS_IF;
  
         /* An interrupt is expected to push three things on the stack: the old
          * "eflags" word, the old code segment, and the old instruction
          * pointer. */
-       push_guest_stack(lg, &gstack, eflags);
-       push_guest_stack(lg, &gstack, lg->regs->cs);
-       push_guest_stack(lg, &gstack, lg->regs->eip);
+       push_guest_stack(cpu, &gstack, eflags);
+       push_guest_stack(cpu, &gstack, cpu->regs->cs);
+       push_guest_stack(cpu, &gstack, cpu->regs->eip);
  
         /* For the six traps which supply an error code, we push that, too. */
         if (has_err)
-               push_guest_stack(lg, &gstack, lg->regs->errcode);
+               push_guest_stack(cpu, &gstack, cpu->regs->errcode);
  
         /* Now we've pushed all the old state, we change the stack, the code
          * segment and the address to execute. */
-       lg->regs->ss = ss;
-       lg->regs->esp = virtstack + (gstack - origstack);
-       lg->regs->cs = (__KERNEL_CS|GUEST_PL);
-       lg->regs->eip = idt_address(lo, hi);
+       cpu->regs->ss = ss;
+       cpu->regs->esp = virtstack + (gstack - origstack);
+       cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
+       cpu->regs->eip = idt_address(lo, hi);
  
         /* There are two kinds of interrupt handlers: 0xE is an "interrupt
          * gate" which expects interrupts to be disabled on entry. */
         if (idt_type(lo, hi) == 0xE)
-               if (put_user(0, &lg->lguest_data->irq_enabled))
-                       kill_guest(lg, "Disabling interrupts");
+               if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
+                       kill_guest(cpu, "Disabling interrupts");
  }
  
  /*H:205
@@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
   *
   * maybe_do_interrupt() gets called before every entry to the Guest, to see if
   * we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lguest *lg)
+void maybe_do_interrupt(struct lg_cpu *cpu)
  {
         unsigned int irq;
         DECLARE_BITMAP(blk, LGUEST_IRQS);
         struct desc_struct *idt;
  
         /* If the Guest hasn't even initialized yet, we can do nothing. */
-       if (!lg->lguest_data)
+       if (!cpu->lg->lguest_data)
                 return;
  
         /* Take our "irqs_pending" array and remove any interrupts the Guest
          * wants blocked: the result ends up in "blk". */
-       if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
+       if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
                            sizeof(blk)))
                 return;
  
-       bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
+       bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
  
         /* Find the first interrupt. */
         irq = find_first_bit(blk, LGUEST_IRQS);
@@ -155,19 +155,20 @@ void maybe_do_interrupt(struct lguest *lg)
  
         /* They may be in the middle of an iret, where they asked us never to
          * deliver interrupts. */
-       if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
+       if (cpu->regs->eip >= cpu->lg->noirq_start &&
+          (cpu->regs->eip < cpu->lg->noirq_end))
                 return;
  
         /* If they're halted, interrupts restart them. */
-       if (lg->halted) {
+       if (cpu->halted) {
                 /* Re-enable interrupts. */
-               if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
-                       kill_guest(lg, "Re-enabling interrupts");
-               lg->halted = 0;
+               if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
+                       kill_guest(cpu, "Re-enabling interrupts");
+               cpu->halted = 0;
         } else {
                 /* Otherwise we check if they have interrupts disabled. */
                 u32 irq_enabled;
-               if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
+               if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
                         irq_enabled = 0;
                 if (!irq_enabled)
                         return;
@@ -176,15 +177,15 @@ void maybe_do_interrupt(struct lguest *lg)
         /* Look at the IDT entry the Guest gave us for this interrupt.  The
          * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
          * over them. */
-       idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
+       idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
         /* If they don't have a handler (yet?), we just ignore it */
         if (idt_present(idt->a, idt->b)) {
                 /* OK, mark it no longer pending and deliver it. */
-               clear_bit(irq, lg->irqs_pending);
+               clear_bit(irq, cpu->irqs_pending);
                 /* set_guest_interrupt() takes the interrupt descriptor and a
                  * flag to say whether this interrupt pushes an error code onto
                  * the stack as well: virtual interrupts never do. */
-               set_guest_interrupt(lg, idt->a, idt->b, 0);
+               set_guest_interrupt(cpu, idt->a, idt->b, 0);
         }
  
         /* Every time we deliver an interrupt, we update the timestamp in the
@@ -192,7 +193,7 @@ void maybe_do_interrupt(struct lguest *lg)
          * did this more often, but it can actually be quite slow: doing it
          * here is a compromise which means at least it gets updated every
          * timer interrupt. */
-       write_timestamp(lg);
+       write_timestamp(cpu);
  }
  /*:*/
  
@@ -245,19 +246,19 @@ static int has_err(unsigned int trap)
  }
  
  /* deliver_trap() returns true if it could deliver the trap. */
-int deliver_trap(struct lguest *lg, unsigned int num)
+int deliver_trap(struct lg_cpu *cpu, unsigned int num)
  {
         /* Trap numbers are always 8 bit, but we set an impossible trap number
          * for traps inside the Switcher, so check that here. */
-       if (num >= ARRAY_SIZE(lg->arch.idt))
+       if (num >= ARRAY_SIZE(cpu->arch.idt))
                 return 0;
  
         /* Early on the Guest hasn't set the IDT entries (or maybe it put a
          * bogus one in): if we fail here, the Guest will be killed. */
-       if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
+       if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
                 return 0;
-       set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,
-                           has_err(num));
+       set_guest_interrupt(cpu, cpu->arch.idt[num].a,
+                           cpu->arch.idt[num].b, has_err(num));
         return 1;
  }
  
@@ -309,18 +310,18 @@ static int direct_trap(unsigned int num)
   * the Guest.
   *
   * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */
-void pin_stack_pages(struct lguest *lg)
+void pin_stack_pages(struct lg_cpu *cpu)
  {
         unsigned int i;
  
         /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or
          * two pages of stack space. */
-       for (i = 0; i < lg->stack_pages; i++)
+       for (i = 0; i < cpu->lg->stack_pages; i++)
                 /* The stack grows *upwards*, so the address we're given is the
                  * start of the page after the kernel stack.  Subtract one to
                  * get back onto the first stack page, and keep subtracting to
                  * get to the rest of the stack pages. */
-               pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE);
+               pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
  }
  
  /* Direct traps also mean that we need to know whenever the Guest wants to use
@@ -331,21 +332,21 @@ void pin_stack_pages(struct lguest *lg)
   *
   * In Linux each process has its own kernel stack, so this happens a lot: we
   * change stacks on each context switch. */
-void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
  {
         /* You are not allowed have a stack segment with privilege level 0: bad
          * Guest! */
         if ((seg & 0x3) != GUEST_PL)
-               kill_guest(lg, "bad stack segment %i", seg);
+               kill_guest(cpu, "bad stack segment %i", seg);
         /* We only expect one or two stack pages. */
         if (pages > 2)
-               kill_guest(lg, "bad stack pages %u", pages);
+               kill_guest(cpu, "bad stack pages %u", pages);
         /* Save where the stack is, and how many pages */
-       lg->ss1 = seg;
-       lg->esp1 = esp;
-       lg->stack_pages = pages;
+       cpu->ss1 = seg;
+       cpu->esp1 = esp;
+       cpu->lg->stack_pages = pages;
         /* Make sure the new stack pages are mapped */
-       pin_stack_pages(lg);
+       pin_stack_pages(cpu);
  }
  
  /* All this reference to mapping stacks leads us neatly into the other complex
@@ -353,7 +354,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
  
  /*H:235 This is the routine which actually checks the Guest's IDT entry and
   * transfers it into the entry in "struct lguest": */
-static void set_trap(struct lguest *lg, struct desc_struct *trap,
+static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
                      unsigned int num, u32 lo, u32 hi)
  {
         u8 type = idt_type(lo, hi);
@@ -366,7 +367,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
  
         /* We only support interrupt and trap gates. */
         if (type != 0xE && type != 0xF)
-               kill_guest(lg, "bad IDT type %i", type);
+               kill_guest(cpu, "bad IDT type %i", type);
  
         /* We only copy the handler address, present bit, privilege level and
          * type.  The privilege level controls where the trap can be triggered
@@ -383,7 +384,7 @@ static void set_trap(struct lguest *lg, struct desc_struct *trap,
   *
   * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
   * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */
-void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
  {
         /* Guest never handles: NMI, doublefault, spurious interrupt or
          * hypercall.  We ignore when it tries to set them. */
@@ -392,13 +393,13 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
  
         /* Mark the IDT as changed: next time the Guest runs we'll know we have
          * to copy this again. */
-       lg->changed |= CHANGED_IDT;
+       cpu->changed |= CHANGED_IDT;
  
         /* Check that the Guest doesn't try to step outside the bounds. */
-       if (num >= ARRAY_SIZE(lg->arch.idt))
-               kill_guest(lg, "Setting idt entry %u", num);
+       if (num >= ARRAY_SIZE(cpu->arch.idt))
+               kill_guest(cpu, "Setting idt entry %u", num);
         else
-               set_trap(lg, &lg->arch.idt[num], num, lo, hi);
+               set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
  }
  
  /* The default entry for each interrupt points into the Switcher routines which
@@ -434,14 +435,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
  /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead
   * we copy them into the IDT which we've set up for Guests on this CPU, just
   * before we run the Guest.  This routine does that copy. */
-void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
                 const unsigned long *def)
  {
         unsigned int i;
  
         /* We can simply copy the direct traps, otherwise we use the default
          * ones in the Switcher: they will return to the Host. */
-       for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) {
+       for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
                 /* If no Guest can ever override this trap, leave it alone. */
                 if (!direct_trap(i))
                         continue;
@@ -450,8 +451,8 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
                  * Interrupt gates (type 14) disable interrupts as they are
                  * entered, which we never let the Guest do.  Not present
                  * entries (type 0x0) also can't go direct, of course. */
-               if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF)
-                       idt[i] = lg->arch.idt[i];
+               if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF)
+                       idt[i] = cpu->arch.idt[i];
                 else
                         /* Reset it to the default. */
                         default_idt_entry(&idt[i], i, def[i]);
@@ -470,13 +471,13 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
   * infrastructure to set a callback at that time.
   *
   * 0 means "turn off the clock". */
-void guest_set_clockevent(struct lguest *lg, unsigned long delta)
+void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
  {
         ktime_t expires;
  
         if (unlikely(delta == 0)) {
                 /* Clock event device is shutting down. */
-               hrtimer_cancel(&lg->hrt);
+               hrtimer_cancel(&cpu->hrt);
                 return;
         }
  
@@ -484,25 +485,25 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta)
          * all the time between now and the timer interrupt it asked for.  This
          * is almost always the right thing to do. */
         expires = ktime_add_ns(ktime_get_real(), delta);
-       hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
+       hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
  }
  
  /* This is the function called when the Guest's timer expires. */
  static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
  {
-       struct lguest *lg = container_of(timer, struct lguest, hrt);
+       struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
  
         /* Remember the first interrupt is the timer interrupt. */
-       set_bit(0, lg->irqs_pending);
+       set_bit(0, cpu->irqs_pending);
         /* If the Guest is actually stopped, we need to wake it up. */
-       if (lg->halted)
-               wake_up_process(lg->tsk);
+       if (cpu->halted)
+               wake_up_process(cpu->tsk);
         return HRTIMER_NORESTART;
  }
  
  /* This sets up the timer for this Guest. */
-void init_clockdev(struct lguest *lg)
+void init_clockdev(struct lg_cpu *cpu)
  {
-       hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-       lg->hrt.function = clockdev_fn;
+       hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+       cpu->hrt.function = clockdev_fn;
  }
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h

index 86924891b5eb21e3f8f3f3b19b7c2353290f5351..2337e1a06f023ffd226c3ad9748ca3104c41d9e4 100644 (file)
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -8,6 +8,7 @@
  #include <linux/lguest.h>
  #include <linux/lguest_launcher.h>
  #include <linux/wait.h>
+#include <linux/hrtimer.h>
  #include <linux/err.h>
  #include <asm/semaphore.h>
  
@@ -38,58 +39,72 @@ struct lguest_pages
  #define CHANGED_GDT_TLS                4 /* Actually a subset of CHANGED_GDT */
  #define CHANGED_ALL            3
  
-/* The private info the thread maintains about the guest. */
-struct lguest
-{
-       /* At end of a page shared mapped over lguest_pages in guest.  */
-       unsigned long regs_page;
-       struct lguest_regs *regs;
-       struct lguest_data __user *lguest_data;
+struct lguest;
+
+struct lg_cpu {
+       unsigned int id;
+       struct lguest *lg;
         struct task_struct *tsk;
         struct mm_struct *mm;   /* == tsk->mm, but that becomes NULL on exit */
-       u32 pfn_limit;
-       /* This provides the offset to the base of guest-physical
-        * memory in the Launcher. */
-       void __user *mem_base;
-       unsigned long kernel_address;
+
         u32 cr2;
-       int halted;
         int ts;
-       u32 next_hcall;
         u32 esp1;
         u8 ss1;
  
+       /* Bitmap of what has changed: see CHANGED_* above. */
+       int changed;
+
+       unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+
+       /* At end of a page shared mapped over lguest_pages in guest.  */
+       unsigned long regs_page;
+       struct lguest_regs *regs;
+
+       struct lguest_pages *last_pages;
+
+       int cpu_pgd; /* which pgd this cpu is currently using */
+
         /* If a hypercall was asked for, this points to the arguments. */
         struct hcall_args *hcall;
+       u32 next_hcall;
+
+       /* Virtual clock device */
+       struct hrtimer hrt;
  
         /* Do we need to stop what we're doing and return to userspace? */
         int break_out;
         wait_queue_head_t break_wq;
+       int halted;
  
-       /* Bitmap of what has changed: see CHANGED_* above. */
-       int changed;
-       struct lguest_pages *last_pages;
+       /* Pending virtual interrupts */
+       DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+
+       struct lg_cpu_arch arch;
+};
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+       struct lguest_data __user *lguest_data;
+       struct lg_cpu cpus[NR_CPUS];
+       unsigned int nr_cpus;
+
+       u32 pfn_limit;
+       /* This provides the offset to the base of guest-physical
+        * memory in the Launcher. */
+       void __user *mem_base;
+       unsigned long kernel_address;
  
-       /* We keep a small number of these. */
-       u32 pgdidx;
         struct pgdir pgdirs[4];
  
         unsigned long noirq_start, noirq_end;
-       unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
  
         unsigned int stack_pages;
         u32 tsc_khz;
  
         /* Dead? */
         const char *dead;
-
-       struct lguest_arch arch;
-
-       /* Virtual clock device */
-       struct hrtimer hrt;
-
-       /* Pending virtual interrupts */
-       DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
  };
  
  extern struct mutex lguest_lock;
@@ -97,26 +112,26 @@ extern struct mutex lguest_lock;
  /* core.c: */
  int lguest_address_ok(const struct lguest *lg,
                       unsigned long addr, unsigned long len);
-void __lgread(struct lguest *, void *, unsigned long, unsigned);
-void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
+void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
+void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
  
  /*H:035 Using memory-copy operations like that is usually inconvient, so we
   * have the following helper macros which read and write a specific type (often
   * an unsigned long).
   *
   * This reads into a variable of the given type then returns that. */
-#define lgread(lg, addr, type)                                         \
-       ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; })
+#define lgread(cpu, addr, type)                                                \
+       ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
  
  /* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(lg, addr, type, val)                           \
+#define lgwrite(cpu, addr, type, val)                          \
         do {                                                    \
                 typecheck(type, val);                           \
-               __lgwrite((lg), (addr), &(val), sizeof(val));   \
+               __lgwrite((cpu), (addr), &(val), sizeof(val));  \
         } while(0)
  /* (end of memory access helper routines) :*/
  
-int run_guest(struct lguest *lg, unsigned long __user *user);
+int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
  
  /* Helper macros to obtain the first 12 or the last 20 bits, this is only the
   * first step in the migration to the kernel types.  pte_pfn is already defined
@@ -126,52 +141,53 @@ int run_guest(struct lguest *lg, unsigned long __user *user);
  #define pgd_pfn(x)     (pgd_val(x) >> PAGE_SHIFT)
  
  /* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lguest *lg);
-int deliver_trap(struct lguest *lg, unsigned int num);
-void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
-void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lguest *lg);
+void maybe_do_interrupt(struct lg_cpu *cpu);
+int deliver_trap(struct lg_cpu *cpu, unsigned int num);
+void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
+                         u32 low, u32 hi);
+void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
+void pin_stack_pages(struct lg_cpu *cpu);
  void setup_default_idt_entries(struct lguest_ro_state *state,
                                const unsigned long *def);
-void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
                 const unsigned long *def);
-void guest_set_clockevent(struct lguest *lg, unsigned long delta);
-void init_clockdev(struct lguest *lg);
+void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+void init_clockdev(struct lg_cpu *cpu);
  bool check_syscall_vector(struct lguest *lg);
  int init_interrupts(void);
  void free_interrupts(void);
  
  /* segments.c: */
  void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lguest *lg);
-void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
-void guest_load_tls(struct lguest *lg, unsigned long tls_array);
-void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
+void setup_guest_gdt(struct lg_cpu *cpu);
+void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num);
+void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
+void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
+void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
  
  /* page_tables.c: */
  int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
  void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
  void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-void guest_pagetable_clear_all(struct lguest *lg);
-void guest_pagetable_flush_user(struct lguest *lg);
-void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
+void guest_pagetable_clear_all(struct lg_cpu *cpu);
+void guest_pagetable_flush_user(struct lg_cpu *cpu);
+void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
                    unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
-int demand_page(struct lguest *info, unsigned long cr2, int errcode);
-void pin_page(struct lguest *lg, unsigned long vaddr);
-unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
-void page_table_guest_data_init(struct lguest *lg);
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
+int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
+void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
+void page_table_guest_data_init(struct lg_cpu *cpu);
  
  /* <arch>/core.c: */
  void lguest_arch_host_init(void);
  void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lguest *lg);
-void lguest_arch_handle_trap(struct lguest *lg);
-int lguest_arch_init_hypercalls(struct lguest *lg);
-int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lguest *lg, unsigned long start);
+void lguest_arch_run_guest(struct lg_cpu *cpu);
+void lguest_arch_handle_trap(struct lg_cpu *cpu);
+int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
+int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
+void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
  
  /* <arch>/switcher.S: */
  extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
@@ -181,8 +197,8 @@ int lguest_device_init(void);
  void lguest_device_remove(void);
  
  /* hypercalls.c: */
-void do_hypercalls(struct lguest *lg);
-void write_timestamp(struct lguest *lg);
+void do_hypercalls(struct lg_cpu *cpu);
+void write_timestamp(struct lg_cpu *cpu);
  
  /*L:035
   * Let's step aside for the moment, to study one important routine that's used
@@ -208,12 +224,12 @@ void write_timestamp(struct lguest *lg);
   * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
   * } while(0)".
   */
-#define kill_guest(lg, fmt...)                                 \
+#define kill_guest(cpu, fmt...)                                        \
  do {                                                           \
-       if (!(lg)->dead) {                                      \
-               (lg)->dead = kasprintf(GFP_ATOMIC, fmt);        \
-               if (!(lg)->dead)                                \
-                       (lg)->dead = ERR_PTR(-ENOMEM);          \
+       if (!(cpu)->lg->dead) {                                 \
+               (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt);   \
+               if (!(cpu)->lg->dead)                           \
+                       (cpu)->lg->dead = ERR_PTR(-ENOMEM);     \
         }                                                       \
  } while(0)
  /* (End of aside) :*/
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c

index 3b92a61ba8d23b14e1ac8e5b342929a314b669be..85d42d3d01a9a9bbeb739ce422271848cd4be3ff 100644 (file)
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -6,6 +6,7 @@
  #include <linux/uaccess.h>
  #include <linux/miscdevice.h>
  #include <linux/fs.h>
+#include <linux/sched.h>
  #include "lg.h"
  
  /*L:055 When something happens, the Waker process needs a way to stop the
@@ -13,7 +14,7 @@
   * LHREQ_BREAK and the value "1" to /dev/lguest to do this.  Once the Launcher
   * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
   * the Waker. */
-static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
+static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
  {
         unsigned long on;
  
@@ -22,21 +23,21 @@ static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
                 return -EFAULT;
  
         if (on) {
-               lg->break_out = 1;
+               cpu->break_out = 1;
                 /* Pop it out of the Guest (may be running on different CPU) */
-               wake_up_process(lg->tsk);
+               wake_up_process(cpu->tsk);
                 /* Wait for them to reset it */
-               return wait_event_interruptible(lg->break_wq, !lg->break_out);
+               return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
         } else {
-               lg->break_out = 0;
-               wake_up(&lg->break_wq);
+               cpu->break_out = 0;
+               wake_up(&cpu->break_wq);
                 return 0;
         }
  }
  
  /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
   * number to /dev/lguest. */
-static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
+static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
  {
         unsigned long irq;
  
@@ -46,7 +47,7 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
                 return -EINVAL;
         /* Next time the Guest runs, the core code will see if it can deliver
          * this interrupt. */
-       set_bit(irq, lg->irqs_pending);
+       set_bit(irq, cpu->irqs_pending);
         return 0;
  }
  
@@ -55,13 +56,21 @@ static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
  static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
  {
         struct lguest *lg = file->private_data;
+       struct lg_cpu *cpu;
+       unsigned int cpu_id = *o;
  
         /* You must write LHREQ_INITIALIZE first! */
         if (!lg)
                 return -EINVAL;
  
+       /* Watch out for arbitrary vcpu indexes! */
+       if (cpu_id >= lg->nr_cpus)
+               return -EINVAL;
+
+       cpu = &lg->cpus[cpu_id];
+
         /* If you're not the task which owns the Guest, go away. */
-       if (current != lg->tsk)
+       if (current != cpu->tsk)
                 return -EPERM;
  
         /* If the guest is already dead, we indicate why */
@@ -81,11 +90,53 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
  
         /* If we returned from read() last time because the Guest notified,
          * clear the flag. */
-       if (lg->pending_notify)
-               lg->pending_notify = 0;
+       if (cpu->pending_notify)
+               cpu->pending_notify = 0;
  
         /* Run the Guest until something interesting happens. */
-       return run_guest(lg, (unsigned long __user *)user);
+       return run_guest(cpu, (unsigned long __user *)user);
+}
+
+static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
+{
+       if (id >= NR_CPUS)
+               return -EINVAL;
+
+       cpu->id = id;
+       cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+       cpu->lg->nr_cpus++;
+       init_clockdev(cpu);
+
+       /* We need a complete page for the Guest registers: they are accessible
+        * to the Guest and we can only grant it access to whole pages. */
+       cpu->regs_page = get_zeroed_page(GFP_KERNEL);
+       if (!cpu->regs_page)
+               return -ENOMEM;
+
+       /* We actually put the registers at the bottom of the page. */
+       cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
+
+       /* Now we initialize the Guest's registers, handing it the start
+        * address. */
+       lguest_arch_setup_regs(cpu, start_ip);
+
+       /* Initialize the queue for the waker to wait on */
+       init_waitqueue_head(&cpu->break_wq);
+
+       /* We keep a pointer to the Launcher task (ie. current task) for when
+        * other Guests want to wake this one (inter-Guest I/O). */
+       cpu->tsk = current;
+
+       /* We need to keep a pointer to the Launcher's memory map, because if
+        * the Launcher dies we need to clean it up.  If we don't keep a
+        * reference, it is destroyed before close() is called. */
+       cpu->mm = get_task_mm(cpu->tsk);
+
+       /* We remember which CPU's pages this Guest used last, for optimization
+        * when the same Guest runs on the same CPU twice. */
+       cpu->last_pages = NULL;
+
+       return 0;
  }
  
  /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
@@ -134,15 +185,10 @@ static int initialize(struct file *file, const unsigned long __user *input)
         lg->mem_base = (void __user *)(long)args[0];
         lg->pfn_limit = args[1];
  
-       /* We need a complete page for the Guest registers: they are accessible
-        * to the Guest and we can only grant it access to whole pages. */
-       lg->regs_page = get_zeroed_page(GFP_KERNEL);
-       if (!lg->regs_page) {
-               err = -ENOMEM;
+       /* This is the first cpu */
+       err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
+       if (err)
                 goto release_guest;
-       }
-       /* We actually put the registers at the bottom of the page. */
-       lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
  
         /* Initialize the Guest's shadow page tables, using the toplevel
          * address the Launcher gave us.  This allocates memory, so can
@@ -151,28 +197,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
         if (err)
                 goto free_regs;
  
-       /* Now we initialize the Guest's registers, handing it the start
-        * address. */
-       lguest_arch_setup_regs(lg, args[3]);
-
-       /* The timer for lguest's clock needs initialization. */
-       init_clockdev(lg);
-
-       /* We keep a pointer to the Launcher task (ie. current task) for when
-        * other Guests want to wake this one (inter-Guest I/O). */
-       lg->tsk = current;
-       /* We need to keep a pointer to the Launcher's memory map, because if
-        * the Launcher dies we need to clean it up.  If we don't keep a
-        * reference, it is destroyed before close() is called. */
-       lg->mm = get_task_mm(lg->tsk);
-
-       /* Initialize the queue for the waker to wait on */
-       init_waitqueue_head(&lg->break_wq);
-
-       /* We remember which CPU's pages this Guest used last, for optimization
-        * when the same Guest runs on the same CPU twice. */
-       lg->last_pages = NULL;
-
         /* We keep our "struct lguest" in the file's private_data. */
         file->private_data = lg;
  
@@ -182,7 +206,8 @@ static int initialize(struct file *file, const unsigned long __user *input)
         return sizeof(args);
  
  free_regs:
-       free_page(lg->regs_page);
+       /* FIXME: This should be in free_vcpu */
+       free_page(lg->cpus[0].regs_page);
  release_guest:
         kfree(lg);
  unlock:
@@ -202,30 +227,37 @@ static ssize_t write(struct file *file, const char __user *in,
         struct lguest *lg = file->private_data;
         const unsigned long __user *input = (const unsigned long __user *)in;
         unsigned long req;
+       struct lg_cpu *uninitialized_var(cpu);
+       unsigned int cpu_id = *off;
  
         if (get_user(req, input) != 0)
                 return -EFAULT;
         input++;
  
         /* If you haven't initialized, you must do that first. */
-       if (req != LHREQ_INITIALIZE && !lg)
-               return -EINVAL;
+       if (req != LHREQ_INITIALIZE) {
+               if (!lg || (cpu_id >= lg->nr_cpus))
+                       return -EINVAL;
+               cpu = &lg->cpus[cpu_id];
+               if (!cpu)
+                       return -EINVAL;
+       }
  
         /* Once the Guest is dead, all you can do is read() why it died. */
         if (lg && lg->dead)
                 return -ENOENT;
  
         /* If you're not the task which owns the Guest, you can only break */
-       if (lg && current != lg->tsk && req != LHREQ_BREAK)
+       if (lg && current != cpu->tsk && req != LHREQ_BREAK)
                 return -EPERM;
  
         switch (req) {
         case LHREQ_INITIALIZE:
                 return initialize(file, input);
         case LHREQ_IRQ:
-               return user_send_irq(lg, input);
+               return user_send_irq(cpu, input);
         case LHREQ_BREAK:
-               return break_guest_out(lg, input);
+               return break_guest_out(cpu, input);
         default:
                 return -EINVAL;
         }
@@ -241,6 +273,7 @@ static ssize_t write(struct file *file, const char __user *in,
  static int close(struct inode *inode, struct file *file)
  {
         struct lguest *lg = file->private_data;
+       unsigned int i;
  
         /* If we never successfully initialized, there's nothing to clean up */
         if (!lg)
@@ -249,19 +282,23 @@ static int close(struct inode *inode, struct file *file)
         /* We need the big lock, to protect from inter-guest I/O and other
          * Launchers initializing guests. */
         mutex_lock(&lguest_lock);
-       /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
-       hrtimer_cancel(&lg->hrt);
+
         /* Free up the shadow page tables for the Guest. */
         free_guest_pagetable(lg);
-       /* Now all the memory cleanups are done, it's safe to release the
-        * Launcher's memory management structure. */
-       mmput(lg->mm);
+
+       for (i = 0; i < lg->nr_cpus; i++) {
+               /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
+               hrtimer_cancel(&lg->cpus[i].hrt);
+               /* We can free up the register page we allocated. */
+               free_page(lg->cpus[i].regs_page);
+               /* Now all the memory cleanups are done, it's safe to release
+                * the Launcher's memory management structure. */
+               mmput(lg->cpus[i].mm);
+       }
         /* If lg->dead doesn't contain an error code it will be NULL or a
          * kmalloc()ed string, either of which is ok to hand to kfree(). */
         if (!IS_ERR(lg->dead))
                 kfree(lg->dead);
-       /* We can free up the register page we allocated. */
-       free_page(lg->regs_page);
         /* We clear the entire structure, which also marks it as free for the
          * next user. */
         memset(lg, 0, sizeof(*lg));
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c

index fffabb3271573d4f979cb36e98b5103c1d1246a7..74b4cf2a6c417743c9556610961ef66b9dbf0bbf 100644 (file)
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
   * page directory entry (PGD) for that address.  Since we keep track of several
   * page tables, the "i" argument tells us which one we're interested in (it's
   * usually the current one). */
-static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
+static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
  {
         unsigned int index = pgd_index(vaddr);
  
         /* We kill any Guest trying to touch the Switcher addresses. */
         if (index >= SWITCHER_PGD_INDEX) {
-               kill_guest(lg, "attempt to access switcher pages");
+               kill_guest(cpu, "attempt to access switcher pages");
                 index = 0;
         }
         /* Return a pointer index'th pgd entry for the i'th page table. */
-       return &lg->pgdirs[i].pgdir[index];
+       return &cpu->lg->pgdirs[i].pgdir[index];
  }
  
  /* This routine then takes the page directory entry returned above, which
   * contains the address of the page table entry (PTE) page.  It then returns a
   * pointer to the PTE entry for the given address. */
-static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
  {
         pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
         /* You should never call this if the PGD entry wasn't valid */
@@ -94,14 +94,13 @@ static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
  
  /* These two functions just like the above two, except they access the Guest
   * page tables.  Hence they return a Guest address. */
-static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
+static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
  {
         unsigned int index = vaddr >> (PGDIR_SHIFT);
-       return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
+       return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
  }
  
-static unsigned long gpte_addr(struct lguest *lg,
-                              pgd_t gpgd, unsigned long vaddr)
+static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
  {
         unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
         BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
@@ -138,7 +137,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
   * entry can be a little tricky.  The flags are (almost) the same, but the
   * Guest PTE contains a virtual page number: the CPU needs the real page
   * number. */
-static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
+static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
  {
         unsigned long pfn, base, flags;
  
@@ -149,7 +148,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
         flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
  
         /* The Guest's pages are offset inside the Launcher. */
-       base = (unsigned long)lg->mem_base / PAGE_SIZE;
+       base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
  
         /* We need a temporary "unsigned long" variable to hold the answer from
          * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
@@ -157,7 +156,7 @@ static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
          * page, given the virtual number. */
         pfn = get_pfn(base + pte_pfn(gpte), write);
         if (pfn == -1UL) {
-               kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
+               kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
                 /* When we destroy the Guest, we'll go through the shadow page
                  * tables and release_pte() them.  Make sure we don't think
                  * this one is valid! */
@@ -177,17 +176,18 @@ static void release_pte(pte_t pte)
  }
  /*:*/
  
-static void check_gpte(struct lguest *lg, pte_t gpte)
+static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
  {
         if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
-           || pte_pfn(gpte) >= lg->pfn_limit)
-               kill_guest(lg, "bad page table entry");
+           || pte_pfn(gpte) >= cpu->lg->pfn_limit)
+               kill_guest(cpu, "bad page table entry");
  }
  
-static void check_gpgd(struct lguest *lg, pgd_t gpgd)
+static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
  {
-       if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
-               kill_guest(lg, "bad page directory entry");
+       if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
+          (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+               kill_guest(cpu, "bad page directory entry");
  }
  
  /*H:330
@@ -200,7 +200,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
   *
   * If we fixed up the fault (ie. we mapped the address), this routine returns
   * true.  Otherwise, it was a real fault and we need to tell the Guest. */
-int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
+int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
  {
         pgd_t gpgd;
         pgd_t *spgd;
@@ -209,24 +209,24 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
         pte_t *spte;
  
         /* First step: get the top-level Guest page table entry. */
-       gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
+       gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
         /* Toplevel not present?  We can't map it in. */
         if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
                 return 0;
  
         /* Now look at the matching shadow entry. */
-       spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+       spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
         if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
                 /* No shadow entry: allocate a new shadow PTE page. */
                 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
                 /* This is not really the Guest's fault, but killing it is
                  * simple for this corner case. */
                 if (!ptepage) {
-                       kill_guest(lg, "out of memory allocating pte page");
+                       kill_guest(cpu, "out of memory allocating pte page");
                         return 0;
                 }
                 /* We check that the Guest pgd is OK. */
-               check_gpgd(lg, gpgd);
+               check_gpgd(cpu, gpgd);
                 /* And we copy the flags to the shadow PGD entry.  The page
                  * number in the shadow PGD is the page we just allocated. */
                 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
@@ -234,8 +234,8 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
  
         /* OK, now we look at the lower level in the Guest page table: keep its
          * address, because we might update it later. */
-       gpte_ptr = gpte_addr(lg, gpgd, vaddr);
-       gpte = lgread(lg, gpte_ptr, pte_t);
+       gpte_ptr = gpte_addr(gpgd, vaddr);
+       gpte = lgread(cpu, gpte_ptr, pte_t);
  
         /* If this page isn't in the Guest page tables, we can't page it in. */
         if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -252,7 +252,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
  
         /* Check that the Guest PTE flags are OK, and the page number is below
          * the pfn_limit (ie. not mapping the Launcher binary). */
-       check_gpte(lg, gpte);
+       check_gpte(cpu, gpte);
  
         /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
         gpte = pte_mkyoung(gpte);
@@ -260,7 +260,7 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
                 gpte = pte_mkdirty(gpte);
  
         /* Get the pointer to the shadow PTE entry we're going to set. */
-       spte = spte_addr(lg, *spgd, vaddr);
+       spte = spte_addr(*spgd, vaddr);
         /* If there was a valid shadow PTE entry here before, we release it.
          * This can happen with a write to a previously read-only entry. */
         release_pte(*spte);
@@ -268,17 +268,17 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
         /* If this is a write, we insist that the Guest page is writable (the
          * final arg to gpte_to_spte()). */
         if (pte_dirty(gpte))
-               *spte = gpte_to_spte(lg, gpte, 1);
+               *spte = gpte_to_spte(cpu, gpte, 1);
         else
                 /* If this is a read, don't set the "writable" bit in the page
                  * table entry, even if the Guest says it's writable.  That way
                  * we will come back here when a write does actually occur, so
                  * we can update the Guest's _PAGE_DIRTY flag. */
-               *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
+               *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
  
         /* Finally, we write the Guest PTE entry back: we've set the
          * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
-       lgwrite(lg, gpte_ptr, pte_t, gpte);
+       lgwrite(cpu, gpte_ptr, pte_t, gpte);
  
         /* The fault is fixed, the page table is populated, the mapping
          * manipulated, the result returned and the code complete.  A small
@@ -297,19 +297,19 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
   *
   * This is a quick version which answers the question: is this virtual address
   * mapped by the shadow page tables, and is it writable? */
-static int page_writable(struct lguest *lg, unsigned long vaddr)
+static int page_writable(struct lg_cpu *cpu, unsigned long vaddr)
  {
         pgd_t *spgd;
         unsigned long flags;
  
         /* Look at the current top level entry: is it present? */
-       spgd = spgd_addr(lg, lg->pgdidx, vaddr);
+       spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
         if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
                 return 0;
  
         /* Check the flags on the pte entry itself: it must be present and
          * writable. */
-       flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
+       flags = pte_flags(*(spte_addr(*spgd, vaddr)));
  
         return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
  }
@@ -317,10 +317,10 @@ static int page_writable(struct lguest *lg, unsigned long vaddr)
  /* So, when pin_stack_pages() asks us to pin a page, we check if it's already
   * in the page tables, and if not, we call demand_page() with error code 2
   * (meaning "write"). */
-void pin_page(struct lguest *lg, unsigned long vaddr)
+void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
  {
-       if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
-               kill_guest(lg, "bad stack page %#lx", vaddr);
+       if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
+               kill_guest(cpu, "bad stack page %#lx", vaddr);
  }
  
  /*H:450 If we chase down the release_pgd() code, it looks like this: */
@@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx)
   *
   * The Guest has a hypercall to throw away the page tables: it's used when a
   * large number of mappings have been changed. */
-void guest_pagetable_flush_user(struct lguest *lg)
+void guest_pagetable_flush_user(struct lg_cpu *cpu)
  {
         /* Drop the userspace part of the current page table. */
-       flush_user_mappings(lg, lg->pgdidx);
+       flush_user_mappings(cpu->lg, cpu->cpu_pgd);
  }
  /*:*/
  
  /* We walk down the guest page tables to get a guest-physical address */
-unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
  {
         pgd_t gpgd;
         pte_t gpte;
  
         /* First step: get the top-level Guest page table entry. */
-       gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
+       gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
         /* Toplevel not present?  We can't map it in. */
         if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-               kill_guest(lg, "Bad address %#lx", vaddr);
+               kill_guest(cpu, "Bad address %#lx", vaddr);
  
-       gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
+       gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
         if (!(pte_flags(gpte) & _PAGE_PRESENT))
-               kill_guest(lg, "Bad address %#lx", vaddr);
+               kill_guest(cpu, "Bad address %#lx", vaddr);
  
         return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
  }
@@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
  /*H:435 And this is us, creating the new page directory.  If we really do
   * allocate a new one (and so the kernel parts are not there), we set
   * blank_pgdir. */
-static unsigned int new_pgdir(struct lguest *lg,
+static unsigned int new_pgdir(struct lg_cpu *cpu,
                               unsigned long gpgdir,
                               int *blank_pgdir)
  {
@@ -407,22 +407,23 @@ static unsigned int new_pgdir(struct lguest *lg,
  
         /* We pick one entry at random to throw out.  Choosing the Least
          * Recently Used might be better, but this is easy. */
-       next = random32() % ARRAY_SIZE(lg->pgdirs);
+       next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
         /* If it's never been allocated at all before, try now. */
-       if (!lg->pgdirs[next].pgdir) {
-               lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+       if (!cpu->lg->pgdirs[next].pgdir) {
+               cpu->lg->pgdirs[next].pgdir =
+                                       (pgd_t *)get_zeroed_page(GFP_KERNEL);
                 /* If the allocation fails, just keep using the one we have */
-               if (!lg->pgdirs[next].pgdir)
-                       next = lg->pgdidx;
+               if (!cpu->lg->pgdirs[next].pgdir)
+                       next = cpu->cpu_pgd;
                 else
                         /* This is a blank page, so there are no kernel
                          * mappings: caller must map the stack! */
                         *blank_pgdir = 1;
         }
         /* Record which Guest toplevel this shadows. */
-       lg->pgdirs[next].gpgdir = gpgdir;
+       cpu->lg->pgdirs[next].gpgdir = gpgdir;
         /* Release all the non-kernel mappings. */
-       flush_user_mappings(lg, next);
+       flush_user_mappings(cpu->lg, next);
  
         return next;
  }
@@ -432,21 +433,21 @@ static unsigned int new_pgdir(struct lguest *lg,
   * Now we've seen all the page table setting and manipulation, let's see what
   * what happens when the Guest changes page tables (ie. changes the top-level
   * pgdir).  This occurs on almost every context switch. */
-void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
  {
         int newpgdir, repin = 0;
  
         /* Look to see if we have this one already. */
-       newpgdir = find_pgdir(lg, pgtable);
+       newpgdir = find_pgdir(cpu->lg, pgtable);
         /* If not, we allocate or mug an existing one: if it's a fresh one,
          * repin gets set to 1. */
-       if (newpgdir == ARRAY_SIZE(lg->pgdirs))
-               newpgdir = new_pgdir(lg, pgtable, &repin);
+       if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
+               newpgdir = new_pgdir(cpu, pgtable, &repin);
         /* Change the current pgd index to the new one. */
-       lg->pgdidx = newpgdir;
+       cpu->cpu_pgd = newpgdir;
         /* If it was completely blank, we map in the Guest kernel stack */
         if (repin)
-               pin_stack_pages(lg);
+               pin_stack_pages(cpu);
  }
  
  /*H:470 Finally, a routine which throws away everything: all PGD entries in all
@@ -468,11 +469,11 @@ static void release_all_pagetables(struct lguest *lg)
   * mapping.  Since kernel mappings are in every page table, it's easiest to
   * throw them all away.  This traps the Guest in amber for a while as
   * everything faults back in, but it's rare. */
-void guest_pagetable_clear_all(struct lguest *lg)
+void guest_pagetable_clear_all(struct lg_cpu *cpu)
  {
-       release_all_pagetables(lg);
+       release_all_pagetables(cpu->lg);
         /* We need the Guest kernel stack mapped again. */
-       pin_stack_pages(lg);
+       pin_stack_pages(cpu);
  }
  /*:*/
  /*M:009 Since we throw away all mappings when a kernel mapping changes, our
@@ -497,24 +498,24 @@ void guest_pagetable_clear_all(struct lguest *lg)
   * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
   * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
   */
-static void do_set_pte(struct lguest *lg, int idx,
+static void do_set_pte(struct lg_cpu *cpu, int idx,
                        unsigned long vaddr, pte_t gpte)
  {
         /* Look up the matching shadow page directory entry. */
-       pgd_t *spgd = spgd_addr(lg, idx, vaddr);
+       pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
  
         /* If the top level isn't present, there's no entry to update. */
         if (pgd_flags(*spgd) & _PAGE_PRESENT) {
                 /* Otherwise, we start by releasing the existing entry. */
-               pte_t *spte = spte_addr(lg, *spgd, vaddr);
+               pte_t *spte = spte_addr(*spgd, vaddr);
                 release_pte(*spte);
  
                 /* If they're setting this entry as dirty or accessed, we might
                  * as well put that entry they've given us in now.  This shaves
                  * 10% off a copy-on-write micro-benchmark. */
                 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
-                       check_gpte(lg, gpte);
-                       *spte = gpte_to_spte(lg, gpte,
+                       check_gpte(cpu, gpte);
+                       *spte = gpte_to_spte(cpu, gpte,
                                              pte_flags(gpte) & _PAGE_DIRTY);
                 } else
                         /* Otherwise kill it and we can demand_page() it in
@@ -533,22 +534,22 @@ static void do_set_pte(struct lguest *lg, int idx,
   *
   * The benefit is that when we have to track a new page table, we can copy keep
   * all the kernel mappings.  This speeds up context switch immensely. */
-void guest_set_pte(struct lguest *lg,
+void guest_set_pte(struct lg_cpu *cpu,
                    unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
  {
         /* Kernel mappings must be changed on all top levels.  Slow, but
          * doesn't happen often. */
-       if (vaddr >= lg->kernel_address) {
+       if (vaddr >= cpu->lg->kernel_address) {
                 unsigned int i;
-               for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-                       if (lg->pgdirs[i].pgdir)
-                               do_set_pte(lg, i, vaddr, gpte);
+               for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
+                       if (cpu->lg->pgdirs[i].pgdir)
+                               do_set_pte(cpu, i, vaddr, gpte);
         } else {
                 /* Is this page table one we have a shadow for? */
-               int pgdir = find_pgdir(lg, gpgdir);
-               if (pgdir != ARRAY_SIZE(lg->pgdirs))
+               int pgdir = find_pgdir(cpu->lg, gpgdir);
+               if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
                         /* If so, do the update. */
-                       do_set_pte(lg, pgdir, vaddr, gpte);
+                       do_set_pte(cpu, pgdir, vaddr, gpte);
         }
  }
  
@@ -590,30 +591,32 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
  {
         /* We start on the first shadow page table, and give it a blank PGD
          * page. */
-       lg->pgdidx = 0;
-       lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
-       lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
-       if (!lg->pgdirs[lg->pgdidx].pgdir)
+       lg->pgdirs[0].gpgdir = pgtable;
+       lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+       if (!lg->pgdirs[0].pgdir)
                 return -ENOMEM;
+       lg->cpus[0].cpu_pgd = 0;
         return 0;
  }
  
  /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lguest *lg)
+void page_table_guest_data_init(struct lg_cpu *cpu)
  {
         /* We get the kernel address: above this is all kernel memory. */
-       if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
+       if (get_user(cpu->lg->kernel_address,
+                    &cpu->lg->lguest_data->kernel_address)
             /* We tell the Guest that it can't use the top 4MB of virtual
              * addresses used by the Switcher. */
-           || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
-           || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
-               kill_guest(lg, "bad guest page %p", lg->lguest_data);
+           || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
+           || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
+               kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
  
         /* In flush_user_mappings() we loop from 0 to
          * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
          * Switcher mappings, so check that now. */
-       if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
-               kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
+       if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
+               kill_guest(cpu, "bad kernel address %#lx",
+                                cpu->lg->kernel_address);
  }
  
  /* When a Guest dies, our cleanup is fairly simple. */
@@ -634,17 +637,18 @@ void free_guest_pagetable(struct lguest *lg)
   * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages
   * for each CPU already set up, we just need to hook them in now we know which
   * Guest is about to run on this CPU. */
-void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
  {
         pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
         pgd_t switcher_pgd;
         pte_t regs_pte;
+       unsigned long pfn;
  
         /* Make the last PGD entry for this Guest point to the Switcher's PTE
          * page for this CPU (with appropriate flags). */
-       switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
+       switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
  
-       lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+       cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
  
         /* We also change the Switcher PTE page.  When we're running the Guest,
          * we want the Guest's "regs" page to appear where the first Switcher
@@ -653,7 +657,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
          * CPU's "struct lguest_pages": if we make sure the Guest's register
          * page is already mapped there, we don't have to copy them out
          * again. */
-       regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
+       pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
+       regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
         switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
  }
  /*:*/
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c

index 9e189cbec7dda8640ffea4eda82e35f499c42236..ec6aa3f1c36b349464e946375de109cc606c82a2 100644 (file)
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num)
   * Protection Fault in the Switcher when it restores a Guest segment register
   * which tries to use that entry.  Then we kill the Guest for causing such a
   * mess: the message will be "unhandled trap 256". */
-static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
+static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
  {
         unsigned int i;
  
@@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
                 /* Segment descriptors contain a privilege level: the Guest is
                  * sometimes careless and leaves this as 0, even though it's
                  * running at privilege level 1.  If so, we fix it here. */
-               if ((lg->arch.gdt[i].b & 0x00006000) == 0)
-                       lg->arch.gdt[i].b |= (GUEST_PL << 13);
+               if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
+                       cpu->arch.gdt[i].b |= (GUEST_PL << 13);
  
                 /* Each descriptor has an "accessed" bit.  If we don't set it
                  * now, the CPU will try to set it when the Guest first loads
                  * that entry into a segment register.  But the GDT isn't
                  * writable by the Guest, so bad things can happen. */
-               lg->arch.gdt[i].b |= 0x00000100;
+               cpu->arch.gdt[i].b |= 0x00000100;
         }
  }
  
@@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
  
  /* This routine sets up the initial Guest GDT for booting.  All entries start
   * as 0 (unusable). */
-void setup_guest_gdt(struct lguest *lg)
+void setup_guest_gdt(struct lg_cpu *cpu)
  {
         /* Start with full 0-4G segments... */
-       lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-       lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+       cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+       cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
         /* ...except the Guest is allowed to use them, so set the privilege
          * level appropriately in the flags. */
-       lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
-       lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+       cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
+       cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
  }
  
  /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
   * entries. */
-void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
+void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
  {
         unsigned int i;
  
         for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-               gdt[i] = lg->arch.gdt[i];
+               gdt[i] = cpu->arch.gdt[i];
  }
  
  /*H:640 When the Guest is run on a different CPU, or the GDT entries have
   * changed, copy_gdt() is called to copy the Guest's GDT entries across to this
   * CPU's GDT. */
-void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
+void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
  {
         unsigned int i;
  
@@ -141,38 +141,38 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
          * replaced.  See ignored_gdt() above. */
         for (i = 0; i < GDT_ENTRIES; i++)
                 if (!ignored_gdt(i))
-                       gdt[i] = lg->arch.gdt[i];
+                       gdt[i] = cpu->arch.gdt[i];
  }
  
  /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
   * We copy it from the Guest and tweak the entries. */
-void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
+void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num)
  {
         /* We assume the Guest has the same number of GDT entries as the
          * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
-       if (num > ARRAY_SIZE(lg->arch.gdt))
-               kill_guest(lg, "too many gdt entries %i", num);
+       if (num > ARRAY_SIZE(cpu->arch.gdt))
+               kill_guest(cpu, "too many gdt entries %i", num);
  
         /* We read the whole thing in, then fix it up. */
-       __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0]));
-       fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt));
+       __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0]));
+       fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt));
         /* Mark that the GDT changed so the core knows it has to copy it again,
          * even if the Guest is run on the same CPU. */
-       lg->changed |= CHANGED_GDT;
+       cpu->changed |= CHANGED_GDT;
  }
  
  /* This is the fast-track version for just changing the three TLS entries.
   * Remember that this happens on every context switch, so it's worth
   * optimizing.  But wouldn't it be neater to have a single hypercall to cover
   * both cases? */
-void guest_load_tls(struct lguest *lg, unsigned long gtls)
+void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
  {
-       struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
+       struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
  
-       __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-       fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
+       __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+       fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
         /* Note that just the TLS entries have changed. */
-       lg->changed |= CHANGED_GDT_TLS;
+       cpu->changed |= CHANGED_GDT_TLS;
  }
  /*:*/
  
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c

index 96d0fd07c57d5702fe9666a587df05a2ced29c51..61f2f8eb8cad7edb744559c873a854b483a3fcdc 100644 (file)
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
                   (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
  }
  
-static DEFINE_PER_CPU(struct lguest *, last_guest);
+static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
  
  /*S:010
   * We approach the Switcher.
@@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lguest *, last_guest);
   * since it last ran.  We saw this set in interrupts_and_traps.c and
   * segments.c.
   */
-static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
  {
         /* Copying all this data can be quite expensive.  We usually run the
          * same Guest we ran last time (and that Guest hasn't run anywhere else
          * meanwhile).  If that's not the case, we pretend everything in the
          * Guest has changed. */
-       if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
-               __get_cpu_var(last_guest) = lg;
-               lg->last_pages = pages;
-               lg->changed = CHANGED_ALL;
+       if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
+               __get_cpu_var(last_cpu) = cpu;
+               cpu->last_pages = pages;
+               cpu->changed = CHANGED_ALL;
         }
  
         /* These copies are pretty cheap, so we do them unconditionally: */
@@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
         pages->state.host_cr3 = __pa(current->mm->pgd);
         /* Set up the Guest's page tables to see this CPU's pages (and no
          * other CPU's pages). */
-       map_switcher_in_guest(lg, pages);
+       map_switcher_in_guest(cpu, pages);
         /* Set up the two "TSS" members which tell the CPU what stack to use
          * for traps which do directly into the Guest (ie. traps at privilege
          * level 1). */
-       pages->state.guest_tss.esp1 = lg->esp1;
-       pages->state.guest_tss.ss1 = lg->ss1;
+       pages->state.guest_tss.esp1 = cpu->esp1;
+       pages->state.guest_tss.ss1 = cpu->ss1;
  
         /* Copy direct-to-Guest trap entries. */
-       if (lg->changed & CHANGED_IDT)
-               copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+       if (cpu->changed & CHANGED_IDT)
+               copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
  
         /* Copy all GDT entries which the Guest can change. */
-       if (lg->changed & CHANGED_GDT)
-               copy_gdt(lg, pages->state.guest_gdt);
+       if (cpu->changed & CHANGED_GDT)
+               copy_gdt(cpu, pages->state.guest_gdt);
         /* If only the TLS entries have changed, copy them. */
-       else if (lg->changed & CHANGED_GDT_TLS)
-               copy_gdt_tls(lg, pages->state.guest_gdt);
+       else if (cpu->changed & CHANGED_GDT_TLS)
+               copy_gdt_tls(cpu, pages->state.guest_gdt);
  
         /* Mark the Guest as unchanged for next time. */
-       lg->changed = 0;
+       cpu->changed = 0;
  }
  
  /* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
  {
         /* This is a dummy value we need for GCC's sake. */
         unsigned int clobber;
  
         /* Copy the guest-specific information into this CPU's "struct
          * lguest_pages". */
-       copy_in_guest_info(lg, pages);
+       copy_in_guest_info(cpu, pages);
  
         /* Set the trap number to 256 (impossible value).  If we fault while
          * switching to the Guest (bad segment registers or bug), this will
          * cause us to abort the Guest. */
-       lg->regs->trapnum = 256;
+       cpu->regs->trapnum = 256;
  
         /* Now: we push the "eflags" register on the stack, then do an "lcall".
          * This is how we change from using the kernel code segment to using
@@ -143,7 +143,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
                       * 0-th argument above, ie "a").  %ebx contains the
                       * physical address of the Guest's top-level page
                       * directory. */
-                    : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+                    : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
                      /* We tell gcc that all these registers could change,
                       * which means we don't have to save and restore them in
                       * the Switcher. */
@@ -161,12 +161,12 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
  
  /*H:040 This is the i386-specific code to setup and run the Guest.  Interrupts
   * are disabled: we own the CPU. */
-void lguest_arch_run_guest(struct lguest *lg)
+void lguest_arch_run_guest(struct lg_cpu *cpu)
  {
         /* Remember the awfully-named TS bit?  If the Guest has asked to set it
          * we set it now, so we can trap and pass that trap to the Guest if it
          * uses the FPU. */
-       if (lg->ts)
+       if (cpu->ts)
                 lguest_set_ts();
  
         /* SYSENTER is an optimized way of doing system calls.  We can't allow
@@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lguest *lg)
         /* Now we actually run the Guest.  It will return when something
          * interesting happens, and we can examine its registers to see what it
          * was doing. */
-       run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+       run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
  
         /* Note that the "regs" pointer contains two extra entries which are
          * not really registers: a trap number which says what interrupt or
@@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lguest *lg)
          * bad virtual address.  We have to grab this now, because once we
          * re-enable interrupts an interrupt could fault and thus overwrite
          * cr2, or we could even move off to a different CPU. */
-       if (lg->regs->trapnum == 14)
-               lg->arch.last_pagefault = read_cr2();
+       if (cpu->regs->trapnum == 14)
+               cpu->arch.last_pagefault = read_cr2();
         /* Similarly, if we took a trap because the Guest used the FPU,
          * we have to restore the FPU it expects to see. */
-       else if (lg->regs->trapnum == 7)
+       else if (cpu->regs->trapnum == 7)
                 math_state_restore();
  
         /* Restore SYSENTER if it's supposed to be on. */
@@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lguest *lg)
   * When the Guest uses one of these instructions, we get a trap (General
   * Protection Fault) and come here.  We see if it's one of those troublesome
   * instructions and skip over it.  We return true if we did. */
-static int emulate_insn(struct lguest *lg)
+static int emulate_insn(struct lg_cpu *cpu)
  {
         u8 insn;
         unsigned int insnlen = 0, in = 0, shift = 0;
         /* The eip contains the *virtual* address of the Guest's instruction:
          * guest_pa just subtracts the Guest's page_offset. */
-       unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+       unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
  
         /* This must be the Guest kernel trying to do something, not userspace!
          * The bottom two bits of the CS segment register are the privilege
          * level. */
-       if ((lg->regs->cs & 3) != GUEST_PL)
+       if ((cpu->regs->cs & 3) != GUEST_PL)
                 return 0;
  
         /* Decoding x86 instructions is icky. */
-       insn = lgread(lg, physaddr, u8);
+       insn = lgread(cpu, physaddr, u8);
  
         /* 0x66 is an "operand prefix".  It means it's using the upper 16 bits
            of the eax register. */
@@ -237,7 +237,7 @@ static int emulate_insn(struct lguest *lg)
                 shift = 16;
                 /* The instruction is 1 byte so far, read the next byte. */
                 insnlen = 1;
-               insn = lgread(lg, physaddr + insnlen, u8);
+               insn = lgread(cpu, physaddr + insnlen, u8);
         }
  
         /* We can ignore the lower bit for the moment and decode the 4 opcodes
@@ -268,26 +268,26 @@ static int emulate_insn(struct lguest *lg)
         if (in) {
                 /* Lower bit tells is whether it's a 16 or 32 bit access */
                 if (insn & 0x1)
-                       lg->regs->eax = 0xFFFFFFFF;
+                       cpu->regs->eax = 0xFFFFFFFF;
                 else
-                       lg->regs->eax |= (0xFFFF << shift);
+                       cpu->regs->eax |= (0xFFFF << shift);
         }
         /* Finally, we've "done" the instruction, so move past it. */
-       lg->regs->eip += insnlen;
+       cpu->regs->eip += insnlen;
         /* Success! */
         return 1;
  }
  
  /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lguest *lg)
+void lguest_arch_handle_trap(struct lg_cpu *cpu)
  {
-       switch (lg->regs->trapnum) {
+       switch (cpu->regs->trapnum) {
         case 13: /* We've intercepted a General Protection Fault. */
                 /* Check if this was one of those annoying IN or OUT
                  * instructions which we need to emulate.  If so, we just go
                  * back into the Guest after we've done it. */
-               if (lg->regs->errcode == 0) {
-                       if (emulate_insn(lg))
+               if (cpu->regs->errcode == 0) {
+                       if (emulate_insn(cpu))
                                 return;
                 }
                 break;
@@ -301,7 +301,8 @@ void lguest_arch_handle_trap(struct lguest *lg)
                  *
                  * The errcode tells whether this was a read or a write, and
                  * whether kernel or userspace code. */
-               if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
+               if (demand_page(cpu, cpu->arch.last_pagefault,
+                               cpu->regs->errcode))
                         return;
  
                 /* OK, it's really not there (or not OK): the Guest needs to
@@ -311,15 +312,16 @@ void lguest_arch_handle_trap(struct lguest *lg)
                  * Note that if the Guest were really messed up, this could
                  * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
                  * lg->lguest_data could be NULL */
-               if (lg->lguest_data &&
-                   put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
-                       kill_guest(lg, "Writing cr2");
+               if (cpu->lg->lguest_data &&
+                   put_user(cpu->arch.last_pagefault,
+                            &cpu->lg->lguest_data->cr2))
+                       kill_guest(cpu, "Writing cr2");
                 break;
         case 7: /* We've intercepted a Device Not Available fault. */
                 /* If the Guest doesn't want to know, we already restored the
                  * Floating Point Unit, so we just continue without telling
                  * it. */
-               if (!lg->ts)
+               if (!cpu->ts)
                         return;
                 break;
         case 32 ... 255:
@@ -332,19 +334,19 @@ void lguest_arch_handle_trap(struct lguest *lg)
         case LGUEST_TRAP_ENTRY:
                 /* Our 'struct hcall_args' maps directly over our regs: we set
                  * up the pointer now to indicate a hypercall is pending. */
-               lg->hcall = (struct hcall_args *)lg->regs;
+               cpu->hcall = (struct hcall_args *)cpu->regs;
                 return;
         }
  
         /* We didn't handle the trap, so it needs to go to the Guest. */
-       if (!deliver_trap(lg, lg->regs->trapnum))
+       if (!deliver_trap(cpu, cpu->regs->trapnum))
                 /* If the Guest doesn't have a handler (either it hasn't
                  * registered any yet, or it's one of the faults we don't let
                  * it handle), it dies with a cryptic error message. */
-               kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
-                          lg->regs->trapnum, lg->regs->eip,
-                          lg->regs->trapnum == 14 ? lg->arch.last_pagefault
-                          : lg->regs->errcode);
+               kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
+                          cpu->regs->trapnum, cpu->regs->eip,
+                          cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
+                          : cpu->regs->errcode);
  }
  
  /* Now we can look at each of the routines this calls, in increasing order of
@@ -416,7 +418,7 @@ void __init lguest_arch_host_init(void)
                 /* We know where we want the stack to be when the Guest enters
                  * the switcher: in pages->regs.  The stack grows upwards, so
                  * we start it at the end of that structure. */
-               state->guest_tss.esp0 = (long)(&pages->regs + 1);
+               state->guest_tss.sp0 = (long)(&pages->regs + 1);
                 /* And this is the GDT entry to use for the stack: we keep a
                  * couple of special LGUEST entries. */
                 state->guest_tss.ss0 = LGUEST_DS;
@@ -487,17 +489,17 @@ void __exit lguest_arch_host_fini(void)
  
  
  /*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
+int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
  {
         switch (args->arg0) {
         case LHCALL_LOAD_GDT:
-               load_guest_gdt(lg, args->arg1, args->arg2);
+               load_guest_gdt(cpu, args->arg1, args->arg2);
                 break;
         case LHCALL_LOAD_IDT_ENTRY:
-               load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3);
+               load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
                 break;
         case LHCALL_LOAD_TLS:
-               guest_load_tls(lg, args->arg1);
+               guest_load_tls(cpu, args->arg1);
                 break;
         default:
                 /* Bad Guest.  Bad! */
@@ -507,13 +509,14 @@ int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
  }
  
  /*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lguest *lg)
+int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
  {
         u32 tsc_speed;
  
         /* The pointer to the Guest's "struct lguest_data" is the only
          * argument.  We check that address now. */
-       if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data)))
+       if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
+                              sizeof(*cpu->lg->lguest_data)))
                 return -EFAULT;
  
         /* Having checked it, we simply set lg->lguest_data to point straight
@@ -521,7 +524,7 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
          * copy_to_user/from_user from now on, instead of lgread/write.  I put
          * this in to show that I'm not immune to writing stupid
          * optimizations. */
-       lg->lguest_data = lg->mem_base + lg->hcall->arg1;
+       cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
  
         /* We insist that the Time Stamp Counter exist and doesn't change with
          * cpu frequency.  Some devious chip manufacturers decided that TSC
@@ -534,12 +537,12 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
                 tsc_speed = tsc_khz;
         else
                 tsc_speed = 0;
-       if (put_user(tsc_speed, &lg->lguest_data->tsc_khz))
+       if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
                 return -EFAULT;
  
         /* The interrupt code might not like the system call vector. */
-       if (!check_syscall_vector(lg))
-               kill_guest(lg, "bad syscall vector");
+       if (!check_syscall_vector(cpu->lg))
+               kill_guest(cpu, "bad syscall vector");
  
         return 0;
  }
@@ -548,9 +551,9 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
   *
   * Most of the Guest's registers are left alone: we used get_zeroed_page() to
   * allocate the structure, so they will be 0. */
-void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
+void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
  {
-       struct lguest_regs *regs = lg->regs;
+       struct lguest_regs *regs = cpu->regs;
  
         /* There are four "segment" registers which the Guest needs to boot:
          * The "code segment" register (cs) refers to the kernel code segment
@@ -577,5 +580,5 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
  
         /* There are a couple of GDT entries the Guest expects when first
          * booting. */
-       setup_guest_gdt(lg);
+       setup_guest_gdt(cpu);
  }
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig

index af40ff434def6cddb55b30711d2627cef3d6affe..6c575403bd39baa739654b81f9bb35b1e01e7a37 100644 (file)
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2009,6 +2009,9 @@ config E1000E
           To compile this driver as a module, choose M here. The module
           will be called e1000e.
  
+config E1000E_ENABLED
+       def_bool E1000E != n
+
  config IP1000
         tristate "IP1000 Gigabit Ethernet support"
         depends on PCI && EXPERIMENTAL
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c

index 7f5b2ae70d5d56cdc056fe2a15daceb63d75a537..8c87940a9ce86341eacda17d1a3ad94106e32921 100644 (file)
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -47,6 +47,12 @@ static const char e1000_copyright[] = "Copyright (c) 1999-2006 Intel Corporation
   * Macro expands to...
   *   {PCI_DEVICE(PCI_VENDOR_ID_INTEL, device_id)}
   */
+#ifdef CONFIG_E1000E_ENABLED
+  #define PCIE(x) 
+#else
+  #define PCIE(x) x,
+#endif
+
  static struct pci_device_id e1000_pci_tbl[] = {
         INTEL_E1000_ETHERNET_DEVICE(0x1000),
         INTEL_E1000_ETHERNET_DEVICE(0x1001),
@@ -73,6 +79,14 @@ static struct pci_device_id e1000_pci_tbl[] = {
         INTEL_E1000_ETHERNET_DEVICE(0x1026),
         INTEL_E1000_ETHERNET_DEVICE(0x1027),
         INTEL_E1000_ETHERNET_DEVICE(0x1028),
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x1049))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x104A))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x104B))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x104C))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x104D))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x105E))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x105F))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x1060))
         INTEL_E1000_ETHERNET_DEVICE(0x1075),
         INTEL_E1000_ETHERNET_DEVICE(0x1076),
         INTEL_E1000_ETHERNET_DEVICE(0x1077),
@@ -81,9 +95,28 @@ static struct pci_device_id e1000_pci_tbl[] = {
         INTEL_E1000_ETHERNET_DEVICE(0x107A),
         INTEL_E1000_ETHERNET_DEVICE(0x107B),
         INTEL_E1000_ETHERNET_DEVICE(0x107C),
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x107D))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x107E))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x107F))
         INTEL_E1000_ETHERNET_DEVICE(0x108A),
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x108B))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x108C))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x1096))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x1098))
         INTEL_E1000_ETHERNET_DEVICE(0x1099),
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x109A))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10A4))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10A5))
         INTEL_E1000_ETHERNET_DEVICE(0x10B5),
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10B9))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10BA))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10BB))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10BC))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10C4))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10C5))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10D5))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10D9))
+PCIE(  INTEL_E1000_ETHERNET_DEVICE(0x10DA))
         /* required last entry */
         {0,}
  };
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c

index 5dba68fe33f59590bb43c8ac23ec4f4cac4522e0..a8364d8152225e43960c6a20e5139bdf8dcc9b94 100644 (file)
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -61,7 +61,7 @@ set_base(gdt[(selname) >> 3], (u32)(address)); \
  set_limit(gdt[(selname) >> 3], size); \
  } while(0)
  
-static struct desc_struct bad_bios_desc = { 0, 0x00409200 };
+static struct desc_struct bad_bios_desc;
  
  /*
   * At some point we want to use this stack frame pointer to unwind
@@ -477,6 +477,9 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
         pnp_bios_callpoint.offset = header->fields.pm16offset;
         pnp_bios_callpoint.segment = PNP_CS16;
  
+       bad_bios_desc.a = 0;
+       bad_bios_desc.b = 0x00409200;
+
         set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
         _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
         for (i = 0; i < NR_CPUS; i++) {
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c

index e45f85f7c7ed815d1c4017baf16465762200235b..0dff05840ee2e1ffcdb222cdb3805c01054fbd65 100644 (file)
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req)
  
                 ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n",
                                fcp_rsp_iu->fcp_sns_len);
-               memcpy(&scpnt->sense_buffer,
+               memcpy(scpnt->sense_buffer,
                        zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len);
                 ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE,
-                             (void *) &scpnt->sense_buffer, sns_len);
+                             (void *)scpnt->sense_buffer, sns_len);
         }
  
         /* check for overrun */
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c

index 1c244832c6c8aa0049d560883c3b959845050b83..b4912d1cee2aa164e81a82ffaefd9e01d991b2b0 100644 (file)
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1990,7 +1990,6 @@ static struct scsi_host_template driver_template = {
         .max_sectors            = TW_MAX_SECTORS,
         .cmd_per_lun            = TW_MAX_CMDS_PER_LUN,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .shost_attrs            = twa_host_attrs,
         .emulated               = 1
  };
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c

index 59716ebeb10c1a5c100d42d5941f18e2ba1769ff..d09532162217b5f04376845546bbced6ada5b05a 100644 (file)
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -2261,7 +2261,6 @@ static struct scsi_host_template driver_template = {
         .max_sectors            = TW_MAX_SECTORS,
         .cmd_per_lun            = TW_MAX_CMDS_PER_LUN,  
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .shost_attrs            = tw_host_attrs,
         .emulated               = 1
  };
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c

index ead47c143ce04fd9792da0d30c9a7b31f7d0db03..4d3ebb1af49022e9777a6472f368d96e1a6dec45 100644 (file)
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -3575,7 +3575,6 @@ static struct scsi_host_template Bus_Logic_template = {
         .unchecked_isa_dma = 1,
         .max_sectors = 128,
         .use_clustering = ENABLE_CLUSTERING,
-       .use_sg_chaining = ENABLE_SG_CHAINING,
  };
  
  /*
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig

index 3e161cd664637292927686aaf954e23dfc3b1a02..14fc7f39e83e0be1ad26267daad4853544775223 100644 (file)
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -345,7 +345,7 @@ config ISCSI_TCP
  
  config SGIWD93_SCSI
         tristate "SGI WD93C93 SCSI Driver"
-       depends on SGI_IP22 && SCSI
+       depends on SGI_HAS_WD93 && SCSI
         help
           If you have a Western Digital WD93 SCSI controller on
           an SGI MIPS system, say Y.  Otherwise, say N.
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c

index 137d065db3da037f350a53c0afb1afa4ad8a187b..6961f78742aedbc86de34d810891737eeece4ec9 100644 (file)
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -1065,7 +1065,6 @@ static struct scsi_host_template driver_template =
       .cmd_per_lun              = 1                     /* commands per lun */, 
       .unchecked_isa_dma        = 1                     /* unchecked_isa_dma */,
       .use_clustering           = ENABLE_CLUSTERING,
-     .use_sg_chaining           = ENABLE_SG_CHAINING,
  };
  
  #include "scsi_module.c"
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c

index d3a6d15fb77af90ead7ee78bfd1deed94e5bef8f..f608d4a1d6daeaa8b44ca46d25cb3fe2b3cc2276 100644 (file)
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1071,7 +1071,6 @@ static struct scsi_host_template inia100_template = {
         .sg_tablesize           = SG_ALL,
         .cmd_per_lun            = 1,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  static int __devinit inia100_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c

index 851a7e599c500bc5f4e24b2a3aba4a23a4e153f9..f8afa358b6b67fba3ba9aa36293ecfdbfd9be01d 100644 (file)
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -243,7 +243,6 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
          *      Search the list of AdapterFibContext addresses on the adapter
          *      to be sure this is a valid address
          */
-       spin_lock_irqsave(&dev->fib_lock, flags);
         entry = dev->fib_list.next;
         fibctx = NULL;
  
@@ -252,25 +251,24 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg)
                 /*
                  *      Extract the AdapterFibContext from the Input parameters.
                  */
-               if (fibctx->unique == f.fibctx) { /* We found a winner */
+               if (fibctx->unique == f.fibctx) {   /* We found a winner */
                         break;
                 }
                 entry = entry->next;
                 fibctx = NULL;
         }
         if (!fibctx) {
-               spin_unlock_irqrestore(&dev->fib_lock, flags);
                 dprintk ((KERN_INFO "Fib Context not found\n"));
                 return -EINVAL;
         }
  
         if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) ||
                  (fibctx->size != sizeof(struct aac_fib_context))) {
-               spin_unlock_irqrestore(&dev->fib_lock, flags);
                 dprintk ((KERN_INFO "Fib Context corrupt?\n"));
                 return -EINVAL;
         }
         status = 0;
+       spin_lock_irqsave(&dev->fib_lock, flags);
         /*
          *      If there are no fibs to send back, then either wait or return
          *      -EAGAIN
@@ -328,9 +326,7 @@ return_fib:
  int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
  {
         struct fib *fib;
-       unsigned long flags;
  
-       spin_lock_irqsave(&dev->fib_lock, flags);
         /*
          *      First free any FIBs that have not been consumed.
          */
@@ -353,7 +349,6 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx)
          *      Remove the Context from the AdapterFibContext List
          */
         list_del(&fibctx->next);
-       spin_unlock_irqrestore(&dev->fib_lock, flags);
         /*
          *      Invalidate context
          */
@@ -419,8 +414,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg)
   *     @arg: ioctl arguments
   *
   *     This routine returns the driver version.
- *     Under Linux, there have been no version incompatibilities, so this is
- *     simple!
+ *      Under Linux, there have been no version incompatibilities, so this is
+ *      simple!
   */
  
  static int check_revision(struct aac_dev *dev, void __user *arg)
@@ -468,7 +463,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
         u32 data_dir;
         void __user *sg_user[32];
         void *sg_list[32];
-       u32 sg_indx = 0;
+       u32   sg_indx = 0;
         u32 byte_count = 0;
         u32 actual_fibsize64, actual_fibsize = 0;
         int i;
@@ -522,11 +517,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
         // Fix up srb for endian and force some values
  
         srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi);       // Force this
-       srbcmd->channel  = cpu_to_le32(user_srbcmd->channel);
+       srbcmd->channel  = cpu_to_le32(user_srbcmd->channel);
         srbcmd->id       = cpu_to_le32(user_srbcmd->id);
-       srbcmd->lun      = cpu_to_le32(user_srbcmd->lun);
-       srbcmd->timeout  = cpu_to_le32(user_srbcmd->timeout);
-       srbcmd->flags    = cpu_to_le32(flags);
+       srbcmd->lun      = cpu_to_le32(user_srbcmd->lun);
+       srbcmd->timeout  = cpu_to_le32(user_srbcmd->timeout);
+       srbcmd->flags    = cpu_to_le32(flags);
         srbcmd->retry_limit = 0; // Obsolete parameter
         srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size);
         memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb));
@@ -791,9 +786,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg)
         pci_info.bus = dev->pdev->bus->number;
         pci_info.slot = PCI_SLOT(dev->pdev->devfn);
  
-       if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
-               dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
-               return -EFAULT;
+       if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) {
+              dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n"));
+              return -EFAULT;
         }
         return 0;
  }
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c

index 61be22774e99786390ca0702cb932b3aa1812799..0e8267c1e9155a29df404823454a47268442a667 100644 (file)
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1032,7 +1032,6 @@ static struct scsi_host_template aac_driver_template = {
         .cmd_per_lun                    = AAC_NUM_IO_FIB,
  #endif
         .use_clustering                 = ENABLE_CLUSTERING,
-       .use_sg_chaining                = ENABLE_SG_CHAINING,
         .emulated                       = 1,
  };
  
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c

index be58a0b097c76752d0827ee097c4c198db09fe17..7c45d88a205bfd2a9fd2d916f228f796d42f2cd5 100644 (file)
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -563,7 +563,6 @@ static struct scsi_host_template aha1740_template = {
         .sg_tablesize     = AHA1740_SCATTER,
         .cmd_per_lun      = AHA1740_CMDLUN,
         .use_clustering   = ENABLE_CLUSTERING,
-       .use_sg_chaining  = ENABLE_SG_CHAINING,
         .eh_abort_handler = aha1740_eh_abort_handler,
  };
  
diff --git a/drivers/scsi/aic7xxx/aic79xx.h b/drivers/scsi/aic7xxx/aic79xx.h

index ce638aa6005ad93537cf4b86b15c6cc453b31f09..2f00467b6b8c8bd12418614d186e475f2fd24590 100644 (file)
--- a/drivers/scsi/aic7xxx/aic79xx.h
+++ b/drivers/scsi/aic7xxx/aic79xx.h
@@ -1340,8 +1340,10 @@ struct   ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t);
  int                      ahd_pci_config(struct ahd_softc *,
                                          struct ahd_pci_identity *);
  int    ahd_pci_test_register_access(struct ahd_softc *);
+#ifdef CONFIG_PM
  void   ahd_pci_suspend(struct ahd_softc *);
  void   ahd_pci_resume(struct ahd_softc *);
+#endif
  
  /************************** SCB and SCB queue management **********************/
  void           ahd_qinfifo_requeue_tail(struct ahd_softc *ahd,
@@ -1352,8 +1354,10 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name);
  int                     ahd_softc_init(struct ahd_softc *);
  void                    ahd_controller_info(struct ahd_softc *ahd, char *buf);
  int                     ahd_init(struct ahd_softc *ahd);
+#ifdef CONFIG_PM
  int                     ahd_suspend(struct ahd_softc *ahd);
  void                    ahd_resume(struct ahd_softc *ahd);
+#endif
  int                     ahd_default_config(struct ahd_softc *ahd);
  int                     ahd_parse_vpddata(struct ahd_softc *ahd,
                                            struct vpd_config *vpd);
@@ -1361,7 +1365,6 @@ int                        ahd_parse_cfgdata(struct ahd_softc *ahd,
                                            struct seeprom_config *sc);
  void                    ahd_intr_enable(struct ahd_softc *ahd, int enable);
  void                    ahd_pause_and_flushwork(struct ahd_softc *ahd);
-int                     ahd_suspend(struct ahd_softc *ahd); 
  void                    ahd_set_unit(struct ahd_softc *, int);
  void                    ahd_set_name(struct ahd_softc *, char *);
  struct scb             *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx);
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c

index a7dd8cdda472d6f31ac5ea57dbd7688da4a94c8e..ade0fb8fbdb23fa9264174e9b6b2c08a972e42f2 100644 (file)
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -7175,6 +7175,7 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd)
         ahd->flags &= ~AHD_ALL_INTERRUPTS;
  }
  
+#ifdef CONFIG_PM
  int
  ahd_suspend(struct ahd_softc *ahd)
  {
@@ -7197,6 +7198,7 @@ ahd_resume(struct ahd_softc *ahd)
         ahd_intr_enable(ahd, TRUE); 
         ahd_restart(ahd);
  }
+#endif
  
  /************************** Busy Target Table *********************************/
  /*
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c

index 0e4708fd43c8e00d861b80392c976683d43145fa..01465479290135bcd4d03719880bc360fc28947e 100644 (file)
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -766,7 +766,6 @@ struct scsi_host_template aic79xx_driver_template = {
         .max_sectors            = 8192,
         .cmd_per_lun            = 2,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .slave_alloc            = ahd_linux_slave_alloc,
         .slave_configure        = ahd_linux_slave_configure,
         .target_alloc           = ahd_linux_target_alloc,
@@ -1922,7 +1921,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd)
                                 struct scsi_sense_data *sense;
                                 
                                 sense = (struct scsi_sense_data *)
-                                       &cmd->sense_buffer;
+                                       cmd->sense_buffer;
                                 if (sense->extra_len >= 5 &&
                                     (sense->add_sense_code == 0x47
                                      || sense->add_sense_code == 0x48))
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c

index 66f0259edb697c34f25ea3e410a963ce2b814e70..4150c8a8fdc2546cdd21224dd0f1dea987ee2416 100644 (file)
--- a/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm_pci.c
@@ -43,17 +43,6 @@
  #include "aic79xx_inline.h"
  #include "aic79xx_pci.h"
  
-static int     ahd_linux_pci_dev_probe(struct pci_dev *pdev,
-                                       const struct pci_device_id *ent);
-static int     ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd,
-                                                u_long *base, u_long *base2);
-static int     ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd,
-                                                u_long *bus_addr,
-                                                uint8_t __iomem **maddr);
-static int     ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
-static int     ahd_linux_pci_dev_resume(struct pci_dev *pdev);
-static void    ahd_linux_pci_dev_remove(struct pci_dev *pdev);
-
  /* Define the macro locally since it's different for different class of chips.
   */
  #define ID(x)            \
@@ -85,17 +74,7 @@ static struct pci_device_id ahd_linux_pci_id_table[] = {
  
  MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table);
  
-static struct pci_driver aic79xx_pci_driver = {
-       .name           = "aic79xx",
-       .probe          = ahd_linux_pci_dev_probe,
  #ifdef CONFIG_PM
-       .suspend        = ahd_linux_pci_dev_suspend,
-       .resume         = ahd_linux_pci_dev_resume,
-#endif
-       .remove         = ahd_linux_pci_dev_remove,
-       .id_table       = ahd_linux_pci_id_table
-};
-
  static int
  ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
  {
@@ -139,6 +118,7 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev)
  
         return rc;
  }
+#endif
  
  static void
  ahd_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -245,6 +225,17 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
         return (0);
  }
  
+static struct pci_driver aic79xx_pci_driver = {
+       .name           = "aic79xx",
+       .probe          = ahd_linux_pci_dev_probe,
+#ifdef CONFIG_PM
+       .suspend        = ahd_linux_pci_dev_suspend,
+       .resume         = ahd_linux_pci_dev_resume,
+#endif
+       .remove         = ahd_linux_pci_dev_remove,
+       .id_table       = ahd_linux_pci_id_table
+};
+
  int
  ahd_linux_pci_init(void)
  {
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c

index 7a203a90601ae06c4e434f36b3f719e5a3eea204..df853676e66a8d7174c81862e65559c99fd45121 100644 (file)
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -389,6 +389,7 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry)
         return error;
  }
  
+#ifdef CONFIG_PM
  void
  ahd_pci_suspend(struct ahd_softc *ahd)
  {
@@ -415,6 +416,7 @@ ahd_pci_resume(struct ahd_softc *ahd)
         ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME,
                              ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1);
  }
+#endif
  
  /*
   * Perform some simple tests that should catch situations where
diff --git a/drivers/scsi/aic7xxx/aic7xxx.h b/drivers/scsi/aic7xxx/aic7xxx.h

index 3d4e42d904521c2f01768506aee696ed799174f6..c0344e61765112195bb711ad25137e34da58fc61 100644 (file)
--- a/drivers/scsi/aic7xxx/aic7xxx.h
+++ b/drivers/scsi/aic7xxx/aic7xxx.h
@@ -1143,7 +1143,9 @@ struct ahc_pci_identity   *ahc_find_pci_device(ahc_dev_softc_t);
  int                     ahc_pci_config(struct ahc_softc *,
                                         struct ahc_pci_identity *);
  int                     ahc_pci_test_register_access(struct ahc_softc *);
+#ifdef CONFIG_PM
  void                    ahc_pci_resume(struct ahc_softc *ahc);
+#endif
  
  /*************************** EISA/VL Front End ********************************/
  struct aic7770_identity *aic7770_find_device(uint32_t);
@@ -1170,8 +1172,10 @@ int                       ahc_chip_init(struct ahc_softc *ahc);
  int                     ahc_init(struct ahc_softc *ahc);
  void                    ahc_intr_enable(struct ahc_softc *ahc, int enable);
  void                    ahc_pause_and_flushwork(struct ahc_softc *ahc);
+#ifdef CONFIG_PM
  int                     ahc_suspend(struct ahc_softc *ahc); 
  int                     ahc_resume(struct ahc_softc *ahc);
+#endif
  void                    ahc_set_unit(struct ahc_softc *, int);
  void                    ahc_set_name(struct ahc_softc *, char *);
  void                    ahc_alloc_scbs(struct ahc_softc *ahc);
diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c

index f350b5e89e76d01485cb9fb22cb16c8a8d2c967c..6d2ae641273c9fe56826043fb623761fe1063bb9 100644 (file)
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c
@@ -5078,6 +5078,7 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc)
         ahc->flags &= ~AHC_ALL_INTERRUPTS;
  }
  
+#ifdef CONFIG_PM
  int
  ahc_suspend(struct ahc_softc *ahc)
  {
@@ -5113,7 +5114,7 @@ ahc_resume(struct ahc_softc *ahc)
         ahc_restart(ahc);
         return (0);
  }
-
+#endif
  /************************** Busy Target Table *********************************/
  /*
   * Return the untagged transaction id for a given target/channel lun.
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c

index e310e414067fc767ef0e8ede1a6864d90650ca6a..99a3b33a3233591834e670c95b8152af9df3d12d 100644 (file)
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -747,7 +747,6 @@ struct scsi_host_template aic7xxx_driver_template = {
         .max_sectors            = 8192,
         .cmd_per_lun            = 2,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .slave_alloc            = ahc_linux_slave_alloc,
         .slave_configure        = ahc_linux_slave_configure,
         .target_alloc           = ahc_linux_target_alloc,
@@ -1658,9 +1657,12 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb)
                 untagged_q = &(ahc->untagged_queues[target_offset]);
                 TAILQ_REMOVE(untagged_q, scb, links.tqe);
                 BUG_ON(!TAILQ_EMPTY(untagged_q));
-       }
-
-       if ((scb->flags & SCB_ACTIVE) == 0) {
+       } else if ((scb->flags & SCB_ACTIVE) == 0) {
+               /*
+                * Transactions aborted from the untagged queue may
+                * not have been dispatched to the controller, so
+                * only check the SCB_ACTIVE flag for tagged transactions.
+                */
                 printf("SCB %d done'd twice\n", scb->hscb->tag);
                 ahc_dump_card_state(ahc);
                 panic("Stopping for safety");
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c

index 4488946cff2e518e501686852090b02f44fc6b62..dd6e21d6f1dd36ae12ec4b18eebb1e7e53c6ce9b 100644 (file)
--- a/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c
@@ -42,17 +42,6 @@
  #include "aic7xxx_osm.h"
  #include "aic7xxx_pci.h"
  
-static int     ahc_linux_pci_dev_probe(struct pci_dev *pdev,
-                                       const struct pci_device_id *ent);
-static int     ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc,
-                                               u_long *base);
-static int     ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc,
-                                                u_long *bus_addr,
-                                                uint8_t __iomem **maddr);
-static int     ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg);
-static int     ahc_linux_pci_dev_resume(struct pci_dev *pdev);
-static void    ahc_linux_pci_dev_remove(struct pci_dev *pdev);
-
  /* Define the macro locally since it's different for different class of chips.
  */
  #define ID(x)  ID_C(x, PCI_CLASS_STORAGE_SCSI)
@@ -132,17 +121,7 @@ static struct pci_device_id ahc_linux_pci_id_table[] = {
  
  MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table);
  
-static struct pci_driver aic7xxx_pci_driver = {
-       .name           = "aic7xxx",
-       .probe          = ahc_linux_pci_dev_probe,
  #ifdef CONFIG_PM
-       .suspend        = ahc_linux_pci_dev_suspend,
-       .resume         = ahc_linux_pci_dev_resume,
-#endif
-       .remove         = ahc_linux_pci_dev_remove,
-       .id_table       = ahc_linux_pci_id_table
-};
-
  static int
  ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg)
  {
@@ -182,6 +161,7 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev)
  
         return (ahc_resume(ahc));
  }
+#endif
  
  static void
  ahc_linux_pci_dev_remove(struct pci_dev *pdev)
@@ -289,6 +269,17 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
         return (0);
  }
  
+static struct pci_driver aic7xxx_pci_driver = {
+       .name           = "aic7xxx",
+       .probe          = ahc_linux_pci_dev_probe,
+#ifdef CONFIG_PM
+       .suspend        = ahc_linux_pci_dev_suspend,
+       .resume         = ahc_linux_pci_dev_resume,
+#endif
+       .remove         = ahc_linux_pci_dev_remove,
+       .id_table       = ahc_linux_pci_id_table
+};
+
  int
  ahc_linux_pci_init(void)
  {
diff --git a/drivers/scsi/aic7xxx/aic7xxx_pci.c b/drivers/scsi/aic7xxx/aic7xxx_pci.c

index ae35937b8055b346f394dcb994006f8465c2abcf..56848f41e4f961f7a70d66964a8d770671ff0663 100644 (file)
--- a/drivers/scsi/aic7xxx/aic7xxx_pci.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_pci.c
@@ -2020,6 +2020,7 @@ ahc_pci_chip_init(struct ahc_softc *ahc)
         return (ahc_chip_init(ahc));
  }
  
+#ifdef CONFIG_PM
  void
  ahc_pci_resume(struct ahc_softc *ahc)
  {
@@ -2051,6 +2052,7 @@ ahc_pci_resume(struct ahc_softc *ahc)
                 ahc_release_seeprom(&sd);
         }
  }
+#endif
  
  static int
  ahc_aic785X_setup(struct ahc_softc *ahc)
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c

index bcb0b870320c166b56f2c08c6ee07113d6fa915d..3bfd9296bbfaaa75dc314d9be939c05eb2688b82 100644 (file)
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -11141,7 +11141,6 @@ static struct scsi_host_template driver_template = {
         .max_sectors            = 2048,
         .cmd_per_lun            = 3,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  #include "scsi_module.c"
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c

index d80dba913a750c8e051b16ec28215348cb55ef41..f4a202e8df267329e44b15cbe48ad546911de118 100644 (file)
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -122,7 +122,6 @@ static struct scsi_host_template arcmsr_scsi_host_template = {
         .max_sectors            = ARCMSR_MAX_XFER_SECTORS,
         .cmd_per_lun            = ARCMSR_MAX_CMD_PERLUN,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .shost_attrs            = arcmsr_host_attrs,
  };
  #ifdef CONFIG_SCSI_ARCMSR_AER
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c

index f93c73c0ba53d7081a187f17e5c17bf7224da77d..22ef3716e7864168977b4e7967358bf0b6496b1a 100644 (file)
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -4763,7 +4763,6 @@ static struct scsi_host_template dc395x_driver_template = {
         .eh_bus_reset_handler   = dc395x_eh_bus_reset,
         .unchecked_isa_dma      = 0,
         .use_clustering         = DISABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c

index 19cce125124c8d423fc9cd500175f2b70bd9e7ce..c9dd8392aab20f138916d6b46150aba96b336483 100644 (file)
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -3340,7 +3340,6 @@ static struct scsi_host_template driver_template = {
         .this_id                = 7,
         .cmd_per_lun            = 1,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  #include "scsi_module.c"
  MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c

index 05163cefec12f7f2f806009de76da5d77767a0ed..8be3d76656faab6fdbfa92548febc77bb4db03e5 100644 (file)
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -524,7 +524,6 @@ static struct scsi_host_template driver_template = {
         .this_id = 7,
         .unchecked_isa_dma = 1,
         .use_clustering = ENABLE_CLUSTERING,
-       .use_sg_chaining = ENABLE_SG_CHAINING,
  };
  
  #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c

index 5ea1f986220cf052e2786d6f91a5d7e86aebcf39..880c78bff0e14a35e2ac6d6677b213946eca7ee8 100644 (file)
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -342,7 +342,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
         shost->use_clustering = sht->use_clustering;
         shost->ordered_tag = sht->ordered_tag;
         shost->active_mode = sht->supported_mode;
-       shost->use_sg_chaining = sht->use_sg_chaining;
  
         if (sht->supported_mode == MODE_UNKNOWN)
                 /* means we didn't set it ... default to INITIATOR */
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c

index e7b2f3575ce99466c1ddf50991744205b4d0c0fa..ff149ad6bc4ed103e44f7497b174391a94ede468 100644 (file)
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag,
                 scsi_set_resid(scp,
                         scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length));
                 scp->result = SAM_STAT_CHECK_CONDITION;
-               memcpy(&scp->sense_buffer, &req->sg_list,
+               memcpy(scp->sense_buffer, &req->sg_list,
                                 min_t(size_t, SCSI_SENSE_BUFFERSIZE,
                                         le32_to_cpu(req->dataxfer_length)));
                 break;
@@ -906,7 +906,6 @@ static struct scsi_host_template driver_template = {
         .unchecked_isa_dma          = 0,
         .emulated                   = 0,
         .use_clustering             = ENABLE_CLUSTERING,
-       .use_sg_chaining            = ENABLE_SG_CHAINING,
         .proc_name                  = driver_name,
         .shost_attrs                = hptiop_attrs,
         .this_id                    = -1,
diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c

index db004a45073277edbd946cff38e941893de01a95..4d15a62914e92f9ea322557be2a798c90b154706 100644 (file)
--- a/drivers/scsi/ibmmca.c
+++ b/drivers/scsi/ibmmca.c
@@ -1501,7 +1501,6 @@ static struct scsi_host_template ibmmca_driver_template = {
            .sg_tablesize   = 16,
            .cmd_per_lun    = 1,
            .use_clustering = ENABLE_CLUSTERING,
-          .use_sg_chaining = ENABLE_SG_CHAINING,
  };
  
  static int ibmmca_probe(struct device *dev)
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c

index 30819012898fca80eb828e81ed2a3aa2c3d308cd..78d46a900bb51a5e43d8877bb0882f983e6ae13f 100644 (file)
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1600,7 +1600,6 @@ static struct scsi_host_template driver_template = {
         .this_id = -1,
         .sg_tablesize = SG_ALL,
         .use_clustering = ENABLE_CLUSTERING,
-       .use_sg_chaining = ENABLE_SG_CHAINING,
         .shost_attrs = ibmvscsi_attrs,
  };
  
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c

index a10a5c74b48db6d2150905b252f52fbc3b0d3362..0cc8868ea35d03de01ebfbf25df8ab25f634f5b8 100644 (file)
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = {
         .sg_tablesize           = SG_ALL,
         .cmd_per_lun            = 1,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  static int initio_probe_one(struct pci_dev *pdev,
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c

index e5be5fd4ef583f10ba907721da19249999d609bb..b6f99dfbb038b91d02ef59d473a1052d4ab7b82b 100644 (file)
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -1933,7 +1933,6 @@ static struct scsi_host_template iscsi_sht = {
         .eh_device_reset_handler= iscsi_eh_device_reset,
         .eh_host_reset_handler  = iscsi_eh_host_reset,
         .use_clustering         = DISABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .slave_configure        = iscsi_tcp_slave_configure,
         .proc_name              = "iscsi_tcp",
         .this_id                = -1,
diff --git a/drivers/scsi/libsrp.c b/drivers/scsi/libsrp.c

index 5cff0204227dc8267ff6443cefa76eb248360f16..6d6a76e65a6c3fc9e391c17b2fe34cf61176f718 100644 (file)
--- a/drivers/scsi/libsrp.c
+++ b/drivers/scsi/libsrp.c
@@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info,
  
         sc->SCp.ptr = info;
         memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE);
-       sc->request_bufflen = len;
-       sc->request_buffer = (void *) (unsigned long) addr;
+       sc->sdb.length = len;
+       sc->sdb.table.sgl = (void *) (unsigned long) addr;
         sc->tag = tag;
         err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun,
                                      cmd->tag);
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c

index 6483c62730b3b8b71c55b10389c881ce2c222d23..fc5c3a42b05a67c0b9b37f8ae03a68521529f2e1 100644 (file)
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1459,7 +1459,6 @@ struct scsi_host_template lpfc_template = {
         .scan_finished          = lpfc_scan_finished,
         .this_id                = -1,
         .sg_tablesize           = LPFC_DEFAULT_SG_SEG_CNT,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .cmd_per_lun            = LPFC_CMD_PER_LUN,
         .use_clustering         = ENABLE_CLUSTERING,
         .shost_attrs            = lpfc_hba_attrs,
@@ -1482,7 +1481,6 @@ struct scsi_host_template lpfc_vport_template = {
         .sg_tablesize           = LPFC_DEFAULT_SG_SEG_CNT,
         .cmd_per_lun            = LPFC_CMD_PER_LUN,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .shost_attrs            = lpfc_vport_attrs,
         .max_sectors            = 0xFFFF,
  };
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c

index a035001f44386a36e49abcd3d292fb31239cf85e..b12ad7c7c6736908419987896a2402898ba2620e 100644 (file)
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -402,7 +402,6 @@ static struct scsi_host_template mac53c94_template = {
         .sg_tablesize   = SG_ALL,
         .cmd_per_lun    = 1,
         .use_clustering = DISABLE_CLUSTERING,
-       .use_sg_chaining = ENABLE_SG_CHAINING,
  };
  
  static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c

index 765c24d2bc3842d54da9ffcc214cff782d46eaaa..4d59ae8491a4edac9f6fae0138202e639ee82b93 100644 (file)
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -4490,7 +4490,6 @@ static struct scsi_host_template megaraid_template = {
         .sg_tablesize                   = MAX_SGLIST,
         .cmd_per_lun                    = DEF_CMD_PER_LUN,
         .use_clustering                 = ENABLE_CLUSTERING,
-       .use_sg_chaining                = ENABLE_SG_CHAINING,
         .eh_abort_handler               = megaraid_abort,
         .eh_device_reset_handler        = megaraid_reset,
         .eh_bus_reset_handler           = megaraid_reset,
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c

index 24e32e446e76048ed461c105539e730a9376ad98..6db77c00e3eed4b802eeab7b9ec662194522d526 100644 (file)
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -361,7 +361,6 @@ static struct scsi_host_template megaraid_template_g = {
         .eh_host_reset_handler          = megaraid_reset_handler,
         .change_queue_depth             = megaraid_change_queue_depth,
         .use_clustering                 = ENABLE_CLUSTERING,
-       .use_sg_chaining                = ENABLE_SG_CHAINING,
         .sdev_attrs                     = megaraid_sdev_attrs,
         .shost_attrs                    = megaraid_shost_attrs,
  };
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c

index d7ec921865c4fdf4a911fd6c46f97e6b12ff9cd5..672c759ac24d7719c3371c191d572e25bd8c4d4f 100644 (file)
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -1192,7 +1192,6 @@ static struct scsi_host_template megasas_template = {
         .eh_timed_out = megasas_reset_timer,
         .bios_param = megasas_bios_param,
         .use_clustering = ENABLE_CLUSTERING,
-       .use_sg_chaining = ENABLE_SG_CHAINING,
  };
  
  /**
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c

index 7470ff39ab22f59bd09e8140ba42811730b50230..651d09b08f2a22eb6283da21830736271d1d4cd8 100644 (file)
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1843,7 +1843,6 @@ static struct scsi_host_template mesh_template = {
         .sg_tablesize                   = SG_ALL,
         .cmd_per_lun                    = 2,
         .use_clustering                 = DISABLE_CLUSTERING,
-       .use_sg_chaining                = ENABLE_SG_CHAINING,
  };
  
  static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c

index c02771aa6c9b4d76d6abc52c509ec9c0327b83f3..c5ebf018b378fc3d026b3bb02eda29f8731be27a 100644 (file)
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp)
                              sizeof(cp->sense_buf)));
  
                 if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) {
-                       u_char * p = (u_char*) & cmd->sense_buffer;
+                       u_char *p = cmd->sense_buffer;
                         int i;
                         PRINT_ADDR(cmd, "sense data:");
                         for (i=0; i<14; i++) printk (" %x", *p++);
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c

index 28161dc95e0d7107613933da5186e50f265a0ded..7fed35372150afe68d619d61d4faa4a2a4588437 100644 (file)
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -281,7 +281,6 @@ static struct scsi_host_template nsp32_template = {
         .cmd_per_lun                    = 1,
         .this_id                        = NSP32_HOST_SCSIID,
         .use_clustering                 = DISABLE_CLUSTERING,
-       .use_sg_chaining                = ENABLE_SG_CHAINING,
         .eh_abort_handler               = nsp32_eh_abort,
         .eh_bus_reset_handler           = nsp32_eh_bus_reset,
         .eh_host_reset_handler          = nsp32_eh_host_reset,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c

index 969b9387a0c370c8990ff2608d1238441df5fa94..3454a57147496cddc1d012699fbd8334b8e8e2e8 100644 (file)
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -692,7 +692,6 @@ static struct scsi_host_template sym53c500_driver_template = {
       .sg_tablesize             = 32,
       .cmd_per_lun              = 1,
       .use_clustering           = ENABLE_CLUSTERING,
-     .use_sg_chaining          = ENABLE_SG_CHAINING,
       .shost_attrs              = SYM53C500_shost_attrs
  };
  
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c

index c94906abfee353137b4483dbcfcc2346cc4aeab9..68c0d09ffe786fe6416f69e948fd0a82e85d5e28 100644 (file)
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4204,7 +4204,6 @@ static struct scsi_host_template qla1280_driver_template = {
         .sg_tablesize           = SG_ALL,
         .cmd_per_lun            = 1,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c

index aba1e6d480669ff6e9f932e55e6364f397d050df..3954ed2d7b517acbd183f30049625bf43a3750b2 100644 (file)
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -131,7 +131,6 @@ static struct scsi_host_template qla2x00_driver_template = {
         .this_id                = -1,
         .cmd_per_lun            = 3,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .sg_tablesize           = SG_ALL,
  
         /*
@@ -163,7 +162,6 @@ struct scsi_host_template qla24xx_driver_template = {
         .this_id                = -1,
         .cmd_per_lun            = 3,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .sg_tablesize           = SG_ALL,
  
         .max_sectors            = 0xFFFF,
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c

index d3f86646cb088b0fecbbd9edcd0b26c8b51bebd1..2e2b9fedffcc7c763b1f1d8f70df0faa38e27f61 100644 (file)
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -94,7 +94,6 @@ static struct scsi_host_template qla4xxx_driver_template = {
         .this_id                = -1,
         .cmd_per_lun            = 3,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .sg_tablesize           = SG_ALL,
  
         .max_sectors            = 0xFFFF,
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c

index 1769f965eedf496e9e0602365220641c5caa2fd1..1e874f1fb5c64a403fd6e7da8365edce51b929df 100644 (file)
--- a/drivers/scsi/qlogicfas.c
+++ b/drivers/scsi/qlogicfas.c
@@ -197,7 +197,6 @@ static struct scsi_host_template qlogicfas_driver_template = {
         .sg_tablesize           = SG_ALL,
         .cmd_per_lun            = 1,
         .use_clustering         = DISABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  static __init int qlogicfas_init(void)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c

index 1a9fba6a9f92ac7ad60d05f1923f8fb45ccca574..b35d19472caa080a7a8614a8e211216b001e8f62 100644 (file)
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
                                 "Notifying upper driver of completion "
                                 "(result %x)\n", cmd->result));
  
-       good_bytes = cmd->request_bufflen;
+       good_bytes = scsi_bufflen(cmd);
          if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) {
                 drv = scsi_cmd_to_driver(cmd);
                 if (drv->done)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c

index 82c06f0a9d0201ae42deb14bec2f2f0f4a239fa9..1541c174937ac81017502253116baeda7350f7c5 100644 (file)
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -280,6 +280,8 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba,
                       unsigned int num, struct sdebug_dev_info * devip);
  static int resp_report_luns(struct scsi_cmnd * SCpnt,
                             struct sdebug_dev_info * devip);
+static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
+                           unsigned int num, struct sdebug_dev_info *devip);
  static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
                                  int arr_len);
  static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
@@ -311,12 +313,48 @@ static void sdebug_max_tgts_luns(void);
  static struct device pseudo_primary;
  static struct bus_type pseudo_lld_bus;
  
+static void get_data_transfer_info(unsigned char *cmd,
+                                  unsigned long long *lba, unsigned int *num)
+{
+       int i;
+
+       switch (*cmd) {
+       case WRITE_16:
+       case READ_16:
+               for (*lba = 0, i = 0; i < 8; ++i) {
+                       if (i > 0)
+                               *lba <<= 8;
+                       *lba += cmd[2 + i];
+               }
+               *num = cmd[13] + (cmd[12] << 8) +
+                       (cmd[11] << 16) + (cmd[10] << 24);
+               break;
+       case WRITE_12:
+       case READ_12:
+               *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
+               *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
+               break;
+       case WRITE_10:
+       case READ_10:
+       case XDWRITEREAD_10:
+               *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24);
+               *num = cmd[8] + (cmd[7] << 8);
+               break;
+       case WRITE_6:
+       case READ_6:
+               *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16);
+               *num = (0 == cmd[4]) ? 256 : cmd[4];
+               break;
+       default:
+               break;
+       }
+}
  
  static
  int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
  {
         unsigned char *cmd = (unsigned char *) SCpnt->cmnd;
-       int len, k, j;
+       int len, k;
         unsigned int num;
         unsigned long long lba;
         int errsts = 0;
@@ -452,28 +490,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
                         break;
                 if (scsi_debug_fake_rw)
                         break;
-               if ((*cmd) == READ_16) {
-                       for (lba = 0, j = 0; j < 8; ++j) {
-                               if (j > 0)
-                                       lba <<= 8;
-                               lba += cmd[2 + j];
-                       }
-                       num = cmd[13] + (cmd[12] << 8) +
-                               (cmd[11] << 16) + (cmd[10] << 24);
-               } else if ((*cmd) == READ_12) {
-                       lba = cmd[5] + (cmd[4] << 8) +
-                               (cmd[3] << 16) + (cmd[2] << 24);
-                       num = cmd[9] + (cmd[8] << 8) +
-                               (cmd[7] << 16) + (cmd[6] << 24);
-               } else if ((*cmd) == READ_10) {
-                       lba = cmd[5] + (cmd[4] << 8) +
-                               (cmd[3] << 16) + (cmd[2] << 24);
-                       num = cmd[8] + (cmd[7] << 8);
-               } else {        /* READ (6) */
-                       lba = cmd[3] + (cmd[2] << 8) +
-                               ((cmd[1] & 0x1f) << 16);
-                       num = (0 == cmd[4]) ? 256 : cmd[4];
-               }
+               get_data_transfer_info(cmd, &lba, &num);
                 errsts = resp_read(SCpnt, lba, num, devip);
                 if (inj_recovered && (0 == errsts)) {
                         mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -500,28 +517,7 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
                         break;
                 if (scsi_debug_fake_rw)
                         break;
-               if ((*cmd) == WRITE_16) {
-                       for (lba = 0, j = 0; j < 8; ++j) {
-                               if (j > 0)
-                                       lba <<= 8;
-                               lba += cmd[2 + j];
-                       }
-                       num = cmd[13] + (cmd[12] << 8) +
-                               (cmd[11] << 16) + (cmd[10] << 24);
-               } else if ((*cmd) == WRITE_12) {
-                       lba = cmd[5] + (cmd[4] << 8) +
-                               (cmd[3] << 16) + (cmd[2] << 24);
-                       num = cmd[9] + (cmd[8] << 8) +
-                               (cmd[7] << 16) + (cmd[6] << 24);
-               } else if ((*cmd) == WRITE_10) {
-                       lba = cmd[5] + (cmd[4] << 8) +
-                               (cmd[3] << 16) + (cmd[2] << 24);
-                       num = cmd[8] + (cmd[7] << 8);
-               } else {        /* WRITE (6) */
-                       lba = cmd[3] + (cmd[2] << 8) +
-                               ((cmd[1] & 0x1f) << 16);
-                       num = (0 == cmd[4]) ? 256 : cmd[4];
-               }
+               get_data_transfer_info(cmd, &lba, &num);
                 errsts = resp_write(SCpnt, lba, num, devip);
                 if (inj_recovered && (0 == errsts)) {
                         mk_sense_buffer(devip, RECOVERED_ERROR,
@@ -549,6 +545,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done)
         case WRITE_BUFFER:
                 errsts = check_readiness(SCpnt, 1, devip);
                 break;
+       case XDWRITEREAD_10:
+               if (!scsi_bidi_cmnd(SCpnt)) {
+                       mk_sense_buffer(devip, ILLEGAL_REQUEST,
+                                       INVALID_FIELD_IN_CDB, 0);
+                       errsts = check_condition_result;
+                       break;
+               }
+
+               errsts = check_readiness(SCpnt, 0, devip);
+               if (errsts)
+                       break;
+               if (scsi_debug_fake_rw)
+                       break;
+               get_data_transfer_info(cmd, &lba, &num);
+               errsts = resp_read(SCpnt, lba, num, devip);
+               if (errsts)
+                       break;
+               errsts = resp_write(SCpnt, lba, num, devip);
+               if (errsts)
+                       break;
+               errsts = resp_xdwriteread(SCpnt, lba, num, devip);
+               break;
         default:
                 if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
                         printk(KERN_INFO "scsi_debug: Opcode: 0x%x not "
@@ -601,18 +619,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
         int k, req_len, act_len, len, active;
         void * kaddr;
         void * kaddr_off;
-       struct scatterlist * sg;
+       struct scatterlist *sg;
+       struct scsi_data_buffer *sdb = scsi_in(scp);
  
-       if (0 == scsi_bufflen(scp))
+       if (!sdb->length)
                 return 0;
-       if (NULL == scsi_sglist(scp))
+       if (!sdb->table.sgl)
                 return (DID_ERROR << 16);
-       if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
-             (scp->sc_data_direction == DMA_FROM_DEVICE)))
+       if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE))
                 return (DID_ERROR << 16);
         active = 1;
         req_len = act_len = 0;
-       scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
+       for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) {
                 if (active) {
                         kaddr = (unsigned char *)
                                 kmap_atomic(sg_page(sg), KM_USER0);
@@ -630,10 +648,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
                 }
                 req_len += sg->length;
         }
-       if (scsi_get_resid(scp))
-               scsi_set_resid(scp, scsi_get_resid(scp) - act_len);
+       if (sdb->resid)
+               sdb->resid -= act_len;
         else
-               scsi_set_resid(scp, req_len - act_len);
+               sdb->resid = req_len - act_len;
         return 0;
  }
  
@@ -650,8 +668,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr,
                 return 0;
         if (NULL == scsi_sglist(scp))
                 return -1;
-       if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) ||
-             (scp->sc_data_direction == DMA_TO_DEVICE)))
+       if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE))
                 return -1;
         req_len = fin = 0;
         scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) {
@@ -1956,6 +1973,50 @@ static int resp_report_luns(struct scsi_cmnd * scp,
                                     min((int)alloc_len, SDEBUG_RLUN_ARR_SZ));
  }
  
+static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba,
+                           unsigned int num, struct sdebug_dev_info *devip)
+{
+       int i, j, ret = -1;
+       unsigned char *kaddr, *buf;
+       unsigned int offset;
+       struct scatterlist *sg;
+       struct scsi_data_buffer *sdb = scsi_in(scp);
+
+       /* better not to use temporary buffer. */
+       buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC);
+       if (!buf)
+               return ret;
+
+       offset = 0;
+       scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) {
+               kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
+               if (!kaddr)
+                       goto out;
+
+               memcpy(buf + offset, kaddr + sg->offset, sg->length);
+               offset += sg->length;
+               kunmap_atomic(kaddr, KM_USER0);
+       }
+
+       offset = 0;
+       for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) {
+               kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0);
+               if (!kaddr)
+                       goto out;
+
+               for (j = 0; j < sg->length; j++)
+                       *(kaddr + sg->offset + j) ^= *(buf + offset + j);
+
+               offset += sg->length;
+               kunmap_atomic(kaddr, KM_USER0);
+       }
+       ret = 0;
+out:
+       kfree(buf);
+
+       return ret;
+}
+
  /* When timer goes off this function is called. */
  static void timer_intr_handler(unsigned long indx)
  {
@@ -1989,6 +2050,7 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp)
         if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
                 printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n",
                        sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
+       set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags);
         return 0;
  }
  
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c

index 547e85aa414f2b06a27a5b298e917c456e472f60..045a0868fc7b4540ac53676206ab1f5136355f13 100644 (file)
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -617,29 +617,27 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses,
         ses->cmd_len = scmd->cmd_len;
         memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd));
         ses->data_direction = scmd->sc_data_direction;
-       ses->bufflen = scmd->request_bufflen;
-       ses->buffer = scmd->request_buffer;
-       ses->use_sg = scmd->use_sg;
-       ses->resid = scmd->resid;
+       ses->sdb = scmd->sdb;
+       ses->next_rq = scmd->request->next_rq;
         ses->result = scmd->result;
  
+       memset(&scmd->sdb, 0, sizeof(scmd->sdb));
+       scmd->request->next_rq = NULL;
+
         if (sense_bytes) {
-               scmd->request_bufflen = min_t(unsigned,
-                                      SCSI_SENSE_BUFFERSIZE, sense_bytes);
+               scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE,
+                                        sense_bytes);
                 sg_init_one(&ses->sense_sgl, scmd->sense_buffer,
-                                                      scmd->request_bufflen);
-               scmd->request_buffer = &ses->sense_sgl;
+                           scmd->sdb.length);
+               scmd->sdb.table.sgl = &ses->sense_sgl;
                 scmd->sc_data_direction = DMA_FROM_DEVICE;
-               scmd->use_sg = 1;
+               scmd->sdb.table.nents = 1;
                 memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
                 scmd->cmnd[0] = REQUEST_SENSE;
-               scmd->cmnd[4] = scmd->request_bufflen;
+               scmd->cmnd[4] = scmd->sdb.length;
                 scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]);
         } else {
-               scmd->request_buffer = NULL;
-               scmd->request_bufflen = 0;
                 scmd->sc_data_direction = DMA_NONE;
-               scmd->use_sg = 0;
                 if (cmnd) {
                         memset(scmd->cmnd, 0, sizeof(scmd->cmnd));
                         memcpy(scmd->cmnd, cmnd, cmnd_size);
@@ -676,10 +674,8 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses)
         scmd->cmd_len = ses->cmd_len;
         memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd));
         scmd->sc_data_direction = ses->data_direction;
-       scmd->request_bufflen = ses->bufflen;
-       scmd->request_buffer = ses->buffer;
-       scmd->use_sg = ses->use_sg;
-       scmd->resid = ses->resid;
+       scmd->sdb = ses->sdb;
+       scmd->request->next_rq = ses->next_rq;
         scmd->result = ses->result;
  }
  EXPORT_SYMBOL(scsi_eh_restore_cmnd);
@@ -1700,8 +1696,7 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
         memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd));
      
         scmd->scsi_done         = scsi_reset_provider_done_command;
-       scmd->request_buffer            = NULL;
-       scmd->request_bufflen           = 0;
+       memset(&scmd->sdb, 0, sizeof(scmd->sdb));
  
         scmd->cmd_len                   = 0;
  
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c

index 7c4c889c5221e09ed9cb00380c5c3053256c8545..b12fb310e3999af497c21778b62f40c46892e8fd 100644 (file)
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -8,6 +8,7 @@
   */
  
  #include <linux/bio.h>
+#include <linux/bitops.h>
  #include <linux/blkdev.h>
  #include <linux/completion.h>
  #include <linux/kernel.h>
@@ -34,13 +35,6 @@
  #define SG_MEMPOOL_NR          ARRAY_SIZE(scsi_sg_pools)
  #define SG_MEMPOOL_SIZE                2
  
-/*
- * The maximum number of SG segments that we will put inside a scatterlist
- * (unless chaining is used). Should ideally fit inside a single page, to
- * avoid a higher order allocation.
- */
-#define SCSI_MAX_SG_SEGMENTS   128
-
  struct scsi_host_sg_pool {
         size_t          size;
         char            *name;
@@ -48,22 +42,31 @@ struct scsi_host_sg_pool {
         mempool_t       *pool;
  };
  
-#define SP(x) { x, "sgpool-" #x }
+#define SP(x) { x, "sgpool-" __stringify(x) }
+#if (SCSI_MAX_SG_SEGMENTS < 32)
+#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater)
+#endif
  static struct scsi_host_sg_pool scsi_sg_pools[] = {
         SP(8),
         SP(16),
-#if (SCSI_MAX_SG_SEGMENTS > 16)
-       SP(32),
  #if (SCSI_MAX_SG_SEGMENTS > 32)
-       SP(64),
+       SP(32),
  #if (SCSI_MAX_SG_SEGMENTS > 64)
+       SP(64),
+#if (SCSI_MAX_SG_SEGMENTS > 128)
         SP(128),
+#if (SCSI_MAX_SG_SEGMENTS > 256)
+#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX)
+#endif
  #endif
  #endif
  #endif
+       SP(SCSI_MAX_SG_SEGMENTS)
  };
  #undef SP
  
+static struct kmem_cache *scsi_bidi_sdb_cache;
+
  static void scsi_run_queue(struct request_queue *q);
  
  /*
@@ -440,7 +443,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async);
  static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
  {
         cmd->serial_number = 0;
-       cmd->resid = 0;
+       scsi_set_resid(cmd, 0);
         memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
         if (cmd->cmd_len == 0)
                 cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]);
@@ -690,42 +693,16 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
         return NULL;
  }
  
-/*
- * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
- * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
- */
-#define SCSI_MAX_SG_CHAIN_SEGMENTS     2048
-
  static inline unsigned int scsi_sgtable_index(unsigned short nents)
  {
         unsigned int index;
  
-       switch (nents) {
-       case 1 ... 8:
+       BUG_ON(nents > SCSI_MAX_SG_SEGMENTS);
+
+       if (nents <= 8)
                 index = 0;
-               break;
-       case 9 ... 16:
-               index = 1;
-               break;
-#if (SCSI_MAX_SG_SEGMENTS > 16)
-       case 17 ... 32:
-               index = 2;
-               break;
-#if (SCSI_MAX_SG_SEGMENTS > 32)
-       case 33 ... 64:
-               index = 3;
-               break;
-#if (SCSI_MAX_SG_SEGMENTS > 64)
-       case 65 ... 128:
-               index = 4;
-               break;
-#endif
-#endif
-#endif
-       default:
-               printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
-               BUG();
-       }
+       else
+               index = get_count_order(nents) - 3;
  
         return index;
  }
@@ -746,31 +723,27 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask)
         return mempool_alloc(sgp->pool, gfp_mask);
  }
  
-int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents,
+                             gfp_t gfp_mask)
  {
         int ret;
  
-       BUG_ON(!cmd->use_sg);
+       BUG_ON(!nents);
  
-       ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg,
-                              SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc);
+       ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS,
+                              gfp_mask, scsi_sg_alloc);
         if (unlikely(ret))
-               __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS,
+               __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS,
                                 scsi_sg_free);
  
-       cmd->request_buffer = cmd->sg_table.sgl;
         return ret;
  }
  
-EXPORT_SYMBOL(scsi_alloc_sgtable);
-
-void scsi_free_sgtable(struct scsi_cmnd *cmd)
+static void scsi_free_sgtable(struct scsi_data_buffer *sdb)
  {
-       __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
+       __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free);
  }
  
-EXPORT_SYMBOL(scsi_free_sgtable);
-
  /*
   * Function:    scsi_release_buffers()
   *
@@ -788,17 +761,49 @@ EXPORT_SYMBOL(scsi_free_sgtable);
   *             the scatter-gather table, and potentially any bounce
   *             buffers.
   */
-static void scsi_release_buffers(struct scsi_cmnd *cmd)
+void scsi_release_buffers(struct scsi_cmnd *cmd)
+{
+       if (cmd->sdb.table.nents)
+               scsi_free_sgtable(&cmd->sdb);
+
+       memset(&cmd->sdb, 0, sizeof(cmd->sdb));
+
+       if (scsi_bidi_cmnd(cmd)) {
+               struct scsi_data_buffer *bidi_sdb =
+                       cmd->request->next_rq->special;
+               scsi_free_sgtable(bidi_sdb);
+               kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb);
+               cmd->request->next_rq->special = NULL;
+       }
+}
+EXPORT_SYMBOL(scsi_release_buffers);
+
+/*
+ * Bidi commands Must be complete as a whole, both sides at once.
+ * If part of the bytes were written and lld returned
+ * scsi_in()->resid and/or scsi_out()->resid this information will be left
+ * in req->data_len and req->next_rq->data_len. The upper-layer driver can
+ * decide what to do with this information.
+ */
+void scsi_end_bidi_request(struct scsi_cmnd *cmd)
  {
-       if (cmd->use_sg)
-               scsi_free_sgtable(cmd);
+       struct request *req = cmd->request;
+       unsigned int dlen = req->data_len;
+       unsigned int next_dlen = req->next_rq->data_len;
+
+       req->data_len = scsi_out(cmd)->resid;
+       req->next_rq->data_len = scsi_in(cmd)->resid;
+
+       /* The req and req->next_rq have not been completed */
+       BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
+
+       scsi_release_buffers(cmd);
  
         /*
-        * Zero these out.  They now point to freed memory, and it is
-        * dangerous to hang onto the pointers.
+        * This will goose the queue request function at the end, so we don't
+        * need to worry about launching another command.
          */
-       cmd->request_buffer = NULL;
-       cmd->request_bufflen = 0;
+       scsi_next_command(cmd);
  }
  
  /*
@@ -832,7 +837,7 @@ static void scsi_release_buffers(struct scsi_cmnd *cmd)
  void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
  {
         int result = cmd->result;
-       int this_count = cmd->request_bufflen;
+       int this_count = scsi_bufflen(cmd);
         struct request_queue *q = cmd->device->request_queue;
         struct request *req = cmd->request;
         int clear_errors = 1;
@@ -840,8 +845,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
         int sense_valid = 0;
         int sense_deferred = 0;
  
-       scsi_release_buffers(cmd);
-
         if (result) {
                 sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
                 if (sense_valid)
@@ -864,9 +867,17 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
                                 req->sense_len = len;
                         }
                 }
-               req->data_len = cmd->resid;
+               if (scsi_bidi_cmnd(cmd)) {
+                       /* will also release_buffers */
+                       scsi_end_bidi_request(cmd);
+                       return;
+               }
+               req->data_len = scsi_get_resid(cmd);
         }
  
+       BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
+       scsi_release_buffers(cmd);
+
         /*
          * Next deal with any sectors which we were able to correctly
          * handle.
@@ -874,7 +885,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
         SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
                                       "%d bytes done.\n",
                                       req->nr_sectors, good_bytes));
-       SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg));
  
         if (clear_errors)
                 req->errors = 0;
@@ -991,52 +1001,80 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
         scsi_end_request(cmd, -EIO, this_count, !result);
  }
  
-/*
- * Function:    scsi_init_io()
- *
- * Purpose:     SCSI I/O initialize function.
- *
- * Arguments:   cmd   - Command descriptor we wish to initialize
- *
- * Returns:     0 on success
- *             BLKPREP_DEFER if the failure is retryable
- */
-static int scsi_init_io(struct scsi_cmnd *cmd)
+static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
+                            gfp_t gfp_mask)
  {
-       struct request     *req = cmd->request;
-       int                count;
-
-       /*
-        * We used to not use scatter-gather for single segment request,
-        * but now we do (it makes highmem I/O easier to support without
-        * kmapping pages)
-        */
-       cmd->use_sg = req->nr_phys_segments;
+       int count;
  
         /*
          * If sg table allocation fails, requeue request later.
          */
-       if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) {
-               scsi_unprep_request(req);
+       if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments,
+                                       gfp_mask))) {
                 return BLKPREP_DEFER;
         }
  
         req->buffer = NULL;
         if (blk_pc_request(req))
-               cmd->request_bufflen = req->data_len;
+               sdb->length = req->data_len;
         else
-               cmd->request_bufflen = req->nr_sectors << 9;
+               sdb->length = req->nr_sectors << 9;
  
         /* 
          * Next, walk the list, and fill in the addresses and sizes of
          * each segment.
          */
-       count = blk_rq_map_sg(req->q, req, cmd->request_buffer);
-       BUG_ON(count > cmd->use_sg);
-       cmd->use_sg = count;
+       count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
+       BUG_ON(count > sdb->table.nents);
+       sdb->table.nents = count;
         return BLKPREP_OK;
  }
  
+/*
+ * Function:    scsi_init_io()
+ *
+ * Purpose:     SCSI I/O initialize function.
+ *
+ * Arguments:   cmd   - Command descriptor we wish to initialize
+ *
+ * Returns:     0 on success
+ *             BLKPREP_DEFER if the failure is retryable
+ *             BLKPREP_KILL if the failure is fatal
+ */
+int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+{
+       int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
+       if (error)
+               goto err_exit;
+
+       if (blk_bidi_rq(cmd->request)) {
+               struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
+                       scsi_bidi_sdb_cache, GFP_ATOMIC);
+               if (!bidi_sdb) {
+                       error = BLKPREP_DEFER;
+                       goto err_exit;
+               }
+
+               cmd->request->next_rq->special = bidi_sdb;
+               error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
+                                                                   GFP_ATOMIC);
+               if (error)
+                       goto err_exit;
+       }
+
+       return BLKPREP_OK ;
+
+err_exit:
+       scsi_release_buffers(cmd);
+       if (error == BLKPREP_KILL)
+               scsi_put_command(cmd);
+       else /* BLKPREP_DEFER */
+               scsi_unprep_request(cmd->request);
+
+       return error;
+}
+EXPORT_SYMBOL(scsi_init_io);
+
  static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev,
                 struct request *req)
  {
@@ -1081,16 +1119,14 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
  
                 BUG_ON(!req->nr_phys_segments);
  
-               ret = scsi_init_io(cmd);
+               ret = scsi_init_io(cmd, GFP_ATOMIC);
                 if (unlikely(ret))
                         return ret;
         } else {
                 BUG_ON(req->data_len);
                 BUG_ON(req->data);
  
-               cmd->request_bufflen = 0;
-               cmd->request_buffer = NULL;
-               cmd->use_sg = 0;
+               memset(&cmd->sdb, 0, sizeof(cmd->sdb));
                 req->buffer = NULL;
         }
  
@@ -1132,7 +1168,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
         if (unlikely(!cmd))
                 return BLKPREP_DEFER;
  
-       return scsi_init_io(cmd);
+       return scsi_init_io(cmd, GFP_ATOMIC);
  }
  EXPORT_SYMBOL(scsi_setup_fs_cmnd);
  
@@ -1542,20 +1578,7 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
          * this limit is imposed by hardware restrictions
          */
         blk_queue_max_hw_segments(q, shost->sg_tablesize);
-
-       /*
-        * In the future, sg chaining support will be mandatory and this
-        * ifdef can then go away. Right now we don't have all archs
-        * converted, so better keep it safe.
-        */
-#ifdef ARCH_HAS_SG_CHAIN
-       if (shost->use_sg_chaining)
-               blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
-       else
-               blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
-#else
-       blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
-#endif
+       blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
  
         blk_queue_max_sectors(q, shost->max_sectors);
         blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
@@ -1654,6 +1677,14 @@ int __init scsi_init_queue(void)
                 return -ENOMEM;
         }
  
+       scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb",
+                                       sizeof(struct scsi_data_buffer),
+                                       0, 0, NULL);
+       if (!scsi_bidi_sdb_cache) {
+               printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n");
+               goto cleanup_io_context;
+       }
+
         for (i = 0; i < SG_MEMPOOL_NR; i++) {
                 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
                 int size = sgp->size * sizeof(struct scatterlist);
@@ -1663,6 +1694,7 @@ int __init scsi_init_queue(void)
                 if (!sgp->slab) {
                         printk(KERN_ERR "SCSI: can't init sg slab %s\n",
                                         sgp->name);
+                       goto cleanup_bidi_sdb;
                 }
  
                 sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
@@ -1670,10 +1702,25 @@ int __init scsi_init_queue(void)
                 if (!sgp->pool) {
                         printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
                                         sgp->name);
+                       goto cleanup_bidi_sdb;
                 }
         }
  
         return 0;
+
+cleanup_bidi_sdb:
+       for (i = 0; i < SG_MEMPOOL_NR; i++) {
+               struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
+               if (sgp->pool)
+                       mempool_destroy(sgp->pool);
+               if (sgp->slab)
+                       kmem_cache_destroy(sgp->slab);
+       }
+       kmem_cache_destroy(scsi_bidi_sdb_cache);
+cleanup_io_context:
+       kmem_cache_destroy(scsi_io_context_cache);
+
+       return -ENOMEM;
  }
  
  void scsi_exit_queue(void)
@@ -1681,6 +1728,7 @@ void scsi_exit_queue(void)
         int i;
  
         kmem_cache_destroy(scsi_io_context_cache);
+       kmem_cache_destroy(scsi_bidi_sdb_cache);
  
         for (i = 0; i < SG_MEMPOOL_NR; i++) {
                 struct scsi_host_sg_pool *sgp = scsi_sg_pools + i;
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c

index 01e03f3f6ffa39ea7240dfe355ddd7edf5a5981c..91630baea532e0656883ee2c427b21467a9bf629 100644 (file)
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -331,8 +331,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd)
  
         scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag);
  
-       if (scsi_sglist(cmd))
-               scsi_free_sgtable(cmd);
+       scsi_release_buffers(cmd);
  
         queue_work(scsi_tgtd, &tcmd->work);
  }
@@ -353,25 +352,6 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd)
         return 0;
  }
  
-static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
-{
-       struct request *rq = cmd->request;
-       int count;
-
-       cmd->use_sg = rq->nr_phys_segments;
-       if (scsi_alloc_sgtable(cmd, gfp_mask))
-               return -ENOMEM;
-
-       cmd->request_bufflen = rq->data_len;
-
-       dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd),
-               rq_data_dir(rq));
-       count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd));
-       BUG_ON(count > cmd->use_sg);
-       cmd->use_sg = count;
-       return 0;
-}
-
  /* TODO: test this crap and replace bio_map_user with new interface maybe */
  static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
                                unsigned long uaddr, unsigned int len, int rw)
@@ -397,9 +377,11 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
         }
  
         tcmd->bio = rq->bio;
-       err = scsi_tgt_init_cmd(cmd, GFP_KERNEL);
-       if (err)
+       err = scsi_init_io(cmd, GFP_KERNEL);
+       if (err) {
+               scsi_release_buffers(cmd);
                 goto unmap_rq;
+       }
  
         return 0;
  
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c

index 24eba3118b5a415e9b76c0d5a668a72fab58d303..51a5557f42dda472fd67bb0b7352ddb7ff1d3c7a 100644 (file)
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
                 SCpnt->cmnd[4] = (unsigned char) this_count;
                 SCpnt->cmnd[5] = 0;
         }
-       SCpnt->request_bufflen = this_count * sdp->sector_size;
+       SCpnt->sdb.length = this_count * sdp->sector_size;
  
         /*
          * We shouldn't disconnect in the middle of a sector, so with a dumb
@@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = {
  static int sd_done(struct scsi_cmnd *SCpnt)
  {
         int result = SCpnt->result;
-       unsigned int xfer_size = SCpnt->request_bufflen;
+       unsigned int xfer_size = scsi_bufflen(SCpnt);
         unsigned int good_bytes = result ? 0 : xfer_size;
         u64 start_lba = SCpnt->request->sector;
         u64 bad_lba;
diff --git a/drivers/scsi/sgiwd93.c b/drivers/scsi/sgiwd93.c

index d4ebe8c67ba92b901298f5d4dda08c4bd69507b0..26cfc56c7091e6d5cf388eba98c456cc1a6ab8c8 100644 (file)
--- a/drivers/scsi/sgiwd93.c
+++ b/drivers/scsi/sgiwd93.c
@@ -33,10 +33,9 @@
  
  struct ip22_hostdata {
         struct WD33C93_hostdata wh;
-       struct hpc_data {
-               dma_addr_t      dma;
-               void            *cpu;
-       } hd;
+       dma_addr_t dma;
+       void *cpu;
+       struct device *dev;
  };
  
  #define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata))
@@ -46,6 +45,11 @@ struct hpc_chunk {
         u32 _padding;   /* align to quadword boundary */
  };
  
+/* space for hpc dma descriptors */
+#define HPC_DMA_SIZE   PAGE_SIZE
+
+#define DMA_DIR(d)   ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
+
  static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
  {
         struct Scsi_Host * host = dev_id;
@@ -59,15 +63,17 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id)
  }
  
  static inline
-void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
+void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din)
  {
         unsigned long len = cmd->SCp.this_residual;
         void *addr = cmd->SCp.ptr;
         dma_addr_t physaddr;
         unsigned long count;
+       struct hpc_chunk *hcp;
  
-       physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction);
+       physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din));
         cmd->SCp.dma_handle = physaddr;
+       hcp = hd->cpu;
  
         while (len) {
                 /*
@@ -89,6 +95,9 @@ void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp)
          */
         hcp->desc.pbuf = 0;
         hcp->desc.cntinfo = HPCDMA_EOX;
+       dma_cache_sync(hd->dev, hd->cpu,
+                      (unsigned long)(hcp + 1) - (unsigned long)hd->cpu,
+                      DMA_TO_DEVICE);
  }
  
  static int dma_setup(struct scsi_cmnd *cmd, int datainp)
@@ -96,9 +105,8 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
         struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host);
         struct hpc3_scsiregs *hregs =
                 (struct hpc3_scsiregs *) cmd->device->host->base;
-       struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu;
  
-       pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp);
+       pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu);
  
         hdata->wh.dma_dir = datainp;
  
@@ -111,12 +119,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp)
         if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0)
                 return 1;
  
-       fill_hpc_entries(hcp, cmd, datainp);
+       fill_hpc_entries(hdata, cmd, datainp);
  
         pr_debug(" HPCGO\n");
  
         /* Start up the HPC. */
-       hregs->ndptr = hdata->hd.dma;
+       hregs->ndptr = hdata->dma;
         if (datainp)
                 hregs->ctrl = HPC3_SCTRL_ACTIVE;
         else
@@ -134,6 +142,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
         if (!SCpnt)
                 return;
  
+       if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0)
+               return;
+
         hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base;
  
         pr_debug("dma_stop: status<%d> ", status);
@@ -145,8 +156,9 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt,
                         barrier();
         }
         hregs->ctrl = 0;
-       dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual,
-                        SCpnt->sc_data_direction);
+       dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle,
+                        SCpnt->SCp.this_residual,
+                        DMA_DIR(hdata->wh.dma_dir));
  
         pr_debug("\n");
  }
@@ -161,22 +173,23 @@ void sgiwd93_reset(unsigned long base)
  }
  EXPORT_SYMBOL_GPL(sgiwd93_reset);
  
-static inline void init_hpc_chain(struct hpc_data *hd)
+static inline void init_hpc_chain(struct ip22_hostdata *hdata)
  {
-       struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu;
-       struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma;
+       struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu;
+       dma_addr_t dma = hdata->dma;
         unsigned long start, end;
  
         start = (unsigned long) hcp;
-       end = start + PAGE_SIZE;
+       end = start + HPC_DMA_SIZE;
         while (start < end) {
-               hcp->desc.pnext = (u32) (dma + 1);
+               hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk));
                 hcp->desc.cntinfo = HPCDMA_EOX;
-               hcp++; dma++;
+               hcp++;
+               dma += sizeof(struct hpc_chunk);
                 start += sizeof(struct hpc_chunk);
         };
         hcp--;
-       hcp->desc.pnext = hd->dma;
+       hcp->desc.pnext = hdata->dma;
  }
  
  static int sgiwd93_bus_reset(struct scsi_cmnd *cmd)
@@ -235,16 +248,17 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
         host->irq = irq;
  
         hdata = host_to_hostdata(host);
-       hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
-                                          &hdata->hd.dma, GFP_KERNEL);
-       if (!hdata->hd.cpu) {
+       hdata->dev = &pdev->dev;
+       hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE,
+                                          &hdata->dma, GFP_KERNEL);
+       if (!hdata->cpu) {
                 printk(KERN_WARNING "sgiwd93: Could not allocate memory for "
                        "host %d buffer.\n", unit);
                 err = -ENOMEM;
                 goto out_put;
         }
  
-       init_hpc_chain(&hdata->hd);
+       init_hpc_chain(hdata);
  
         regs.SASR = wdregs + 3;
         regs.SCMD = wdregs + 7;
@@ -274,7 +288,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev)
  out_irq:
         free_irq(irq, host);
  out_free:
-       dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
+       dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
  out_put:
         scsi_host_put(host);
  out:
@@ -290,7 +304,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev)
  
         scsi_remove_host(host);
         free_irq(pd->irq, host);
-       dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma);
+       dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma);
         scsi_host_put(host);
  }
  
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c

index 1fcee16fa36dbd47246b659a2c2dfb1d23daecb9..50ba492502035afe85d0bd62899b02c132022c63 100644 (file)
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -231,7 +231,7 @@ out:
  static int sr_done(struct scsi_cmnd *SCpnt)
  {
         int result = SCpnt->result;
-       int this_count = SCpnt->request_bufflen;
+       int this_count = scsi_bufflen(SCpnt);
         int good_bytes = (result == 0 ? this_count : 0);
         int block_sectors = 0;
         long error_sector;
@@ -379,17 +379,18 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
         }
  
         {
-               struct scatterlist *sg = SCpnt->request_buffer;
-               int i, size = 0;
-               for (i = 0; i < SCpnt->use_sg; i++)
-                       size += sg[i].length;
+               struct scatterlist *sg;
+               int i, size = 0, sg_count = scsi_sg_count(SCpnt);
  
-               if (size != SCpnt->request_bufflen && SCpnt->use_sg) {
+               scsi_for_each_sg(SCpnt, sg, sg_count, i)
+                       size += sg->length;
+
+               if (size != scsi_bufflen(SCpnt)) {
                         scmd_printk(KERN_ERR, SCpnt,
                                 "mismatch count %d, bytes %d\n",
-                               size, SCpnt->request_bufflen);
-                       if (SCpnt->request_bufflen > size)
-                               SCpnt->request_bufflen = size;
+                               size, scsi_bufflen(SCpnt));
+                       if (scsi_bufflen(SCpnt) > size)
+                               SCpnt->sdb.length = size;
                 }
         }
  
@@ -397,12 +398,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
          * request doesn't start on hw block boundary, add scatter pads
          */
         if (((unsigned int)rq->sector % (s_size >> 9)) ||
-           (SCpnt->request_bufflen % s_size)) {
+           (scsi_bufflen(SCpnt) % s_size)) {
                 scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
                 goto out;
         }
  
-       this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9);
+       this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
  
  
         SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
@@ -416,7 +417,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
  
         if (this_count > 0xffff) {
                 this_count = 0xffff;
-               SCpnt->request_bufflen = this_count * s_size;
+               SCpnt->sdb.length = this_count * s_size;
         }
  
         SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c

index e3fab3a6aed79f843e44de0720ff3dea485b18b2..72f6d8015358d6f52c967f51876d53cac2597c29 100644 (file)
--- a/drivers/scsi/stex.c
+++ b/drivers/scsi/stex.c
@@ -1123,7 +1123,6 @@ static struct scsi_host_template driver_template = {
         .this_id                        = -1,
         .sg_tablesize                   = ST_MAX_SG,
         .cmd_per_lun                    = ST_CMD_PER_LUN,
-       .use_sg_chaining                = ENABLE_SG_CHAINING,
  };
  
  static int stex_set_dma_mask(struct pci_dev * pdev)
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c

index 1f6fd16803351f7d67940d864cfdca2148eec3c7..6325901e509342c8e95aae2ce81f35c8bdd11900 100644 (file)
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -840,6 +840,5 @@ static struct scsi_host_template driver_template = {
         .cmd_per_lun =          1,
         .unchecked_isa_dma =    1,
         .use_clustering =       ENABLE_CLUSTERING,
-       .use_sg_chaining =      ENABLE_SG_CHAINING,
  };
  #include "scsi_module.c"
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c

index 21e926dcdab0bfc73d6bc89d85f7c4d177d5c0ea..d39107b7669bfb22e3fa1e278166dd2c2766616f 100644 (file)
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid)
                         /*
                          *  Bounce back the sense data to user.
                          */
-                       memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
+                       memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
                         memcpy(cmd->sense_buffer, cp->sns_bbuf,
                                min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN));
  #if 0
@@ -1681,7 +1681,6 @@ static struct scsi_host_template sym2_template = {
         .eh_host_reset_handler  = sym53c8xx_eh_host_reset_handler,
         .this_id                = 7,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
         .max_sectors            = 0xFFFF,
  #ifdef SYM_LINUX_PROC_INFO_SUPPORT
         .proc_info              = sym53c8xx_proc_info,
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c

index 4bc5407f96958598f3a44ca56c8d387a6a86100e..662c00451be443556e32f1bbd930a4c6dcd811bc 100644 (file)
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -451,7 +451,6 @@ static struct scsi_host_template driver_template = {
                  .this_id                 = 7,
                  .unchecked_isa_dma       = 1,
                  .use_clustering          = ENABLE_CLUSTERING,
-                .use_sg_chaining         = ENABLE_SG_CHAINING,
                  };
  
  #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c

index 75eca6b22db561e7508d52dd2cb1be70cafdb98f..f385dce8dfbeb16d809a04eaf149edfb25c9cbf6 100644 (file)
--- a/drivers/scsi/ultrastor.c
+++ b/drivers/scsi/ultrastor.c
@@ -1204,6 +1204,5 @@ static struct scsi_host_template driver_template = {
         .cmd_per_lun       = ULTRASTOR_MAX_CMDS_PER_LUN,
         .unchecked_isa_dma = 1,
         .use_clustering    = ENABLE_CLUSTERING,
-       .use_sg_chaining   = ENABLE_SG_CHAINING,
  };
  #include "scsi_module.c"
diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c

index b4304ae78527e409bae6ad6334ec68b7199e22b1..c975c01b3a02431000f071116f3b40caffd4f3f8 100644 (file)
--- a/drivers/scsi/wd7000.c
+++ b/drivers/scsi/wd7000.c
@@ -1671,7 +1671,6 @@ static struct scsi_host_template driver_template = {
         .cmd_per_lun            = 1,
         .unchecked_isa_dma      = 1,
         .use_clustering         = ENABLE_CLUSTERING,
-       .use_sg_chaining        = ENABLE_SG_CHAINING,
  };
  
  #include "scsi_module.c"
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c

index 178e8c2a8a2feb41f03fd7d16117347647f6247b..0db488624ab1c4983cf481ec637e29a9650bc586 100644 (file)
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info,
                 sg_init_one(&info->sg, buff, bufflen);
  
         srb->sc_data_direction = dir;
-       srb->request_buffer = buff ? &info->sg : NULL;
-       srb->request_bufflen = bufflen;
-       srb->use_sg = buff ? 1 : 0;
+       srb->sdb.table.sgl = buff ? &info->sg : NULL;
+       srb->sdb.length = bufflen;
+       srb->sdb.table.nents = buff ? 1 : 0;
  }
  
  static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen)
  {
-       srb->request_bufflen = bufflen;
+       srb->sdb.length = bufflen;
  }
  
  
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c

index c31f549ebea0b88750118cd7f5a6c72305182895..1c656667b937d5ba1d4ce1fbc6d6e6de6cd5a320 100644 (file)
--- a/drivers/video/vermilion/vermilion.c
+++ b/drivers/video/vermilion/vermilion.c
@@ -88,9 +88,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
  {
         gfp_t flags;
         unsigned long i;
-       pgprot_t wc_pageprot;
  
-       wc_pageprot = PAGE_KERNEL_NOCACHE;
         max_order++;
         do {
                 /*
@@ -126,14 +124,8 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
         /*
          * Change caching policy of the linear kernel map to avoid
          * mapping type conflicts with user-space mappings.
-        * The first global_flush_tlb() is really only there to do a global
-        * wbinvd().
          */
-
-       global_flush_tlb();
-       change_page_attr(virt_to_page(va->logical), va->size >> PAGE_SHIFT,
-                        wc_pageprot);
-       global_flush_tlb();
+       set_pages_uc(virt_to_page(va->logical), va->size >> PAGE_SHIFT);
  
         printk(KERN_DEBUG MODULE_NAME
                ": Allocated %ld bytes vram area at 0x%08lx\n",
@@ -157,9 +149,8 @@ static void vmlfb_free_vram_area(struct vram_area *va)
                  * Reset the linear kernel map caching policy.
                  */
  
-               change_page_attr(virt_to_page(va->logical),
-                                va->size >> PAGE_SHIFT, PAGE_KERNEL);
-               global_flush_tlb();
+               set_pages_wb(virt_to_page(va->logical),
+                                va->size >> PAGE_SHIFT);
  
                 /*
                  * Decrease the usage count on the pages we've used
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt

index d4fc6095466daa2b20337d66ffaaaeb12cc4d208..7c3d5f923da1c4f0d4d2f06392e3151649625cd3 100644 (file)
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -23,6 +23,10 @@ config BINFMT_ELF
           ld.so (check the file <file:Documentation/Changes> for location and
           latest version).
  
+config COMPAT_BINFMT_ELF
+       bool
+       depends on COMPAT && MMU
+
  config BINFMT_ELF_FDPIC
         bool "Kernel support for FDPIC ELF binaries"
         default y
diff --git a/fs/Makefile b/fs/Makefile

index 500cf15cdb4ba94f5811ae53178fc685fe9da79e..1e7a11bd4da116f888a4589925dea958fa6ec5f9 100644 (file)
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_BINFMT_MISC)     += binfmt_misc.o
  obj-y                          += binfmt_script.o
  
  obj-$(CONFIG_BINFMT_ELF)       += binfmt_elf.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF)        += compat_binfmt_elf.o
  obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
  obj-$(CONFIG_BINFMT_SOM)       += binfmt_som.o
  obj-$(CONFIG_BINFMT_FLAT)      += binfmt_flat.o
diff --git a/fs/aio.c b/fs/aio.c

index 9dec7d2d546e0e12e200898993f1b272111f5cee..8a37dbbf3437cb78021c22c608dc8beb4cbe2788 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -397,7 +397,7 @@ void fastcall __put_ioctx(struct kioctx *ctx)
   * This prevents races between the aio code path referencing the
   * req (after submitting it) and aio_complete() freeing the req.
   */
-static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx));
+static struct kiocb *__aio_get_req(struct kioctx *ctx);
  static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
  {
         struct kiocb *req = NULL;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c

index f0b3171842f22e75796d2aa68a7713f30af2f111..18ed6dd906c150208a4beb9230676a5a95de06d6 100644 (file)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,8 @@
  
  static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
  static int load_elf_library(struct file *);
-static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
+static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
+                               int, int, unsigned long);
  
  /*
   * If we don't support core dumping, then supply a NULL so we
@@ -298,33 +299,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
  #ifndef elf_map
  
  static unsigned long elf_map(struct file *filep, unsigned long addr,
-               struct elf_phdr *eppnt, int prot, int type)
+               struct elf_phdr *eppnt, int prot, int type,
+               unsigned long total_size)
  {
         unsigned long map_addr;
-       unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
+       unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+       unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+       addr = ELF_PAGESTART(addr);
+       size = ELF_PAGEALIGN(size);
  
-       down_write(&current->mm->mmap_sem);
         /* mmap() will return -EINVAL if given a zero size, but a
          * segment with zero filesize is perfectly valid */
-       if (eppnt->p_filesz + pageoffset)
-               map_addr = do_mmap(filep, ELF_PAGESTART(addr),
-                                  eppnt->p_filesz + pageoffset, prot, type,
-                                  eppnt->p_offset - pageoffset);
-       else
-               map_addr = ELF_PAGESTART(addr);
+       if (!size)
+               return addr;
+
+       down_write(&current->mm->mmap_sem);
+       /*
+       * total_size is the size of the ELF (interpreter) image.
+       * The _first_ mmap needs to know the full size, otherwise
+       * randomization might put this image into an overlapping
+       * position with the ELF binary image. (since size < total_size)
+       * So we first map the 'big' image - and unmap the remainder at
+       * the end. (which unmap is needed for ELF images with holes.)
+       */
+       if (total_size) {
+               total_size = ELF_PAGEALIGN(total_size);
+               map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+               if (!BAD_ADDR(map_addr))
+                       do_munmap(current->mm, map_addr+size, total_size-size);
+       } else
+               map_addr = do_mmap(filep, addr, size, prot, type, off);
+
         up_write(&current->mm->mmap_sem);
         return(map_addr);
  }
  
  #endif /* !elf_map */
  
+static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+{
+       int i, first_idx = -1, last_idx = -1;
+
+       for (i = 0; i < nr; i++) {
+               if (cmds[i].p_type == PT_LOAD) {
+                       last_idx = i;
+                       if (first_idx == -1)
+                               first_idx = i;
+               }
+       }
+       if (first_idx == -1)
+               return 0;
+
+       return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+                               ELF_PAGESTART(cmds[first_idx].p_vaddr);
+}
+
+
  /* This is much more generalized than the library routine read function,
     so we keep this separate.  Technically the library read function
     is only provided so that we can read a.out libraries that have
     an ELF header */
  
  static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
-               struct file *interpreter, unsigned long *interp_load_addr)
+               struct file *interpreter, unsigned long *interp_map_addr,
+               unsigned long no_base)
  {
         struct elf_phdr *elf_phdata;
         struct elf_phdr *eppnt;
@@ -332,6 +370,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
         int load_addr_set = 0;
         unsigned long last_bss = 0, elf_bss = 0;
         unsigned long error = ~0UL;
+       unsigned long total_size;
         int retval, i, size;
  
         /* First of all, some simple consistency checks */
@@ -370,6 +409,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                 goto out_close;
         }
  
+       total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
+       if (!total_size) {
+               error = -EINVAL;
+               goto out_close;
+       }
+
         eppnt = elf_phdata;
         for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
                 if (eppnt->p_type == PT_LOAD) {
@@ -387,9 +432,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                         vaddr = eppnt->p_vaddr;
                         if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
                                 elf_type |= MAP_FIXED;
+                       else if (no_base && interp_elf_ex->e_type == ET_DYN)
+                               load_addr = -vaddr;
  
                         map_addr = elf_map(interpreter, load_addr + vaddr,
-                                          eppnt, elf_prot, elf_type);
+                                       eppnt, elf_prot, elf_type, total_size);
+                       total_size = 0;
+                       if (!*interp_map_addr)
+                               *interp_map_addr = map_addr;
                         error = map_addr;
                         if (BAD_ADDR(map_addr))
                                 goto out_close;
@@ -455,8 +505,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                         goto out_close;
         }
  
-       *interp_load_addr = load_addr;
-       error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
+       error = load_addr;
  
  out_close:
         kfree(elf_phdata);
@@ -546,14 +595,14 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
         int load_addr_set = 0;
         char * elf_interpreter = NULL;
         unsigned int interpreter_type = INTERPRETER_NONE;
-       unsigned char ibcs2_interpreter = 0;
         unsigned long error;
         struct elf_phdr *elf_ppnt, *elf_phdata;
         unsigned long elf_bss, elf_brk;
         int elf_exec_fileno;
         int retval, i;
         unsigned int size;
-       unsigned long elf_entry, interp_load_addr = 0;
+       unsigned long elf_entry;
+       unsigned long interp_load_addr = 0;
         unsigned long start_code, end_code, start_data, end_data;
         unsigned long reloc_func_desc = 0;
         char passed_fileno[6];
@@ -663,14 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
                                 goto out_free_interp;
  
-                       /* If the program interpreter is one of these two,
-                        * then assume an iBCS2 image. Otherwise assume
-                        * a native linux image.
-                        */
-                       if (strcmp(elf_interpreter,"/usr/lib/libc.so.1") == 0 ||
-                           strcmp(elf_interpreter,"/usr/lib/ld.so.1") == 0)
-                               ibcs2_interpreter = 1;
-
                         /*
                          * The early SET_PERSONALITY here is so that the lookup
                          * for the interpreter happens in the namespace of the 
@@ -690,7 +731,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                          * switch really is going to happen - do this in
                          * flush_thread().      - akpm
                          */
-                       SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+                       SET_PERSONALITY(loc->elf_ex, 0);
  
                         interpreter = open_exec(elf_interpreter);
                         retval = PTR_ERR(interpreter);
@@ -769,7 +810,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         goto out_free_dentry;
         } else {
                 /* Executables without an interpreter also need a personality  */
-               SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+               SET_PERSONALITY(loc->elf_ex, 0);
         }
  
         /* OK, we are done with that, now set up the arg stuff,
@@ -803,7 +844,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
  
         /* Do this immediately, since STACK_TOP as used in setup_arg_pages
            may depend on the personality.  */
-       SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+       SET_PERSONALITY(loc->elf_ex, 0);
         if (elf_read_implies_exec(loc->elf_ex, executable_stack))
                 current->personality |= READ_IMPLIES_EXEC;
  
@@ -825,9 +866,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
         current->mm->start_stack = bprm->p;
  
         /* Now we do a little grungy work by mmaping the ELF image into
-          the correct location in memory.  At this point, we assume that
-          the image should be loaded at fixed address, not at a variable
-          address. */
+          the correct location in memory. */
         for(i = 0, elf_ppnt = elf_phdata;
             i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
                 int elf_prot = 0, elf_flags;
@@ -881,11 +920,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                          * default mmap base, as well as whatever program they
                          * might try to exec.  This is because the brk will
                          * follow the loader, and is not movable.  */
+#ifdef CONFIG_X86
+                       load_bias = 0;
+#else
                         load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+#endif
                 }
  
                 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-                               elf_prot, elf_flags);
+                               elf_prot, elf_flags, 0);
                 if (BAD_ADDR(error)) {
                         send_sig(SIGKILL, current, 0);
                         retval = IS_ERR((void *)error) ?
@@ -961,13 +1004,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
         }
  
         if (elf_interpreter) {
-               if (interpreter_type == INTERPRETER_AOUT)
+               if (interpreter_type == INTERPRETER_AOUT) {
                         elf_entry = load_aout_interp(&loc->interp_ex,
                                                      interpreter);
-               else
+               } else {
+                       unsigned long uninitialized_var(interp_map_addr);
+
                         elf_entry = load_elf_interp(&loc->interp_elf_ex,
                                                     interpreter,
-                                                   &interp_load_addr);
+                                                   &interp_map_addr,
+                                                   load_bias);
+                       if (!IS_ERR((void *)elf_entry)) {
+                               /*
+                                * load_elf_interp() returns relocation
+                                * adjustment
+                                */
+                               interp_load_addr = elf_entry;
+                               elf_entry += loc->interp_elf_ex.e_entry;
+                       }
+               }
                 if (BAD_ADDR(elf_entry)) {
                         force_sig(SIGSEGV, current);
                         retval = IS_ERR((void *)elf_entry) ?
@@ -1021,6 +1076,12 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
         current->mm->end_data = end_data;
         current->mm->start_stack = bprm->p;
  
+#ifdef arch_randomize_brk
+       if (current->flags & PF_RANDOMIZE)
+               current->mm->brk = current->mm->start_brk =
+                       arch_randomize_brk(current->mm);
+#endif
+
         if (current->personality & MMAP_PAGE_ZERO) {
                 /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
                    and some applications "depend" upon this behavior.
@@ -1325,7 +1386,8 @@ static int writenote(struct memelfnote *men, struct file *file,
         if (!dump_seek(file, (off))) \
                 goto end_coredump;
  
-static void fill_elf_header(struct elfhdr *elf, int segs)
+static void fill_elf_header(struct elfhdr *elf, int segs,
+                           u16 machine, u32 flags, u8 osabi)
  {
         memcpy(elf->e_ident, ELFMAG, SELFMAG);
         elf->e_ident[EI_CLASS] = ELF_CLASS;
@@ -1335,12 +1397,12 @@ static void fill_elf_header(struct elfhdr *elf, int segs)
         memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
  
         elf->e_type = ET_CORE;
-       elf->e_machine = ELF_ARCH;
+       elf->e_machine = machine;
         elf->e_version = EV_CURRENT;
         elf->e_entry = 0;
         elf->e_phoff = sizeof(struct elfhdr);
         elf->e_shoff = 0;
-       elf->e_flags = ELF_CORE_EFLAGS;
+       elf->e_flags = flags;
         elf->e_ehsize = sizeof(struct elfhdr);
         elf->e_phentsize = sizeof(struct elf_phdr);
         elf->e_phnum = segs;
@@ -1447,6 +1509,238 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
         return 0;
  }
  
+static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
+{
+       elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
+       int i = 0;
+       do
+               i += 2;
+       while (auxv[i - 2] != AT_NULL);
+       fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+}
+
+#ifdef CORE_DUMP_USE_REGSET
+#include <linux/regset.h>
+
+struct elf_thread_core_info {
+       struct elf_thread_core_info *next;
+       struct task_struct *task;
+       struct elf_prstatus prstatus;
+       struct memelfnote notes[0];
+};
+
+struct elf_note_info {
+       struct elf_thread_core_info *thread;
+       struct memelfnote psinfo;
+       struct memelfnote auxv;
+       size_t size;
+       int thread_notes;
+};
+
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+                                const struct user_regset_view *view,
+                                long signr, size_t *total)
+{
+       unsigned int i;
+
+       /*
+        * NT_PRSTATUS is the one special case, because the regset data
+        * goes into the pr_reg field inside the note contents, rather
+        * than being the whole note contents.  We fill the reset in here.
+        * We assume that regset 0 is NT_PRSTATUS.
+        */
+       fill_prstatus(&t->prstatus, t->task, signr);
+       (void) view->regsets[0].get(t->task, &view->regsets[0],
+                                   0, sizeof(t->prstatus.pr_reg),
+                                   &t->prstatus.pr_reg, NULL);
+
+       fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+                 sizeof(t->prstatus), &t->prstatus);
+       *total += notesize(&t->notes[0]);
+
+       /*
+        * Each other regset might generate a note too.  For each regset
+        * that has no core_note_type or is inactive, we leave t->notes[i]
+        * all zero and we'll know to skip writing it later.
+        */
+       for (i = 1; i < view->n; ++i) {
+               const struct user_regset *regset = &view->regsets[i];
+               if (regset->core_note_type &&
+                   (!regset->active || regset->active(t->task, regset))) {
+                       int ret;
+                       size_t size = regset->n * regset->size;
+                       void *data = kmalloc(size, GFP_KERNEL);
+                       if (unlikely(!data))
+                               return 0;
+                       ret = regset->get(t->task, regset,
+                                         0, size, data, NULL);
+                       if (unlikely(ret))
+                               kfree(data);
+                       else {
+                               if (regset->core_note_type != NT_PRFPREG)
+                                       fill_note(&t->notes[i], "LINUX",
+                                                 regset->core_note_type,
+                                                 size, data);
+                               else {
+                                       t->prstatus.pr_fpvalid = 1;
+                                       fill_note(&t->notes[i], "CORE",
+                                                 NT_PRFPREG, size, data);
+                               }
+                               *total += notesize(&t->notes[i]);
+                       }
+               }
+       }
+
+       return 1;
+}
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+                         struct elf_note_info *info,
+                         long signr, struct pt_regs *regs)
+{
+       struct task_struct *dump_task = current;
+       const struct user_regset_view *view = task_user_regset_view(dump_task);
+       struct elf_thread_core_info *t;
+       struct elf_prpsinfo *psinfo;
+       struct task_struct *g, *p;
+       unsigned int i;
+
+       info->size = 0;
+       info->thread = NULL;
+
+       psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+       fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+
+       if (psinfo == NULL)
+               return 0;
+
+       /*
+        * Figure out how many notes we're going to need for each thread.
+        */
+       info->thread_notes = 0;
+       for (i = 0; i < view->n; ++i)
+               if (view->regsets[i].core_note_type != 0)
+                       ++info->thread_notes;
+
+       /*
+        * Sanity check.  We rely on regset 0 being in NT_PRSTATUS,
+        * since it is our one special case.
+        */
+       if (unlikely(info->thread_notes == 0) ||
+           unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
+               WARN_ON(1);
+               return 0;
+       }
+
+       /*
+        * Initialize the ELF file header.
+        */
+       fill_elf_header(elf, phdrs,
+                       view->e_machine, view->e_flags, view->ei_osabi);
+
+       /*
+        * Allocate a structure for each thread.
+        */
+       rcu_read_lock();
+       do_each_thread(g, p)
+               if (p->mm == dump_task->mm) {
+                       t = kzalloc(offsetof(struct elf_thread_core_info,
+                                            notes[info->thread_notes]),
+                                   GFP_ATOMIC);
+                       if (unlikely(!t)) {
+                               rcu_read_unlock();
+                               return 0;
+                       }
+                       t->task = p;
+                       if (p == dump_task || !info->thread) {
+                               t->next = info->thread;
+                               info->thread = t;
+                       } else {
+                               /*
+                                * Make sure to keep the original task at
+                                * the head of the list.
+                                */
+                               t->next = info->thread->next;
+                               info->thread->next = t;
+                       }
+               }
+       while_each_thread(g, p);
+       rcu_read_unlock();
+
+       /*
+        * Now fill in each thread's information.
+        */
+       for (t = info->thread; t != NULL; t = t->next)
+               if (!fill_thread_core_info(t, view, signr, &info->size))
+                       return 0;
+
+       /*
+        * Fill in the two process-wide notes.
+        */
+       fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
+       info->size += notesize(&info->psinfo);
+
+       fill_auxv_note(&info->auxv, current->mm);
+       info->size += notesize(&info->auxv);
+
+       return 1;
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+       return info->size;
+}
+
+/*
+ * Write all the notes for each thread.  When writing the first thread, the
+ * process-wide notes are interleaved after the first thread-specific note.
+ */
+static int write_note_info(struct elf_note_info *info,
+                          struct file *file, loff_t *foffset)
+{
+       bool first = 1;
+       struct elf_thread_core_info *t = info->thread;
+
+       do {
+               int i;
+
+               if (!writenote(&t->notes[0], file, foffset))
+                       return 0;
+
+               if (first && !writenote(&info->psinfo, file, foffset))
+                       return 0;
+               if (first && !writenote(&info->auxv, file, foffset))
+                       return 0;
+
+               for (i = 1; i < info->thread_notes; ++i)
+                       if (t->notes[i].data &&
+                           !writenote(&t->notes[i], file, foffset))
+                               return 0;
+
+               first = 0;
+               t = t->next;
+       } while (t);
+
+       return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+       struct elf_thread_core_info *threads = info->thread;
+       while (threads) {
+               unsigned int i;
+               struct elf_thread_core_info *t = threads;
+               threads = t->next;
+               WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
+               for (i = 1; i < info->thread_notes; ++i)
+                       kfree(t->notes[i].data);
+               kfree(t);
+       }
+       kfree(info->psinfo.data);
+}
+
+#else
+
  /* Here is the structure in which status of each thread is captured. */
  struct elf_thread_status
  {
@@ -1499,6 +1793,176 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
         return sz;
  }
  
+struct elf_note_info {
+       struct memelfnote *notes;
+       struct elf_prstatus *prstatus;  /* NT_PRSTATUS */
+       struct elf_prpsinfo *psinfo;    /* NT_PRPSINFO */
+       struct list_head thread_list;
+       elf_fpregset_t *fpu;
+#ifdef ELF_CORE_COPY_XFPREGS
+       elf_fpxregset_t *xfpu;
+#endif
+       int thread_status_size;
+       int numnote;
+};
+
+static int fill_note_info(struct elfhdr *elf, int phdrs,
+                         struct elf_note_info *info,
+                         long signr, struct pt_regs *regs)
+{
+#define        NUM_NOTES       6
+       struct list_head *t;
+       struct task_struct *g, *p;
+
+       info->notes = NULL;
+       info->prstatus = NULL;
+       info->psinfo = NULL;
+       info->fpu = NULL;
+#ifdef ELF_CORE_COPY_XFPREGS
+       info->xfpu = NULL;
+#endif
+       INIT_LIST_HEAD(&info->thread_list);
+
+       info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote),
+                             GFP_KERNEL);
+       if (!info->notes)
+               return 0;
+       info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
+       if (!info->psinfo)
+               return 0;
+       info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
+       if (!info->prstatus)
+               return 0;
+       info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
+       if (!info->fpu)
+               return 0;
+#ifdef ELF_CORE_COPY_XFPREGS
+       info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
+       if (!info->xfpu)
+               return 0;
+#endif
+
+       info->thread_status_size = 0;
+       if (signr) {
+               struct elf_thread_status *tmp;
+               rcu_read_lock();
+               do_each_thread(g, p)
+                       if (current->mm == p->mm && current != p) {
+                               tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
+                               if (!tmp) {
+                                       rcu_read_unlock();
+                                       return 0;
+                               }
+                               tmp->thread = p;
+                               list_add(&tmp->list, &info->thread_list);
+                       }
+               while_each_thread(g, p);
+               rcu_read_unlock();
+               list_for_each(t, &info->thread_list) {
+                       struct elf_thread_status *tmp;
+                       int sz;
+
+                       tmp = list_entry(t, struct elf_thread_status, list);
+                       sz = elf_dump_thread_status(signr, tmp);
+                       info->thread_status_size += sz;
+               }
+       }
+       /* now collect the dump for the current */
+       memset(info->prstatus, 0, sizeof(*info->prstatus));
+       fill_prstatus(info->prstatus, current, signr);
+       elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+
+       /* Set up header */
+       fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+
+       /*
+        * Set up the notes in similar form to SVR4 core dumps made
+        * with info from their /proc.
+        */
+
+       fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
+                 sizeof(*info->prstatus), info->prstatus);
+       fill_psinfo(info->psinfo, current->group_leader, current->mm);
+       fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
+                 sizeof(*info->psinfo), info->psinfo);
+
+       info->numnote = 2;
+
+       fill_auxv_note(&info->notes[info->numnote++], current->mm);
+
+       /* Try to dump the FPU. */
+       info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
+                                                              info->fpu);
+       if (info->prstatus->pr_fpvalid)
+               fill_note(info->notes + info->numnote++,
+                         "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+       if (elf_core_copy_task_xfpregs(current, info->xfpu))
+               fill_note(info->notes + info->numnote++,
+                         "LINUX", ELF_CORE_XFPREG_TYPE,
+                         sizeof(*info->xfpu), info->xfpu);
+#endif
+
+       return 1;
+
+#undef NUM_NOTES
+}
+
+static size_t get_note_info_size(struct elf_note_info *info)
+{
+       int sz = 0;
+       int i;
+
+       for (i = 0; i < info->numnote; i++)
+               sz += notesize(info->notes + i);
+
+       sz += info->thread_status_size;
+
+       return sz;
+}
+
+static int write_note_info(struct elf_note_info *info,
+                          struct file *file, loff_t *foffset)
+{
+       int i;
+       struct list_head *t;
+
+       for (i = 0; i < info->numnote; i++)
+               if (!writenote(info->notes + i, file, foffset))
+                       return 0;
+
+       /* write out the thread status notes section */
+       list_for_each(t, &info->thread_list) {
+               struct elf_thread_status *tmp =
+                               list_entry(t, struct elf_thread_status, list);
+
+               for (i = 0; i < tmp->num_notes; i++)
+                       if (!writenote(&tmp->notes[i], file, foffset))
+                               return 0;
+       }
+
+       return 1;
+}
+
+static void free_note_info(struct elf_note_info *info)
+{
+       while (!list_empty(&info->thread_list)) {
+               struct list_head *tmp = info->thread_list.next;
+               list_del(tmp);
+               kfree(list_entry(tmp, struct elf_thread_status, list));
+       }
+
+       kfree(info->prstatus);
+       kfree(info->psinfo);
+       kfree(info->notes);
+       kfree(info->fpu);
+#ifdef ELF_CORE_COPY_XFPREGS
+       kfree(info->xfpu);
+#endif
+}
+
+#endif
+
  static struct vm_area_struct *first_vma(struct task_struct *tsk,
                                         struct vm_area_struct *gate_vma)
  {
@@ -1534,29 +1998,15 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
   */
  static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
  {
-#define        NUM_NOTES       6
         int has_dumped = 0;
         mm_segment_t fs;
         int segs;
         size_t size = 0;
-       int i;
         struct vm_area_struct *vma, *gate_vma;
         struct elfhdr *elf = NULL;
         loff_t offset = 0, dataoff, foffset;
-       int numnote;
-       struct memelfnote *notes = NULL;
-       struct elf_prstatus *prstatus = NULL;   /* NT_PRSTATUS */
-       struct elf_prpsinfo *psinfo = NULL;     /* NT_PRPSINFO */
-       struct task_struct *g, *p;
-       LIST_HEAD(thread_list);
-       struct list_head *t;
-       elf_fpregset_t *fpu = NULL;
-#ifdef ELF_CORE_COPY_XFPREGS
-       elf_fpxregset_t *xfpu = NULL;
-#endif
-       int thread_status_size = 0;
-       elf_addr_t *auxv;
         unsigned long mm_flags;
+       struct elf_note_info info;
  
         /*
          * We no longer stop all VM operations.
@@ -1574,52 +2024,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
         elf = kmalloc(sizeof(*elf), GFP_KERNEL);
         if (!elf)
                 goto cleanup;
-       prstatus = kmalloc(sizeof(*prstatus), GFP_KERNEL);
-       if (!prstatus)
-               goto cleanup;
-       psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
-       if (!psinfo)
-               goto cleanup;
-       notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), GFP_KERNEL);
-       if (!notes)
-               goto cleanup;
-       fpu = kmalloc(sizeof(*fpu), GFP_KERNEL);
-       if (!fpu)
-               goto cleanup;
-#ifdef ELF_CORE_COPY_XFPREGS
-       xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL);
-       if (!xfpu)
-               goto cleanup;
-#endif
-
-       if (signr) {
-               struct elf_thread_status *tmp;
-               rcu_read_lock();
-               do_each_thread(g,p)
-                       if (current->mm == p->mm && current != p) {
-                               tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
-                               if (!tmp) {
-                                       rcu_read_unlock();
-                                       goto cleanup;
-                               }
-                               tmp->thread = p;
-                               list_add(&tmp->list, &thread_list);
-                       }
-               while_each_thread(g,p);
-               rcu_read_unlock();
-               list_for_each(t, &thread_list) {
-                       struct elf_thread_status *tmp;
-                       int sz;
-
-                       tmp = list_entry(t, struct elf_thread_status, list);
-                       sz = elf_dump_thread_status(signr, tmp);
-                       thread_status_size += sz;
-               }
-       }
-       /* now collect the dump for the current */
-       memset(prstatus, 0, sizeof(*prstatus));
-       fill_prstatus(prstatus, current, signr);
-       elf_core_copy_regs(&prstatus->pr_reg, regs);
         
         segs = current->mm->map_count;
  #ifdef ELF_CORE_EXTRA_PHDRS
@@ -1630,42 +2034,16 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
         if (gate_vma != NULL)
                 segs++;
  
-       /* Set up header */
-       fill_elf_header(elf, segs + 1); /* including notes section */
-
-       has_dumped = 1;
-       current->flags |= PF_DUMPCORE;
-
         /*
-        * Set up the notes in similar form to SVR4 core dumps made
-        * with info from their /proc.
+        * Collect all the non-memory information about the process for the
+        * notes.  This also sets up the file header.
          */
+       if (!fill_note_info(elf, segs + 1, /* including notes section */
+                           &info, signr, regs))
+               goto cleanup;
  
-       fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
-       fill_psinfo(psinfo, current->group_leader, current->mm);
-       fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
-       
-       numnote = 2;
-
-       auxv = (elf_addr_t *)current->mm->saved_auxv;
-
-       i = 0;
-       do
-               i += 2;
-       while (auxv[i - 2] != AT_NULL);
-       fill_note(&notes[numnote++], "CORE", NT_AUXV,
-                 i * sizeof(elf_addr_t), auxv);
-
-       /* Try to dump the FPU. */
-       if ((prstatus->pr_fpvalid =
-            elf_core_copy_task_fpregs(current, regs, fpu)))
-               fill_note(notes + numnote++,
-                         "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
-#ifdef ELF_CORE_COPY_XFPREGS
-       if (elf_core_copy_task_xfpregs(current, xfpu))
-               fill_note(notes + numnote++,
-                         "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
-#endif 
+       has_dumped = 1;
+       current->flags |= PF_DUMPCORE;
    
         fs = get_fs();
         set_fs(KERNEL_DS);
@@ -1678,12 +2056,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
         /* Write notes phdr entry */
         {
                 struct elf_phdr phdr;
-               int sz = 0;
-
-               for (i = 0; i < numnote; i++)
-                       sz += notesize(notes + i);
-               
-               sz += thread_status_size;
+               size_t sz = get_note_info_size(&info);
  
                 sz += elf_coredump_extra_notes_size();
  
@@ -1728,23 +2101,12 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
  #endif
  
         /* write out the notes section */
-       for (i = 0; i < numnote; i++)
-               if (!writenote(notes + i, file, &foffset))
-                       goto end_coredump;
+       if (!write_note_info(&info, file, &foffset))
+               goto end_coredump;
  
         if (elf_coredump_extra_notes_write(file, &foffset))
                 goto end_coredump;
  
-       /* write out the thread status notes section */
-       list_for_each(t, &thread_list) {
-               struct elf_thread_status *tmp =
-                               list_entry(t, struct elf_thread_status, list);
-
-               for (i = 0; i < tmp->num_notes; i++)
-                       if (!writenote(&tmp->notes[i], file, &foffset))
-                               goto end_coredump;
-       }
-
         /* Align to page */
         DUMP_SEEK(dataoff - foffset);
  
@@ -1795,22 +2157,9 @@ end_coredump:
         set_fs(fs);
  
  cleanup:
-       while (!list_empty(&thread_list)) {
-               struct list_head *tmp = thread_list.next;
-               list_del(tmp);
-               kfree(list_entry(tmp, struct elf_thread_status, list));
-       }
-
         kfree(elf);
-       kfree(prstatus);
-       kfree(psinfo);
-       kfree(notes);
-       kfree(fpu);
-#ifdef ELF_CORE_COPY_XFPREGS
-       kfree(xfpu);
-#endif
+       free_note_info(&info);
         return has_dumped;
-#undef NUM_NOTES
  }
  
  #endif         /* USE_ELF_CORE_DUMP */
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c

new file mode 100644 (file)

index 0000000..0adced2
--- /dev/null
+++ b/fs/compat_binfmt_elf.c
@@ -0,0 +1,131 @@
+/*
+ * 32-bit compatibility support for ELF format executables and core dumps.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ *
+ * This file is used in a 64-bit kernel that wants to support 32-bit ELF.
+ * asm/elf.h is responsible for defining the compat_* and COMPAT_* macros
+ * used below, with definitions appropriate for 32-bit ABI compatibility.
+ *
+ * We use macros to rename the ABI types and machine-dependent
+ * functions used in binfmt_elf.c to compat versions.
+ */
+
+#include <linux/elfcore-compat.h>
+#include <linux/time.h>
+
+/*
+ * Rename the basic ELF layout types to refer to the 32-bit class of files.
+ */
+#undef ELF_CLASS
+#define ELF_CLASS      ELFCLASS32
+
+#undef elfhdr
+#undef elf_phdr
+#undef elf_note
+#undef elf_addr_t
+#define elfhdr         elf32_hdr
+#define elf_phdr       elf32_phdr
+#define elf_note       elf32_note
+#define elf_addr_t     Elf32_Addr
+
+/*
+ * The machine-dependent core note format types are defined in elfcore-compat.h,
+ * which requires asm/elf.h to define compat_elf_gregset_t et al.
+ */
+#define elf_prstatus   compat_elf_prstatus
+#define elf_prpsinfo   compat_elf_prpsinfo
+
+/*
+ * Compat version of cputime_to_compat_timeval, perhaps this
+ * should be an inline in <linux/compat.h>.
+ */
+static void cputime_to_compat_timeval(const cputime_t cputime,
+                                     struct compat_timeval *value)
+{
+       struct timeval tv;
+       cputime_to_timeval(cputime, &tv);
+       value->tv_sec = tv.tv_sec;
+       value->tv_usec = tv.tv_usec;
+}
+
+#undef cputime_to_timeval
+#define cputime_to_timeval cputime_to_compat_timeval
+
+
+/*
+ * To use this file, asm/elf.h must define compat_elf_check_arch.
+ * The other following macros can be defined if the compat versions
+ * differ from the native ones, or omitted when they match.
+ */
+
+#undef ELF_ARCH
+#undef elf_check_arch
+#define        elf_check_arch  compat_elf_check_arch
+
+#ifdef COMPAT_ELF_PLATFORM
+#undef ELF_PLATFORM
+#define        ELF_PLATFORM            COMPAT_ELF_PLATFORM
+#endif
+
+#ifdef COMPAT_ELF_HWCAP
+#undef ELF_HWCAP
+#define        ELF_HWCAP               COMPAT_ELF_HWCAP
+#endif
+
+#ifdef COMPAT_ARCH_DLINFO
+#undef ARCH_DLINFO
+#define        ARCH_DLINFO             COMPAT_ARCH_DLINFO
+#endif
+
+#ifdef COMPAT_ELF_ET_DYN_BASE
+#undef ELF_ET_DYN_BASE
+#define        ELF_ET_DYN_BASE         COMPAT_ELF_ET_DYN_BASE
+#endif
+
+#ifdef COMPAT_ELF_EXEC_PAGESIZE
+#undef ELF_EXEC_PAGESIZE
+#define        ELF_EXEC_PAGESIZE       COMPAT_ELF_EXEC_PAGESIZE
+#endif
+
+#ifdef COMPAT_ELF_PLAT_INIT
+#undef ELF_PLAT_INIT
+#define        ELF_PLAT_INIT           COMPAT_ELF_PLAT_INIT
+#endif
+
+#ifdef COMPAT_SET_PERSONALITY
+#undef SET_PERSONALITY
+#define        SET_PERSONALITY         COMPAT_SET_PERSONALITY
+#endif
+
+#ifdef compat_start_thread
+#undef start_thread
+#define        start_thread            compat_start_thread
+#endif
+
+#ifdef compat_arch_setup_additional_pages
+#undef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+#undef arch_setup_additional_pages
+#define        arch_setup_additional_pages compat_arch_setup_additional_pages
+#endif
+
+/*
+ * Rename a few of the symbols that binfmt_elf.c will define.
+ * These are all local so the names don't really matter, but it
+ * might make some debugging less confusing not to duplicate them.
+ */
+#define elf_format             compat_elf_format
+#define init_elf_binfmt                init_compat_elf_binfmt
+#define exit_elf_binfmt                exit_compat_elf_binfmt
+
+/*
+ * We share all the actual code with the native (64-bit) version.
+ */
+#include "binfmt_elf.c"
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c

index 46754553fdcc4ae06799d7f0a1c9a102c4a6900f..ff97ba924333fccfc7cbb9699753f194d47eda7b 100644 (file)
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
         spin_unlock(&ls->ls_recover_list_lock);
  
         if (!found)
-               de = allocate_direntry(ls, len);
+               de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
         return de;
  }
  
@@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls)
                 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
                                 list);
                 list_del(&de->list);
-               free_direntry(de);
+               kfree(de);
         }
         spin_unlock(&ls->ls_recover_list_lock);
  }
@@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
         }
  
         list_del(&de->list);
-       free_direntry(de);
+       kfree(de);
   out:
         write_unlock(&ls->ls_dirtbl[bucket].lock);
  }
@@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
  
         write_unlock(&ls->ls_dirtbl[bucket].lock);
  
-       de = allocate_direntry(ls, namelen);
+       de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
         if (!de)
                 return -ENOMEM;
  
@@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
         write_lock(&ls->ls_dirtbl[bucket].lock);
         tmp = search_bucket(ls, name, namelen, bucket);
         if (tmp) {
-               free_direntry(de);
+               kfree(de);
                 de = tmp;
         } else {
                 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
@@ -329,49 +329,47 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
         return get_entry(ls, nodeid, name, namelen, r_nodeid);
  }
  
-/* Copy the names of master rsb's into the buffer provided.
-   Only select names whose dir node is the given nodeid. */
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
+{
+       struct dlm_rsb *r;
+
+       down_read(&ls->ls_root_sem);
+       list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+               if (len == r->res_length && !memcmp(name, r->res_name, len)) {
+                       up_read(&ls->ls_root_sem);
+                       return r;
+               }
+       }
+       up_read(&ls->ls_root_sem);
+       return NULL;
+}
+
+/* Find the rsb where we left off (or start again), then send rsb names
+   for rsb's we're master of and whose directory node matches the requesting
+   node.  inbuf is the rsb name last sent, inlen is the name's length */
  
  void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
                            char *outbuf, int outlen, int nodeid)
  {
         struct list_head *list;
-       struct dlm_rsb *start_r = NULL, *r = NULL;
-       int offset = 0, start_namelen, error, dir_nodeid;
-       char *start_name;
+       struct dlm_rsb *r;
+       int offset = 0, dir_nodeid;
         uint16_t be_namelen;
  
-       /*
-        * Find the rsb where we left off (or start again)
-        */
-
-       start_namelen = inlen;
-       start_name = inbuf;
-
-       if (start_namelen > 1) {
-               /*
-                * We could also use a find_rsb_root() function here that
-                * searched the ls_root_list.
-                */
-               error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
-                                    &start_r);
-               DLM_ASSERT(!error && start_r,
-                          printk("error %d\n", error););
-               DLM_ASSERT(!list_empty(&start_r->res_root_list),
-                          dlm_print_rsb(start_r););
-               dlm_put_rsb(start_r);
-       }
-
-       /*
-        * Send rsb names for rsb's we're master of and whose directory node
-        * matches the requesting node.
-        */
-
         down_read(&ls->ls_root_sem);
-       if (start_r)
-               list = start_r->res_root_list.next;
-       else
+
+       if (inlen > 1) {
+               r = find_rsb_root(ls, inbuf, inlen);
+               if (!r) {
+                       inbuf[inlen - 1] = '\0';
+                       log_error(ls, "copy_master_names from %d start %d %s",
+                                 nodeid, inlen, inbuf);
+                       goto out;
+               }
+               list = r->res_root_list.next;
+       } else {
                 list = ls->ls_root_list.next;
+       }
  
         for (offset = 0; list != &ls->ls_root_list; list = list->next) {
                 r = list_entry(list, struct dlm_rsb, res_root_list);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h

index d2fc2384c3bed34e32b83ed98804d01961eb7d5a..ec61bbaf25dfdae4cc6d56ec7acff1b50e71c4ff 100644 (file)
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -570,5 +570,21 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
         return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
  }
  
+int dlm_netlink_init(void);
+void dlm_netlink_exit(void);
+void dlm_timeout_warn(struct dlm_lkb *lkb);
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
  #endif                         /* __DLM_INTERNAL_DOT_H__ */
  
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c

index 3915b8e1414623db82bce20d0d4b4c41fc6b9fa3..ff4a198fa6776a325d730748bd3d8b5782cd189a 100644 (file)
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
  /******************************************************************************
  *******************************************************************************
  **
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -88,7 +88,6 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
  static int receive_extralen(struct dlm_message *ms);
  static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
  static void del_timeout(struct dlm_lkb *lkb);
-void dlm_timeout_warn(struct dlm_lkb *lkb);
  
  /*
   * Lock compatibilty matrix - thanks Steve
@@ -335,7 +334,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
  {
         struct dlm_rsb *r;
  
-       r = allocate_rsb(ls, len);
+       r = dlm_allocate_rsb(ls, len);
         if (!r)
                 return NULL;
  
@@ -478,7 +477,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
         error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
         if (!error) {
                 write_unlock(&ls->ls_rsbtbl[bucket].lock);
-               free_rsb(r);
+               dlm_free_rsb(r);
                 r = tmp;
                 goto out;
         }
@@ -490,12 +489,6 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
         return error;
  }
  
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
-                unsigned int flags, struct dlm_rsb **r_ret)
-{
-       return find_rsb(ls, name, namelen, flags, r_ret);
-}
-
  /* This is only called to add a reference when the code already holds
     a valid reference to the rsb, so there's no need for locking. */
  
@@ -519,7 +512,7 @@ static void toss_rsb(struct kref *kref)
         list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
         r->res_toss_time = jiffies;
         if (r->res_lvbptr) {
-               free_lvb(r->res_lvbptr);
+               dlm_free_lvb(r->res_lvbptr);
                 r->res_lvbptr = NULL;
         }
  }
@@ -589,7 +582,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
         uint32_t lkid = 0;
         uint16_t bucket;
  
-       lkb = allocate_lkb(ls);
+       lkb = dlm_allocate_lkb(ls);
         if (!lkb)
                 return -ENOMEM;
  
@@ -683,8 +676,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
  
                 /* for local/process lkbs, lvbptr points to caller's lksb */
                 if (lkb->lkb_lvbptr && is_master_copy(lkb))
-                       free_lvb(lkb->lkb_lvbptr);
-               free_lkb(lkb);
+                       dlm_free_lvb(lkb->lkb_lvbptr);
+               dlm_free_lkb(lkb);
                 return 1;
         } else {
                 write_unlock(&ls->ls_lkbtbl[bucket].lock);
@@ -988,7 +981,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
  
                         if (is_master(r))
                                 dir_remove(r);
-                       free_rsb(r);
+                       dlm_free_rsb(r);
                         count++;
                 } else {
                         write_unlock(&ls->ls_rsbtbl[b].lock);
@@ -1171,7 +1164,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
                         return;
  
                 if (!r->res_lvbptr)
-                       r->res_lvbptr = allocate_lvb(r->res_ls);
+                       r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
  
                 if (!r->res_lvbptr)
                         return;
@@ -1203,7 +1196,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
                 return;
  
         if (!r->res_lvbptr)
-               r->res_lvbptr = allocate_lvb(r->res_ls);
+               r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
  
         if (!r->res_lvbptr)
                 return;
@@ -1852,7 +1845,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
  static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
  {
         struct dlm_ls *ls = r->res_ls;
-       int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+       int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
  
         if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
                 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
@@ -1886,7 +1879,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
                 return 1;
         }
  
-       for (;;) {
+       for (i = 0; i < 2; i++) {
                 /* It's possible for dlm_scand to remove an old rsb for
                    this same resource from the toss list, us to create
                    a new one, look up the master locally, and find it
@@ -1900,6 +1893,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
                 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
                 schedule();
         }
+       if (error && error != -EEXIST)
+               return error;
  
         if (ret_nodeid == our_nodeid) {
                 r->res_first_lkid = 0;
@@ -1941,8 +1936,11 @@ static void confirm_master(struct dlm_rsb *r, int error)
                 break;
  
         case -EAGAIN:
-               /* the remote master didn't queue our NOQUEUE request;
-                  make a waiting lkb the first_lkid */
+       case -EBADR:
+       case -ENOTBLK:
+               /* the remote request failed and won't be retried (it was
+                  a NOQUEUE, or has been canceled/unlocked); make a waiting
+                  lkb the first_lkid */
  
                 r->res_first_lkid = 0;
  
@@ -2108,17 +2106,18 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
         /* an lkb may be waiting for an rsb lookup to complete where the
            lookup was initiated by another lock */
  
-       if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
-               if (!list_empty(&lkb->lkb_rsb_lookup)) {
+       if (!list_empty(&lkb->lkb_rsb_lookup)) {
+               if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
                         log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
                         list_del_init(&lkb->lkb_rsb_lookup);
                         queue_cast(lkb->lkb_resource, lkb,
                                    args->flags & DLM_LKF_CANCEL ?
                                    -DLM_ECANCEL : -DLM_EUNLOCK);
                         unhold_lkb(lkb); /* undoes create_lkb() */
-                       rv = -EBUSY;
-                       goto out;
                 }
+               /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
+               rv = -EBUSY;
+               goto out;
         }
  
         /* cancel not allowed with another cancel/unlock in progress */
@@ -2986,7 +2985,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
  
         if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                 if (!lkb->lkb_lvbptr)
-                       lkb->lkb_lvbptr = allocate_lvb(ls);
+                       lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
                 if (!lkb->lkb_lvbptr)
                         return -ENOMEM;
                 len = receive_extralen(ms);
@@ -3006,11 +3005,9 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
         lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
         lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
  
-       DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
-
         if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                 /* lkb was just created so there won't be an lvb yet */
-               lkb->lkb_lvbptr = allocate_lvb(ls);
+               lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
                 if (!lkb->lkb_lvbptr)
                         return -ENOMEM;
         }
@@ -3021,16 +3018,6 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
  static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
                                 struct dlm_message *ms)
  {
-       if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
-               log_error(ls, "convert_args nodeid %d %d lkid %x %x",
-                         lkb->lkb_nodeid, ms->m_header.h_nodeid,
-                         lkb->lkb_id, lkb->lkb_remid);
-               return -EINVAL;
-       }
-
-       if (!is_master_copy(lkb))
-               return -EINVAL;
-
         if (lkb->lkb_status != DLM_LKSTS_GRANTED)
                 return -EBUSY;
  
@@ -3046,8 +3033,6 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
  static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
                                struct dlm_message *ms)
  {
-       if (!is_master_copy(lkb))
-               return -EINVAL;
         if (receive_lvb(ls, lkb, ms))
                 return -ENOMEM;
         return 0;
@@ -3063,6 +3048,50 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
         lkb->lkb_remid = ms->m_lkid;
  }
  
+/* This is called after the rsb is locked so that we can safely inspect
+   fields in the lkb. */
+
+static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+       int from = ms->m_header.h_nodeid;
+       int error = 0;
+
+       switch (ms->m_type) {
+       case DLM_MSG_CONVERT:
+       case DLM_MSG_UNLOCK:
+       case DLM_MSG_CANCEL:
+               if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
+                       error = -EINVAL;
+               break;
+
+       case DLM_MSG_CONVERT_REPLY:
+       case DLM_MSG_UNLOCK_REPLY:
+       case DLM_MSG_CANCEL_REPLY:
+       case DLM_MSG_GRANT:
+       case DLM_MSG_BAST:
+               if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
+                       error = -EINVAL;
+               break;
+
+       case DLM_MSG_REQUEST_REPLY:
+               if (!is_process_copy(lkb))
+                       error = -EINVAL;
+               else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
+                       error = -EINVAL;
+               break;
+
+       default:
+               error = -EINVAL;
+       }
+
+       if (error)
+               log_error(lkb->lkb_resource->res_ls,
+                         "ignore invalid message %d from %d %x %x %x %d",
+                         ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
+                         lkb->lkb_flags, lkb->lkb_nodeid);
+       return error;
+}
+
  static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
  {
         struct dlm_lkb *lkb;
@@ -3124,17 +3153,21 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         receive_flags(lkb, ms);
         error = receive_convert_args(ls, lkb, ms);
         if (error)
-               goto out;
+               goto out_reply;
         reply = !down_conversion(lkb);
  
         error = do_convert(r, lkb);
- out:
+ out_reply:
         if (reply)
                 send_convert_reply(r, lkb, error);
-
+ out:
         unlock_rsb(r);
         put_rsb(r);
         dlm_put_lkb(lkb);
@@ -3160,15 +3193,19 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         receive_flags(lkb, ms);
         error = receive_unlock_args(ls, lkb, ms);
         if (error)
-               goto out;
+               goto out_reply;
  
         error = do_unlock(r, lkb);
- out:
+ out_reply:
         send_unlock_reply(r, lkb, error);
-
+ out:
         unlock_rsb(r);
         put_rsb(r);
         dlm_put_lkb(lkb);
@@ -3196,9 +3233,13 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         error = do_cancel(r, lkb);
         send_cancel_reply(r, lkb, error);
-
+ out:
         unlock_rsb(r);
         put_rsb(r);
         dlm_put_lkb(lkb);
@@ -3217,22 +3258,26 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
  
         error = find_lkb(ls, ms->m_remid, &lkb);
         if (error) {
-               log_error(ls, "receive_grant no lkb");
+               log_debug(ls, "receive_grant from %d no lkb %x",
+                         ms->m_header.h_nodeid, ms->m_remid);
                 return;
         }
-       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
  
         r = lkb->lkb_resource;
  
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         receive_flags_reply(lkb, ms);
         if (is_altmode(lkb))
                 munge_altmode(lkb, ms);
         grant_lock_pc(r, lkb, ms);
         queue_cast(r, lkb, 0);
-
+ out:
         unlock_rsb(r);
         put_rsb(r);
         dlm_put_lkb(lkb);
@@ -3246,18 +3291,22 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
  
         error = find_lkb(ls, ms->m_remid, &lkb);
         if (error) {
-               log_error(ls, "receive_bast no lkb");
+               log_debug(ls, "receive_bast from %d no lkb %x",
+                         ms->m_header.h_nodeid, ms->m_remid);
                 return;
         }
-       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
  
         r = lkb->lkb_resource;
  
         hold_rsb(r);
         lock_rsb(r);
  
-       queue_bast(r, lkb, ms->m_bastmode);
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
  
+       queue_bast(r, lkb, ms->m_bastmode);
+ out:
         unlock_rsb(r);
         put_rsb(r);
         dlm_put_lkb(lkb);
@@ -3323,15 +3372,19 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
  
         error = find_lkb(ls, ms->m_remid, &lkb);
         if (error) {
-               log_error(ls, "receive_request_reply no lkb");
+               log_debug(ls, "receive_request_reply from %d no lkb %x",
+                         ms->m_header.h_nodeid, ms->m_remid);
                 return;
         }
-       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
  
         r = lkb->lkb_resource;
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         mstype = lkb->lkb_wait_type;
         error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
         if (error)
@@ -3383,6 +3436,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
                 if (is_overlap(lkb)) {
                         /* we'll ignore error in cancel/unlock reply */
                         queue_cast_overlap(r, lkb);
+                       confirm_master(r, result);
                         unhold_lkb(lkb); /* undoes create_lkb() */
                 } else
                         _request_lock(r, lkb);
@@ -3463,6 +3517,10 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         /* stub reply can happen with waiters_mutex held */
         error = remove_from_waiters_ms(lkb, ms);
         if (error)
@@ -3481,10 +3539,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
  
         error = find_lkb(ls, ms->m_remid, &lkb);
         if (error) {
-               log_error(ls, "receive_convert_reply no lkb");
+               log_debug(ls, "receive_convert_reply from %d no lkb %x",
+                         ms->m_header.h_nodeid, ms->m_remid);
                 return;
         }
-       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
  
         _receive_convert_reply(lkb, ms);
         dlm_put_lkb(lkb);
@@ -3498,6 +3556,10 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         /* stub reply can happen with waiters_mutex held */
         error = remove_from_waiters_ms(lkb, ms);
         if (error)
@@ -3529,10 +3591,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
  
         error = find_lkb(ls, ms->m_remid, &lkb);
         if (error) {
-               log_error(ls, "receive_unlock_reply no lkb");
+               log_debug(ls, "receive_unlock_reply from %d no lkb %x",
+                         ms->m_header.h_nodeid, ms->m_remid);
                 return;
         }
-       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
  
         _receive_unlock_reply(lkb, ms);
         dlm_put_lkb(lkb);
@@ -3546,6 +3608,10 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
         hold_rsb(r);
         lock_rsb(r);
  
+       error = validate_message(lkb, ms);
+       if (error)
+               goto out;
+
         /* stub reply can happen with waiters_mutex held */
         error = remove_from_waiters_ms(lkb, ms);
         if (error)
@@ -3577,10 +3643,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
  
         error = find_lkb(ls, ms->m_remid, &lkb);
         if (error) {
-               log_error(ls, "receive_cancel_reply no lkb");
+               log_debug(ls, "receive_cancel_reply from %d no lkb %x",
+                         ms->m_header.h_nodeid, ms->m_remid);
                 return;
         }
-       DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
  
         _receive_cancel_reply(lkb, ms);
         dlm_put_lkb(lkb);
@@ -3640,6 +3706,13 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
  
  static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
  {
+       if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+               log_debug(ls, "ignore non-member message %d from %d %x %x %d",
+                         ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
+                         ms->m_remid, ms->m_result);
+               return;
+       }
+
         switch (ms->m_type) {
  
         /* messages sent to a master node */
@@ -3778,8 +3851,9 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
  
         ls = dlm_find_lockspace_global(hd->h_lockspace);
         if (!ls) {
-               log_print("invalid h_lockspace %x from %d cmd %d type %d",
-                         hd->h_lockspace, nodeid, hd->h_cmd, type);
+               if (dlm_config.ci_log_debug)
+                       log_print("invalid lockspace %x from %d cmd %d type %d",
+                                 hd->h_lockspace, nodeid, hd->h_cmd, type);
  
                 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
                         dlm_send_ls_not_ready(nodeid, rc);
@@ -3806,6 +3880,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
                 ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
                 ls->ls_stub_ms.m_result = -EINPROGRESS;
                 ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+               ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
                 _receive_convert_reply(lkb, &ls->ls_stub_ms);
  
                 /* Same special case as in receive_rcom_lock_args() */
@@ -3847,6 +3922,7 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
  void dlm_recover_waiters_pre(struct dlm_ls *ls)
  {
         struct dlm_lkb *lkb, *safe;
+       int wait_type, stub_unlock_result, stub_cancel_result;
  
         mutex_lock(&ls->ls_waiters_mutex);
  
@@ -3865,7 +3941,33 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                 if (!waiter_needs_recovery(ls, lkb))
                         continue;
  
-               switch (lkb->lkb_wait_type) {
+               wait_type = lkb->lkb_wait_type;
+               stub_unlock_result = -DLM_EUNLOCK;
+               stub_cancel_result = -DLM_ECANCEL;
+
+               /* Main reply may have been received leaving a zero wait_type,
+                  but a reply for the overlapping op may not have been
+                  received.  In that case we need to fake the appropriate
+                  reply for the overlap op. */
+
+               if (!wait_type) {
+                       if (is_overlap_cancel(lkb)) {
+                               wait_type = DLM_MSG_CANCEL;
+                               if (lkb->lkb_grmode == DLM_LOCK_IV)
+                                       stub_cancel_result = 0;
+                       }
+                       if (is_overlap_unlock(lkb)) {
+                               wait_type = DLM_MSG_UNLOCK;
+                               if (lkb->lkb_grmode == DLM_LOCK_IV)
+                                       stub_unlock_result = -ENOENT;
+                       }
+
+                       log_debug(ls, "rwpre overlap %x %x %d %d %d",
+                                 lkb->lkb_id, lkb->lkb_flags, wait_type,
+                                 stub_cancel_result, stub_unlock_result);
+               }
+
+               switch (wait_type) {
  
                 case DLM_MSG_REQUEST:
                         lkb->lkb_flags |= DLM_IFL_RESEND;
@@ -3878,8 +3980,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                 case DLM_MSG_UNLOCK:
                         hold_lkb(lkb);
                         ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
-                       ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+                       ls->ls_stub_ms.m_result = stub_unlock_result;
                         ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                       ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
                         _receive_unlock_reply(lkb, &ls->ls_stub_ms);
                         dlm_put_lkb(lkb);
                         break;
@@ -3887,15 +3990,16 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
                 case DLM_MSG_CANCEL:
                         hold_lkb(lkb);
                         ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
-                       ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+                       ls->ls_stub_ms.m_result = stub_cancel_result;
                         ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+                       ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
                         _receive_cancel_reply(lkb, &ls->ls_stub_ms);
                         dlm_put_lkb(lkb);
                         break;
  
                 default:
-                       log_error(ls, "invalid lkb wait_type %d",
-                                 lkb->lkb_wait_type);
+                       log_error(ls, "invalid lkb wait_type %d %d",
+                                 lkb->lkb_wait_type, wait_type);
                 }
                 schedule();
         }
@@ -4184,7 +4288,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
         lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
  
         if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
-               lkb->lkb_lvbptr = allocate_lvb(ls);
+               lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
                 if (!lkb->lkb_lvbptr)
                         return -ENOMEM;
                 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4259,7 +4363,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
         put_rsb(r);
   out:
         if (error)
-               log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+               log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid);
         rl->rl_result = error;
         return error;
  }
@@ -4342,7 +4446,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
                 }
         }
  
-       /* After ua is attached to lkb it will be freed by free_lkb().
+       /* After ua is attached to lkb it will be freed by dlm_free_lkb().
            When DLM_IFL_USER is set, the dlm knows that this is a userspace
            lock and that lkb_astparam is the dlm_user_args structure. */
  
@@ -4679,6 +4783,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
         }
  
         list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
+               lkb->lkb_ast_type = 0;
                 list_del(&lkb->lkb_astqueue);
                 dlm_put_lkb(lkb);
         }
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h

index ada04680a1e5d9ff994aa82a35c899b6a975692b..27b6ed3029115d194a809373b739f841162dea92 100644 (file)
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -19,8 +19,6 @@ void dlm_print_lkb(struct dlm_lkb *lkb);
  void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
  void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
  int dlm_modes_compat(int mode1, int mode2);
-int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
-       unsigned int flags, struct dlm_rsb **r_ret);
  void dlm_put_rsb(struct dlm_rsb *r);
  void dlm_hold_rsb(struct dlm_rsb *r);
  int dlm_put_lkb(struct dlm_lkb *lkb);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c

index 5c108c49cb8cd30128d533630cdd3ac72810839d..b180fdc51085adba0be6ace10703efdfab5507b9 100644 (file)
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -24,14 +24,6 @@
  #include "recover.h"
  #include "requestqueue.h"
  
-#ifdef CONFIG_DLM_DEBUG
-int dlm_create_debug_file(struct dlm_ls *ls);
-void dlm_delete_debug_file(struct dlm_ls *ls);
-#else
-static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
-static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
-#endif
-
  static int                     ls_count;
  static struct mutex            ls_lock;
  static struct list_head                lslist;
@@ -684,9 +676,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
                         dlm_del_ast(lkb);
  
                         if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
-                               free_lvb(lkb->lkb_lvbptr);
+                               dlm_free_lvb(lkb->lkb_lvbptr);
  
-                       free_lkb(lkb);
+                       dlm_free_lkb(lkb);
                 }
         }
         dlm_astd_resume();
@@ -704,7 +696,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
                                          res_hashchain);
  
                         list_del(&rsb->res_hashchain);
-                       free_rsb(rsb);
+                       dlm_free_rsb(rsb);
                 }
  
                 head = &ls->ls_rsbtbl[i].toss;
@@ -712,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
                         rsb = list_entry(head->next, struct dlm_rsb,
                                          res_hashchain);
                         list_del(&rsb->res_hashchain);
-                       free_rsb(rsb);
+                       dlm_free_rsb(rsb);
                 }
         }
  
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c

index e9923ca9c2d9ee4c700eb6f18ea1bfb150fee52f..7c1e5e5cccd8ae97a87a618ad9818d5b35721553 100644 (file)
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con)
  static void tcp_connect_to_sock(struct connection *con)
  {
         int result = -EHOSTUNREACH;
-       struct sockaddr_storage saddr;
+       struct sockaddr_storage saddr, src_addr;
         int addr_len;
         struct socket *sock;
  
@@ -898,6 +898,17 @@ static void tcp_connect_to_sock(struct connection *con)
         con->connect_action = tcp_connect_to_sock;
         add_sock(sock, con);
  
+       /* Bind to our cluster-known address connecting to avoid
+          routing problems */
+       memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+       make_sockaddr(&src_addr, 0, &addr_len);
+       result = sock->ops->bind(sock, (struct sockaddr *) &src_addr,
+                                addr_len);
+       if (result < 0) {
+               log_print("could not bind for connect: %d", result);
+               /* This *may* not indicate a critical error */
+       }
+
         make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
  
         log_print("connecting to %d", con->nodeid);
@@ -1426,6 +1437,8 @@ void dlm_lowcomms_stop(void)
                 con = __nodeid2con(i, 0);
                 if (con) {
                         close_connection(con, true);
+                       if (con->othercon)
+                               kmem_cache_free(con_cache, con->othercon);
                         kmem_cache_free(con_cache, con);
                 }
         }
diff --git a/fs/dlm/main.c b/fs/dlm/main.c

index eca2907f2386da93396d19d5195bd59101614b54..58487fb95a4c258b6e33f38a167460cadfa171be 100644 (file)
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -18,16 +18,6 @@
  #include "memory.h"
  #include "config.h"
  
-#ifdef CONFIG_DLM_DEBUG
-int dlm_register_debugfs(void);
-void dlm_unregister_debugfs(void);
-#else
-static inline int dlm_register_debugfs(void) { return 0; }
-static inline void dlm_unregister_debugfs(void) { }
-#endif
-int dlm_netlink_init(void);
-void dlm_netlink_exit(void);
-
  static int __init init_dlm(void)
  {
         int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c

index e9cdcab306e2a06037c4e0c32ac1c0e7be3097dc..fa17f5a278831fa3acf3977930fae7fcc63af2df 100644 (file)
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
  /******************************************************************************
  *******************************************************************************
  **
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
         ls->ls_num_nodes--;
  }
  
-static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
  {
         struct dlm_member *memb;
  
diff --git a/fs/dlm/member.h b/fs/dlm/member.h

index 927c08c192148ef07c165b29d1715a4ed35c2fa9..7a26fca1e0b5386e6b5906655b57f6fd80e1fe3c 100644 (file)
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
  /******************************************************************************
  *******************************************************************************
  **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_clear_members(struct dlm_ls *ls);
  void dlm_clear_members_gone(struct dlm_ls *ls);
  int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
  int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+int dlm_is_member(struct dlm_ls *ls, int nodeid);
  
  #endif                          /* __MEMBER_DOT_H__ */
  
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c

index ecf0e5cb2035e885e90bf8acbc992c3c9b0c397d..f7783867491aedf9678f6c506d49bf636ddb68e0 100644 (file)
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -2,7 +2,7 @@
  *******************************************************************************
  **
  **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,7 @@ void dlm_memory_exit(void)
                 kmem_cache_destroy(lkb_cache);
  }
  
-char *allocate_lvb(struct dlm_ls *ls)
+char *dlm_allocate_lvb(struct dlm_ls *ls)
  {
         char *p;
  
@@ -43,7 +43,7 @@ char *allocate_lvb(struct dlm_ls *ls)
         return p;
  }
  
-void free_lvb(char *p)
+void dlm_free_lvb(char *p)
  {
         kfree(p);
  }
@@ -51,7 +51,7 @@ void free_lvb(char *p)
  /* FIXME: have some minimal space built-in to rsb for the name and
     kmalloc a separate name if needed, like dentries are done */
  
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
  {
         struct dlm_rsb *r;
  
@@ -61,14 +61,14 @@ struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
         return r;
  }
  
-void free_rsb(struct dlm_rsb *r)
+void dlm_free_rsb(struct dlm_rsb *r)
  {
         if (r->res_lvbptr)
-               free_lvb(r->res_lvbptr);
+               dlm_free_lvb(r->res_lvbptr);
         kfree(r);
  }
  
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
  {
         struct dlm_lkb *lkb;
  
@@ -76,7 +76,7 @@ struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
         return lkb;
  }
  
-void free_lkb(struct dlm_lkb *lkb)
+void dlm_free_lkb(struct dlm_lkb *lkb)
  {
         if (lkb->lkb_flags & DLM_IFL_USER) {
                 struct dlm_user_args *ua;
@@ -90,19 +90,3 @@ void free_lkb(struct dlm_lkb *lkb)
         kmem_cache_free(lkb_cache, lkb);
  }
  
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
-{
-       struct dlm_direntry *de;
-
-       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
-                  printk("namelen = %d\n", namelen););
-
-       de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL);
-       return de;
-}
-
-void free_direntry(struct dlm_direntry *de)
-{
-       kfree(de);
-}
-
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h

index 6ead158ccc5c661cb4138bfd756bb05d69382989..485fb29143bdd0d86af973523ea8ab4a584780ab 100644 (file)
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -2,7 +2,7 @@
  *******************************************************************************
  **
  **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -16,14 +16,12 @@
  
  int dlm_memory_init(void);
  void dlm_memory_exit(void);
-struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
-void free_rsb(struct dlm_rsb *r);
-struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
-void free_lkb(struct dlm_lkb *l);
-struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
-void free_direntry(struct dlm_direntry *de);
-char *allocate_lvb(struct dlm_ls *ls);
-void free_lvb(char *l);
+struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen);
+void dlm_free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
+void dlm_free_lkb(struct dlm_lkb *l);
+char *dlm_allocate_lvb(struct dlm_ls *ls);
+void dlm_free_lvb(char *l);
  
  #endif         /* __MEMORY_DOT_H__ */
  
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c

index f8c69dda16a080af8e960f2111772bed46d2c8c2..e69926e984db3f07565aa052335e49151bd489d5 100644 (file)
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
  *******************************************************************************
  **
  **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -58,8 +58,12 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset,
  int dlm_process_incoming_buffer(int nodeid, const void *base,
                                 unsigned offset, unsigned len, unsigned limit)
  {
-       unsigned char __tmp[DLM_INBUF_LEN];
-       struct dlm_header *msg = (struct dlm_header *) __tmp;
+       union {
+               unsigned char __buf[DLM_INBUF_LEN];
+               /* this is to force proper alignment on some arches */
+               struct dlm_header dlm;
+       } __tmp;
+       struct dlm_header *msg = &__tmp.dlm;
         int ret = 0;
         int err = 0;
         uint16_t msglen;
@@ -100,8 +104,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                    in the buffer on the stack (which should work for most
                    ordinary messages). */
  
-               if (msglen > sizeof(__tmp) &&
-                   msg == (struct dlm_header *) __tmp) {
+               if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) {
                         msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
                         if (msg == NULL)
                                 return ret;
@@ -119,7 +122,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                 dlm_receive_buffer(msg, nodeid);
         }
  
-       if (msg != (struct dlm_header *) __tmp)
+       if (msg != &__tmp.dlm)
                 kfree(msg);
  
         return err ? err : ret;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c

index ae2fd97fa4adc8392b720f1e455e4a53968c8d0c..026824cd3acbdd5198d9a4a299b4b6babaf7b02c 100644 (file)
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
  *******************************************************************************
  **
  **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -197,11 +197,6 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
         spin_unlock(&ls->ls_rcom_spin);
  }
  
-static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-       receive_sync_reply(ls, rc_in);
-}
-
  int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
  {
         struct dlm_rcom *rc;
@@ -254,11 +249,6 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
         send_rcom(ls, mh, rc);
  }
  
-static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-       receive_sync_reply(ls, rc_in);
-}
-
  int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
  {
         struct dlm_rcom *rc;
@@ -381,11 +371,6 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
         send_rcom(ls, mh, rc);
  }
  
-static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
-{
-       dlm_recover_process_copy(ls, rc_in);
-}
-
  /* If the lockspace doesn't exist then still send a status message
     back; it's possible that it just doesn't have its global_id yet. */
  
@@ -481,11 +466,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
                 break;
  
         case DLM_RCOM_STATUS_REPLY:
-               receive_rcom_status_reply(ls, rc);
+               receive_sync_reply(ls, rc);
                 break;
  
         case DLM_RCOM_NAMES_REPLY:
-               receive_rcom_names_reply(ls, rc);
+               receive_sync_reply(ls, rc);
                 break;
  
         case DLM_RCOM_LOOKUP_REPLY:
@@ -493,11 +478,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
                 break;
  
         case DLM_RCOM_LOCK_REPLY:
-               receive_rcom_lock_reply(ls, rc);
+               dlm_recover_process_copy(ls, rc);
                 break;
  
         default:
-               DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+               log_error(ls, "receive_rcom bad type %d", rc->rc_type);
         }
   out:
         return;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c

index c2cc7694cd164b6847f35c12257b6a783bf82525..df075dc300fa4ad7bbe8364478b297795d562a61 100644 (file)
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r)
                 goto out;
  
         if (!r->res_lvbptr) {
-               r->res_lvbptr = allocate_lvb(r->res_ls);
+               r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
                 if (!r->res_lvbptr)
                         goto out;
         }
@@ -731,6 +731,20 @@ int dlm_create_root_list(struct dlm_ls *ls)
                         list_add(&r->res_root_list, &ls->ls_root_list);
                         dlm_hold_rsb(r);
                 }
+
+               /* If we're using a directory, add tossed rsbs to the root
+                  list; they'll have entries created in the new directory,
+                  but no other recovery steps should do anything with them. */
+
+               if (dlm_no_directory(ls)) {
+                       read_unlock(&ls->ls_rsbtbl[i].lock);
+                       continue;
+               }
+
+               list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+                       list_add(&r->res_root_list, &ls->ls_root_list);
+                       dlm_hold_rsb(r);
+               }
                 read_unlock(&ls->ls_rsbtbl[i].lock);
         }
   out:
@@ -750,6 +764,11 @@ void dlm_release_root_list(struct dlm_ls *ls)
         up_write(&ls->ls_root_sem);
  }
  
+/* If not using a directory, clear the entire toss list, there's no benefit to
+   caching the master value since it's fixed.  If we are using a dir, keep the
+   rsb's we're the master of.  Recovery will add them to the root list and from
+   there they'll be entered in the rebuilt directory. */
+
  void dlm_clear_toss_list(struct dlm_ls *ls)
  {
         struct dlm_rsb *r, *safe;
@@ -759,8 +778,10 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
                 write_lock(&ls->ls_rsbtbl[i].lock);
                 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
                                          res_hashchain) {
-                       list_del(&r->res_hashchain);
-                       free_rsb(r);
+                       if (dlm_no_directory(ls) || !is_master(r)) {
+                               list_del(&r->res_hashchain);
+                               dlm_free_rsb(r);
+                       }
                 }
                 write_unlock(&ls->ls_rsbtbl[i].lock);
         }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c

index 4b89e20eebe702f32a9214d2f377e1efc2768001..997f9531d59482ed1d3fecee535d8aada119b302 100644 (file)
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -67,17 +67,18 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
         dlm_astd_resume();
  
         /*
-        * This list of root rsb's will be the basis of most of the recovery
-        * routines.
+        * Free non-master tossed rsb's.  Master rsb's are kept on toss
+        * list and put on root list to be included in resdir recovery.
          */
  
-       dlm_create_root_list(ls);
+       dlm_clear_toss_list(ls);
  
         /*
-        * Free all the tossed rsb's so we don't have to recover them.
+        * This list of root rsb's will be the basis of most of the recovery
+        * routines.
          */
  
-       dlm_clear_toss_list(ls);
+       dlm_create_root_list(ls);
  
         /*
          * Add or remove nodes from the lockspace's ls_nodes list.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c

index 4f741546f4bbce873079bee9060eac437934d8b6..7cbc6826239b10b5b17b70c0d4f737f7e77152fa 100644 (file)
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,8 +24,7 @@
  #include "lvb_table.h"
  #include "user.h"
  
-static const char *name_prefix="dlm";
-static struct miscdevice ctl_device;
+static const char name_prefix[] = "dlm";
  static const struct file_operations device_fops;
  
  #ifdef CONFIG_COMPAT
@@ -82,7 +81,8 @@ struct dlm_lock_result32 {
  };
  
  static void compat_input(struct dlm_write_request *kb,
-                        struct dlm_write_request32 *kb32)
+                        struct dlm_write_request32 *kb32,
+                        int max_namelen)
  {
         kb->version[0] = kb32->version[0];
         kb->version[1] = kb32->version[1];
@@ -112,7 +112,11 @@ static void compat_input(struct dlm_write_request *kb,
                 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
                 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
                 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
-               memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+               if (kb->i.lock.namelen <= max_namelen)
+                       memcpy(kb->i.lock.name, kb32->i.lock.name,
+                              kb->i.lock.namelen);
+               else
+                       kb->i.lock.namelen = max_namelen;
         }
  }
  
@@ -236,12 +240,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
         spin_unlock(&proc->asts_spin);
  
         if (eol) {
-               spin_lock(&ua->proc->locks_spin);
+               spin_lock(&proc->locks_spin);
                 if (!list_empty(&lkb->lkb_ownqueue)) {
                         list_del_init(&lkb->lkb_ownqueue);
                         dlm_put_lkb(lkb);
                 }
-               spin_unlock(&ua->proc->locks_spin);
+               spin_unlock(&proc->locks_spin);
         }
   out:
         mutex_unlock(&ls->ls_clear_proc_locks);
@@ -529,7 +533,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
  
                 if (proc)
                         set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
-               compat_input(kbuf, k32buf);
+               compat_input(kbuf, k32buf,
+                            count - sizeof(struct dlm_write_request32));
                 kfree(k32buf);
         }
  #endif
@@ -896,14 +901,16 @@ static const struct file_operations ctl_device_fops = {
         .owner   = THIS_MODULE,
  };
  
+static struct miscdevice ctl_device = {
+       .name  = "dlm-control",
+       .fops  = &ctl_device_fops,
+       .minor = MISC_DYNAMIC_MINOR,
+};
+
  int dlm_user_init(void)
  {
         int error;
  
-       ctl_device.name = "dlm-control";
-       ctl_device.fops = &ctl_device_fops;
-       ctl_device.minor = MISC_DYNAMIC_MINOR;
-
         error = misc_register(&ctl_device);
         if (error)
                 log_print("misc_register failed for control device");
diff --git a/fs/dlm/util.c b/fs/dlm/util.c

index 963889cf674063fb5591cc0ab7b933afb06b24b5..4d9c1f4e1bd10af144a0ae813badaf528c042532 100644 (file)
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -1,7 +1,7 @@
  /******************************************************************************
  *******************************************************************************
  **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
  **
  **  This copyrighted material is made available to anyone wishing to use,
  **  modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,14 @@
  #include "rcom.h"
  #include "util.h"
  
+#define DLM_ERRNO_EDEADLK              35
+#define DLM_ERRNO_EBADR                        53
+#define DLM_ERRNO_EBADSLT              57
+#define DLM_ERRNO_EPROTO               71
+#define DLM_ERRNO_EOPNOTSUPP           95
+#define DLM_ERRNO_ETIMEDOUT           110
+#define DLM_ERRNO_EINPROGRESS         115
+
  static void header_out(struct dlm_header *hd)
  {
         hd->h_version           = cpu_to_le32(hd->h_version);
@@ -30,11 +38,54 @@ static void header_in(struct dlm_header *hd)
         hd->h_length            = le16_to_cpu(hd->h_length);
  }
  
-void dlm_message_out(struct dlm_message *ms)
+/* higher errno values are inconsistent across architectures, so select
+   one set of values for on the wire */
+
+static int to_dlm_errno(int err)
+{
+       switch (err) {
+       case -EDEADLK:
+               return -DLM_ERRNO_EDEADLK;
+       case -EBADR:
+               return -DLM_ERRNO_EBADR;
+       case -EBADSLT:
+               return -DLM_ERRNO_EBADSLT;
+       case -EPROTO:
+               return -DLM_ERRNO_EPROTO;
+       case -EOPNOTSUPP:
+               return -DLM_ERRNO_EOPNOTSUPP;
+       case -ETIMEDOUT:
+               return -DLM_ERRNO_ETIMEDOUT;
+       case -EINPROGRESS:
+               return -DLM_ERRNO_EINPROGRESS;
+       }
+       return err;
+}
+
+static int from_dlm_errno(int err)
  {
-       struct dlm_header *hd = (struct dlm_header *) ms;
+       switch (err) {
+       case -DLM_ERRNO_EDEADLK:
+               return -EDEADLK;
+       case -DLM_ERRNO_EBADR:
+               return -EBADR;
+       case -DLM_ERRNO_EBADSLT:
+               return -EBADSLT;
+       case -DLM_ERRNO_EPROTO:
+               return -EPROTO;
+       case -DLM_ERRNO_EOPNOTSUPP:
+               return -EOPNOTSUPP;
+       case -DLM_ERRNO_ETIMEDOUT:
+               return -ETIMEDOUT;
+       case -DLM_ERRNO_EINPROGRESS:
+               return -EINPROGRESS;
+       }
+       return err;
+}
  
-       header_out(hd);
+void dlm_message_out(struct dlm_message *ms)
+{
+       header_out(&ms->m_header);
  
         ms->m_type              = cpu_to_le32(ms->m_type);
         ms->m_nodeid            = cpu_to_le32(ms->m_nodeid);
@@ -53,14 +104,12 @@ void dlm_message_out(struct dlm_message *ms)
         ms->m_rqmode            = cpu_to_le32(ms->m_rqmode);
         ms->m_bastmode          = cpu_to_le32(ms->m_bastmode);
         ms->m_asts              = cpu_to_le32(ms->m_asts);
-       ms->m_result            = cpu_to_le32(ms->m_result);
+       ms->m_result            = cpu_to_le32(to_dlm_errno(ms->m_result));
  }
  
  void dlm_message_in(struct dlm_message *ms)
  {
-       struct dlm_header *hd = (struct dlm_header *) ms;
-
-       header_in(hd);
+       header_in(&ms->m_header);
  
         ms->m_type              = le32_to_cpu(ms->m_type);
         ms->m_nodeid            = le32_to_cpu(ms->m_nodeid);
@@ -79,7 +128,7 @@ void dlm_message_in(struct dlm_message *ms)
         ms->m_rqmode            = le32_to_cpu(ms->m_rqmode);
         ms->m_bastmode          = le32_to_cpu(ms->m_bastmode);
         ms->m_asts              = le32_to_cpu(ms->m_asts);
-       ms->m_result            = le32_to_cpu(ms->m_result);
+       ms->m_result            = from_dlm_errno(le32_to_cpu(ms->m_result));
  }
  
  static void rcom_lock_out(struct rcom_lock *rl)
@@ -126,10 +175,9 @@ static void rcom_config_in(struct rcom_config *rf)
  
  void dlm_rcom_out(struct dlm_rcom *rc)
  {
-       struct dlm_header *hd = (struct dlm_header *) rc;
         int type = rc->rc_type;
  
-       header_out(hd);
+       header_out(&rc->rc_header);
  
         rc->rc_type             = cpu_to_le32(rc->rc_type);
         rc->rc_result           = cpu_to_le32(rc->rc_result);
@@ -137,7 +185,7 @@ void dlm_rcom_out(struct dlm_rcom *rc)
         rc->rc_seq              = cpu_to_le64(rc->rc_seq);
         rc->rc_seq_reply        = cpu_to_le64(rc->rc_seq_reply);
  
-       if (type == DLM_RCOM_LOCK)
+       if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
                 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
  
         else if (type == DLM_RCOM_STATUS_REPLY)
@@ -146,9 +194,9 @@ void dlm_rcom_out(struct dlm_rcom *rc)
  
  void dlm_rcom_in(struct dlm_rcom *rc)
  {
-       struct dlm_header *hd = (struct dlm_header *) rc;
+       int type;
  
-       header_in(hd);
+       header_in(&rc->rc_header);
  
         rc->rc_type             = le32_to_cpu(rc->rc_type);
         rc->rc_result           = le32_to_cpu(rc->rc_result);
@@ -156,10 +204,12 @@ void dlm_rcom_in(struct dlm_rcom *rc)
         rc->rc_seq              = le64_to_cpu(rc->rc_seq);
         rc->rc_seq_reply        = le64_to_cpu(rc->rc_seq_reply);
  
-       if (rc->rc_type == DLM_RCOM_LOCK)
+       type = rc->rc_type;
+
+       if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY))
                 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
  
-       else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+       else if (type == DLM_RCOM_STATUS_REPLY)
                 rcom_config_in((struct rcom_config *) rc->rc_buf);
  }
  
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c

index 0f69c416eebc6adef7d05eeaf3283083adb9e700..a5432bbbfb88ab678a63b8cfb55b4d04cf8eae7d 100644 (file)
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -347,7 +347,8 @@ restart:
                                 break;
                         }
                         retry = __process_buffer(journal, jh, bhs,&batch_count);
-                       if (!retry && lock_need_resched(&journal->j_list_lock)){
+                       if (!retry && (need_resched() ||
+                               spin_needbreak(&journal->j_list_lock))) {
                                 spin_unlock(&journal->j_list_lock);
                                 retry = 1;
                                 break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c

index 610264b99a8e8bb357ed46dc9a08735443ab2659..31853eb65b4cb0f2bf4f365e21a0456e7362d1b1 100644 (file)
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -265,7 +265,7 @@ write_out_data:
                         put_bh(bh);
                 }
  
-               if (lock_need_resched(&journal->j_list_lock)) {
+               if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
                         spin_unlock(&journal->j_list_lock);
                         goto write_out_data;
                 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c

index 1b7f282c1ae9e9940ca57009e3d3c6f73054433b..6914598022ce836e10a13aa8be50aab1ec3bdc74 100644 (file)
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -353,7 +353,8 @@ restart:
                         }
                         retry = __process_buffer(journal, jh, bhs, &batch_count,
                                                  transaction);
-                       if (!retry && lock_need_resched(&journal->j_list_lock)){
+                       if (!retry && (need_resched() ||
+                               spin_needbreak(&journal->j_list_lock))) {
                                 spin_unlock(&journal->j_list_lock);
                                 retry = 1;
                                 break;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c

index da8d0eb3b7b9c8933091561e7e44852fd814de66..4f302d2792794008351326bf9d6e128cf8b18a33 100644 (file)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -341,7 +341,7 @@ write_out_data:
                         put_bh(bh);
                 }
  
-               if (lock_need_resched(&journal->j_list_lock)) {
+               if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
                         spin_unlock(&journal->j_list_lock);
                         goto write_out_data;
                 }
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c

index d070b18e539dac18a8e452bff19524e74b1d793e..0b45fd3a4bfd6dfdb9e6e8d426a46489954822bb 100644 (file)
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -41,6 +41,48 @@ struct nlm_wait {
  
  static LIST_HEAD(nlm_blocked);
  
+/**
+ * nlmclnt_init - Set up per-NFS mount point lockd data structures
+ * @nlm_init: pointer to arguments structure
+ *
+ * Returns pointer to an appropriate nlm_host struct,
+ * or an ERR_PTR value.
+ */
+struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
+{
+       struct nlm_host *host;
+       u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
+       int status;
+
+       status = lockd_up(nlm_init->protocol);
+       if (status < 0)
+               return ERR_PTR(status);
+
+       host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+                                  nlm_init->protocol, nlm_version,
+                                  nlm_init->hostname,
+                                  strlen(nlm_init->hostname));
+       if (host == NULL) {
+               lockd_down();
+               return ERR_PTR(-ENOLCK);
+       }
+
+       return host;
+}
+EXPORT_SYMBOL_GPL(nlmclnt_init);
+
+/**
+ * nlmclnt_done - Release resources allocated by nlmclnt_init()
+ * @host: nlm_host structure reserved by nlmclnt_init()
+ *
+ */
+void nlmclnt_done(struct nlm_host *host)
+{
+       nlm_release_host(host);
+       lockd_down();
+}
+EXPORT_SYMBOL_GPL(nlmclnt_done);
+
  /*
   * Queue up a lock for blocking so that the GRANTED request can see it
   */
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c

index a10343bed1607252c24c2ffcad872a0042c6d758..b6b74a60e1ebb8cb18cfb4935e23c5ba9a6bfc02 100644 (file)
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -145,34 +145,21 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
         BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
  }
  
-/*
- * This is the main entry point for the NLM client.
+/**
+ * nlmclnt_proc - Perform a single client-side lock request
+ * @host: address of a valid nlm_host context representing the NLM server
+ * @cmd: fcntl-style file lock operation to perform
+ * @fl: address of arguments for the lock operation
+ *
   */
-int
-nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
  {
-       struct rpc_clnt         *client = NFS_CLIENT(inode);
-       struct sockaddr_in      addr;
-       struct nfs_server       *nfssrv = NFS_SERVER(inode);
-       struct nlm_host         *host;
         struct nlm_rqst         *call;
         sigset_t                oldset;
         unsigned long           flags;
-       int                     status, vers;
-
-       vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1;
-       if (NFS_PROTO(inode)->version > 3) {
-               printk(KERN_NOTICE "NFSv4 file locking not implemented!\n");
-               return -ENOLCK;
-       }
-
-       rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
-       host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
-                                  nfssrv->nfs_client->cl_hostname,
-                                  strlen(nfssrv->nfs_client->cl_hostname));
-       if (host == NULL)
-               return -ENOLCK;
+       int                     status;
  
+       nlm_get_host(host);
         call = nlm_alloc_call(host);
         if (call == NULL)
                 return -ENOMEM;
@@ -219,7 +206,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
         dprintk("lockd: clnt proc returns %d\n", status);
         return status;
  }
-EXPORT_SYMBOL(nlmclnt_proc);
+EXPORT_SYMBOL_GPL(nlmclnt_proc);
  
  /*
   * Allocate an NLM RPC call struct
@@ -257,7 +244,7 @@ void nlm_release_call(struct nlm_rqst *call)
  
  static void nlmclnt_rpc_release(void *data)
  {
-       return nlm_release_call(data);
+       nlm_release_call(data);
  }
  
  static int nlm_wait_on_grace(wait_queue_head_t *queue)
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c

index 633653bff9440632417dc8aaba3491e62265416c..3e459e18cc31ba2efeaca67dc9b40ae506604247 100644 (file)
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -612,8 +612,7 @@ const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
          * called with BKL held.
          */
         static char buf[2*NLM_MAXCOOKIELEN+1];
-       int i;
-       int len = sizeof(buf);
+       unsigned int i, len = sizeof(buf);
         char *p = buf;
  
         len--;  /* allow for trailing \0 */
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c

index a796be5051bfa4886254f7bd40a9cb4f7ff4f7de..9b6bbf1b978795b8ba29668db9313f1e86bb0d36 100644 (file)
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -73,8 +73,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
         complete(&nfs_callback_info.started);
  
         for(;;) {
-               char buf[RPC_MAX_ADDRBUFLEN];
-
                 if (signalled()) {
                         if (nfs_callback_info.users == 0)
                                 break;
@@ -92,8 +90,6 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
                                         __FUNCTION__, -err);
                         break;
                 }
-               dprintk("%s: request from %s\n", __FUNCTION__,
-                               svc_print_addr(rqstp, buf, sizeof(buf)));
                 svc_process(rqstp);
         }
  
@@ -168,12 +164,11 @@ void nfs_callback_down(void)
  
  static int nfs_callback_authenticate(struct svc_rqst *rqstp)
  {
-       struct sockaddr_in *addr = svc_addr_in(rqstp);
         struct nfs_client *clp;
         char buf[RPC_MAX_ADDRBUFLEN];
  
         /* Don't talk to strangers */
-       clp = nfs_find_client(addr, 4);
+       clp = nfs_find_client(svc_addr(rqstp), 4);
         if (clp == NULL)
                 return SVC_DROP;
  
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h

index c2bb14e053e1cb445494e402b9f8c93d7df4b7c7..bb25d2135ff1e7e15ead85466b5c36a0de5737cf 100644 (file)
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,7 @@ struct cb_compound_hdr_res {
  };
  
  struct cb_getattrargs {
-       struct sockaddr_in *addr;
+       struct sockaddr *addr;
         struct nfs_fh fh;
         uint32_t bitmap[2];
  };
@@ -53,7 +53,7 @@ struct cb_getattrres {
  };
  
  struct cb_recallargs {
-       struct sockaddr_in *addr;
+       struct sockaddr *addr;
         struct nfs_fh fh;
         nfs4_stateid stateid;
         uint32_t truncate;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c

index 72e55d83756d9b6d6b5e82a28040b58b0375cf9c..15f7785048d3eca72defd3ddfc64452301226f5f 100644 (file)
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -12,7 +12,9 @@
  #include "delegation.h"
  #include "internal.h"
  
+#ifdef NFS_DEBUG
  #define NFSDBG_FACILITY NFSDBG_CALLBACK
+#endif
   
  __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
  {
@@ -20,12 +22,16 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
         struct nfs_delegation *delegation;
         struct nfs_inode *nfsi;
         struct inode *inode;
-       
+
         res->bitmap[0] = res->bitmap[1] = 0;
         res->status = htonl(NFS4ERR_BADHANDLE);
         clp = nfs_find_client(args->addr, 4);
         if (clp == NULL)
                 goto out;
+
+       dprintk("NFS: GETATTR callback request from %s\n",
+               rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
         inode = nfs_delegation_find_inode(clp, &args->fh);
         if (inode == NULL)
                 goto out_putclient;
@@ -65,23 +71,32 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
         clp = nfs_find_client(args->addr, 4);
         if (clp == NULL)
                 goto out;
-       inode = nfs_delegation_find_inode(clp, &args->fh);
-       if (inode == NULL)
-               goto out_putclient;
-       /* Set up a helper thread to actually return the delegation */
-       switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
-               case 0:
-                       res = 0;
-                       break;
-               case -ENOENT:
-                       res = htonl(NFS4ERR_BAD_STATEID);
-                       break;
-               default:
-                       res = htonl(NFS4ERR_RESOURCE);
-       }
-       iput(inode);
-out_putclient:
-       nfs_put_client(clp);
+
+       dprintk("NFS: RECALL callback request from %s\n",
+               rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+       do {
+               struct nfs_client *prev = clp;
+
+               inode = nfs_delegation_find_inode(clp, &args->fh);
+               if (inode != NULL) {
+                       /* Set up a helper thread to actually return the delegation */
+                       switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
+                               case 0:
+                                       res = 0;
+                                       break;
+                               case -ENOENT:
+                                       if (res != 0)
+                                               res = htonl(NFS4ERR_BAD_STATEID);
+                                       break;
+                               default:
+                                       res = htonl(NFS4ERR_RESOURCE);
+                       }
+                       iput(inode);
+               }
+               clp = nfs_find_client_next(prev);
+               nfs_put_client(prev);
+       } while (clp != NULL);
  out:
         dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
         return res;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c

index 058ade7efe79407847b1af74a6d73bcbf90e0aec..c63eb720b68bed5d0eca4854e2150a0465353920 100644 (file)
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -139,7 +139,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
         if (unlikely(status != 0))
                 return status;
         /* We do not like overly long tags! */
-       if (hdr->taglen > CB_OP_TAGLEN_MAXSZ-12 || hdr->taglen < 0) {
+       if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
                 printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
                                 __FUNCTION__, hdr->taglen);
                 return htonl(NFS4ERR_RESOURCE);
@@ -176,7 +176,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
         status = decode_fh(xdr, &args->fh);
         if (unlikely(status != 0))
                 goto out;
-       args->addr = svc_addr_in(rqstp);
+       args->addr = svc_addr(rqstp);
         status = decode_bitmap(xdr, args->bitmap);
  out:
         dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
@@ -188,7 +188,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
         __be32 *p;
         __be32 status;
  
-       args->addr = svc_addr_in(rqstp);
+       args->addr = svc_addr(rqstp);
         status = decode_stateid(xdr, &args->stateid);
         if (unlikely(status != 0))
                 goto out;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c

index a6f625497612048dcf2ef5ef9048187e8e1dd63e..685c43f810c10a476ada529d4e0a92075674ca9a 100644 (file)
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -34,6 +34,8 @@
  #include <linux/nfs_idmap.h>
  #include <linux/vfs.h>
  #include <linux/inet.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
  #include <linux/nfs_xdr.h>
  
  #include <asm/system.h>
@@ -93,22 +95,30 @@ struct rpc_program          nfsacl_program = {
  };
  #endif  /* CONFIG_NFS_V3_ACL */
  
+struct nfs_client_initdata {
+       const char *hostname;
+       const struct sockaddr *addr;
+       size_t addrlen;
+       const struct nfs_rpc_ops *rpc_ops;
+       int proto;
+};
+
  /*
   * Allocate a shared client record
   *
   * Since these are allocated/deallocated very rarely, we don't
   * bother putting them in a slab cache...
   */
-static struct nfs_client *nfs_alloc_client(const char *hostname,
-                                          const struct sockaddr_in *addr,
-                                          int nfsversion)
+static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
  {
         struct nfs_client *clp;
  
         if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
                 goto error_0;
  
-       if (nfsversion == 4) {
+       clp->rpc_ops = cl_init->rpc_ops;
+
+       if (cl_init->rpc_ops->version == 4) {
                 if (nfs_callback_up() < 0)
                         goto error_2;
                 __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
@@ -117,11 +127,11 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
         atomic_set(&clp->cl_count, 1);
         clp->cl_cons_state = NFS_CS_INITING;
  
-       clp->cl_nfsversion = nfsversion;
-       memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
+       memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
+       clp->cl_addrlen = cl_init->addrlen;
  
-       if (hostname) {
-               clp->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+       if (cl_init->hostname) {
+               clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL);
                 if (!clp->cl_hostname)
                         goto error_3;
         }
@@ -129,6 +139,8 @@ static struct nfs_client *nfs_alloc_client(const char *hostname,
         INIT_LIST_HEAD(&clp->cl_superblocks);
         clp->cl_rpcclient = ERR_PTR(-EINVAL);
  
+       clp->cl_proto = cl_init->proto;
+
  #ifdef CONFIG_NFS_V4
         init_rwsem(&clp->cl_sem);
         INIT_LIST_HEAD(&clp->cl_delegations);
@@ -166,7 +178,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
   */
  static void nfs_free_client(struct nfs_client *clp)
  {
-       dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion);
+       dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
  
         nfs4_shutdown_client(clp);
  
@@ -203,76 +215,148 @@ void nfs_put_client(struct nfs_client *clp)
         }
  }
  
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+                                const struct sockaddr_in *sa2)
+{
+       return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
+                                const struct sockaddr_in6 *sa2)
+{
+       return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
+}
+
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+                                const struct sockaddr *sa2)
+{
+       switch (sa1->sa_family) {
+       case AF_INET:
+               return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+                               (const struct sockaddr_in *)sa2);
+       case AF_INET6:
+               return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
+                               (const struct sockaddr_in6 *)sa2);
+       }
+       BUG();
+}
+
  /*
- * Find a client by address
- * - caller must hold nfs_client_lock
+ * Find a client by IP address and protocol version
+ * - returns NULL if no such client
   */
-static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port)
+struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
  {
         struct nfs_client *clp;
  
+       spin_lock(&nfs_client_lock);
         list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+               struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
                 /* Don't match clients that failed to initialise properly */
-               if (clp->cl_cons_state < 0)
+               if (clp->cl_cons_state != NFS_CS_READY)
                         continue;
  
                 /* Different NFS versions cannot share the same nfs_client */
-               if (clp->cl_nfsversion != nfsversion)
+               if (clp->rpc_ops->version != nfsversion)
                         continue;
  
-               if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr,
-                          sizeof(clp->cl_addr.sin_addr)) != 0)
+               if (addr->sa_family != clap->sa_family)
+                       continue;
+               /* Match only the IP address, not the port number */
+               if (!nfs_sockaddr_match_ipaddr(addr, clap))
                         continue;
  
-               if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
-                       goto found;
+               atomic_inc(&clp->cl_count);
+               spin_unlock(&nfs_client_lock);
+               return clp;
         }
-
+       spin_unlock(&nfs_client_lock);
         return NULL;
-
-found:
-       atomic_inc(&clp->cl_count);
-       return clp;
  }
  
  /*
   * Find a client by IP address and protocol version
   * - returns NULL if no such client
   */
-struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
  {
-       struct nfs_client *clp;
+       struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
+       u32 nfsvers = clp->rpc_ops->version;
  
         spin_lock(&nfs_client_lock);
-       clp = __nfs_find_client(addr, nfsversion, 0);
+       list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
+               struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
+
+               /* Don't match clients that failed to initialise properly */
+               if (clp->cl_cons_state != NFS_CS_READY)
+                       continue;
+
+               /* Different NFS versions cannot share the same nfs_client */
+               if (clp->rpc_ops->version != nfsvers)
+                       continue;
+
+               if (sap->sa_family != clap->sa_family)
+                       continue;
+               /* Match only the IP address, not the port number */
+               if (!nfs_sockaddr_match_ipaddr(sap, clap))
+                       continue;
+
+               atomic_inc(&clp->cl_count);
+               spin_unlock(&nfs_client_lock);
+               return clp;
+       }
         spin_unlock(&nfs_client_lock);
-       if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) {
-               nfs_put_client(clp);
-               clp = NULL;
+       return NULL;
+}
+
+/*
+ * Find an nfs_client on the list that matches the initialisation data
+ * that is supplied.
+ */
+static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
+{
+       struct nfs_client *clp;
+
+       list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+               /* Don't match clients that failed to initialise properly */
+               if (clp->cl_cons_state < 0)
+                       continue;
+
+               /* Different NFS versions cannot share the same nfs_client */
+               if (clp->rpc_ops != data->rpc_ops)
+                       continue;
+
+               if (clp->cl_proto != data->proto)
+                       continue;
+
+               /* Match the full socket address */
+               if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
+                       continue;
+
+               atomic_inc(&clp->cl_count);
+               return clp;
         }
-       return clp;
+       return NULL;
  }
  
  /*
   * Look up a client by IP address and protocol version
   * - creates a new record if one doesn't yet exist
   */
-static struct nfs_client *nfs_get_client(const char *hostname,
-                                        const struct sockaddr_in *addr,
-                                        int nfsversion)
+static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
  {
         struct nfs_client *clp, *new = NULL;
         int error;
  
-       dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n",
-               hostname ?: "", NIPQUAD(addr->sin_addr),
-               addr->sin_port, nfsversion);
+       dprintk("--> nfs_get_client(%s,v%u)\n",
+               cl_init->hostname ?: "", cl_init->rpc_ops->version);
  
         /* see if the client already exists */
         do {
                 spin_lock(&nfs_client_lock);
  
-               clp = __nfs_find_client(addr, nfsversion, 1);
+               clp = nfs_match_client(cl_init);
                 if (clp)
                         goto found_client;
                 if (new)
@@ -280,7 +364,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
  
                 spin_unlock(&nfs_client_lock);
  
-               new = nfs_alloc_client(hostname, addr, nfsversion);
+               new = nfs_alloc_client(cl_init);
         } while (new);
  
         return ERR_PTR(-ENOMEM);
@@ -344,12 +428,16 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
         switch (proto) {
         case XPRT_TRANSPORT_TCP:
         case XPRT_TRANSPORT_RDMA:
-               if (!to->to_initval)
+               if (to->to_initval == 0)
                         to->to_initval = 60 * HZ;
                 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
                         to->to_initval = NFS_MAX_TCP_TIMEOUT;
                 to->to_increment = to->to_initval;
                 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+               if (to->to_maxval > NFS_MAX_TCP_TIMEOUT)
+                       to->to_maxval = NFS_MAX_TCP_TIMEOUT;
+               if (to->to_maxval < to->to_initval)
+                       to->to_maxval = to->to_initval;
                 to->to_exponential = 0;
                 break;
         case XPRT_TRANSPORT_UDP:
@@ -367,19 +455,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
  /*
   * Create an RPC client handle
   */
-static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
-                                               unsigned int timeo,
-                                               unsigned int retrans,
-                                               rpc_authflavor_t flavor,
-                                               int flags)
+static int nfs_create_rpc_client(struct nfs_client *clp,
+                                const struct rpc_timeout *timeparms,
+                                rpc_authflavor_t flavor,
+                                int flags)
  {
-       struct rpc_timeout      timeparms;
         struct rpc_clnt         *clnt = NULL;
         struct rpc_create_args args = {
-               .protocol       = proto,
+               .protocol       = clp->cl_proto,
                 .address        = (struct sockaddr *)&clp->cl_addr,
-               .addrsize       = sizeof(clp->cl_addr),
-               .timeout        = &timeparms,
+               .addrsize       = clp->cl_addrlen,
+               .timeout        = timeparms,
                 .servername     = clp->cl_hostname,
                 .program        = &nfs_program,
                 .version        = clp->rpc_ops->version,
@@ -390,10 +476,6 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
         if (!IS_ERR(clp->cl_rpcclient))
                 return 0;
  
-       nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
-       clp->retrans_timeo = timeparms.to_initval;
-       clp->retrans_count = timeparms.to_retries;
-
         clnt = rpc_create(&args);
         if (IS_ERR(clnt)) {
                 dprintk("%s: cannot create RPC client. Error = %ld\n",
@@ -411,7 +493,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
  static void nfs_destroy_server(struct nfs_server *server)
  {
         if (!(server->flags & NFS_MOUNT_NONLM))
-               lockd_down();   /* release rpc.lockd */
+               nlmclnt_done(server->nlm_host);
  }
  
  /*
@@ -419,20 +501,29 @@ static void nfs_destroy_server(struct nfs_server *server)
   */
  static int nfs_start_lockd(struct nfs_server *server)
  {
-       int error = 0;
+       struct nlm_host *host;
+       struct nfs_client *clp = server->nfs_client;
+       struct nlmclnt_initdata nlm_init = {
+               .hostname       = clp->cl_hostname,
+               .address        = (struct sockaddr *)&clp->cl_addr,
+               .addrlen        = clp->cl_addrlen,
+               .protocol       = server->flags & NFS_MOUNT_TCP ?
+                                               IPPROTO_TCP : IPPROTO_UDP,
+               .nfs_version    = clp->rpc_ops->version,
+       };
  
-       if (server->nfs_client->cl_nfsversion > 3)
-               goto out;
+       if (nlm_init.nfs_version > 3)
+               return 0;
         if (server->flags & NFS_MOUNT_NONLM)
-               goto out;
-       error = lockd_up((server->flags & NFS_MOUNT_TCP) ?
-                       IPPROTO_TCP : IPPROTO_UDP);
-       if (error < 0)
-               server->flags |= NFS_MOUNT_NONLM;
-       else
-               server->destroy = nfs_destroy_server;
-out:
-       return error;
+               return 0;
+
+       host = nlmclnt_init(&nlm_init);
+       if (IS_ERR(host))
+               return PTR_ERR(host);
+
+       server->nlm_host = host;
+       server->destroy = nfs_destroy_server;
+       return 0;
  }
  
  /*
@@ -441,7 +532,7 @@ out:
  #ifdef CONFIG_NFS_V3_ACL
  static void nfs_init_server_aclclient(struct nfs_server *server)
  {
-       if (server->nfs_client->cl_nfsversion != 3)
+       if (server->nfs_client->rpc_ops->version != 3)
                 goto out_noacl;
         if (server->flags & NFS_MOUNT_NOACL)
                 goto out_noacl;
@@ -468,7 +559,9 @@ static inline void nfs_init_server_aclclient(struct nfs_server *server)
  /*
   * Create a general RPC client
   */
-static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t pseudoflavour)
+static int nfs_init_server_rpcclient(struct nfs_server *server,
+               const struct rpc_timeout *timeo,
+               rpc_authflavor_t pseudoflavour)
  {
         struct nfs_client *clp = server->nfs_client;
  
@@ -478,6 +571,11 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
                 return PTR_ERR(server->client);
         }
  
+       memcpy(&server->client->cl_timeout_default,
+                       timeo,
+                       sizeof(server->client->cl_timeout_default));
+       server->client->cl_timeout = &server->client->cl_timeout_default;
+
         if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
                 struct rpc_auth *auth;
  
@@ -502,6 +600,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t
   * Initialise an NFS2 or NFS3 client
   */
  static int nfs_init_client(struct nfs_client *clp,
+                          const struct rpc_timeout *timeparms,
                            const struct nfs_parsed_mount_data *data)
  {
         int error;
@@ -512,18 +611,11 @@ static int nfs_init_client(struct nfs_client *clp,
                 return 0;
         }
  
-       /* Check NFS protocol revision and initialize RPC op vector */
-       clp->rpc_ops = &nfs_v2_clientops;
-#ifdef CONFIG_NFS_V3
-       if (clp->cl_nfsversion == 3)
-               clp->rpc_ops = &nfs_v3_clientops;
-#endif
         /*
          * Create a client RPC handle for doing FSSTAT with UNIX auth only
          * - RFC 2623, sec 2.3.2
          */
-       error = nfs_create_rpc_client(clp, data->nfs_server.protocol,
-                               data->timeo, data->retrans, RPC_AUTH_UNIX, 0);
+       error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
         if (error < 0)
                 goto error;
         nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -541,25 +633,34 @@ error:
  static int nfs_init_server(struct nfs_server *server,
                            const struct nfs_parsed_mount_data *data)
  {
+       struct nfs_client_initdata cl_init = {
+               .hostname = data->nfs_server.hostname,
+               .addr = (const struct sockaddr *)&data->nfs_server.address,
+               .addrlen = data->nfs_server.addrlen,
+               .rpc_ops = &nfs_v2_clientops,
+               .proto = data->nfs_server.protocol,
+       };
+       struct rpc_timeout timeparms;
         struct nfs_client *clp;
-       int error, nfsvers = 2;
+       int error;
  
         dprintk("--> nfs_init_server()\n");
  
  #ifdef CONFIG_NFS_V3
         if (data->flags & NFS_MOUNT_VER3)
-               nfsvers = 3;
+               cl_init.rpc_ops = &nfs_v3_clientops;
  #endif
  
         /* Allocate or find a client reference we can use */
-       clp = nfs_get_client(data->nfs_server.hostname,
-                               &data->nfs_server.address, nfsvers);
+       clp = nfs_get_client(&cl_init);
         if (IS_ERR(clp)) {
                 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
                 return PTR_ERR(clp);
         }
  
-       error = nfs_init_client(clp, data);
+       nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+                       data->timeo, data->retrans);
+       error = nfs_init_client(clp, &timeparms, data);
         if (error < 0)
                 goto error;
  
@@ -583,7 +684,7 @@ static int nfs_init_server(struct nfs_server *server,
         if (error < 0)
                 goto error;
  
-       error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
+       error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
         if (error < 0)
                 goto error;
  
@@ -729,6 +830,9 @@ static struct nfs_server *nfs_alloc_server(void)
         INIT_LIST_HEAD(&server->client_link);
         INIT_LIST_HEAD(&server->master_link);
  
+       init_waitqueue_head(&server->active_wq);
+       atomic_set(&server->active, 0);
+
         server->io_stats = nfs_alloc_iostats();
         if (!server->io_stats) {
                 kfree(server);
@@ -840,7 +944,7 @@ error:
   * Initialise an NFS4 client record
   */
  static int nfs4_init_client(struct nfs_client *clp,
-               int proto, int timeo, int retrans,
+               const struct rpc_timeout *timeparms,
                 const char *ip_addr,
                 rpc_authflavor_t authflavour)
  {
@@ -855,7 +959,7 @@ static int nfs4_init_client(struct nfs_client *clp,
         /* Check NFS protocol revision and initialize RPC op vector */
         clp->rpc_ops = &nfs_v4_clientops;
  
-       error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour,
+       error = nfs_create_rpc_client(clp, timeparms, authflavour,
                                         RPC_CLNT_CREATE_DISCRTRY);
         if (error < 0)
                 goto error;
@@ -882,23 +986,32 @@ error:
   * Set up an NFS4 client
   */
  static int nfs4_set_client(struct nfs_server *server,
-               const char *hostname, const struct sockaddr_in *addr,
+               const char *hostname,
+               const struct sockaddr *addr,
+               const size_t addrlen,
                 const char *ip_addr,
                 rpc_authflavor_t authflavour,
-               int proto, int timeo, int retrans)
+               int proto, const struct rpc_timeout *timeparms)
  {
+       struct nfs_client_initdata cl_init = {
+               .hostname = hostname,
+               .addr = addr,
+               .addrlen = addrlen,
+               .rpc_ops = &nfs_v4_clientops,
+               .proto = proto,
+       };
         struct nfs_client *clp;
         int error;
  
         dprintk("--> nfs4_set_client()\n");
  
         /* Allocate or find a client reference we can use */
-       clp = nfs_get_client(hostname, addr, 4);
+       clp = nfs_get_client(&cl_init);
         if (IS_ERR(clp)) {
                 error = PTR_ERR(clp);
                 goto error;
         }
-       error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour);
+       error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
         if (error < 0)
                 goto error_put;
  
@@ -919,10 +1032,26 @@ error:
  static int nfs4_init_server(struct nfs_server *server,
                 const struct nfs_parsed_mount_data *data)
  {
+       struct rpc_timeout timeparms;
         int error;
  
         dprintk("--> nfs4_init_server()\n");
  
+       nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+                       data->timeo, data->retrans);
+
+       /* Get a client record */
+       error = nfs4_set_client(server,
+                       data->nfs_server.hostname,
+                       (const struct sockaddr *)&data->nfs_server.address,
+                       data->nfs_server.addrlen,
+                       data->client_address,
+                       data->auth_flavors[0],
+                       data->nfs_server.protocol,
+                       &timeparms);
+       if (error < 0)
+               goto error;
+
         /* Initialise the client representation from the mount data */
         server->flags = data->flags & NFS_MOUNT_FLAGMASK;
         server->caps |= NFS_CAP_ATOMIC_OPEN;
@@ -937,8 +1066,9 @@ static int nfs4_init_server(struct nfs_server *server,
         server->acdirmin = data->acdirmin * HZ;
         server->acdirmax = data->acdirmax * HZ;
  
-       error = nfs_init_server_rpcclient(server, data->auth_flavors[0]);
+       error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
  
+error:
         /* Done */
         dprintk("<-- nfs4_init_server() = %d\n", error);
         return error;
@@ -961,17 +1091,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
         if (!server)
                 return ERR_PTR(-ENOMEM);
  
-       /* Get a client record */
-       error = nfs4_set_client(server,
-                       data->nfs_server.hostname,
-                       &data->nfs_server.address,
-                       data->client_address,
-                       data->auth_flavors[0],
-                       data->nfs_server.protocol,
-                       data->timeo, data->retrans);
-       if (error < 0)
-               goto error;
-
         /* set up the general RPC client */
         error = nfs4_init_server(server, data);
         if (error < 0)
@@ -1039,12 +1158,13 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
  
         /* Get a client representation.
          * Note: NFSv4 always uses TCP, */
-       error = nfs4_set_client(server, data->hostname, data->addr,
-                       parent_client->cl_ipaddr,
-                       data->authflavor,
-                       parent_server->client->cl_xprt->prot,
-                       parent_client->retrans_timeo,
-                       parent_client->retrans_count);
+       error = nfs4_set_client(server, data->hostname,
+                               data->addr,
+                               data->addrlen,
+                               parent_client->cl_ipaddr,
+                               data->authflavor,
+                               parent_server->client->cl_xprt->prot,
+                               parent_server->client->cl_timeout);
         if (error < 0)
                 goto error;
  
@@ -1052,7 +1172,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
         nfs_server_copy_userdata(server, parent_server);
         server->caps |= NFS_CAP_ATOMIC_OPEN;
  
-       error = nfs_init_server_rpcclient(server, data->authflavor);
+       error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
         if (error < 0)
                 goto error;
  
@@ -1121,7 +1241,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
  
         server->fsid = fattr->fsid;
  
-       error = nfs_init_server_rpcclient(server, source->client->cl_auth->au_flavor);
+       error = nfs_init_server_rpcclient(server,
+                       source->client->cl_timeout,
+                       source->client->cl_auth->au_flavor);
         if (error < 0)
                 goto out_free_server;
         if (!IS_ERR(source->client_acl))
@@ -1263,10 +1385,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
         /* display one transport per line on subsequent lines */
         clp = list_entry(v, struct nfs_client, cl_share_link);
  
-       seq_printf(m, "v%d %02x%02x%02x%02x %4hx %3d %s\n",
-                  clp->cl_nfsversion,
-                  NIPQUAD(clp->cl_addr.sin_addr),
-                  ntohs(clp->cl_addr.sin_port),
+       seq_printf(m, "v%u %s %s %3d %s\n",
+                  clp->rpc_ops->version,
+                  rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+                  rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                    atomic_read(&clp->cl_count),
                    clp->cl_hostname);
  
@@ -1342,10 +1464,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
                  (unsigned long long) server->fsid.major,
                  (unsigned long long) server->fsid.minor);
  
-       seq_printf(m, "v%d %02x%02x%02x%02x %4hx %-7s %-17s\n",
-                  clp->cl_nfsversion,
-                  NIPQUAD(clp->cl_addr.sin_addr),
-                  ntohs(clp->cl_addr.sin_port),
+       seq_printf(m, "v%u %s %s %-7s %-17s\n",
+                  clp->rpc_ops->version,
+                  rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
+                  rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                    dev,
                    fsid);
  
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c

index 11833f4caeaa9a2ea5549675636330a689c0e80b..b9eadd18ba7070a796497a7601fed53f3700ddb4 100644 (file)
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,32 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
         put_rpccred(oldcred);
  }
  
+static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+{
+       int res = 0;
+
+       res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+       nfs_free_delegation(delegation);
+       return res;
+}
+
+static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+{
+       struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+       if (delegation == NULL)
+               goto nomatch;
+       if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
+                               sizeof(delegation->stateid.data)) != 0)
+               goto nomatch;
+       list_del_rcu(&delegation->super_list);
+       nfsi->delegation_state = 0;
+       rcu_assign_pointer(nfsi->delegation, NULL);
+       return delegation;
+nomatch:
+       return NULL;
+}
+
  /*
   * Set up a delegation on an inode
   */
@@ -133,6 +159,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
         struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
         struct nfs_inode *nfsi = NFS_I(inode);
         struct nfs_delegation *delegation;
+       struct nfs_delegation *freeme = NULL;
         int status = 0;
  
         delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
@@ -147,41 +174,45 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
         delegation->inode = inode;
  
         spin_lock(&clp->cl_lock);
-       if (rcu_dereference(nfsi->delegation) == NULL) {
-               list_add_rcu(&delegation->super_list, &clp->cl_delegations);
-               nfsi->delegation_state = delegation->type;
-               rcu_assign_pointer(nfsi->delegation, delegation);
-               delegation = NULL;
-       } else {
+       if (rcu_dereference(nfsi->delegation) != NULL) {
                 if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
-                                       sizeof(delegation->stateid)) != 0 ||
-                               delegation->type != nfsi->delegation->type) {
-                       printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n",
-                                       __FUNCTION__, NIPQUAD(clp->cl_addr.sin_addr));
-                       status = -EIO;
+                                       sizeof(delegation->stateid)) == 0 &&
+                               delegation->type == nfsi->delegation->type) {
+                       goto out;
+               }
+               /*
+                * Deal with broken servers that hand out two
+                * delegations for the same file.
+                */
+               dfprintk(FILE, "%s: server %s handed out "
+                               "a duplicate delegation!\n",
+                               __FUNCTION__, clp->cl_hostname);
+               if (delegation->type <= nfsi->delegation->type) {
+                       freeme = delegation;
+                       delegation = NULL;
+                       goto out;
                 }
+               freeme = nfs_detach_delegation_locked(nfsi, NULL);
         }
+       list_add_rcu(&delegation->super_list, &clp->cl_delegations);
+       nfsi->delegation_state = delegation->type;
+       rcu_assign_pointer(nfsi->delegation, delegation);
+       delegation = NULL;
  
         /* Ensure we revalidate the attributes and page cache! */
         spin_lock(&inode->i_lock);
         nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
         spin_unlock(&inode->i_lock);
  
+out:
         spin_unlock(&clp->cl_lock);
         if (delegation != NULL)
                 nfs_free_delegation(delegation);
+       if (freeme != NULL)
+               nfs_do_return_delegation(inode, freeme, 0);
         return status;
  }
  
-static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
-{
-       int res = 0;
-
-       res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
-       nfs_free_delegation(delegation);
-       return res;
-}
-
  /* Sync all data to disk upon delegation return */
  static void nfs_msync_inode(struct inode *inode)
  {
@@ -207,24 +238,28 @@ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegat
         up_read(&clp->cl_sem);
         nfs_msync_inode(inode);
  
-       return nfs_do_return_delegation(inode, delegation);
+       return nfs_do_return_delegation(inode, delegation, 1);
  }
  
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+/*
+ * This function returns the delegation without reclaiming opens
+ * or protecting against delegation reclaims.
+ * It is therefore really only safe to be called from
+ * nfs4_clear_inode()
+ */
+void nfs_inode_return_delegation_noreclaim(struct inode *inode)
  {
-       struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+       struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_delegation *delegation;
  
-       if (delegation == NULL)
-               goto nomatch;
-       if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
-                               sizeof(delegation->stateid.data)) != 0)
-               goto nomatch;
-       list_del_rcu(&delegation->super_list);
-       nfsi->delegation_state = 0;
-       rcu_assign_pointer(nfsi->delegation, NULL);
-       return delegation;
-nomatch:
-       return NULL;
+       if (rcu_dereference(nfsi->delegation) != NULL) {
+               spin_lock(&clp->cl_lock);
+               delegation = nfs_detach_delegation_locked(nfsi, NULL);
+               spin_unlock(&clp->cl_lock);
+               if (delegation != NULL)
+                       nfs_do_return_delegation(inode, delegation, 0);
+       }
  }
  
  int nfs_inode_return_delegation(struct inode *inode)
@@ -314,8 +349,9 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
         __module_get(THIS_MODULE);
         atomic_inc(&clp->cl_count);
         task = kthread_run(nfs_do_expire_all_delegations, clp,
-                       "%u.%u.%u.%u-delegreturn",
-                       NIPQUAD(clp->cl_addr.sin_addr));
+                               "%s-delegreturn",
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                                       RPC_DISPLAY_ADDR));
         if (!IS_ERR(task))
                 return;
         nfs_put_client(clp);
@@ -386,7 +422,7 @@ static int recall_thread(void *data)
         nfs_msync_inode(inode);
  
         if (delegation != NULL)
-               nfs_do_return_delegation(inode, delegation);
+               nfs_do_return_delegation(inode, delegation, 1);
         iput(inode);
         module_put_and_exit(0);
  }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h

index 5874ce7fdbaee12339a0d6910af6fcdc6f578848..f1c5e2a5d88e35ace148060ea33c4492dd2f3f4d 100644 (file)
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,6 +29,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
  void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
  int nfs_inode_return_delegation(struct inode *inode);
  int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
+void nfs_inode_return_delegation_noreclaim(struct inode *inode);
  
  struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
  void nfs_return_all_delegations(struct super_block *sb);
@@ -39,7 +40,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp);
  void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
  
  /* NFSv4 delegation-related procedures */
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
  int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
  int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
  int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c

index f697b5c74b7c38f5627ea71c9d2ec9ac95bc3eba..476cb0f837fd191d08898c87674d7a1fe51e84ee 100644 (file)
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -192,7 +192,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
                 /* We requested READDIRPLUS, but the server doesn't grok it */
                 if (error == -ENOTSUPP && desc->plus) {
                         NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
-                       clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                       clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                         desc->plus = 0;
                         goto again;
                 }
@@ -537,12 +537,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
  
         lock_kernel();
  
-       res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
-       if (res < 0) {
-               unlock_kernel();
-               return res;
-       }
-
         /*
          * filp->f_pos points to the dirent entry number.
          * *desc->dir_cookie has the cookie for the next entry. We have
@@ -564,6 +558,10 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
         desc->entry = &my_entry;
  
         nfs_block_sillyrename(dentry);
+       res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
+       if (res < 0)
+               goto out;
+
         while(!desc->entry->eof) {
                 res = readdir_search_pagecache(desc);
  
@@ -579,7 +577,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                         break;
                 }
                 if (res == -ETOOSMALL && desc->plus) {
-                       clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                       clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                         nfs_zap_caches(inode);
                         desc->plus = 0;
                         desc->entry->eof = 0;
@@ -594,6 +592,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                         break;
                 }
         }
+out:
         nfs_unblock_sillyrename(dentry);
         unlock_kernel();
         if (res > 0)
@@ -639,6 +638,21 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
         return 0;
  }
  
+/**
+ * nfs_force_lookup_revalidate - Mark the directory as having changed
+ * @dir - pointer to directory inode
+ *
+ * This forces the revalidation code in nfs_lookup_revalidate() to do a
+ * full lookup on all child dentries of 'dir' whenever a change occurs
+ * on the server that might have invalidated our dcache.
+ *
+ * The caller should be holding dir->i_lock
+ */
+void nfs_force_lookup_revalidate(struct inode *dir)
+{
+       NFS_I(dir)->cache_change_attribute = jiffies;
+}
+
  /*
   * A check for whether or not the parent directory has changed.
   * In the case it has, we assume that the dentries are untrustworthy
@@ -827,6 +841,10 @@ static int nfs_dentry_delete(struct dentry *dentry)
                 dentry->d_parent->d_name.name, dentry->d_name.name,
                 dentry->d_flags);
  
+       /* Unhash any dentry with a stale inode */
+       if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
+               return 1;
+
         if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                 /* Unhash it, so that ->d_iput() would be called */
                 return 1;
@@ -846,7 +864,6 @@ static int nfs_dentry_delete(struct dentry *dentry)
   */
  static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
  {
-       nfs_inode_return_delegation(inode);
         if (S_ISDIR(inode->i_mode))
                 /* drop any readdir cache as it could easily be old */
                 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
@@ -1268,6 +1285,12 @@ out_err:
         return error;
  }
  
+static void nfs_dentry_handle_enoent(struct dentry *dentry)
+{
+       if (dentry->d_inode != NULL && !d_unhashed(dentry))
+               d_delete(dentry);
+}
+
  static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
  {
         int error;
@@ -1280,6 +1303,8 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
         /* Ensure the VFS deletes this inode */
         if (error == 0 && dentry->d_inode != NULL)
                 clear_nlink(dentry->d_inode);
+       else if (error == -ENOENT)
+               nfs_dentry_handle_enoent(dentry);
         unlock_kernel();
  
         return error;
@@ -1386,6 +1411,8 @@ static int nfs_safe_remove(struct dentry *dentry)
                 nfs_mark_for_revalidate(inode);
         } else
                 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
+       if (error == -ENOENT)
+               nfs_dentry_handle_enoent(dentry);
  out:
         return error;
  }
@@ -1422,7 +1449,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
         spin_unlock(&dentry->d_lock);
         spin_unlock(&dcache_lock);
         error = nfs_safe_remove(dentry);
-       if (!error) {
+       if (!error || error == -ENOENT) {
                 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
         } else if (need_rehash)
                 d_rehash(dentry);
@@ -1635,7 +1662,8 @@ out:
                 d_move(old_dentry, new_dentry);
                 nfs_set_verifier(new_dentry,
                                         nfs_save_change_attribute(new_dir));
-       }
+       } else if (error == -ENOENT)
+               nfs_dentry_handle_enoent(old_dentry);
  
         /* new dentry created? */
         if (dentry)
@@ -1666,13 +1694,19 @@ int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
  restart:
         spin_lock(&nfs_access_lru_lock);
         list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+               struct rw_semaphore *s_umount;
                 struct inode *inode;
  
                 if (nr_to_scan-- == 0)
                         break;
+               s_umount = &nfsi->vfs_inode.i_sb->s_umount;
+               if (!down_read_trylock(s_umount))
+                       continue;
                 inode = igrab(&nfsi->vfs_inode);
-               if (inode == NULL)
+               if (inode == NULL) {
+                       up_read(s_umount);
                         continue;
+               }
                 spin_lock(&inode->i_lock);
                 if (list_empty(&nfsi->access_cache_entry_lru))
                         goto remove_lru_entry;
@@ -1691,6 +1725,7 @@ remove_lru_entry:
                 spin_unlock(&inode->i_lock);
                 spin_unlock(&nfs_access_lru_lock);
                 iput(inode);
+               up_read(s_umount);
                 goto restart;
         }
         spin_unlock(&nfs_access_lru_lock);
@@ -1731,7 +1766,7 @@ static void __nfs_access_zap_cache(struct inode *inode)
  void nfs_access_zap_cache(struct inode *inode)
  {
         /* Remove from global LRU init */
-       if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+       if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
                 spin_lock(&nfs_access_lru_lock);
                 list_del_init(&NFS_I(inode)->access_cache_inode_lru);
                 spin_unlock(&nfs_access_lru_lock);
@@ -1845,7 +1880,7 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
         smp_mb__after_atomic_inc();
  
         /* Add inode to global LRU list */
-       if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+       if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
                 spin_lock(&nfs_access_lru_lock);
                 list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
                 spin_unlock(&nfs_access_lru_lock);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c

index 3c9d16b4f80c2d2f869ad7541b650389d4cf8130..f8e165c7d5a637de762e619e1ddd1a7db4436fb7 100644 (file)
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -188,12 +188,17 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
  static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
  {
         ssize_t result = -EIOCBQUEUED;
+       struct rpc_clnt *clnt;
+       sigset_t oldset;
  
         /* Async requests don't wait here */
         if (dreq->iocb)
                 goto out;
  
+       clnt = NFS_CLIENT(dreq->inode);
+       rpc_clnt_sigmask(clnt, &oldset);
         result = wait_for_completion_interruptible(&dreq->completion);
+       rpc_clnt_sigunmask(clnt, &oldset);
  
         if (!result)
                 result = dreq->error;
@@ -272,6 +277,16 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
         unsigned long user_addr = (unsigned long)iov->iov_base;
         size_t count = iov->iov_len;
         size_t rsize = NFS_SERVER(inode)->rsize;
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_cred = ctx->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = NFS_CLIENT(inode),
+               .rpc_message = &msg,
+               .callback_ops = &nfs_read_direct_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
         unsigned int pgbase;
         int result;
         ssize_t started = 0;
@@ -311,7 +326,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
  
                 data->req = (struct nfs_page *) dreq;
                 data->inode = inode;
-               data->cred = ctx->cred;
+               data->cred = msg.rpc_cred;
                 data->args.fh = NFS_FH(inode);
                 data->args.context = ctx;
                 data->args.offset = pos;
@@ -321,14 +336,16 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                 data->res.fattr = &data->fattr;
                 data->res.eof = 0;
                 data->res.count = bytes;
+               msg.rpc_argp = &data->args;
+               msg.rpc_resp = &data->res;
  
-               rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-                               &nfs_read_direct_ops, data);
-               NFS_PROTO(inode)->read_setup(data);
-
-               data->task.tk_cookie = (unsigned long) inode;
+               task_setup_data.task = &data->task;
+               task_setup_data.callback_data = data;
+               NFS_PROTO(inode)->read_setup(data, &msg);
  
-               rpc_execute(&data->task);
+               task = rpc_run_task(&task_setup_data);
+               if (!IS_ERR(task))
+                       rpc_put_task(task);
  
                 dprintk("NFS: %5u initiated direct read call "
                         "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -391,9 +408,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos)
  {
         ssize_t result = 0;
-       sigset_t oldset;
         struct inode *inode = iocb->ki_filp->f_mapping->host;
-       struct rpc_clnt *clnt = NFS_CLIENT(inode);
         struct nfs_direct_req *dreq;
  
         dreq = nfs_direct_req_alloc();
@@ -405,11 +420,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
         if (!is_sync_kiocb(iocb))
                 dreq->iocb = iocb;
  
-       rpc_clnt_sigmask(clnt, &oldset);
         result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
         if (!result)
                 result = nfs_direct_wait(dreq);
-       rpc_clnt_sigunmask(clnt, &oldset);
         nfs_direct_req_release(dreq);
  
         return result;
@@ -431,6 +444,15 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
         struct inode *inode = dreq->inode;
         struct list_head *p;
         struct nfs_write_data *data;
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_cred = dreq->ctx->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = NFS_CLIENT(inode),
+               .callback_ops = &nfs_write_direct_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
  
         dreq->count = 0;
         get_dreq(dreq);
@@ -440,6 +462,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
  
                 get_dreq(dreq);
  
+               /* Use stable writes */
+               data->args.stable = NFS_FILE_SYNC;
+
                 /*
                  * Reset data->res.
                  */
@@ -451,17 +476,18 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
                  * Reuse data->task; data->args should not have changed
                  * since the original request was sent.
                  */
-               rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-                               &nfs_write_direct_ops, data);
-               NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
-
-               data->task.tk_priority = RPC_PRIORITY_NORMAL;
-               data->task.tk_cookie = (unsigned long) inode;
+               task_setup_data.task = &data->task;
+               task_setup_data.callback_data = data;
+               msg.rpc_argp = &data->args;
+               msg.rpc_resp = &data->res;
+               NFS_PROTO(inode)->write_setup(data, &msg);
  
                 /*
                  * We're called via an RPC callback, so BKL is already held.
                  */
-               rpc_execute(&data->task);
+               task = rpc_run_task(&task_setup_data);
+               if (!IS_ERR(task))
+                       rpc_put_task(task);
  
                 dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
                                 data->task.tk_pid,
@@ -504,9 +530,23 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
  static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
  {
         struct nfs_write_data *data = dreq->commit_data;
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+               .rpc_cred = dreq->ctx->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .task = &data->task,
+               .rpc_client = NFS_CLIENT(dreq->inode),
+               .rpc_message = &msg,
+               .callback_ops = &nfs_commit_direct_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC,
+       };
  
         data->inode = dreq->inode;
-       data->cred = dreq->ctx->cred;
+       data->cred = msg.rpc_cred;
  
         data->args.fh = NFS_FH(data->inode);
         data->args.offset = 0;
@@ -515,18 +555,16 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
         data->res.fattr = &data->fattr;
         data->res.verf = &data->verf;
  
-       rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
-                               &nfs_commit_direct_ops, data);
-       NFS_PROTO(data->inode)->commit_setup(data, 0);
+       NFS_PROTO(data->inode)->commit_setup(data, &msg);
  
-       data->task.tk_priority = RPC_PRIORITY_NORMAL;
-       data->task.tk_cookie = (unsigned long)data->inode;
         /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
         dreq->commit_data = NULL;
  
         dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
  
-       rpc_execute(&data->task);
+       task = rpc_run_task(&task_setup_data);
+       if (!IS_ERR(task))
+               rpc_put_task(task);
  }
  
  static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
@@ -641,6 +679,16 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
         struct inode *inode = ctx->path.dentry->d_inode;
         unsigned long user_addr = (unsigned long)iov->iov_base;
         size_t count = iov->iov_len;
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_cred = ctx->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = NFS_CLIENT(inode),
+               .rpc_message = &msg,
+               .callback_ops = &nfs_write_direct_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
         size_t wsize = NFS_SERVER(inode)->wsize;
         unsigned int pgbase;
         int result;
@@ -683,25 +731,27 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
  
                 data->req = (struct nfs_page *) dreq;
                 data->inode = inode;
-               data->cred = ctx->cred;
+               data->cred = msg.rpc_cred;
                 data->args.fh = NFS_FH(inode);
                 data->args.context = ctx;
                 data->args.offset = pos;
                 data->args.pgbase = pgbase;
                 data->args.pages = data->pagevec;
                 data->args.count = bytes;
+               data->args.stable = sync;
                 data->res.fattr = &data->fattr;
                 data->res.count = bytes;
                 data->res.verf = &data->verf;
  
-               rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-                               &nfs_write_direct_ops, data);
-               NFS_PROTO(inode)->write_setup(data, sync);
-
-               data->task.tk_priority = RPC_PRIORITY_NORMAL;
-               data->task.tk_cookie = (unsigned long) inode;
+               task_setup_data.task = &data->task;
+               task_setup_data.callback_data = data;
+               msg.rpc_argp = &data->args;
+               msg.rpc_resp = &data->res;
+               NFS_PROTO(inode)->write_setup(data, &msg);
  
-               rpc_execute(&data->task);
+               task = rpc_run_task(&task_setup_data);
+               if (!IS_ERR(task))
+                       rpc_put_task(task);
  
                 dprintk("NFS: %5u initiated direct write call "
                         "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
@@ -767,12 +817,10 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
                                 size_t count)
  {
         ssize_t result = 0;
-       sigset_t oldset;
         struct inode *inode = iocb->ki_filp->f_mapping->host;
-       struct rpc_clnt *clnt = NFS_CLIENT(inode);
         struct nfs_direct_req *dreq;
         size_t wsize = NFS_SERVER(inode)->wsize;
-       int sync = 0;
+       int sync = NFS_UNSTABLE;
  
         dreq = nfs_direct_req_alloc();
         if (!dreq)
@@ -780,18 +828,16 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
         nfs_alloc_commit_data(dreq);
  
         if (dreq->commit_data == NULL || count < wsize)
-               sync = FLUSH_STABLE;
+               sync = NFS_FILE_SYNC;
  
         dreq->inode = inode;
         dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
         if (!is_sync_kiocb(iocb))
                 dreq->iocb = iocb;
  
-       rpc_clnt_sigmask(clnt, &oldset);
         result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
         if (!result)
                 result = nfs_direct_wait(dreq);
-       rpc_clnt_sigunmask(clnt, &oldset);
         nfs_direct_req_release(dreq);
  
         return result;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c

index b3bb89f7d5d2856719a705e37ddb320b7a285127..ef57a5ae5904663d4ab20c974e920e711f921436 100644 (file)
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -349,7 +349,9 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
         unlock_page(page);
         page_cache_release(page);
  
-       return status < 0 ? status : copied;
+       if (status < 0)
+               return status;
+       return copied;
  }
  
  static void nfs_invalidate_page(struct page *page, unsigned long offset)
@@ -392,35 +394,27 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
         struct file *filp = vma->vm_file;
         unsigned pagelen;
         int ret = -EINVAL;
-       void *fsdata;
         struct address_space *mapping;
-       loff_t offset;
  
         lock_page(page);
         mapping = page->mapping;
-       if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) {
-               unlock_page(page);
-               return -EINVAL;
-       }
+       if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+               goto out_unlock;
+
+       ret = 0;
         pagelen = nfs_page_length(page);
-       offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-       unlock_page(page);
+       if (pagelen == 0)
+               goto out_unlock;
  
-       /*
-        * we can use mapping after releasing the page lock, because:
-        * we hold mmap_sem on the fault path, which should pin the vma
-        * which should pin the file, which pins the dentry which should
-        * hold a reference on inode.
-        */
+       ret = nfs_flush_incompatible(filp, page);
+       if (ret != 0)
+               goto out_unlock;
  
-       if (pagelen) {
-               struct page *page2 = NULL;
-               ret = nfs_write_begin(filp, mapping, offset, pagelen,
-                               0, &page2, &fsdata);
-               if (!ret)
-                       ret = nfs_write_end(filp, mapping, offset, pagelen,
-                                       pagelen, page2, fsdata);
-       }
+       ret = nfs_updatepage(filp, page, 0, pagelen);
+       if (ret == 0)
+               ret = pagelen;
+out_unlock:
+       unlock_page(page);
         return ret;
  }
  
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c

index d11eb055265cbd80dbb5f280d0f9d118c8fe64d5..8ae5dba2d4e5903a163eb4b050aa6da3ea31dc01 100644 (file)
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -72,39 +72,39 @@ module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
                  &nfs_idmap_cache_timeout, 0644);
  
  struct idmap_hashent {
-       unsigned long ih_expires;
-       __u32 ih_id;
-       int ih_namelen;
-       char ih_name[IDMAP_NAMESZ];
+       unsigned long           ih_expires;
+       __u32                   ih_id;
+       size_t                  ih_namelen;
+       char                    ih_name[IDMAP_NAMESZ];
  };
  
  struct idmap_hashtable {
-       __u8 h_type;
-       struct idmap_hashent h_entries[IDMAP_HASH_SZ];
+       __u8                    h_type;
+       struct idmap_hashent    h_entries[IDMAP_HASH_SZ];
  };
  
  struct idmap {
-       struct dentry        *idmap_dentry;
-       wait_queue_head_t     idmap_wq;
-       struct idmap_msg      idmap_im;
-       struct mutex          idmap_lock;    /* Serializes upcalls */
-       struct mutex          idmap_im_lock; /* Protects the hashtable */
-       struct idmap_hashtable idmap_user_hash;
-       struct idmap_hashtable idmap_group_hash;
+       struct dentry           *idmap_dentry;
+       wait_queue_head_t       idmap_wq;
+       struct idmap_msg        idmap_im;
+       struct mutex            idmap_lock;     /* Serializes upcalls */
+       struct mutex            idmap_im_lock;  /* Protects the hashtable */
+       struct idmap_hashtable  idmap_user_hash;
+       struct idmap_hashtable  idmap_group_hash;
  };
  
-static ssize_t   idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-                    char __user *, size_t);
-static ssize_t   idmap_pipe_downcall(struct file *, const char __user *,
-                    size_t);
-static void      idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
+static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+                                char __user *, size_t);
+static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
+                                  size_t);
+static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
  
  static unsigned int fnvhash32(const void *, size_t);
  
  static struct rpc_pipe_ops idmap_upcall_ops = {
-        .upcall         = idmap_pipe_upcall,
-        .downcall       = idmap_pipe_downcall,
-        .destroy_msg    = idmap_pipe_destroy_msg,
+       .upcall         = idmap_pipe_upcall,
+       .downcall       = idmap_pipe_downcall,
+       .destroy_msg    = idmap_pipe_destroy_msg,
  };
  
  int
@@ -115,19 +115,20 @@ nfs_idmap_new(struct nfs_client *clp)
  
         BUG_ON(clp->cl_idmap != NULL);
  
-        if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
-                return -ENOMEM;
+       idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);
+       if (idmap == NULL)
+               return -ENOMEM;
  
-        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
-           idmap, &idmap_upcall_ops, 0);
-        if (IS_ERR(idmap->idmap_dentry)) {
+       idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
+                                        idmap, &idmap_upcall_ops, 0);
+       if (IS_ERR(idmap->idmap_dentry)) {
                 error = PTR_ERR(idmap->idmap_dentry);
                 kfree(idmap);
                 return error;
         }
  
-        mutex_init(&idmap->idmap_lock);
-        mutex_init(&idmap->idmap_im_lock);
+       mutex_init(&idmap->idmap_lock);
+       mutex_init(&idmap->idmap_im_lock);
         init_waitqueue_head(&idmap->idmap_wq);
         idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER;
         idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
@@ -192,7 +193,7 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id)
   * pretty trivial.
   */
  static inline struct idmap_hashent *
-idmap_alloc_name(struct idmap_hashtable *h, char *name, unsigned len)
+idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
  {
         return idmap_name_hash(h, name, len);
  }
@@ -285,7 +286,7 @@ nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
         memset(im, 0, sizeof(*im));
         mutex_unlock(&idmap->idmap_im_lock);
         mutex_unlock(&idmap->idmap_lock);
-       return (ret);
+       return ret;
  }
  
  /*
@@ -354,42 +355,40 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
  /* RPC pipefs upcall/downcall routines */
  static ssize_t
  idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-    char __user *dst, size_t buflen)
+                 char __user *dst, size_t buflen)
  {
-        char *data = (char *)msg->data + msg->copied;
-        ssize_t mlen = msg->len - msg->copied;
-        ssize_t left;
-
-        if (mlen > buflen)
-                mlen = buflen;
-
-        left = copy_to_user(dst, data, mlen);
-       if (left < 0) {
-               msg->errno = left;
-               return left;
+       char *data = (char *)msg->data + msg->copied;
+       size_t mlen = min(msg->len, buflen);
+       unsigned long left;
+
+       left = copy_to_user(dst, data, mlen);
+       if (left == mlen) {
+               msg->errno = -EFAULT;
+               return -EFAULT;
         }
+
         mlen -= left;
         msg->copied += mlen;
         msg->errno = 0;
-        return mlen;
+       return mlen;
  }
  
  static ssize_t
  idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
  {
-        struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+       struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
         struct idmap *idmap = (struct idmap *)rpci->private;
         struct idmap_msg im_in, *im = &idmap->idmap_im;
         struct idmap_hashtable *h;
         struct idmap_hashent *he = NULL;
-       int namelen_in;
+       size_t namelen_in;
         int ret;
  
-        if (mlen != sizeof(im_in))
-                return (-ENOSPC);
+       if (mlen != sizeof(im_in))
+               return -ENOSPC;
  
-        if (copy_from_user(&im_in, src, mlen) != 0)
-               return (-EFAULT);
+       if (copy_from_user(&im_in, src, mlen) != 0)
+               return -EFAULT;
  
         mutex_lock(&idmap->idmap_im_lock);
  
@@ -487,7 +486,7 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
                 hash ^= (unsigned int)*p;
         }
  
-       return (hash);
+       return hash;
  }
  
  int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c

index db5d96dc6107d5cb8b21235963fd4a950728401f..3f332e54e760ad2056154c01491c8db9521b8319 100644 (file)
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -192,7 +192,7 @@ void nfs_invalidate_atime(struct inode *inode)
   */
  static void nfs_invalidate_inode(struct inode *inode)
  {
-       set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
+       set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
         nfs_zap_caches_locked(inode);
  }
  
@@ -229,7 +229,7 @@ nfs_init_locked(struct inode *inode, void *opaque)
         struct nfs_find_desc    *desc = (struct nfs_find_desc *)opaque;
         struct nfs_fattr        *fattr = desc->fattr;
  
-       NFS_FILEID(inode) = fattr->fileid;
+       set_nfs_fileid(inode, fattr->fileid);
         nfs_copy_fh(NFS_FH(inode), desc->fh);
         return 0;
  }
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                         inode->i_fop = &nfs_dir_operations;
                         if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
                             && fattr->size <= NFS_LIMIT_READDIRPLUS)
-                               set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+                               set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                         /* Deal with crossing mountpoints */
                         if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
                                 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
@@ -461,9 +461,18 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
         int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
         int err;
  
-       /* Flush out writes to the server in order to update c/mtime */
-       if (S_ISREG(inode->i_mode))
+       /*
+        * Flush out writes to the server in order to update c/mtime.
+        *
+        * Hold the i_mutex to suspend application writes temporarily;
+        * this prevents long-running writing applications from blocking
+        * nfs_wb_nocommit.
+        */
+       if (S_ISREG(inode->i_mode)) {
+               mutex_lock(&inode->i_mutex);
                 nfs_wb_nocommit(inode);
+               mutex_unlock(&inode->i_mutex);
+       }
  
         /*
          * We may force a getattr if the user cares about atime.
@@ -659,7 +668,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                 if (status == -ESTALE) {
                         nfs_zap_caches(inode);
                         if (!S_ISDIR(inode->i_mode))
-                               set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
+                               set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
                 }
                 goto out;
         }
@@ -814,8 +823,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                         if (S_ISDIR(inode->i_mode))
                                 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
                 }
-               if (inode->i_size == fattr->pre_size && nfsi->npages == 0)
-                       inode->i_size = fattr->size;
+               if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+                   nfsi->npages == 0)
+                       inode->i_size = nfs_size_to_loff_t(fattr->size);
         }
  }
  
@@ -1019,7 +1029,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                         dprintk("NFS: mtime change on server for file %s/%ld\n",
                                         inode->i_sb->s_id, inode->i_ino);
                         invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-                       nfsi->cache_change_attribute = now;
+                       if (S_ISDIR(inode->i_mode))
+                               nfs_force_lookup_revalidate(inode);
                 }
                 /* If ctime has changed we should definitely clear access+acl caches */
                 if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
@@ -1028,7 +1039,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                 dprintk("NFS: change_attr change on server for file %s/%ld\n",
                                 inode->i_sb->s_id, inode->i_ino);
                 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-               nfsi->cache_change_attribute = now;
+               if (S_ISDIR(inode->i_mode))
+                       nfs_force_lookup_revalidate(inode);
         }
  
         /* Check if our cached file size is stale */
@@ -1133,7 +1145,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  void nfs4_clear_inode(struct inode *inode)
  {
         /* If we are holding a delegation, return it! */
-       nfs_inode_return_delegation(inode);
+       nfs_inode_return_delegation_noreclaim(inode);
         /* First call standard NFS clear_inode() code */
         nfs_clear_inode(inode);
  }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h

index f3acf48412be78dda1b26ce06e348e2b4ed38e87..0f5619611b8dc5d313accd81658fd6dd32e6a647 100644 (file)
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -21,7 +21,8 @@ struct nfs_clone_mount {
         struct nfs_fattr *fattr;
         char *hostname;
         char *mnt_path;
-       struct sockaddr_in *addr;
+       struct sockaddr *addr;
+       size_t addrlen;
         rpc_authflavor_t authflavor;
  };
  
@@ -41,19 +42,19 @@ struct nfs_parsed_mount_data {
         char                    *client_address;
  
         struct {
-               struct sockaddr_in      address;
+               struct sockaddr_storage address;
+               size_t                  addrlen;
                 char                    *hostname;
-               unsigned int            program;
                 unsigned int            version;
                 unsigned short          port;
                 int                     protocol;
         } mount_server;
  
         struct {
-               struct sockaddr_in      address;
+               struct sockaddr_storage address;
+               size_t                  addrlen;
                 char                    *hostname;
                 char                    *export_path;
-               unsigned int            program;
                 int                     protocol;
         } nfs_server;
  };
@@ -62,7 +63,8 @@ struct nfs_parsed_mount_data {
  extern struct rpc_program nfs_program;
  
  extern void nfs_put_client(struct nfs_client *);
-extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
+extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32);
+extern struct nfs_client *nfs_find_client_next(struct nfs_client *);
  extern struct nfs_server *nfs_create_server(
                                         const struct nfs_parsed_mount_data *,
                                         struct nfs_fh *);
@@ -160,6 +162,8 @@ extern struct rpc_stat nfs_rpcstat;
  
  extern int __init register_nfs_fs(void);
  extern void __exit unregister_nfs_fs(void);
+extern void nfs_sb_active(struct nfs_server *server);
+extern void nfs_sb_deactive(struct nfs_server *server);
  
  /* namespace.c */
  extern char *nfs_path(const char *base,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c

index acfc56f9edc04c53b25efaffeb80eb0ba9d887c5..be4ce1c3a3d8d30e19f9a1e5921253d6e6542589 100644 (file)
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -188,7 +188,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
  {
  #ifdef CONFIG_NFS_V4
         struct vfsmount *mnt = NULL;
-       switch (server->nfs_client->cl_nfsversion) {
+       switch (server->nfs_client->rpc_ops->version) {
                 case 2:
                 case 3:
                         mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c

index 668ab96c7b596f19c2550c034cbea48f0706ae93..1f7ea675e0c5e9489d8316b60969d159f386d8ab 100644 (file)
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -262,7 +262,9 @@ static int
  nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
  {
         struct kvec *iov = req->rq_rcv_buf.head;
-       int     status, count, recvd, hdrlen;
+       size_t hdrlen;
+       u32 count, recvd;
+       int status;
  
         if ((status = ntohl(*p++)))
                 return -nfs_stat_to_errno(status);
@@ -273,7 +275,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
         hdrlen = (u8 *) p - (u8 *) iov->iov_base;
         if (iov->iov_len < hdrlen) {
                 dprintk("NFS: READ reply header overflowed:"
-                               "length %d > %Zu\n", hdrlen, iov->iov_len);
+                               "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                 return -errno_NFSERR_IO;
         } else if (iov->iov_len != hdrlen) {
                 dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -283,11 +285,11 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
         recvd = req->rq_rcv_buf.len - hdrlen;
         if (count > recvd) {
                 dprintk("NFS: server cheating in read reply: "
-                       "count %d > recvd %d\n", count, recvd);
+                       "count %u > recvd %u\n", count, recvd);
                 count = recvd;
         }
  
-       dprintk("RPC:      readres OK count %d\n", count);
+       dprintk("RPC:      readres OK count %u\n", count);
         if (count < res->count)
                 res->count = count;
  
@@ -423,9 +425,10 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
         struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
         struct kvec *iov = rcvbuf->head;
         struct page **page;
-       int hdrlen, recvd;
+       size_t hdrlen;
+       unsigned int pglen, recvd;
+       u32 len;
         int status, nr;
-       unsigned int len, pglen;
         __be32 *end, *entry, *kaddr;
  
         if ((status = ntohl(*p++)))
@@ -434,7 +437,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
         hdrlen = (u8 *) p - (u8 *) iov->iov_base;
         if (iov->iov_len < hdrlen) {
                 dprintk("NFS: READDIR reply header overflowed:"
-                               "length %d > %Zu\n", hdrlen, iov->iov_len);
+                               "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                 return -errno_NFSERR_IO;
         } else if (iov->iov_len != hdrlen) {
                 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -576,7 +579,8 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
  {
         struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
         struct kvec *iov = rcvbuf->head;
-       int hdrlen, len, recvd;
+       size_t hdrlen;
+       u32 len, recvd;
         char    *kaddr;
         int     status;
  
@@ -584,14 +588,14 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
                 return -nfs_stat_to_errno(status);
         /* Convert length of symlink */
         len = ntohl(*p++);
-       if (len >= rcvbuf->page_len || len <= 0) {
+       if (len >= rcvbuf->page_len) {
                 dprintk("nfs: server returned giant symlink!\n");
                 return -ENAMETOOLONG;
         }
         hdrlen = (u8 *) p - (u8 *) iov->iov_base;
         if (iov->iov_len < hdrlen) {
                 dprintk("NFS: READLINK reply header overflowed:"
-                               "length %d > %Zu\n", hdrlen, iov->iov_len);
+                               "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                 return -errno_NFSERR_IO;
         } else if (iov->iov_len != hdrlen) {
                 dprintk("NFS: READLINK header is short. iovec will be shifted.\n");
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c

index 4cdc2361a669bed2347d599f027cc10e57c9d7dc..b353c1a05bfda77c7074fcc31d4dbac00b4c09c9 100644 (file)
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -732,16 +732,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
         return 0;
  }
  
-static void nfs3_proc_read_setup(struct nfs_read_data *data)
+static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
  {
-       struct rpc_message      msg = {
-               .rpc_proc       = &nfs3_procedures[NFS3PROC_READ],
-               .rpc_argp       = &data->args,
-               .rpc_resp       = &data->res,
-               .rpc_cred       = data->cred,
-       };
-
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
  }
  
  static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -753,24 +746,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
         return 0;
  }
  
-static void nfs3_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
  {
-       struct rpc_message      msg = {
-               .rpc_proc       = &nfs3_procedures[NFS3PROC_WRITE],
-               .rpc_argp       = &data->args,
-               .rpc_resp       = &data->res,
-               .rpc_cred       = data->cred,
-       };
-
-       data->args.stable = NFS_UNSTABLE;
-       if (how & FLUSH_STABLE) {
-               data->args.stable = NFS_FILE_SYNC;
-               if (NFS_I(data->inode)->ncommit)
-                       data->args.stable = NFS_DATA_SYNC;
-       }
-
-       /* Finalize the task. */
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
  }
  
  static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -781,22 +759,17 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
         return 0;
  }
  
-static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
  {
-       struct rpc_message      msg = {
-               .rpc_proc       = &nfs3_procedures[NFS3PROC_COMMIT],
-               .rpc_argp       = &data->args,
-               .rpc_resp       = &data->res,
-               .rpc_cred       = data->cred,
-       };
-
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
  }
  
  static int
  nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
  {
-       return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl);
+       struct inode *inode = filp->f_path.dentry->d_inode;
+
+       return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
  }
  
  const struct nfs_rpc_ops nfs_v3_clientops = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c

index 616d3267b7e7b545d02d0bb88c02a48e2cbe6e80..3917e2fa4e40f3e1a18c12bee149c576847d4b5f 100644 (file)
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -506,9 +506,9 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
         struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
         struct kvec *iov = rcvbuf->head;
         struct page **page;
-       int hdrlen, recvd;
+       size_t hdrlen;
+       u32 len, recvd, pglen;
         int status, nr;
-       unsigned int len, pglen;
         __be32 *entry, *end, *kaddr;
  
         status = ntohl(*p++);
@@ -527,7 +527,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
         hdrlen = (u8 *) p - (u8 *) iov->iov_base;
         if (iov->iov_len < hdrlen) {
                 dprintk("NFS: READDIR reply header overflowed:"
-                               "length %d > %Zu\n", hdrlen, iov->iov_len);
+                               "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                 return -errno_NFSERR_IO;
         } else if (iov->iov_len != hdrlen) {
                 dprintk("NFS: READDIR header is short. iovec will be shifted.\n");
@@ -549,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                 len = ntohl(*p++);              /* string length */
                 p += XDR_QUADLEN(len) + 2;      /* name + cookie */
                 if (len > NFS3_MAXNAMLEN) {
-                       dprintk("NFS: giant filename in readdir (len %x)!\n",
+                       dprintk("NFS: giant filename in readdir (len 0x%x)!\n",
                                                 len);
                         goto err_unmap;
                 }
@@ -570,7 +570,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                                 len = ntohl(*p++);
                                 if (len > NFS3_FHSIZE) {
                                         dprintk("NFS: giant filehandle in "
-                                               "readdir (len %x)!\n", len);
+                                               "readdir (len 0x%x)!\n", len);
                                         goto err_unmap;
                                 }
                                 p += XDR_QUADLEN(len);
@@ -815,7 +815,8 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
  {
         struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
         struct kvec *iov = rcvbuf->head;
-       int hdrlen, len, recvd;
+       size_t hdrlen;
+       u32 len, recvd;
         char    *kaddr;
         int     status;
  
@@ -827,7 +828,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
  
         /* Convert length of symlink */
         len = ntohl(*p++);
-       if (len >= rcvbuf->page_len || len <= 0) {
+       if (len >= rcvbuf->page_len) {
                 dprintk("nfs: server returned giant symlink!\n");
                 return -ENAMETOOLONG;
         }
@@ -835,7 +836,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
         hdrlen = (u8 *) p - (u8 *) iov->iov_base;
         if (iov->iov_len < hdrlen) {
                 dprintk("NFS: READLINK reply header overflowed:"
-                               "length %d > %Zu\n", hdrlen, iov->iov_len);
+                               "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                 return -errno_NFSERR_IO;
         } else if (iov->iov_len != hdrlen) {
                 dprintk("NFS: READLINK header is short. "
@@ -863,7 +864,9 @@ static int
  nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
  {
         struct kvec *iov = req->rq_rcv_buf.head;
-       int     status, count, ocount, recvd, hdrlen;
+       size_t hdrlen;
+       u32 count, ocount, recvd;
+       int status;
  
         status = ntohl(*p++);
         p = xdr_decode_post_op_attr(p, res->fattr);
@@ -871,7 +874,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
         if (status != 0)
                 return -nfs_stat_to_errno(status);
  
-       /* Decode reply could and EOF flag. NFSv3 is somewhat redundant
+       /* Decode reply count and EOF flag. NFSv3 is somewhat redundant
          * in that it puts the count both in the res struct and in the
          * opaque data count. */
         count    = ntohl(*p++);
@@ -886,7 +889,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
         hdrlen = (u8 *) p - (u8 *) iov->iov_base;
         if (iov->iov_len < hdrlen) {
                 dprintk("NFS: READ reply header overflowed:"
-                               "length %d > %Zu\n", hdrlen, iov->iov_len);
+                               "length %Zu > %Zu\n", hdrlen, iov->iov_len);
                         return -errno_NFSERR_IO;
         } else if (iov->iov_len != hdrlen) {
                 dprintk("NFS: READ header is short. iovec will be shifted.\n");
@@ -896,7 +899,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
         recvd = req->rq_rcv_buf.len - hdrlen;
         if (count > recvd) {
                 dprintk("NFS: server cheating in read reply: "
-                       "count %d > recvd %d\n", count, recvd);
+                       "count %u > recvd %u\n", count, recvd);
                 count = recvd;
                 res->eof = 0;
         }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c

index dd5fef20c702785859dc514e4bd3bc1e6f067f82..5f9ba41ed5bfa6b175ef38f1f54bcb4986b26b29 100644 (file)
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -114,10 +114,7 @@ static inline int valid_ipaddr4(const char *buf)
   * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
   * @mnt_parent - mountpoint of parent directory
   * @dentry - parent directory
- * @fspath - fs path returned in fs_locations
- * @mntpath - mount path to new server
- * @hostname - hostname of new server
- * @addr - host addr of new server
+ * @locations - array of NFSv4 server location information
   *
   */
  static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
@@ -131,7 +128,8 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
         };
         char *page = NULL, *page2 = NULL;
-       int loc, s, error;
+       unsigned int s;
+       int loc, error;
  
         if (locations == NULL || locations->nlocations <= 0)
                 goto out;
@@ -174,7 +172,10 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
  
                 s = 0;
                 while (s < location->nservers) {
-                       struct sockaddr_in addr = {};
+                       struct sockaddr_in addr = {
+                               .sin_family     = AF_INET,
+                               .sin_port       = htons(NFS_PORT),
+                       };
  
                         if (location->servers[s].len <= 0 ||
                             valid_ipaddr4(location->servers[s].data) < 0) {
@@ -183,10 +184,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                         }
  
                         mountdata.hostname = location->servers[s].data;
-                       addr.sin_addr.s_addr = in_aton(mountdata.hostname);
-                       addr.sin_family = AF_INET;
-                       addr.sin_port = htons(NFS_PORT);
-                       mountdata.addr = &addr;
+                       addr.sin_addr.s_addr = in_aton(mountdata.hostname),
+                       mountdata.addr = (struct sockaddr *)&addr;
+                       mountdata.addrlen = sizeof(addr);
  
                         snprintf(page, PAGE_SIZE, "%s:%s",
                                         mountdata.hostname,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c

index 9e2e1c7291dbfd5959ba53389351f72c233629b8..5c189bd57eb2b6ea88bcc837bb8f821f04089fe4 100644 (file)
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -210,7 +210,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
         spin_lock(&dir->i_lock);
         nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
         if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
-               nfsi->cache_change_attribute = jiffies;
+               nfs_force_lookup_revalidate(dir);
         nfsi->change_attr = cinfo->after;
         spin_unlock(&dir->i_lock);
  }
@@ -718,19 +718,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
         return err;
  }
  
-static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
-{
-       struct nfs4_opendata *data = calldata;
-       struct  rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
-               .rpc_argp = &data->c_arg,
-               .rpc_resp = &data->c_res,
-               .rpc_cred = data->owner->so_cred,
-       };
-       data->timestamp = jiffies;
-       rpc_call_setup(task, &msg, 0);
-}
-
  static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
  {
         struct nfs4_opendata *data = calldata;
@@ -767,7 +754,6 @@ out_free:
  }
  
  static const struct rpc_call_ops nfs4_open_confirm_ops = {
-       .rpc_call_prepare = nfs4_open_confirm_prepare,
         .rpc_call_done = nfs4_open_confirm_done,
         .rpc_release = nfs4_open_confirm_release,
  };
@@ -779,12 +765,26 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
  {
         struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
         struct rpc_task *task;
+       struct  rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+               .rpc_argp = &data->c_arg,
+               .rpc_resp = &data->c_res,
+               .rpc_cred = data->owner->so_cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_open_confirm_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC,
+       };
         int status;
  
         kref_get(&data->kref);
         data->rpc_done = 0;
         data->rpc_status = 0;
-       task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data);
+       data->timestamp = jiffies;
+       task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
         status = nfs4_wait_for_completion_rpc_task(task);
@@ -801,13 +801,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
  {
         struct nfs4_opendata *data = calldata;
         struct nfs4_state_owner *sp = data->owner;
-       struct rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
-               .rpc_argp = &data->o_arg,
-               .rpc_resp = &data->o_res,
-               .rpc_cred = sp->so_cred,
-       };
-       
+
         if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
                 return;
         /*
@@ -832,11 +826,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
         data->o_arg.id = sp->so_owner_id.id;
         data->o_arg.clientid = sp->so_client->cl_clientid;
         if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
-               msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+               task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
                 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
         }
         data->timestamp = jiffies;
-       rpc_call_setup(task, &msg, 0);
+       rpc_call_start(task);
         return;
  out_no_action:
         task->tk_action = NULL;
@@ -908,13 +902,26 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
         struct nfs_openargs *o_arg = &data->o_arg;
         struct nfs_openres *o_res = &data->o_res;
         struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN],
+               .rpc_argp = o_arg,
+               .rpc_resp = o_res,
+               .rpc_cred = data->owner->so_cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_open_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC,
+       };
         int status;
  
         kref_get(&data->kref);
         data->rpc_done = 0;
         data->rpc_status = 0;
         data->cancelled = 0;
-       task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data);
+       task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
         status = nfs4_wait_for_completion_rpc_task(task);
@@ -1244,12 +1251,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
  {
         struct nfs4_closedata *calldata = data;
         struct nfs4_state *state = calldata->state;
-       struct rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
-               .rpc_argp = &calldata->arg,
-               .rpc_resp = &calldata->res,
-               .rpc_cred = state->owner->so_cred,
-       };
         int clear_rd, clear_wr, clear_rdwr;
  
         if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
@@ -1276,14 +1277,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
         }
         nfs_fattr_init(calldata->res.fattr);
         if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
-               msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+               task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
                 calldata->arg.open_flags = FMODE_READ;
         } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
-               msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+               task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
                 calldata->arg.open_flags = FMODE_WRITE;
         }
         calldata->timestamp = jiffies;
-       rpc_call_setup(task, &msg, 0);
+       rpc_call_start(task);
  }
  
  static const struct rpc_call_ops nfs4_close_ops = {
@@ -1309,6 +1310,16 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
         struct nfs4_closedata *calldata;
         struct nfs4_state_owner *sp = state->owner;
         struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
+               .rpc_cred = state->owner->so_cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_close_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
         int status = -ENOMEM;
  
         calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
@@ -1328,7 +1339,10 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
         calldata->path.mnt = mntget(path->mnt);
         calldata->path.dentry = dget(path->dentry);
  
-       task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_close_ops, calldata);
+       msg.rpc_argp = &calldata->arg,
+       msg.rpc_resp = &calldata->res,
+       task_setup_data.callback_data = calldata;
+       task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
         status = 0;
@@ -2414,18 +2428,10 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
         return 0;
  }
  
-static void nfs4_proc_read_setup(struct nfs_read_data *data)
+static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
  {
-       struct rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ],
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->cred,
-       };
-
         data->timestamp   = jiffies;
-
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
  }
  
  static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2443,33 +2449,15 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
         return 0;
  }
  
-static void nfs4_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
  {
-       struct rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE],
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->cred,
-       };
-       struct inode *inode = data->inode;
-       struct nfs_server *server = NFS_SERVER(inode);
-       int stable;
-       
-       if (how & FLUSH_STABLE) {
-               if (!NFS_I(inode)->ncommit)
-                       stable = NFS_FILE_SYNC;
-               else
-                       stable = NFS_DATA_SYNC;
-       } else
-               stable = NFS_UNSTABLE;
-       data->args.stable = stable;
+       struct nfs_server *server = NFS_SERVER(data->inode);
+
         data->args.bitmask = server->attr_bitmask;
         data->res.server = server;
-
         data->timestamp   = jiffies;
  
-       /* Finalize the task. */
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
  }
  
  static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -2484,20 +2472,13 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
         return 0;
  }
  
-static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
  {
-       struct rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->cred,
-       };      
         struct nfs_server *server = NFS_SERVER(data->inode);
         
         data->args.bitmask = server->attr_bitmask;
         data->res.server = server;
-
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
  }
  
  /*
@@ -2910,14 +2891,20 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
  
         for(;;) {
                 setclientid.sc_name_len = scnprintf(setclientid.sc_name,
-                               sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u",
-                               clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr),
+                               sizeof(setclientid.sc_name), "%s/%s %s %s %u",
+                               clp->cl_ipaddr,
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                                       RPC_DISPLAY_ADDR),
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                                       RPC_DISPLAY_PROTO),
                                 cred->cr_ops->cr_name,
                                 clp->cl_id_uniquifier);
                 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
-                               sizeof(setclientid.sc_netid), "tcp");
+                               sizeof(setclientid.sc_netid),
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                                       RPC_DISPLAY_NETID));
                 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
-                               sizeof(setclientid.sc_uaddr), "%s.%d.%d",
+                               sizeof(setclientid.sc_uaddr), "%s.%u.%u",
                                 clp->cl_ipaddr, port >> 8, port & 255);
  
                 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
@@ -2981,25 +2968,11 @@ struct nfs4_delegreturndata {
         struct nfs4_delegreturnres res;
         struct nfs_fh fh;
         nfs4_stateid stateid;
-       struct rpc_cred *cred;
         unsigned long timestamp;
         struct nfs_fattr fattr;
         int rpc_status;
  };
  
-static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata)
-{
-       struct nfs4_delegreturndata *data = calldata;
-       struct rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->cred,
-       };
-       nfs_fattr_init(data->res.fattr);
-       rpc_call_setup(task, &msg, 0);
-}
-
  static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
  {
         struct nfs4_delegreturndata *data = calldata;
@@ -3010,24 +2983,30 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
  
  static void nfs4_delegreturn_release(void *calldata)
  {
-       struct nfs4_delegreturndata *data = calldata;
-
-       put_rpccred(data->cred);
         kfree(calldata);
  }
  
  static const struct rpc_call_ops nfs4_delegreturn_ops = {
-       .rpc_call_prepare = nfs4_delegreturn_prepare,
         .rpc_call_done = nfs4_delegreturn_done,
         .rpc_release = nfs4_delegreturn_release,
  };
  
-static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
  {
         struct nfs4_delegreturndata *data;
         struct nfs_server *server = NFS_SERVER(inode);
         struct rpc_task *task;
-       int status;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
+               .rpc_cred = cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_delegreturn_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
+       int status = 0;
  
         data = kmalloc(sizeof(*data), GFP_KERNEL);
         if (data == NULL)
@@ -3039,30 +3018,37 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
         memcpy(&data->stateid, stateid, sizeof(data->stateid));
         data->res.fattr = &data->fattr;
         data->res.server = server;
-       data->cred = get_rpccred(cred);
+       nfs_fattr_init(data->res.fattr);
         data->timestamp = jiffies;
         data->rpc_status = 0;
  
-       task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data);
+       task_setup_data.callback_data = data;
+       msg.rpc_argp = &data->args,
+       msg.rpc_resp = &data->res,
+       task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
+       if (!issync)
+               goto out;
         status = nfs4_wait_for_completion_rpc_task(task);
-       if (status == 0) {
-               status = data->rpc_status;
-               if (status == 0)
-                       nfs_refresh_inode(inode, &data->fattr);
-       }
+       if (status != 0)
+               goto out;
+       status = data->rpc_status;
+       if (status != 0)
+               goto out;
+       nfs_refresh_inode(inode, &data->fattr);
+out:
         rpc_put_task(task);
         return status;
  }
  
-int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
+int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync)
  {
         struct nfs_server *server = NFS_SERVER(inode);
         struct nfs4_exception exception = { };
         int err;
         do {
-               err = _nfs4_proc_delegreturn(inode, cred, stateid);
+               err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
                 switch (err) {
                         case -NFS4ERR_STALE_STATEID:
                         case -NFS4ERR_EXPIRED:
@@ -3230,12 +3216,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
  static void nfs4_locku_prepare(struct rpc_task *task, void *data)
  {
         struct nfs4_unlockdata *calldata = data;
-       struct rpc_message msg = {
-               .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
-               .rpc_argp       = &calldata->arg,
-               .rpc_resp       = &calldata->res,
-               .rpc_cred       = calldata->lsp->ls_state->owner->so_cred,
-       };
  
         if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
                 return;
@@ -3245,7 +3225,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                 return;
         }
         calldata->timestamp = jiffies;
-       rpc_call_setup(task, &msg, 0);
+       rpc_call_start(task);
  }
  
  static const struct rpc_call_ops nfs4_locku_ops = {
@@ -3260,6 +3240,16 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                 struct nfs_seqid *seqid)
  {
         struct nfs4_unlockdata *data;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
+               .rpc_cred = ctx->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = NFS_CLIENT(lsp->ls_state->inode),
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_locku_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
  
         /* Ensure this is an unlock - when canceling a lock, the
          * canceled lock is passed in, and it won't be an unlock.
@@ -3272,7 +3262,10 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
                 return ERR_PTR(-ENOMEM);
         }
  
-       return rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data);
+       msg.rpc_argp = &data->arg,
+       msg.rpc_resp = &data->res,
+       task_setup_data.callback_data = data;
+       return rpc_run_task(&task_setup_data);
  }
  
  static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
@@ -3331,15 +3324,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
  
         p->arg.fh = NFS_FH(inode);
         p->arg.fl = &p->fl;
-       if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
-               p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
-               if (p->arg.open_seqid == NULL)
-                       goto out_free;
-
-       }
+       p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+       if (p->arg.open_seqid == NULL)
+               goto out_free;
         p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
         if (p->arg.lock_seqid == NULL)
-               goto out_free;
+               goto out_free_seqid;
         p->arg.lock_stateid = &lsp->ls_stateid;
         p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
         p->arg.lock_owner.id = lsp->ls_id.id;
@@ -3348,9 +3338,9 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
         p->ctx = get_nfs_open_context(ctx);
         memcpy(&p->fl, fl, sizeof(p->fl));
         return p;
+out_free_seqid:
+       nfs_free_seqid(p->arg.open_seqid);
  out_free:
-       if (p->arg.open_seqid != NULL)
-               nfs_free_seqid(p->arg.open_seqid);
         kfree(p);
         return NULL;
  }
@@ -3359,31 +3349,20 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
  {
         struct nfs4_lockdata *data = calldata;
         struct nfs4_state *state = data->lsp->ls_state;
-       struct nfs4_state_owner *sp = state->owner;
-       struct rpc_message msg = {
-               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
-               .rpc_argp = &data->arg,
-               .rpc_resp = &data->res,
-               .rpc_cred = sp->so_cred,
-       };
  
         dprintk("%s: begin!\n", __FUNCTION__);
+       if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
+               return;
         /* Do we need to do an open_to_lock_owner? */
         if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
                 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
                         return;
                 data->arg.open_stateid = &state->stateid;
                 data->arg.new_lock_owner = 1;
-               /* Retest in case we raced... */
-               if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED))
-                       goto do_rpc;
-       }
-       if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
-               return;
-       data->arg.new_lock_owner = 0;
-do_rpc:        
+       } else
+               data->arg.new_lock_owner = 0;
         data->timestamp = jiffies;
-       rpc_call_setup(task, &msg, 0);
+       rpc_call_start(task);
         dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
  }
  
@@ -3419,6 +3398,7 @@ static void nfs4_lock_release(void *calldata)
         struct nfs4_lockdata *data = calldata;
  
         dprintk("%s: begin!\n", __FUNCTION__);
+       nfs_free_seqid(data->arg.open_seqid);
         if (data->cancelled != 0) {
                 struct rpc_task *task;
                 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
@@ -3428,8 +3408,6 @@ static void nfs4_lock_release(void *calldata)
                 dprintk("%s: cancelling lock!\n", __FUNCTION__);
         } else
                 nfs_free_seqid(data->arg.lock_seqid);
-       if (data->arg.open_seqid != NULL)
-               nfs_free_seqid(data->arg.open_seqid);
         nfs4_put_lock_state(data->lsp);
         put_nfs_open_context(data->ctx);
         kfree(data);
@@ -3446,6 +3424,16 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
  {
         struct nfs4_lockdata *data;
         struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK],
+               .rpc_cred = state->owner->so_cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = NFS_CLIENT(state->inode),
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_lock_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
         int ret;
  
         dprintk("%s: begin!\n", __FUNCTION__);
@@ -3457,8 +3445,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                 data->arg.block = 1;
         if (reclaim != 0)
                 data->arg.reclaim = 1;
-       task = rpc_run_task(NFS_CLIENT(state->inode), RPC_TASK_ASYNC,
-                       &nfs4_lock_ops, data);
+       msg.rpc_argp = &data->arg,
+       msg.rpc_resp = &data->res,
+       task_setup_data.callback_data = data;
+       task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
         ret = nfs4_wait_for_completion_rpc_task(task);
@@ -3631,10 +3621,6 @@ int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
         if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
                 return -EOPNOTSUPP;
  
-       if (!S_ISREG(inode->i_mode) &&
-           (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-               return -EPERM;
-
         return nfs4_proc_set_acl(inode, buf, buflen);
  }
  
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c

index 5a39c6f78acf9394960885a1b774392d84019629..f9c7432471dcf772b92cf29662740e9168c08510 100644 (file)
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -644,27 +644,26 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
  
  struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
  {
-       struct rpc_sequence *sequence = counter->sequence;
         struct nfs_seqid *new;
  
         new = kmalloc(sizeof(*new), GFP_KERNEL);
         if (new != NULL) {
                 new->sequence = counter;
-               spin_lock(&sequence->lock);
-               list_add_tail(&new->list, &sequence->list);
-               spin_unlock(&sequence->lock);
+               INIT_LIST_HEAD(&new->list);
         }
         return new;
  }
  
  void nfs_free_seqid(struct nfs_seqid *seqid)
  {
-       struct rpc_sequence *sequence = seqid->sequence->sequence;
+       if (!list_empty(&seqid->list)) {
+               struct rpc_sequence *sequence = seqid->sequence->sequence;
  
-       spin_lock(&sequence->lock);
-       list_del(&seqid->list);
-       spin_unlock(&sequence->lock);
-       rpc_wake_up(&sequence->wait);
+               spin_lock(&sequence->lock);
+               list_del(&seqid->list);
+               spin_unlock(&sequence->lock);
+               rpc_wake_up(&sequence->wait);
+       }
         kfree(seqid);
  }
  
@@ -675,6 +674,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
   */
  static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
  {
+       BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);
         switch (status) {
                 case 0:
                         break;
@@ -726,15 +726,15 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
         struct rpc_sequence *sequence = seqid->sequence->sequence;
         int status = 0;
  
-       if (sequence->list.next == &seqid->list)
-               goto out;
         spin_lock(&sequence->lock);
-       if (sequence->list.next != &seqid->list) {
-               rpc_sleep_on(&sequence->wait, task, NULL, NULL);
-               status = -EAGAIN;
-       }
+       if (list_empty(&seqid->list))
+               list_add_tail(&seqid->list, &sequence->list);
+       if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
+               goto unlock;
+       rpc_sleep_on(&sequence->wait, task, NULL, NULL);
+       status = -EAGAIN;
+unlock:
         spin_unlock(&sequence->lock);
-out:
         return status;
  }
  
@@ -758,8 +758,9 @@ static void nfs4_recover_state(struct nfs_client *clp)
  
         __module_get(THIS_MODULE);
         atomic_inc(&clp->cl_count);
-       task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim",
-                       NIPQUAD(clp->cl_addr.sin_addr));
+       task = kthread_run(reclaimer, clp, "%s-reclaim",
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                                       RPC_DISPLAY_ADDR));
         if (!IS_ERR(task))
                 return;
         nfs4_clear_recover_bit(clp);
@@ -970,8 +971,8 @@ out:
         module_put_and_exit(0);
         return 0;
  out_error:
-       printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
-                               NIPQUAD(clp->cl_addr.sin_addr), -status);
+       printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
+                       " with error %d\n", clp->cl_hostname, -status);
         set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
         goto out;
  }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c

index 51dd3804866f1575adc3ceeda8a47162796e1bbe..db1ed9c46ede9f4d9c899e45c63289ccb5296546 100644 (file)
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -116,10 +116,12 @@ static int nfs4_stat_to_errno(int);
  #define decode_renew_maxsz     (op_decode_hdr_maxsz)
  #define encode_setclientid_maxsz \
                                 (op_encode_hdr_maxsz + \
-                               4 /*server->ip_addr*/ + \
-                               1 /*Netid*/ + \
-                               6 /*uaddr*/ + \
-                               6 + (NFS4_VERIFIER_SIZE >> 2))
+                               XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
+                               XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+                               1 /* sc_prog */ + \
+                               XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
+                               XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
+                               1) /* sc_cb_ident */
  #define decode_setclientid_maxsz \
                                 (op_decode_hdr_maxsz + \
                                 2 + \
@@ -2515,14 +2517,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
  
  static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
  {
-       int n;
+       u32 n;
         __be32 *p;
         int status = 0;
  
         READ_BUF(4);
         READ32(n);
-       if (n < 0)
-               goto out_eio;
         if (n == 0)
                 goto root_path;
         dprintk("path ");
@@ -2579,13 +2579,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                 goto out_eio;
         res->nlocations = 0;
         while (res->nlocations < n) {
-               int m;
+               u32 m;
                 struct nfs4_fs_location *loc = &res->locations[res->nlocations];
  
                 READ_BUF(4);
                 READ32(m);
-               if (m <= 0)
-                       goto out_eio;
  
                 loc->nservers = 0;
                 dprintk("%s: servers ", __FUNCTION__);
@@ -2598,8 +2596,12 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                         if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
                                 loc->nservers++;
                         else {
-                               int i;
-                               dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+                               unsigned int i;
+                               dprintk("%s: using first %u of %u servers "
+                                       "returned for location %u\n",
+                                               __FUNCTION__,
+                                               NFS4_FS_LOCATION_MAXSERVERS,
+                                               m, res->nlocations);
                                 for (i = loc->nservers; i < m; i++) {
                                         unsigned int len;
                                         char *data;
@@ -3476,10 +3478,11 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
         struct xdr_buf  *rcvbuf = &req->rq_rcv_buf;
         struct page     *page = *rcvbuf->pages;
         struct kvec     *iov = rcvbuf->head;
-       unsigned int    nr, pglen = rcvbuf->page_len;
+       size_t          hdrlen;
+       u32             recvd, pglen = rcvbuf->page_len;
         __be32          *end, *entry, *p, *kaddr;
-       uint32_t        len, attrlen, xlen;
-       int             hdrlen, recvd, status;
+       unsigned int    nr;
+       int             status;
  
         status = decode_op_hdr(xdr, OP_READDIR);
         if (status)
@@ -3503,6 +3506,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
         end = p + ((pglen + readdir->pgbase) >> 2);
         entry = p;
         for (nr = 0; *p++; nr++) {
+               u32 len, attrlen, xlen;
                 if (end - p < 3)
                         goto short_pkt;
                 dprintk("cookie = %Lu, ", *((unsigned long long *)p));
@@ -3551,7 +3555,8 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
  {
         struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
         struct kvec *iov = rcvbuf->head;
-       int hdrlen, len, recvd;
+       size_t hdrlen;
+       u32 len, recvd;
         __be32 *p;
         char *kaddr;
         int status;
@@ -3646,7 +3651,8 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
         if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
                 return -EIO;
         if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
-               int hdrlen, recvd;
+               size_t hdrlen;
+               u32 recvd;
  
                 /* We ignore &savep and don't do consistency checks on
                  * the attr length.  Let userspace figure it out.... */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c

index 345bb9b4765b6fee52fd8a0c76558c34e2fc1f1c..3b3dbb94393de116c4bd8923aba8efa58bb3debb 100644 (file)
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -111,13 +111,14 @@ void nfs_unlock_request(struct nfs_page *req)
   * nfs_set_page_tag_locked - Tag a request as locked
   * @req:
   */
-static int nfs_set_page_tag_locked(struct nfs_page *req)
+int nfs_set_page_tag_locked(struct nfs_page *req)
  {
         struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
  
-       if (!nfs_lock_request(req))
+       if (!nfs_lock_request_dontget(req))
                 return 0;
-       radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+       if (req->wb_page != NULL)
+               radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
         return 1;
  }
  
@@ -132,9 +133,10 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
         if (req->wb_page != NULL) {
                 spin_lock(&inode->i_lock);
                 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+               nfs_unlock_request(req);
                 spin_unlock(&inode->i_lock);
-       }
-       nfs_unlock_request(req);
+       } else
+               nfs_unlock_request(req);
  }
  
  /**
@@ -421,6 +423,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
                                 goto out;
                         idx_start = req->wb_index + 1;
                         if (nfs_set_page_tag_locked(req)) {
+                               kref_get(&req->wb_kref);
                                 nfs_list_remove_request(req);
                                 radix_tree_tag_clear(&nfsi->nfs_page_tree,
                                                 req->wb_index, tag);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c

index 4f80d88e9fee005d69bd2d279ddeb5dc8bb4fd2f..5ccf7faee19cd17cc63a3506f91990e6c8b5b17f 100644 (file)
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -565,16 +565,9 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
         return 0;
  }
  
-static void nfs_proc_read_setup(struct nfs_read_data *data)
+static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
  {
-       struct rpc_message      msg = {
-               .rpc_proc       = &nfs_procedures[NFSPROC_READ],
-               .rpc_argp       = &data->args,
-               .rpc_resp       = &data->res,
-               .rpc_cred       = data->cred,
-       };
-
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
  }
  
  static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -584,24 +577,15 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
         return 0;
  }
  
-static void nfs_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
  {
-       struct rpc_message      msg = {
-               .rpc_proc       = &nfs_procedures[NFSPROC_WRITE],
-               .rpc_argp       = &data->args,
-               .rpc_resp       = &data->res,
-               .rpc_cred       = data->cred,
-       };
-
         /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
         data->args.stable = NFS_FILE_SYNC;
-
-       /* Finalize the task. */
-       rpc_call_setup(&data->task, &msg, 0);
+       msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
  }
  
  static void
-nfs_proc_commit_setup(struct nfs_write_data *data, int how)
+nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
  {
         BUG();
  }
@@ -609,7 +593,9 @@ nfs_proc_commit_setup(struct nfs_write_data *data, int how)
  static int
  nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
  {
-       return nlmclnt_proc(filp->f_path.dentry->d_inode, cmd, fl);
+       struct inode *inode = filp->f_path.dentry->d_inode;
+
+       return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
  }
  
  
diff --git a/fs/nfs/read.c b/fs/nfs/read.c

index 4587a86adaac8d5cc0762f36fbddaa3b96dc71f8..8fd6dfbe1bc3b8c248595a85fc5d3f3b86779ee3 100644 (file)
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -160,12 +160,26 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
                 const struct rpc_call_ops *call_ops,
                 unsigned int count, unsigned int offset)
  {
-       struct inode            *inode;
-       int flags;
+       struct inode *inode = req->wb_context->path.dentry->d_inode;
+       int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+               .rpc_cred = req->wb_context->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .task = &data->task,
+               .rpc_client = NFS_CLIENT(inode),
+               .rpc_message = &msg,
+               .callback_ops = call_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC | swap_flags,
+       };
  
         data->req         = req;
-       data->inode       = inode = req->wb_context->path.dentry->d_inode;
-       data->cred        = req->wb_context->cred;
+       data->inode       = inode;
+       data->cred        = msg.rpc_cred;
  
         data->args.fh     = NFS_FH(inode);
         data->args.offset = req_offset(req) + offset;
@@ -180,11 +194,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
         nfs_fattr_init(&data->fattr);
  
         /* Set up the initial task struct. */
-       flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
-       rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
-       NFS_PROTO(inode)->read_setup(data);
-
-       data->task.tk_cookie = (unsigned long)inode;
+       NFS_PROTO(inode)->read_setup(data, &msg);
  
         dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
                         data->task.tk_pid,
@@ -192,6 +202,10 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
                         (long long)NFS_FILEID(inode),
                         count,
                         (unsigned long long)data->args.offset);
+
+       task = rpc_run_task(&task_setup_data);
+       if (!IS_ERR(task))
+               rpc_put_task(task);
  }
  
  static void
@@ -207,19 +221,6 @@ nfs_async_read_error(struct list_head *head)
         }
  }
  
-/*
- * Start an async read operation
- */
-static void nfs_execute_read(struct nfs_read_data *data)
-{
-       struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-       sigset_t oldset;
-
-       rpc_clnt_sigmask(clnt, &oldset);
-       rpc_execute(&data->task);
-       rpc_clnt_sigunmask(clnt, &oldset);
-}
-
  /*
   * Generate multiple requests to fill a single page.
   *
@@ -274,7 +275,6 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
                                   rsize, offset);
                 offset += rsize;
                 nbytes -= rsize;
-               nfs_execute_read(data);
         } while (nbytes != 0);
  
         return 0;
@@ -312,8 +312,6 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
         req = nfs_list_entry(data->pages.next);
  
         nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
-
-       nfs_execute_read(data);
         return 0;
  out_bad:
         nfs_async_read_error(head);
@@ -338,7 +336,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
         nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count);
  
         if (task->tk_status == -ESTALE) {
-               set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
+               set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags);
                 nfs_mark_for_revalidate(data->inode);
         }
         return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c

index 0b0c72a072ffd5c5c772a518d01db5a75fe4392a..22c49c02897d3244c8f53ffd9bb9537e90064238 100644 (file)
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -45,6 +45,8 @@
  #include <linux/nfs_idmap.h>
  #include <linux/vfs.h>
  #include <linux/inet.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
  #include <linux/nfs_xdr.h>
  #include <linux/magic.h>
  #include <linux/parser.h>
@@ -83,11 +85,11 @@ enum {
         Opt_actimeo,
         Opt_namelen,
         Opt_mountport,
-       Opt_mountprog, Opt_mountvers,
-       Opt_nfsprog, Opt_nfsvers,
+       Opt_mountvers,
+       Opt_nfsvers,
  
         /* Mount options that take string arguments */
-       Opt_sec, Opt_proto, Opt_mountproto,
+       Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
         Opt_addr, Opt_mountaddr, Opt_clientaddr,
  
         /* Mount options that are ignored */
@@ -137,9 +139,7 @@ static match_table_t nfs_mount_option_tokens = {
         { Opt_userspace, "retry=%u" },
         { Opt_namelen, "namlen=%u" },
         { Opt_mountport, "mountport=%u" },
-       { Opt_mountprog, "mountprog=%u" },
         { Opt_mountvers, "mountvers=%u" },
-       { Opt_nfsprog, "nfsprog=%u" },
         { Opt_nfsvers, "nfsvers=%u" },
         { Opt_nfsvers, "vers=%u" },
  
@@ -148,7 +148,7 @@ static match_table_t nfs_mount_option_tokens = {
         { Opt_mountproto, "mountproto=%s" },
         { Opt_addr, "addr=%s" },
         { Opt_clientaddr, "clientaddr=%s" },
-       { Opt_userspace, "mounthost=%s" },
+       { Opt_mounthost, "mounthost=%s" },
         { Opt_mountaddr, "mountaddr=%s" },
  
         { Opt_err, NULL }
@@ -202,6 +202,7 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
  static int nfs_xdev_get_sb(struct file_system_type *fs_type,
                 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
  static void nfs_kill_super(struct super_block *);
+static void nfs_put_super(struct super_block *);
  
  static struct file_system_type nfs_fs_type = {
         .owner          = THIS_MODULE,
@@ -223,6 +224,7 @@ static const struct super_operations nfs_sops = {
         .alloc_inode    = nfs_alloc_inode,
         .destroy_inode  = nfs_destroy_inode,
         .write_inode    = nfs_write_inode,
+       .put_super      = nfs_put_super,
         .statfs         = nfs_statfs,
         .clear_inode    = nfs_clear_inode,
         .umount_begin   = nfs_umount_begin,
@@ -325,6 +327,28 @@ void __exit unregister_nfs_fs(void)
         unregister_filesystem(&nfs_fs_type);
  }
  
+void nfs_sb_active(struct nfs_server *server)
+{
+       atomic_inc(&server->active);
+}
+
+void nfs_sb_deactive(struct nfs_server *server)
+{
+       if (atomic_dec_and_test(&server->active))
+               wake_up(&server->active_wq);
+}
+
+static void nfs_put_super(struct super_block *sb)
+{
+       struct nfs_server *server = NFS_SB(sb);
+       /*
+        * Make sure there are no outstanding ops to this server.
+        * If so, wait for them to finish before allowing the
+        * unmount to continue.
+        */
+       wait_event(server->active_wq, atomic_read(&server->active) == 0);
+}
+
  /*
   * Deliver file system statistics to userspace
   */
@@ -455,8 +479,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
         }
         seq_printf(m, ",proto=%s",
                    rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO));
-       seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
-       seq_printf(m, ",retrans=%u", clp->retrans_count);
+       seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ);
+       seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries);
         seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
  }
  
@@ -469,8 +493,9 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
  
         nfs_show_mount_options(m, nfss, 0);
  
-       seq_printf(m, ",addr="NIPQUAD_FMT,
-               NIPQUAD(nfss->nfs_client->cl_addr.sin_addr));
+       seq_printf(m, ",addr=%s",
+                       rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
+                                                       RPC_DISPLAY_ADDR));
  
         return 0;
  }
@@ -507,7 +532,7 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
         seq_printf(m, ",namelen=%d", nfss->namelen);
  
  #ifdef CONFIG_NFS_V4
-       if (nfss->nfs_client->cl_nfsversion == 4) {
+       if (nfss->nfs_client->rpc_ops->version == 4) {
                 seq_printf(m, "\n\tnfsv4:\t");
                 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
                 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
@@ -575,22 +600,80 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
  }
  
  /*
- * Sanity-check a server address provided by the mount command
+ * Set the port number in an address.  Be agnostic about the address family.
+ */
+static void nfs_set_port(struct sockaddr *sap, unsigned short port)
+{
+       switch (sap->sa_family) {
+       case AF_INET: {
+               struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+               ap->sin_port = htons(port);
+               break;
+       }
+       case AF_INET6: {
+               struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+               ap->sin6_port = htons(port);
+               break;
+       }
+       }
+}
+
+/*
+ * Sanity-check a server address provided by the mount command.
+ *
+ * Address family must be initialized, and address must not be
+ * the ANY address for that family.
   */
  static int nfs_verify_server_address(struct sockaddr *addr)
  {
         switch (addr->sa_family) {
         case AF_INET: {
-               struct sockaddr_in *sa = (struct sockaddr_in *) addr;
-               if (sa->sin_addr.s_addr != INADDR_ANY)
-                       return 1;
-               break;
+               struct sockaddr_in *sa = (struct sockaddr_in *)addr;
+               return sa->sin_addr.s_addr != INADDR_ANY;
+       }
+       case AF_INET6: {
+               struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr;
+               return !ipv6_addr_any(sa);
         }
         }
  
         return 0;
  }
  
+/*
+ * Parse string addresses passed in via a mount option,
+ * and construct a sockaddr based on the result.
+ *
+ * If address parsing fails, set the sockaddr's address
+ * family to AF_UNSPEC to force nfs_verify_server_address()
+ * to punt the mount.
+ */
+static void nfs_parse_server_address(char *value,
+                                    struct sockaddr *sap,
+                                    size_t *len)
+{
+       if (strchr(value, ':')) {
+               struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+               u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
+
+               ap->sin6_family = AF_INET6;
+               *len = sizeof(*ap);
+               if (in6_pton(value, -1, addr, '\0', NULL))
+                       return;
+       } else {
+               struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+               u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+
+               ap->sin_family = AF_INET;
+               *len = sizeof(*ap);
+               if (in4_pton(value, -1, addr, '\0', NULL))
+                       return;
+       }
+
+       sap->sa_family = AF_UNSPEC;
+       *len = 0;
+}
+
  /*
   * Error-check and convert a string of mount options from user space into
   * a data structure
@@ -599,6 +682,7 @@ static int nfs_parse_mount_options(char *raw,
                                    struct nfs_parsed_mount_data *mnt)
  {
         char *p, *string;
+       unsigned short port = 0;
  
         if (!raw) {
                 dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -701,7 +785,7 @@ static int nfs_parse_mount_options(char *raw,
                                 return 0;
                         if (option < 0 || option > 65535)
                                 return 0;
-                       mnt->nfs_server.address.sin_port = htons(option);
+                       port = option;
                         break;
                 case Opt_rsize:
                         if (match_int(args, &mnt->rsize))
@@ -763,13 +847,6 @@ static int nfs_parse_mount_options(char *raw,
                                 return 0;
                         mnt->mount_server.port = option;
                         break;
-               case Opt_mountprog:
-                       if (match_int(args, &option))
-                               return 0;
-                       if (option < 0)
-                               return 0;
-                       mnt->mount_server.program = option;
-                       break;
                 case Opt_mountvers:
                         if (match_int(args, &option))
                                 return 0;
@@ -777,13 +854,6 @@ static int nfs_parse_mount_options(char *raw,
                                 return 0;
                         mnt->mount_server.version = option;
                         break;
-               case Opt_nfsprog:
-                       if (match_int(args, &option))
-                               return 0;
-                       if (option < 0)
-                               return 0;
-                       mnt->nfs_server.program = option;
-                       break;
                 case Opt_nfsvers:
                         if (match_int(args, &option))
                                 return 0;
@@ -927,24 +997,32 @@ static int nfs_parse_mount_options(char *raw,
                         string = match_strdup(args);
                         if (string == NULL)
                                 goto out_nomem;
-                       mnt->nfs_server.address.sin_family = AF_INET;
-                       mnt->nfs_server.address.sin_addr.s_addr =
-                                                       in_aton(string);
+                       nfs_parse_server_address(string, (struct sockaddr *)
+                                                &mnt->nfs_server.address,
+                                                &mnt->nfs_server.addrlen);
                         kfree(string);
                         break;
                 case Opt_clientaddr:
                         string = match_strdup(args);
                         if (string == NULL)
                                 goto out_nomem;
+                       kfree(mnt->client_address);
                         mnt->client_address = string;
                         break;
+               case Opt_mounthost:
+                       string = match_strdup(args);
+                       if (string == NULL)
+                               goto out_nomem;
+                       kfree(mnt->mount_server.hostname);
+                       mnt->mount_server.hostname = string;
+                       break;
                 case Opt_mountaddr:
                         string = match_strdup(args);
                         if (string == NULL)
                                 goto out_nomem;
-                       mnt->mount_server.address.sin_family = AF_INET;
-                       mnt->mount_server.address.sin_addr.s_addr =
-                                                       in_aton(string);
+                       nfs_parse_server_address(string, (struct sockaddr *)
+                                                &mnt->mount_server.address,
+                                                &mnt->mount_server.addrlen);
                         kfree(string);
                         break;
  
@@ -957,6 +1035,8 @@ static int nfs_parse_mount_options(char *raw,
                 }
         }
  
+       nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port);
+
         return 1;
  
  out_nomem:
@@ -987,7 +1067,8 @@ out_unknown:
  static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                          struct nfs_fh *root_fh)
  {
-       struct sockaddr_in sin;
+       struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
+       char *hostname;
         int status;
  
         if (args->mount_server.version == 0) {
@@ -997,25 +1078,32 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         args->mount_server.version = NFS_MNT_VERSION;
         }
  
+       if (args->mount_server.hostname)
+               hostname = args->mount_server.hostname;
+       else
+               hostname = args->nfs_server.hostname;
+
         /*
          * Construct the mount server's address.
          */
-       if (args->mount_server.address.sin_addr.s_addr != INADDR_ANY)
-               sin = args->mount_server.address;
-       else
-               sin = args->nfs_server.address;
+       if (args->mount_server.address.ss_family == AF_UNSPEC) {
+               memcpy(sap, &args->nfs_server.address,
+                      args->nfs_server.addrlen);
+               args->mount_server.addrlen = args->nfs_server.addrlen;
+       }
+
         /*
          * autobind will be used if mount_server.port == 0
          */
-       sin.sin_port = htons(args->mount_server.port);
+       nfs_set_port(sap, args->mount_server.port);
  
         /*
          * Now ask the mount server to map our export path
          * to a file handle.
          */
-       status = nfs_mount((struct sockaddr *) &sin,
-                          sizeof(sin),
-                          args->nfs_server.hostname,
+       status = nfs_mount(sap,
+                          args->mount_server.addrlen,
+                          hostname,
                            args->nfs_server.export_path,
                            args->mount_server.version,
                            args->mount_server.protocol,
@@ -1023,8 +1111,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
         if (status == 0)
                 return 0;
  
-       dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT
-                       ", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status);
+       dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+                       hostname, status);
         return status;
  }
  
@@ -1043,9 +1131,6 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
   *
   * + breaking back: trying proto=udp after proto=tcp, v2 after v3,
   *   mountproto=tcp after mountproto=udp, and so on
- *
- * XXX: as far as I can tell, changing the NFS program number is not
- *      supported in the NFS client.
   */
  static int nfs_validate_mount_data(void *options,
                                    struct nfs_parsed_mount_data *args,
@@ -1069,9 +1154,7 @@ static int nfs_validate_mount_data(void *options,
         args->acdirmin          = 30;
         args->acdirmax          = 60;
         args->mount_server.protocol = XPRT_TRANSPORT_UDP;
-       args->mount_server.program = NFS_MNT_PROGRAM;
         args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-       args->nfs_server.program = NFS_PROGRAM;
  
         switch (data->version) {
         case 1:
@@ -1102,9 +1185,6 @@ static int nfs_validate_mount_data(void *options,
                         memset(mntfh->data + mntfh->size, 0,
                                sizeof(mntfh->data) - mntfh->size);
  
-               if (!nfs_verify_server_address((struct sockaddr *) &data->addr))
-                       goto out_no_address;
-
                 /*
                  * Translate to nfs_parsed_mount_data, which nfs_fill_super
                  * can deal with.
@@ -1119,7 +1199,14 @@ static int nfs_validate_mount_data(void *options,
                 args->acregmax          = data->acregmax;
                 args->acdirmin          = data->acdirmin;
                 args->acdirmax          = data->acdirmax;
-               args->nfs_server.address = data->addr;
+
+               memcpy(&args->nfs_server.address, &data->addr,
+                      sizeof(data->addr));
+               args->nfs_server.addrlen = sizeof(data->addr);
+               if (!nfs_verify_server_address((struct sockaddr *)
+                                               &args->nfs_server.address))
+                       goto out_no_address;
+
                 if (!(data->flags & NFS_MOUNT_TCP))
                         args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
                 /* N.B. caller will free nfs_server.hostname in all cases */
@@ -1322,15 +1409,50 @@ static int nfs_set_super(struct super_block *s, void *data)
         return ret;
  }
  
+static int nfs_compare_super_address(struct nfs_server *server1,
+                                    struct nfs_server *server2)
+{
+       struct sockaddr *sap1, *sap2;
+
+       sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr;
+       sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr;
+
+       if (sap1->sa_family != sap2->sa_family)
+               return 0;
+
+       switch (sap1->sa_family) {
+       case AF_INET: {
+               struct sockaddr_in *sin1 = (struct sockaddr_in *)sap1;
+               struct sockaddr_in *sin2 = (struct sockaddr_in *)sap2;
+               if (sin1->sin_addr.s_addr != sin2->sin_addr.s_addr)
+                       return 0;
+               if (sin1->sin_port != sin2->sin_port)
+                       return 0;
+               break;
+       }
+       case AF_INET6: {
+               struct sockaddr_in6 *sin1 = (struct sockaddr_in6 *)sap1;
+               struct sockaddr_in6 *sin2 = (struct sockaddr_in6 *)sap2;
+               if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
+                       return 0;
+               if (sin1->sin6_port != sin2->sin6_port)
+                       return 0;
+               break;
+       }
+       default:
+               return 0;
+       }
+
+       return 1;
+}
+
  static int nfs_compare_super(struct super_block *sb, void *data)
  {
         struct nfs_sb_mountdata *sb_mntdata = data;
         struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
         int mntflags = sb_mntdata->mntflags;
  
-       if (memcmp(&old->nfs_client->cl_addr,
-                               &server->nfs_client->cl_addr,
-                               sizeof(old->nfs_client->cl_addr)) != 0)
+       if (!nfs_compare_super_address(old, server))
                 return 0;
         /* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
         if (old->flags & NFS_MOUNT_UNSHARED)
@@ -1400,6 +1522,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
  
  out:
         kfree(data.nfs_server.hostname);
+       kfree(data.mount_server.hostname);
         return error;
  
  out_err_nosb:
@@ -1527,6 +1650,28 @@ static void nfs4_fill_super(struct super_block *sb)
         nfs_initialise_sb(sb);
  }
  
+/*
+ * If the user didn't specify a port, set the port number to
+ * the NFS version 4 default port.
+ */
+static void nfs4_default_port(struct sockaddr *sap)
+{
+       switch (sap->sa_family) {
+       case AF_INET: {
+               struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+               if (ap->sin_port == 0)
+                       ap->sin_port = htons(NFS_PORT);
+               break;
+       }
+       case AF_INET6: {
+               struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
+               if (ap->sin6_port == 0)
+                       ap->sin6_port = htons(NFS_PORT);
+               break;
+       }
+       }
+}
+
  /*
   * Validate NFSv4 mount options
   */
@@ -1534,6 +1679,7 @@ static int nfs4_validate_mount_data(void *options,
                                     struct nfs_parsed_mount_data *args,
                                     const char *dev_name)
  {
+       struct sockaddr_in *ap;
         struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
         char *c;
  
@@ -1554,18 +1700,21 @@ static int nfs4_validate_mount_data(void *options,
  
         switch (data->version) {
         case 1:
-               if (data->host_addrlen != sizeof(args->nfs_server.address))
+               ap = (struct sockaddr_in *)&args->nfs_server.address;
+               if (data->host_addrlen > sizeof(args->nfs_server.address))
                         goto out_no_address;
-               if (copy_from_user(&args->nfs_server.address,
-                                  data->host_addr,
-                                  sizeof(args->nfs_server.address)))
+               if (data->host_addrlen == 0)
+                       goto out_no_address;
+               args->nfs_server.addrlen = data->host_addrlen;
+               if (copy_from_user(ap, data->host_addr, data->host_addrlen))
                         return -EFAULT;
-               if (args->nfs_server.address.sin_port == 0)
-                       args->nfs_server.address.sin_port = htons(NFS_PORT);
                 if (!nfs_verify_server_address((struct sockaddr *)
                                                 &args->nfs_server.address))
                         goto out_no_address;
  
+               nfs4_default_port((struct sockaddr *)
+                                 &args->nfs_server.address);
+
                 switch (data->auth_flavourlen) {
                 case 0:
                         args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1623,6 +1772,9 @@ static int nfs4_validate_mount_data(void *options,
                                                 &args->nfs_server.address))
                         return -EINVAL;
  
+               nfs4_default_port((struct sockaddr *)
+                                 &args->nfs_server.address);
+
                 switch (args->auth_flavor_len) {
                 case 0:
                         args->auth_flavors[0] = RPC_AUTH_UNIX;
@@ -1643,21 +1795,16 @@ static int nfs4_validate_mount_data(void *options,
                 len = c - dev_name;
                 if (len > NFS4_MAXNAMLEN)
                         return -ENAMETOOLONG;
-               args->nfs_server.hostname = kzalloc(len, GFP_KERNEL);
-               if (args->nfs_server.hostname == NULL)
-                       return -ENOMEM;
-               strncpy(args->nfs_server.hostname, dev_name, len - 1);
+               /* N.B. caller will free nfs_server.hostname in all cases */
+               args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
  
                 c++;                    /* step over the ':' */
                 len = strlen(c);
                 if (len > NFS4_MAXPATHLEN)
                         return -ENAMETOOLONG;
-               args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL);
-               if (args->nfs_server.export_path == NULL)
-                       return -ENOMEM;
-               strncpy(args->nfs_server.export_path, c, len);
+               args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
  
-               dprintk("MNTPATH: %s\n", args->nfs_server.export_path);
+               dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
  
                 if (args->client_address == NULL)
                         goto out_no_client_address;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c

index 233ad38161f924095e12e26ed4955337b16cdec6..757415363422da022f2c7197779233d93b86e653 100644 (file)
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,8 @@
  #include <linux/sched.h>
  #include <linux/wait.h>
  
+#include "internal.h"
+
  struct nfs_unlinkdata {
         struct hlist_node list;
         struct nfs_removeargs args;
@@ -68,24 +70,6 @@ static void nfs_dec_sillycount(struct inode *dir)
                 wake_up(&nfsi->waitqueue);
  }
  
-/**
- * nfs_async_unlink_init - Initialize the RPC info
- * task: rpc_task of the sillydelete
- */
-static void nfs_async_unlink_init(struct rpc_task *task, void *calldata)
-{
-       struct nfs_unlinkdata *data = calldata;
-       struct inode *dir = data->dir;
-       struct rpc_message msg = {
-               .rpc_argp = &data->args,
-               .rpc_resp = &data->res,
-               .rpc_cred = data->cred,
-       };
-
-       NFS_PROTO(dir)->unlink_setup(&msg, dir);
-       rpc_call_setup(task, &msg, 0);
-}
-
  /**
   * nfs_async_unlink_done - Sillydelete post-processing
   * @task: rpc_task of the sillydelete
@@ -113,32 +97,45 @@ static void nfs_async_unlink_release(void *calldata)
         struct nfs_unlinkdata   *data = calldata;
  
         nfs_dec_sillycount(data->dir);
+       nfs_sb_deactive(NFS_SERVER(data->dir));
         nfs_free_unlinkdata(data);
  }
  
  static const struct rpc_call_ops nfs_unlink_ops = {
-       .rpc_call_prepare = nfs_async_unlink_init,
         .rpc_call_done = nfs_async_unlink_done,
         .rpc_release = nfs_async_unlink_release,
  };
  
  static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
  {
+       struct rpc_message msg = {
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+               .rpc_cred = data->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_message = &msg,
+               .callback_ops = &nfs_unlink_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC,
+       };
         struct rpc_task *task;
         struct dentry *alias;
  
         alias = d_lookup(parent, &data->args.name);
         if (alias != NULL) {
                 int ret = 0;
+
                 /*
                  * Hey, we raced with lookup... See if we need to transfer
                  * the sillyrename information to the aliased dentry.
                  */
                 nfs_free_dname(data);
                 spin_lock(&alias->d_lock);
-               if (!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+               if (alias->d_inode != NULL &&
+                   !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
                         alias->d_fsdata = data;
-                       alias->d_flags ^= DCACHE_NFSFS_RENAMED;
+                       alias->d_flags |= DCACHE_NFSFS_RENAMED;
                         ret = 1;
                 }
                 spin_unlock(&alias->d_lock);
@@ -151,10 +148,14 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                 nfs_dec_sillycount(dir);
                 return 0;
         }
+       nfs_sb_active(NFS_SERVER(dir));
         data->args.fh = NFS_FH(dir);
         nfs_fattr_init(&data->res.dir_attr);
  
-       task = rpc_run_task(NFS_CLIENT(dir), RPC_TASK_ASYNC, &nfs_unlink_ops, data);
+       NFS_PROTO(dir)->unlink_setup(&msg, dir);
+
+       task_setup_data.rpc_client = NFS_CLIENT(dir);
+       task = rpc_run_task(&task_setup_data);
         if (!IS_ERR(task))
                 rpc_put_task(task);
         return 1;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c

index 51cc1bd6a116643067b29f4cf8549207e2088c65..5ac5b27b639a85decb94b5d49c72aeb46361a9ff 100644 (file)
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -196,7 +196,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
         }
         /* Update file length */
         nfs_grow_file(page, offset, count);
-       nfs_unlock_request(req);
+       nfs_clear_page_tag_locked(req);
         return 0;
  }
  
@@ -252,7 +252,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                                 struct page *page)
  {
         struct inode *inode = page->mapping->host;
-       struct nfs_inode *nfsi = NFS_I(inode);
         struct nfs_page *req;
         int ret;
  
@@ -263,10 +262,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                         spin_unlock(&inode->i_lock);
                         return 0;
                 }
-               if (nfs_lock_request_dontget(req))
+               if (nfs_set_page_tag_locked(req))
                         break;
                 /* Note: If we hold the page lock, as is the case in nfs_writepage,
-                *       then the call to nfs_lock_request_dontget() will always
+                *       then the call to nfs_set_page_tag_locked() will always
                  *       succeed provided that someone hasn't already marked the
                  *       request as dirty (in which case we don't care).
                  */
@@ -280,7 +279,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
         if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
                 /* This request is marked for commit */
                 spin_unlock(&inode->i_lock);
-               nfs_unlock_request(req);
+               nfs_clear_page_tag_locked(req);
                 nfs_pageio_complete(pgio);
                 return 0;
         }
@@ -288,8 +287,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                 spin_unlock(&inode->i_lock);
                 BUG();
         }
-       radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
-                       NFS_PAGE_TAG_LOCKED);
         spin_unlock(&inode->i_lock);
         nfs_pageio_add_request(pgio, req);
         return 0;
@@ -381,6 +378,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
         set_page_private(req->wb_page, (unsigned long)req);
         nfsi->npages++;
         kref_get(&req->wb_kref);
+       radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
         return 0;
  }
  
@@ -596,7 +594,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
                 spin_lock(&inode->i_lock);
                 req = nfs_page_find_request_locked(page);
                 if (req) {
-                       if (!nfs_lock_request_dontget(req)) {
+                       if (!nfs_set_page_tag_locked(req)) {
                                 int error;
  
                                 spin_unlock(&inode->i_lock);
@@ -646,7 +644,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
             || req->wb_page != page
             || !nfs_dirty_request(req)
             || offset > rqend || end < req->wb_offset) {
-               nfs_unlock_request(req);
+               nfs_clear_page_tag_locked(req);
                 return ERR_PTR(-EBUSY);
         }
  
@@ -755,7 +753,7 @@ static void nfs_writepage_release(struct nfs_page *req)
         nfs_clear_page_tag_locked(req);
  }
  
-static inline int flush_task_priority(int how)
+static int flush_task_priority(int how)
  {
         switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) {
                 case FLUSH_HIGHPRI:
@@ -775,15 +773,31 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
                 unsigned int count, unsigned int offset,
                 int how)
  {
-       struct inode            *inode;
-       int flags;
+       struct inode *inode = req->wb_context->path.dentry->d_inode;
+       int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+       int priority = flush_task_priority(how);
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+               .rpc_cred = req->wb_context->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = NFS_CLIENT(inode),
+               .task = &data->task,
+               .rpc_message = &msg,
+               .callback_ops = call_ops,
+               .callback_data = data,
+               .flags = flags,
+               .priority = priority,
+       };
  
         /* Set up the RPC argument and reply structs
          * NB: take care not to mess about with data->commit et al. */
  
         data->req = req;
         data->inode = inode = req->wb_context->path.dentry->d_inode;
-       data->cred = req->wb_context->cred;
+       data->cred = msg.rpc_cred;
  
         data->args.fh     = NFS_FH(inode);
         data->args.offset = req_offset(req) + offset;
@@ -791,6 +805,12 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
         data->args.pages  = data->pagevec;
         data->args.count  = count;
         data->args.context = req->wb_context;
+       data->args.stable  = NFS_UNSTABLE;
+       if (how & FLUSH_STABLE) {
+               data->args.stable = NFS_DATA_SYNC;
+               if (!NFS_I(inode)->ncommit)
+                       data->args.stable = NFS_FILE_SYNC;
+       }
  
         data->res.fattr   = &data->fattr;
         data->res.count   = count;
@@ -798,12 +818,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
         nfs_fattr_init(&data->fattr);
  
         /* Set up the initial task struct.  */
-       flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-       rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
-       NFS_PROTO(inode)->write_setup(data, how);
-
-       data->task.tk_priority = flush_task_priority(how);
-       data->task.tk_cookie = (unsigned long)inode;
+       NFS_PROTO(inode)->write_setup(data, &msg);
  
         dprintk("NFS: %5u initiated write call "
                 "(req %s/%Ld, %u bytes @ offset %Lu)\n",
@@ -812,16 +827,10 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
                 (long long)NFS_FILEID(inode),
                 count,
                 (unsigned long long)data->args.offset);
-}
-
-static void nfs_execute_write(struct nfs_write_data *data)
-{
-       struct rpc_clnt *clnt = NFS_CLIENT(data->inode);
-       sigset_t oldset;
  
-       rpc_clnt_sigmask(clnt, &oldset);
-       rpc_execute(&data->task);
-       rpc_clnt_sigunmask(clnt, &oldset);
+       task = rpc_run_task(&task_setup_data);
+       if (!IS_ERR(task))
+               rpc_put_task(task);
  }
  
  /*
@@ -868,7 +877,6 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
                                    wsize, offset, how);
                 offset += wsize;
                 nbytes -= wsize;
-               nfs_execute_write(data);
         } while (nbytes != 0);
  
         return 0;
@@ -916,7 +924,6 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
         /* Set up the argument struct */
         nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
  
-       nfs_execute_write(data);
         return 0;
   out_bad:
         while (!list_empty(head)) {
@@ -932,7 +939,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
  static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
                                   struct inode *inode, int ioflags)
  {
-       int wsize = NFS_SERVER(inode)->wsize;
+       size_t wsize = NFS_SERVER(inode)->wsize;
  
         if (wsize < PAGE_CACHE_SIZE)
                 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
@@ -1146,19 +1153,33 @@ static void nfs_commit_rpcsetup(struct list_head *head,
                 struct nfs_write_data *data,
                 int how)
  {
-       struct nfs_page         *first;
-       struct inode            *inode;
-       int flags;
+       struct nfs_page *first = nfs_list_entry(head->next);
+       struct inode *inode = first->wb_context->path.dentry->d_inode;
+       int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+       int priority = flush_task_priority(how);
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+               .rpc_cred = first->wb_context->cred,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .task = &data->task,
+               .rpc_client = NFS_CLIENT(inode),
+               .rpc_message = &msg,
+               .callback_ops = &nfs_commit_ops,
+               .callback_data = data,
+               .flags = flags,
+               .priority = priority,
+       };
  
         /* Set up the RPC argument and reply structs
          * NB: take care not to mess about with data->commit et al. */
  
         list_splice_init(head, &data->pages);
-       first = nfs_list_entry(data->pages.next);
-       inode = first->wb_context->path.dentry->d_inode;
  
         data->inode       = inode;
-       data->cred        = first->wb_context->cred;
+       data->cred        = msg.rpc_cred;
  
         data->args.fh     = NFS_FH(data->inode);
         /* Note: we always request a commit of the entire inode */
@@ -1170,14 +1191,13 @@ static void nfs_commit_rpcsetup(struct list_head *head,
         nfs_fattr_init(&data->fattr);
  
         /* Set up the initial task struct.  */
-       flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-       rpc_init_task(&data->task, NFS_CLIENT(inode), flags, &nfs_commit_ops, data);
-       NFS_PROTO(inode)->commit_setup(data, how);
+       NFS_PROTO(inode)->commit_setup(data, &msg);
  
-       data->task.tk_priority = flush_task_priority(how);
-       data->task.tk_cookie = (unsigned long)inode;
-       
         dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+       task = rpc_run_task(&task_setup_data);
+       if (!IS_ERR(task))
+               rpc_put_task(task);
  }
  
  /*
@@ -1197,7 +1217,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
         /* Set up the argument struct */
         nfs_commit_rpcsetup(head, data, how);
  
-       nfs_execute_write(data);
         return 0;
   out_bad:
         while (!list_empty(head)) {
diff --git a/fs/splice.c b/fs/splice.c

index 0a0b79b01d059a1c71b632de1807100578be8269..1577a7391d23fa47cd74fb5089278dba81daee55 100644 (file)
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1031,7 +1031,11 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                         goto out_release;
         }
  
+done:
         pipe->nrbufs = pipe->curbuf = 0;
+       if (bytes > 0)
+               file_accessed(in);
+
         return bytes;
  
  out_release:
@@ -1047,16 +1051,11 @@ out_release:
                         buf->ops = NULL;
                 }
         }
-       pipe->nrbufs = pipe->curbuf = 0;
-
-       /*
-        * If we transferred some data, return the number of bytes:
-        */
-       if (bytes > 0)
-               return bytes;
  
-       return ret;
+       if (!bytes)
+               bytes = ret;
  
+       goto done;
  }
  EXPORT_SYMBOL(splice_direct_to_actor);
  
diff --git a/include/acpi/reboot.h b/include/acpi/reboot.h

new file mode 100644 (file)

index 0000000..8857f57
--- /dev/null
+++ b/include/acpi/reboot.h
@@ -0,0 +1,9 @@
+
+/*
+ * Dummy placeholder to make the EFI patches apply to the x86 tree.
+ * Andrew/Len, please just kill this file if you encounter it.
+ */
+#ifndef acpi_reboot
+# define acpi_reboot() do { } while (0)
+#endif
+
diff --git a/include/asm-alpha/agp.h b/include/asm-alpha/agp.h

index ef855a3bc0f54eed5048af5cf636d4c365bf29e7..26c17913529366ef0a5930b073c0e1f4648b6b70 100644 (file)
--- a/include/asm-alpha/agp.h
+++ b/include/asm-alpha/agp.h
@@ -7,7 +7,6 @@
  
  #define map_page_into_agp(page) 
  #define unmap_page_from_agp(page) 
-#define flush_agp_mappings() 
  #define flush_agp_cache() mb()
  
  /* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h

index d56fedbb457ab5aa5c5bd7a63affbd10afae6161..2632328d8646840aab41c8e401a1088e922754f2 100644 (file)
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -31,14 +31,19 @@ struct bug_entry {
  #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while(0)
  #endif
  
-#ifndef HAVE_ARCH_WARN_ON
+#ifndef __WARN
+#ifndef __ASSEMBLY__
+extern void warn_on_slowpath(const char *file, const int line);
+#define WANT_WARN_ON_SLOWPATH
+#endif
+#define __WARN() warn_on_slowpath(__FILE__, __LINE__)
+#endif
+
+#ifndef WARN_ON
  #define WARN_ON(condition) ({                                          \
         int __ret_warn_on = !!(condition);                              \
-       if (unlikely(__ret_warn_on)) {                                  \
-               printk("WARNING: at %s:%d %s()\n", __FILE__,            \
-                       __LINE__, __FUNCTION__);                        \
-               dump_stack();                                           \
-       }                                                               \
+       if (unlikely(__ret_warn_on))                                    \
+               __WARN();                                               \
         unlikely(__ret_warn_on);                                        \
  })
  #endif
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h

index d85172e9ed4580afa55c333e49fcb2ee925403db..4b8d31cda1a01b1c9bee8e5c4b1810c89fdd0119 100644 (file)
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -3,54 +3,79 @@
  #include <linux/compiler.h>
  #include <linux/threads.h>
  
-#define __GENERIC_PER_CPU
+/*
+ * Determine the real variable name from the name visible in the
+ * kernel sources.
+ */
+#define per_cpu_var(var) per_cpu__##var
+
  #ifdef CONFIG_SMP
  
+/*
+ * per_cpu_offset() is the offset that has to be added to a
+ * percpu variable to get to the instance for a certain processor.
+ *
+ * Most arches use the __per_cpu_offset array for those offsets but
+ * some arches have their own ways of determining the offset (x86_64, s390).
+ */
+#ifndef __per_cpu_offset
  extern unsigned long __per_cpu_offset[NR_CPUS];
  
  #define per_cpu_offset(x) (__per_cpu_offset[x])
+#endif
  
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*({                         \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
-#define __get_cpu_var(var) per_cpu(var, smp_processor_id())
-#define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id())
-
-/* A macro to avoid #include hell... */
-#define percpu_modcopy(pcpudst, src, size)                     \
-do {                                                           \
-       unsigned int __i;                                       \
-       for_each_possible_cpu(__i)                              \
-               memcpy((pcpudst)+__per_cpu_offset[__i],         \
-                      (src), (size));                          \
-} while (0)
-#else /* ! SMP */
+/*
+ * Determine the offset for the currently active processor.
+ * An arch may define __my_cpu_offset to provide a more effective
+ * means of obtaining the offset to the per cpu variables of the
+ * current processor.
+ */
+#ifndef __my_cpu_offset
+#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id())
+#define my_cpu_offset per_cpu_offset(smp_processor_id())
+#else
+#define my_cpu_offset __my_cpu_offset
+#endif
+
+/*
+ * Add a offset to a pointer but keep the pointer as is.
+ *
+ * Only S390 provides its own means of moving the pointer.
+ */
+#ifndef SHIFT_PERCPU_PTR
+#define SHIFT_PERCPU_PTR(__p, __offset)        RELOC_HIDE((__p), (__offset))
+#endif
  
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
+/*
+ * A percpu variable may point to a discarded regions. The following are
+ * established ways to produce a usable pointer from the percpu variable
+ * offset.
+ */
+#define per_cpu(var, cpu) \
+       (*SHIFT_PERCPU_PTR(&per_cpu_var(var), per_cpu_offset(cpu)))
+#define __get_cpu_var(var) \
+       (*SHIFT_PERCPU_PTR(&per_cpu_var(var), my_cpu_offset))
+#define __raw_get_cpu_var(var) \
+       (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
  
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
  
-#define per_cpu(var, cpu)                      (*((void)(cpu), &per_cpu__##var))
-#define __get_cpu_var(var)                     per_cpu__##var
-#define __raw_get_cpu_var(var)                 per_cpu__##var
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
+extern void setup_per_cpu_areas(void);
+#endif
+
+#else /* ! SMP */
+
+#define per_cpu(var, cpu)                      (*((void)(cpu), &per_cpu_var(var)))
+#define __get_cpu_var(var)                     per_cpu_var(var)
+#define __raw_get_cpu_var(var)                 per_cpu_var(var)
  
  #endif /* SMP */
  
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+#ifndef PER_CPU_ATTRIBUTES
+#define PER_CPU_ATTRIBUTES
+#endif
  
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
+                                       __typeof__(type) per_cpu_var(name)
  
  #endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h

index 75f2bfab614f40639090702a2a6268f34864df75..6ce9f3ab928da00ba2f233b9439fdcb8b3cfce7c 100644 (file)
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -15,7 +15,6 @@
  
  #include <linux/swap.h>
  #include <linux/quicklist.h>
-#include <asm/pgalloc.h>
  #include <asm/tlbflush.h>
  
  /*
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index 76df771be58566171cd5b74ab16489ae1c5f2cea..f784d2f341496f1ba095ef662720d33916208b52 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -184,6 +184,7 @@
                 VMLINUX_SYMBOL(__start___param) = .;                    \
                 *(__param)                                              \
                 VMLINUX_SYMBOL(__stop___param) = .;                     \
+               . = ALIGN((align));                                     \
                 VMLINUX_SYMBOL(__end_rodata) = .;                       \
         }                                                               \
         . = ALIGN((align));
diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h

index 81bcd5e517898af73ac5b096c8e1eed682968f6d..cd1cc39b5599b639441bbb00b050d40675889a2e 100644 (file)
--- a/include/asm-ia64/acpi.h
+++ b/include/asm-ia64/acpi.h
@@ -127,6 +127,8 @@ extern int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
  extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
  #endif
  
+#define acpi_unlazy_tlb(x)
+
  #endif /*__KERNEL__*/
  
  #endif /*_ASM_ACPI_H*/
diff --git a/include/asm-ia64/agp.h b/include/asm-ia64/agp.h

index 4e517f0e6afa34e81e9aecdeee51bf0eaec2b967..c11fdd8ab4d7a88665c757a81c6e8c1a471bdf90 100644 (file)
--- a/include/asm-ia64/agp.h
+++ b/include/asm-ia64/agp.h
@@ -15,7 +15,6 @@
   */
  #define map_page_into_agp(page)                /* nothing */
  #define unmap_page_from_agp(page)      /* nothing */
-#define flush_agp_mappings()           /* nothing */
  #define flush_agp_cache()              mb()
  
  /* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h

index c4f1e328a5ba845f0635e42f0669b137369a604d..0095bcf798484d0988ff086bc0c5d0a167cd08ab 100644 (file)
--- a/include/asm-ia64/percpu.h
+++ b/include/asm-ia64/percpu.h
@@ -16,28 +16,11 @@
  #include <linux/threads.h>
  
  #ifdef HAVE_MODEL_SMALL_ATTRIBUTE
-# define __SMALL_ADDR_AREA     __attribute__((__model__ (__small__)))
-#else
-# define __SMALL_ADDR_AREA
+# define PER_CPU_ATTRIBUTES    __attribute__((__model__ (__small__)))
  #endif
  
  #define DECLARE_PER_CPU(type, name)                            \
-       extern __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
-
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name)                             \
-       __attribute__((__section__(".data.percpu")))            \
-       __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name
-
-#ifdef CONFIG_SMP
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                      \
-       __attribute__((__section__(".data.percpu.shared_aligned")))     \
-       __SMALL_ADDR_AREA __typeof__(type) per_cpu__##name              \
-       ____cacheline_aligned_in_smp
-#else
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-       DEFINE_PER_CPU(type, name)
-#endif
+       extern PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
  
  /*
   * Pretty much a literal copy of asm-generic/percpu.h, except that percpu_modcopy() is an
@@ -68,9 +51,6 @@ extern void *per_cpu_init(void);
  
  #endif /* SMP */
  
-#define EXPORT_PER_CPU_SYMBOL(var)             EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var)         EXPORT_SYMBOL_GPL(per_cpu__##var)
-
  /*
   * Be extremely careful when taking the address of this variable!  Due to virtual
   * remapping, it is different from the canonical address returned by __get_cpu_var(var)!
diff --git a/include/asm-m32r/signal.h b/include/asm-m32r/signal.h

index 937258686ba54982ebb7b94149a513ba21090241..1a607066bc645ce0a6767702640438930402ebd3 100644 (file)
--- a/include/asm-m32r/signal.h
+++ b/include/asm-m32r/signal.h
@@ -157,7 +157,7 @@ typedef struct sigaltstack {
  #undef __HAVE_ARCH_SIG_BITOPS
  
  struct pt_regs;
-extern int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset));
+extern int do_signal(struct pt_regs *regs, sigset_t *oldset);
  
  #define ptrace_signal_deliver(regs, cookie)    do { } while (0)
  
diff --git a/include/asm-parisc/agp.h b/include/asm-parisc/agp.h

index 9f61d4eb6c01fd1fd61f25e0c7fbc16ddf54a891..9651660da639a32b6c0d20cc11aaafff2fce549b 100644 (file)
--- a/include/asm-parisc/agp.h
+++ b/include/asm-parisc/agp.h
@@ -9,7 +9,6 @@
  
  #define map_page_into_agp(page)                /* nothing */
  #define unmap_page_from_agp(page)      /* nothing */
-#define flush_agp_mappings()           /* nothing */
  #define flush_agp_cache()              mb()
  
  /* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-powerpc/agp.h b/include/asm-powerpc/agp.h

index e5ccaca2f5a42a0d379281536601766594ccec49..86455c4c31eea851f84b73ddbdc126581ff49c92 100644 (file)
--- a/include/asm-powerpc/agp.h
+++ b/include/asm-powerpc/agp.h
@@ -6,7 +6,6 @@
  
  #define map_page_into_agp(page)
  #define unmap_page_from_agp(page)
-#define flush_agp_mappings()
  #define flush_agp_cache() mb()
  
  /* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h

index 6b229626d3ff8a483a7875068ffcdf3dd29fff08..cc1cbf656b02f343e60199c79ec50fb26cb53665 100644 (file)
--- a/include/asm-powerpc/percpu.h
+++ b/include/asm-powerpc/percpu.h
@@ -16,15 +16,6 @@
  #define __my_cpu_offset() get_paca()->data_offset
  #define per_cpu_offset(x) (__per_cpu_offset(x))
  
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
  /* var is in discarded region: offset to particular copy we want */
  #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
  #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
@@ -43,11 +34,6 @@ extern void setup_per_cpu_areas(void);
  
  #else /* ! SMP */
  
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
-
  #define per_cpu(var, cpu)                      (*((void)(cpu), &per_cpu__##var))
  #define __get_cpu_var(var)                     per_cpu__##var
  #define __raw_get_cpu_var(var)                 per_cpu__##var
@@ -56,9 +42,6 @@ extern void setup_per_cpu_areas(void);
  
  #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
  
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
  #else
  #include <asm-generic/percpu.h>
  #endif
diff --git a/include/asm-powerpc/ptrace.h b/include/asm-powerpc/ptrace.h

index 13fccc5a41197e96f1e1f8e4163df90583d2b0b9..3063363f6799e3b655a59bf4d040de966e607888 100644 (file)
--- a/include/asm-powerpc/ptrace.h
+++ b/include/asm-powerpc/ptrace.h
@@ -119,6 +119,13 @@ do {                                                                             \
  } while (0)
  #endif /* __powerpc64__ */
  
+/*
+ * These are defined as per linux/ptrace.h, which see.
+ */
+#define arch_has_single_step() (1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
  #endif /* __ASSEMBLY__ */
  
  #endif /* __KERNEL__ */
diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h

index 545857e6444376352344db9c3f328d5125847105..2d676a873858cf599fbc1b679e6ceccd6a112fc3 100644 (file)
--- a/include/asm-s390/percpu.h
+++ b/include/asm-s390/percpu.h
@@ -4,8 +4,6 @@
  #include <linux/compiler.h>
  #include <asm/lowcore.h>
  
-#define __GENERIC_PER_CPU
-
  /*
   * s390 uses its own implementation for per cpu data, the offset of
   * the cpu local data area is cached in the cpu's lowcore memory.
@@ -36,16 +34,6 @@
  
  extern unsigned long __per_cpu_offset[NR_CPUS];
  
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) \
-    __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
  #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
  #define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset)
  #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu])
@@ -62,11 +50,6 @@ do {                                                         \
  
  #else /* ! SMP */
  
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
-
  #define __get_cpu_var(var) __reloc_hide(var,0)
  #define __raw_get_cpu_var(var) __reloc_hide(var,0)
  #define per_cpu(var,cpu) __reloc_hide(var,0)
@@ -75,7 +58,4 @@ do {                                                          \
  
  #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
  
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
  #endif /* __ARCH_S390_PERCPU__ */
diff --git a/include/asm-sparc64/agp.h b/include/asm-sparc64/agp.h

index 58f8cb6ae767d536769a8a43ab46564a5665b96b..e9fcf0e781ea11cd1644dc9dc3e96aa6519fa0e6 100644 (file)
--- a/include/asm-sparc64/agp.h
+++ b/include/asm-sparc64/agp.h
@@ -5,7 +5,6 @@
  
  #define map_page_into_agp(page) 
  #define unmap_page_from_agp(page) 
-#define flush_agp_mappings() 
  #define flush_agp_cache() mb()
  
  /* Convert a physical address to an address suitable for the GART. */
diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h

index a1f53a4da405dd7496bee581e88a609a57455756..c7e52decba9863e5e821872d293f6f79b4fc7ec2 100644 (file)
--- a/include/asm-sparc64/percpu.h
+++ b/include/asm-sparc64/percpu.h
@@ -16,15 +16,6 @@ extern unsigned long __per_cpu_shift;
         (__per_cpu_base + ((unsigned long)(__cpu) << __per_cpu_shift))
  #define per_cpu_offset(x) (__per_cpu_offset(x))
  
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
  /* var is in discarded region: offset to particular copy we want */
  #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
  #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset))
@@ -41,10 +32,6 @@ do {                                                         \
  #else /* ! SMP */
  
  #define real_setup_per_cpu_areas()             do { } while (0)
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
  
  #define per_cpu(var, cpu)                      (*((void)cpu, &per_cpu__##var))
  #define __get_cpu_var(var)                     per_cpu__##var
@@ -54,7 +41,4 @@ do {                                                          \
  
  #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
  
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
  #endif /* __ARCH_SPARC64_PERCPU__ */
diff --git a/include/asm-um/asm.h b/include/asm-um/asm.h

new file mode 100644 (file)

index 0000000..af1269a
--- /dev/null
+++ b/include/asm-um/asm.h
@@ -0,0 +1,6 @@
+#ifndef __UM_ASM_H
+#define __UM_ASM_H
+
+#include "asm/arch/asm.h"
+
+#endif
diff --git a/include/asm-um/linkage.h b/include/asm-um/linkage.h

index 78b862472b363e9bde4c01e43b3f69d81697bd7d..cdb3024a699a6d83df5f49ccd8e63a3e3adb362b 100644 (file)
--- a/include/asm-um/linkage.h
+++ b/include/asm-um/linkage.h
@@ -6,7 +6,6 @@
  
  /* <linux/linkage.h> will pick sane defaults */
  #ifdef CONFIG_GPROF
-#undef FASTCALL
  #undef fastcall
  #endif
  
diff --git a/include/asm-um/nops.h b/include/asm-um/nops.h

new file mode 100644 (file)

index 0000000..814e9bf
--- /dev/null
+++ b/include/asm-um/nops.h
@@ -0,0 +1,6 @@
+#ifndef __UM_NOPS_H
+#define __UM_NOPS_H
+
+#include "asm/arch/nops.h"
+
+#endif
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild

index 12db5a1cdd7431686403f664716e2997267db0cb..3c6f0f80e827cdd96986236ed98880561185ad74 100644 (file)
--- a/include/asm-x86/Kbuild
+++ b/include/asm-x86/Kbuild
@@ -3,21 +3,20 @@ include include/asm-generic/Kbuild.asm
  header-y += boot.h
  header-y += bootparam.h
  header-y += debugreg.h
+header-y += kvm.h
  header-y += ldt.h
  header-y += msr-index.h
  header-y += prctl.h
  header-y += ptrace-abi.h
  header-y += sigcontext32.h
  header-y += ucontext.h
-header-y += vsyscall32.h
  
  unifdef-y += e820.h
  unifdef-y += ist.h
  unifdef-y += mce.h
  unifdef-y += msr.h
  unifdef-y += mtrr.h
-unifdef-y += page_32.h
-unifdef-y += page_64.h
+unifdef-y += page.h
  unifdef-y += posix_types_32.h
  unifdef-y += posix_types_64.h
  unifdef-y += ptrace.h
diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h

index f8a89793ac8cbb865dbedb2b70181236d2a4288d..98a9ca266531e5c823f125d72f5445f7474d4a5d 100644 (file)
--- a/include/asm-x86/acpi.h
+++ b/include/asm-x86/acpi.h
@@ -1,13 +1,123 @@
  #ifndef _ASM_X86_ACPI_H
  #define _ASM_X86_ACPI_H
  
-#ifdef CONFIG_X86_32
-# include "acpi_32.h"
-#else
-# include "acpi_64.h"
-#endif
+/*
+ *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <acpi/pdc_intel.h>
  
+#include <asm/numa.h>
  #include <asm/processor.h>
+#include <asm/mmu.h>
+
+#define COMPILER_DEPENDENT_INT64   long long
+#define COMPILER_DEPENDENT_UINT64  unsigned long long
+
+/*
+ * Calling conventions:
+ *
+ * ACPI_SYSTEM_XFACE        - Interfaces to host OS (handlers, threads)
+ * ACPI_EXTERNAL_XFACE      - External ACPI interfaces
+ * ACPI_INTERNAL_XFACE      - Internal ACPI interfaces
+ * ACPI_INTERNAL_VAR_XFACE  - Internal variable-parameter list interfaces
+ */
+#define ACPI_SYSTEM_XFACE
+#define ACPI_EXTERNAL_XFACE
+#define ACPI_INTERNAL_XFACE
+#define ACPI_INTERNAL_VAR_XFACE
+
+/* Asm macros */
+
+#define ACPI_ASM_MACROS
+#define BREAKPOINT3
+#define ACPI_DISABLE_IRQS() local_irq_disable()
+#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_FLUSH_CPU_CACHE() wbinvd()
+
+int __acpi_acquire_global_lock(unsigned int *lock);
+int __acpi_release_global_lock(unsigned int *lock);
+
+#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
+       ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
+
+#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
+       ((Acq) = __acpi_release_global_lock(&facs->global_lock))
+
+/*
+ * Math helper asm macros
+ */
+#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
+       asm("divl %2;"                               \
+           :"=a"(q32), "=d"(r32)                    \
+           :"r"(d32),                               \
+            "0"(n_lo), "1"(n_hi))
+
+
+#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
+       asm("shrl   $1,%2       ;"      \
+           "rcrl   $1,%3;"             \
+           :"=r"(n_hi), "=r"(n_lo)     \
+           :"0"(n_hi), "1"(n_lo))
+
+#ifdef CONFIG_ACPI
+extern int acpi_lapic;
+extern int acpi_ioapic;
+extern int acpi_noirq;
+extern int acpi_strict;
+extern int acpi_disabled;
+extern int acpi_ht;
+extern int acpi_pci_disabled;
+extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;
+
+static inline void disable_acpi(void)
+{
+       acpi_disabled = 1;
+       acpi_ht = 0;
+       acpi_pci_disabled = 1;
+       acpi_noirq = 1;
+}
+
+/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
+#define FIX_ACPI_PAGES 4
+
+extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
+
+static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
+static inline void acpi_disable_pci(void)
+{
+       acpi_pci_disabled = 1;
+       acpi_noirq_set();
+}
+extern int acpi_irq_balance_set(char *str);
+
+/* routines for saving/restoring kernel state */
+extern int acpi_save_state_mem(void);
+extern void acpi_restore_state_mem(void);
+
+extern unsigned long acpi_wakeup_address;
+
+/* early initialization routine */
+extern void acpi_reserve_bootmem(void);
  
  /*
   * Check if the CPU can handle C2 and deeper
@@ -29,4 +139,35 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
                 return max_cstate;
  }
  
+#else /* !CONFIG_ACPI */
+
+#define acpi_lapic 0
+#define acpi_ioapic 0
+static inline void acpi_noirq_set(void) { }
+static inline void acpi_disable_pci(void) { }
+static inline void disable_acpi(void) { }
+
+#endif /* !CONFIG_ACPI */
+
+#define ARCH_HAS_POWER_INIT    1
+
+struct bootnode;
+
+#ifdef CONFIG_ACPI_NUMA
+extern int acpi_numa;
+extern int acpi_scan_nodes(unsigned long start, unsigned long end);
+#ifdef CONFIG_X86_64
+# define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+#endif
+extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
+                                  int num_nodes);
+#else
+static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
+                                  int num_nodes)
+{
+}
  #endif
+
+#define acpi_unlazy_tlb(x)     leave_mm(x)
+
+#endif /*__X86_ASM_ACPI_H*/
diff --git a/include/asm-x86/acpi_32.h b/include/asm-x86/acpi_32.h

deleted file mode 100644 (file)

index 723493e..0000000
--- a/include/asm-x86/acpi_32.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- *  asm-i386/acpi.h
- *
- *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
-  *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#ifndef _ASM_ACPI_H
-#define _ASM_ACPI_H
-
-#ifdef __KERNEL__
-
-#include <acpi/pdc_intel.h>
-
-#include <asm/system.h>                /* defines cmpxchg */
-
-#define COMPILER_DEPENDENT_INT64   long long
-#define COMPILER_DEPENDENT_UINT64  unsigned long long
-
-/*
- * Calling conventions:
- *
- * ACPI_SYSTEM_XFACE        - Interfaces to host OS (handlers, threads)
- * ACPI_EXTERNAL_XFACE      - External ACPI interfaces
- * ACPI_INTERNAL_XFACE      - Internal ACPI interfaces
- * ACPI_INTERNAL_VAR_XFACE  - Internal variable-parameter list interfaces
- */
-#define ACPI_SYSTEM_XFACE
-#define ACPI_EXTERNAL_XFACE
-#define ACPI_INTERNAL_XFACE
-#define ACPI_INTERNAL_VAR_XFACE
-
-/* Asm macros */
-
-#define ACPI_ASM_MACROS
-#define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
-#define ACPI_FLUSH_CPU_CACHE() wbinvd()
-
-int __acpi_acquire_global_lock(unsigned int *lock);
-int __acpi_release_global_lock(unsigned int *lock);
-
-#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
-       ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
-
-#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
-       ((Acq) = __acpi_release_global_lock(&facs->global_lock))
-
-/*
- * Math helper asm macros
- */
-#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
-        asm("divl %2;"        \
-        :"=a"(q32), "=d"(r32) \
-        :"r"(d32),            \
-        "0"(n_lo), "1"(n_hi))
-
-
-#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
-    asm("shrl   $1,%2;"             \
-        "rcrl   $1,%3;"             \
-        :"=r"(n_hi), "=r"(n_lo)     \
-        :"0"(n_hi), "1"(n_lo))
-
-extern void early_quirks(void);
-
-#ifdef CONFIG_ACPI
-extern int acpi_lapic;
-extern int acpi_ioapic;
-extern int acpi_noirq;
-extern int acpi_strict;
-extern int acpi_disabled;
-extern int acpi_ht;
-extern int acpi_pci_disabled;
-static inline void disable_acpi(void)
-{
-       acpi_disabled = 1;
-       acpi_ht = 0;
-       acpi_pci_disabled = 1;
-       acpi_noirq = 1;
-}
-
-/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
-#define FIX_ACPI_PAGES 4
-
-extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
-
-#ifdef CONFIG_X86_IO_APIC
-extern int acpi_skip_timer_override;
-extern int acpi_use_timer_override;
-#endif
-
-static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
-static inline void acpi_disable_pci(void)
-{
-       acpi_pci_disabled = 1;
-       acpi_noirq_set();
-}
-extern int acpi_irq_balance_set(char *str);
-
-/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-extern void acpi_restore_state_mem(void);
-
-extern unsigned long acpi_wakeup_address;
-
-/* early initialization routine */
-extern void acpi_reserve_bootmem(void);
-
-#else  /* !CONFIG_ACPI */
-
-#define acpi_lapic 0
-#define acpi_ioapic 0
-static inline void acpi_noirq_set(void) { }
-static inline void acpi_disable_pci(void) { }
-static inline void disable_acpi(void) { }
-
-#endif /* !CONFIG_ACPI */
-
-#define ARCH_HAS_POWER_INIT    1
-
-#endif /*__KERNEL__*/
-
-#endif /*_ASM_ACPI_H*/
diff --git a/include/asm-x86/acpi_64.h b/include/asm-x86/acpi_64.h

deleted file mode 100644 (file)

index 9817335..0000000
--- a/include/asm-x86/acpi_64.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- *  asm-x86_64/acpi.h
- *
- *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
-  *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#ifndef _ASM_ACPI_H
-#define _ASM_ACPI_H
-
-#ifdef __KERNEL__
-
-#include <acpi/pdc_intel.h>
-#include <asm/numa.h>
-
-#define COMPILER_DEPENDENT_INT64   long long
-#define COMPILER_DEPENDENT_UINT64  unsigned long long
-
-/*
- * Calling conventions:
- *
- * ACPI_SYSTEM_XFACE        - Interfaces to host OS (handlers, threads)
- * ACPI_EXTERNAL_XFACE      - External ACPI interfaces
- * ACPI_INTERNAL_XFACE      - Internal ACPI interfaces
- * ACPI_INTERNAL_VAR_XFACE  - Internal variable-parameter list interfaces
- */
-#define ACPI_SYSTEM_XFACE
-#define ACPI_EXTERNAL_XFACE
-#define ACPI_INTERNAL_XFACE
-#define ACPI_INTERNAL_VAR_XFACE
-
-/* Asm macros */
-
-#define ACPI_ASM_MACROS
-#define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
-#define ACPI_FLUSH_CPU_CACHE() wbinvd()
-
-int __acpi_acquire_global_lock(unsigned int *lock);
-int __acpi_release_global_lock(unsigned int *lock);
-
-#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
-       ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
-
-#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
-       ((Acq) = __acpi_release_global_lock(&facs->global_lock))
-
-/*
- * Math helper asm macros
- */
-#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
-        asm("divl %2;"        \
-        :"=a"(q32), "=d"(r32) \
-        :"r"(d32),            \
-        "0"(n_lo), "1"(n_hi))
-
-
-#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
-    asm("shrl   $1,%2;"             \
-        "rcrl   $1,%3;"             \
-        :"=r"(n_hi), "=r"(n_lo)     \
-        :"0"(n_hi), "1"(n_lo))
-
-#ifdef CONFIG_ACPI
-extern int acpi_lapic;
-extern int acpi_ioapic;
-extern int acpi_noirq;
-extern int acpi_strict;
-extern int acpi_disabled;
-extern int acpi_pci_disabled;
-extern int acpi_ht;
-static inline void disable_acpi(void)
-{
-       acpi_disabled = 1;
-       acpi_ht = 0;
-       acpi_pci_disabled = 1;
-       acpi_noirq = 1;
-}
-
-/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
-#define FIX_ACPI_PAGES 4
-
-extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
-static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
-static inline void acpi_disable_pci(void)
-{
-       acpi_pci_disabled = 1;
-       acpi_noirq_set();
-}
-extern int acpi_irq_balance_set(char *str);
-
-/* routines for saving/restoring kernel state */
-extern int acpi_save_state_mem(void);
-extern void acpi_restore_state_mem(void);
-
-extern unsigned long acpi_wakeup_address;
-
-/* early initialization routine */
-extern void acpi_reserve_bootmem(void);
-
-#else  /* !CONFIG_ACPI */
-
-#define acpi_lapic 0
-#define acpi_ioapic 0
-static inline void acpi_noirq_set(void) { }
-static inline void acpi_disable_pci(void) { }
-
-#endif /* !CONFIG_ACPI */
-
-extern int acpi_numa;
-extern int acpi_scan_nodes(unsigned long start, unsigned long end);
-#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-
-extern int acpi_disabled;
-extern int acpi_pci_disabled;
-
-#define ARCH_HAS_POWER_INIT 1
-
-extern int acpi_skip_timer_override;
-extern int acpi_use_timer_override;
-
-#ifdef CONFIG_ACPI_NUMA
-extern void __init acpi_fake_nodes(const struct bootnode *fake_nodes,
-                                  int num_nodes);
-#else
-static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
-                                  int num_nodes)
-{
-}
-#endif
-
-#endif /*__KERNEL__*/
-
-#endif /*_ASM_ACPI_H*/
diff --git a/include/asm-x86/agp.h b/include/asm-x86/agp.h

index 62df2a9e7130939886667618ebbab022febab820..e4004a9f6a9a53fa03c7b5ce6db80809180dc703 100644 (file)
--- a/include/asm-x86/agp.h
+++ b/include/asm-x86/agp.h
@@ -12,13 +12,8 @@
   * page. This avoids data corruption on some CPUs.
   */
  
-/*
- * Caller's responsibility to call global_flush_tlb() for performance
- * reasons
- */
-#define map_page_into_agp(page) change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)
-#define unmap_page_from_agp(page) change_page_attr(page, 1, PAGE_KERNEL)
-#define flush_agp_mappings() global_flush_tlb()
+#define map_page_into_agp(page) set_pages_uc(page, 1)
+#define unmap_page_from_agp(page) set_pages_wb(page, 1)
  
  /*
   * Could use CLFLUSH here if the cpu supports it. But then it would
diff --git a/include/asm-x86/alternative.h b/include/asm-x86/alternative.h

index 9eef6a32a130490368c726a6587f53a7858e586f..d8bacf3c4b08765f674791b39603d900f5983f80 100644 (file)
--- a/include/asm-x86/alternative.h
+++ b/include/asm-x86/alternative.h
@@ -1,5 +1,161 @@
-#ifdef CONFIG_X86_32
-# include "alternative_32.h"
+#ifndef _ASM_X86_ALTERNATIVE_H
+#define _ASM_X86_ALTERNATIVE_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/asm.h>
+
+/*
+ * Alternative inline assembly for SMP.
+ *
+ * The LOCK_PREFIX macro defined here replaces the LOCK and
+ * LOCK_PREFIX macros used everywhere in the source tree.
+ *
+ * SMP alternatives use the same data structures as the other
+ * alternatives and the X86_FEATURE_UP flag to indicate the case of a
+ * UP system running a SMP kernel.  The existing apply_alternatives()
+ * works fine for patching a SMP kernel for UP.
+ *
+ * The SMP alternative tables can be kept after boot and contain both
+ * UP and SMP versions of the instructions to allow switching back to
+ * SMP at runtime, when hotplugging in a new CPU, which is especially
+ * useful in virtualized environments.
+ *
+ * The very common lock prefix is handled as special case in a
+ * separate table which is a pure address list without replacement ptr
+ * and size information.  That keeps the table sizes small.
+ */
+
+#ifdef CONFIG_SMP
+#define LOCK_PREFIX \
+               ".section .smp_locks,\"a\"\n"   \
+               _ASM_ALIGN "\n"                 \
+               _ASM_PTR "661f\n" /* address */ \
+               ".previous\n"                   \
+               "661:\n\tlock; "
+
+#else /* ! CONFIG_SMP */
+#define LOCK_PREFIX ""
+#endif
+
+/* This must be included *after* the definition of LOCK_PREFIX */
+#include <asm/cpufeature.h>
+
+struct alt_instr {
+       u8 *instr;              /* original instruction */
+       u8 *replacement;
+       u8  cpuid;              /* cpuid bit set for replacement */
+       u8  instrlen;           /* length of original instruction */
+       u8  replacementlen;     /* length of new instruction, <= instrlen */
+       u8  pad1;
+#ifdef CONFIG_X86_64
+       u32 pad2;
+#endif
+};
+
+extern void alternative_instructions(void);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
+struct module;
+
+#ifdef CONFIG_SMP
+extern void alternatives_smp_module_add(struct module *mod, char *name,
+                                       void *locks, void *locks_end,
+                                       void *text, void *text_end);
+extern void alternatives_smp_module_del(struct module *mod);
+extern void alternatives_smp_switch(int smp);
+#else
+static inline void alternatives_smp_module_add(struct module *mod, char *name,
+                                       void *locks, void *locks_end,
+                                       void *text, void *text_end) {}
+static inline void alternatives_smp_module_del(struct module *mod) {}
+static inline void alternatives_smp_switch(int smp) {}
+#endif /* CONFIG_SMP */
+
+/*
+ * Alternative instructions for different CPU types or capabilities.
+ *
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ *
+ * length of oldinstr must be longer or equal the length of newinstr
+ * It can be padded with nops as needed.
+ *
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, newinstr, feature)                       \
+       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
+                     ".section .altinstructions,\"a\"\n"               \
+                     _ASM_ALIGN "\n"                                   \
+                     _ASM_PTR "661b\n"         /* label */             \
+                     _ASM_PTR "663f\n"         /* new instruction */   \
+                     "  .byte %c0\n"           /* feature bit */       \
+                     "  .byte 662b-661b\n"     /* sourcelen */         \
+                     "  .byte 664f-663f\n"     /* replacementlen */    \
+                     ".previous\n"                                     \
+                     ".section .altinstr_replacement,\"ax\"\n"         \
+                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
+                     ".previous" :: "i" (feature) : "memory")
+
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...)       \
+       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
+                     ".section .altinstructions,\"a\"\n"               \
+                     _ASM_ALIGN "\n"                                   \
+                     _ASM_PTR "661b\n"         /* label */             \
+                     _ASM_PTR "663f\n"         /* new instruction */   \
+                     "  .byte %c0\n"           /* feature bit */       \
+                     "  .byte 662b-661b\n"     /* sourcelen */         \
+                     "  .byte 664f-663f\n"     /* replacementlen */    \
+                     ".previous\n"                                     \
+                     ".section .altinstr_replacement,\"ax\"\n"         \
+                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
+                     ".previous" :: "i" (feature), ##input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...)  \
+       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
+                     ".section .altinstructions,\"a\"\n"               \
+                     _ASM_ALIGN "\n"                                   \
+                     _ASM_PTR "661b\n"         /* label */             \
+                     _ASM_PTR "663f\n"         /* new instruction */   \
+                     "  .byte %c[feat]\n"      /* feature bit */       \
+                     "  .byte 662b-661b\n"     /* sourcelen */         \
+                     "  .byte 664f-663f\n"     /* replacementlen */    \
+                     ".previous\n"                                     \
+                     ".section .altinstr_replacement,\"ax\"\n"         \
+                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
+                     ".previous" : output : [feat] "i" (feature), ##input)
+
+/*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io
+ */
+#define ASM_OUTPUT2(a, b) a, b
+
+struct paravirt_patch_site;
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+                   struct paravirt_patch_site *end);
  #else
-# include "alternative_64.h"
+static inline void
+apply_paravirt(struct paravirt_patch_site *start,
+              struct paravirt_patch_site *end)
+{}
+#define __parainstructions     NULL
+#define __parainstructions_end NULL
  #endif
+
+extern void text_poke(void *addr, unsigned char *opcode, int len);
+
+#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/include/asm-x86/alternative_32.h b/include/asm-x86/alternative_32.h

deleted file mode 100644 (file)

index bda6c81..0000000
--- a/include/asm-x86/alternative_32.h
+++ /dev/null
@@ -1,154 +0,0 @@
-#ifndef _I386_ALTERNATIVE_H
-#define _I386_ALTERNATIVE_H
-
-#include <asm/types.h>
-#include <linux/stddef.h>
-#include <linux/types.h>
-
-struct alt_instr {
-       u8 *instr;              /* original instruction */
-       u8 *replacement;
-       u8  cpuid;              /* cpuid bit set for replacement */
-       u8  instrlen;           /* length of original instruction */
-       u8  replacementlen;     /* length of new instruction, <= instrlen */
-       u8  pad;
-};
-
-extern void alternative_instructions(void);
-extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
-
-struct module;
-#ifdef CONFIG_SMP
-extern void alternatives_smp_module_add(struct module *mod, char *name,
-                                       void *locks, void *locks_end,
-                                       void *text, void *text_end);
-extern void alternatives_smp_module_del(struct module *mod);
-extern void alternatives_smp_switch(int smp);
-#else
-static inline void alternatives_smp_module_add(struct module *mod, char *name,
-                                       void *locks, void *locks_end,
-                                       void *text, void *text_end) {}
-static inline void alternatives_smp_module_del(struct module *mod) {}
-static inline void alternatives_smp_switch(int smp) {}
-#endif /* CONFIG_SMP */
-
-/*
- * Alternative instructions for different CPU types or capabilities.
- *
- * This allows to use optimized instructions even on generic binary
- * kernels.
- *
- * length of oldinstr must be longer or equal the length of newinstr
- * It can be padded with nops as needed.
- *
- * For non barrier like inlines please define new variants
- * without volatile and memory clobber.
- */
-#define alternative(oldinstr, newinstr, feature)                       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     "  .align 4\n"                                    \
-                     "  .long 661b\n"            /* label */           \
-                     "  .long 663f\n"            /* new instruction */ \
-                     "  .byte %c0\n"             /* feature bit */     \
-                     "  .byte 662b-661b\n"       /* sourcelen */       \
-                     "  .byte 664f-663f\n"       /* replacementlen */  \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"   /* replacement */\
-                     ".previous" :: "i" (feature) : "memory")
-
-/*
- * Alternative inline assembly with input.
- *
- * Pecularities:
- * No memory clobber here.
- * Argument numbers start with 1.
- * Best is to use constraints that are fixed size (like (%1) ... "r")
- * If you use variable sized constraints like "m" or "g" in the
- * replacement maake sure to pad to the worst case length.
- */
-#define alternative_input(oldinstr, newinstr, feature, input...)       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     "  .align 4\n"                                    \
-                     "  .long 661b\n"            /* label */           \
-                     "  .long 663f\n"            /* new instruction */ \
-                     "  .byte %c0\n"             /* feature bit */     \
-                     "  .byte 662b-661b\n"       /* sourcelen */       \
-                     "  .byte 664f-663f\n"       /* replacementlen */  \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"   /* replacement */\
-                     ".previous" :: "i" (feature), ##input)
-
-/* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...) \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     "  .align 4\n"                                    \
-                     "  .long 661b\n"            /* label */           \
-                     "  .long 663f\n"            /* new instruction */ \
-                     "  .byte %c[feat]\n"        /* feature bit */     \
-                     "  .byte 662b-661b\n"       /* sourcelen */       \
-                     "  .byte 664f-663f\n"       /* replacementlen */  \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
-                     ".previous" : output : [feat] "i" (feature), ##input)
-
-/*
- * use this macro(s) if you need more than one output parameter
- * in alternative_io
- */
-#define ASM_OUTPUT2(a, b) a, b
-
-/*
- * Alternative inline assembly for SMP.
- *
- * The LOCK_PREFIX macro defined here replaces the LOCK and
- * LOCK_PREFIX macros used everywhere in the source tree.
- *
- * SMP alternatives use the same data structures as the other
- * alternatives and the X86_FEATURE_UP flag to indicate the case of a
- * UP system running a SMP kernel.  The existing apply_alternatives()
- * works fine for patching a SMP kernel for UP.
- *
- * The SMP alternative tables can be kept after boot and contain both
- * UP and SMP versions of the instructions to allow switching back to
- * SMP at runtime, when hotplugging in a new CPU, which is especially
- * useful in virtualized environments.
- *
- * The very common lock prefix is handled as special case in a
- * separate table which is a pure address list without replacement ptr
- * and size information.  That keeps the table sizes small.
- */
-
-#ifdef CONFIG_SMP
-#define LOCK_PREFIX \
-               ".section .smp_locks,\"a\"\n"   \
-               "  .align 4\n"                  \
-               "  .long 661f\n" /* address */  \
-               ".previous\n"                   \
-               "661:\n\tlock; "
-
-#else /* ! CONFIG_SMP */
-#define LOCK_PREFIX ""
-#endif
-
-struct paravirt_patch_site;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
-                   struct paravirt_patch_site *end);
-#else
-static inline void
-apply_paravirt(struct paravirt_patch_site *start,
-              struct paravirt_patch_site *end)
-{}
-#define __parainstructions     NULL
-#define __parainstructions_end NULL
-#endif
-
-extern void text_poke(void *addr, unsigned char *opcode, int len);
-
-#endif /* _I386_ALTERNATIVE_H */
diff --git a/include/asm-x86/alternative_64.h b/include/asm-x86/alternative_64.h

deleted file mode 100644 (file)

index ab161e8..0000000
--- a/include/asm-x86/alternative_64.h
+++ /dev/null
@@ -1,159 +0,0 @@
-#ifndef _X86_64_ALTERNATIVE_H
-#define _X86_64_ALTERNATIVE_H
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/stddef.h>
-
-/*
- * Alternative inline assembly for SMP.
- *
- * The LOCK_PREFIX macro defined here replaces the LOCK and
- * LOCK_PREFIX macros used everywhere in the source tree.
- *
- * SMP alternatives use the same data structures as the other
- * alternatives and the X86_FEATURE_UP flag to indicate the case of a
- * UP system running a SMP kernel.  The existing apply_alternatives()
- * works fine for patching a SMP kernel for UP.
- *
- * The SMP alternative tables can be kept after boot and contain both
- * UP and SMP versions of the instructions to allow switching back to
- * SMP at runtime, when hotplugging in a new CPU, which is especially
- * useful in virtualized environments.
- *
- * The very common lock prefix is handled as special case in a
- * separate table which is a pure address list without replacement ptr
- * and size information.  That keeps the table sizes small.
- */
-
-#ifdef CONFIG_SMP
-#define LOCK_PREFIX \
-               ".section .smp_locks,\"a\"\n"   \
-               "  .align 8\n"                  \
-               "  .quad 661f\n" /* address */  \
-               ".previous\n"                   \
-               "661:\n\tlock; "
-
-#else /* ! CONFIG_SMP */
-#define LOCK_PREFIX ""
-#endif
-
-/* This must be included *after* the definition of LOCK_PREFIX */
-#include <asm/cpufeature.h>
-
-struct alt_instr {
-       u8 *instr;              /* original instruction */
-       u8 *replacement;
-       u8  cpuid;              /* cpuid bit set for replacement */
-       u8  instrlen;           /* length of original instruction */
-       u8  replacementlen;     /* length of new instruction, <= instrlen */
-       u8  pad[5];
-};
-
-extern void alternative_instructions(void);
-extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
-
-struct module;
-
-#ifdef CONFIG_SMP
-extern void alternatives_smp_module_add(struct module *mod, char *name,
-                                       void *locks, void *locks_end,
-                                       void *text, void *text_end);
-extern void alternatives_smp_module_del(struct module *mod);
-extern void alternatives_smp_switch(int smp);
-#else
-static inline void alternatives_smp_module_add(struct module *mod, char *name,
-                                       void *locks, void *locks_end,
-                                       void *text, void *text_end) {}
-static inline void alternatives_smp_module_del(struct module *mod) {}
-static inline void alternatives_smp_switch(int smp) {}
-#endif
-
-#endif
-
-/*
- * Alternative instructions for different CPU types or capabilities.
- *
- * This allows to use optimized instructions even on generic binary
- * kernels.
- *
- * length of oldinstr must be longer or equal the length of newinstr
- * It can be padded with nops as needed.
- *
- * For non barrier like inlines please define new variants
- * without volatile and memory clobber.
- */
-#define alternative(oldinstr, newinstr, feature)       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                 \
-                     ".section .altinstructions,\"a\"\n"            \
-                     "  .align 8\n"                                   \
-                     "  .quad 661b\n"            /* label */          \
-                     "  .quad 663f\n"            /* new instruction */ \
-                     "  .byte %c0\n"             /* feature bit */    \
-                     "  .byte 662b-661b\n"       /* sourcelen */      \
-                     "  .byte 664f-663f\n"       /* replacementlen */ \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
-                     ".previous" :: "i" (feature) : "memory")
-
-/*
- * Alternative inline assembly with input.
- *
- * Pecularities:
- * No memory clobber here.
- * Argument numbers start with 1.
- * Best is to use constraints that are fixed size (like (%1) ... "r")
- * If you use variable sized constraints like "m" or "g" in the
- * replacement make sure to pad to the worst case length.
- */
-#define alternative_input(oldinstr, newinstr, feature, input...)       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     "  .align 8\n"                                    \
-                     "  .quad 661b\n"            /* label */           \
-                     "  .quad 663f\n"            /* new instruction */ \
-                     "  .byte %c0\n"             /* feature bit */     \
-                     "  .byte 662b-661b\n"       /* sourcelen */       \
-                     "  .byte 664f-663f\n"       /* replacementlen */  \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
-                     ".previous" :: "i" (feature), ##input)
-
-/* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...) \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     "  .align 8\n"                                    \
-                     "  .quad 661b\n"            /* label */           \
-                     "  .quad 663f\n"            /* new instruction */ \
-                     "  .byte %c[feat]\n"        /* feature bit */     \
-                     "  .byte 662b-661b\n"       /* sourcelen */       \
-                     "  .byte 664f-663f\n"       /* replacementlen */  \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
-                     ".previous" : output : [feat] "i" (feature), ##input)
-
-/*
- * use this macro(s) if you need more than one output parameter
- * in alternative_io
- */
-#define ASM_OUTPUT2(a, b) a, b
-
-struct paravirt_patch;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
-#else
-static inline void
-apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
-{}
-#define __parainstructions NULL
-#define __parainstructions_end NULL
-#endif
-
-extern void text_poke(void *addr, unsigned char *opcode, int len);
-
-#endif /* _X86_64_ALTERNATIVE_H */
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h

index 9fbcc0bd2ac49f9c21f0a4edb608e01fef9d02c3..bcfc07fd3661c4e513445b0e4e2305e6b28185ef 100644 (file)
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -1,5 +1,140 @@
-#ifdef CONFIG_X86_32
-# include "apic_32.h"
+#ifndef _ASM_X86_APIC_H
+#define _ASM_X86_APIC_H
+
+#include <linux/pm.h>
+#include <linux/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#define ARCH_APICTIMER_STOPS_ON_C3     1
+
+#define Dprintk(x...)
+
+/*
+ * Debugging macros
+ */
+#define APIC_QUIET   0
+#define APIC_VERBOSE 1
+#define APIC_DEBUG   2
+
+/*
+ * Define the default level of output to be very little
+ * This can be turned up by using apic=verbose for more
+ * information and apic=debug for _lots_ of information.
+ * apic_verbosity is defined in apic.c
+ */
+#define apic_printk(v, s, a...) do {       \
+               if ((v) <= apic_verbosity) \
+                       printk(s, ##a);    \
+       } while (0)
+
+
+extern void generic_apic_probe(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern int apic_verbosity;
+extern int timer_over_8254;
+extern int local_apic_timer_c2_ok;
+extern int local_apic_timer_disabled;
+
+extern int apic_runs_main_timer;
+extern int ioapic_force;
+extern int disable_apic;
+extern int disable_apic_timer;
+extern unsigned boot_cpu_id;
+
+/*
+ * Basic functions accessing APICs.
+ */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
  #else
-# include "apic_64.h"
+#define apic_write native_apic_write
+#define apic_write_atomic native_apic_write_atomic
+#define apic_read native_apic_read
+#define setup_boot_clock setup_boot_APIC_clock
+#define setup_secondary_clock setup_secondary_APIC_clock
  #endif
+
+static inline void native_apic_write(unsigned long reg, u32 v)
+{
+       *((volatile u32 *)(APIC_BASE + reg)) = v;
+}
+
+static inline void native_apic_write_atomic(unsigned long reg, u32 v)
+{
+       (void) xchg((u32*)(APIC_BASE + reg), v);
+}
+
+static inline u32 native_apic_read(unsigned long reg)
+{
+       return *((volatile u32 *)(APIC_BASE + reg));
+}
+
+extern void apic_wait_icr_idle(void);
+extern u32 safe_apic_wait_icr_idle(void);
+extern int get_physical_broadcast(void);
+
+#ifdef CONFIG_X86_GOOD_APIC
+# define FORCE_READ_AROUND_WRITE 0
+# define apic_read_around(x)
+# define apic_write_around(x, y) apic_write((x), (y))
+#else
+# define FORCE_READ_AROUND_WRITE 1
+# define apic_read_around(x) apic_read(x)
+# define apic_write_around(x, y) apic_write_atomic((x), (y))
+#endif
+
+static inline void ack_APIC_irq(void)
+{
+       /*
+        * ack_APIC_irq() actually gets compiled as a single instruction:
+        * - a single rmw on Pentium/82489DX
+        * - a single write on P6+ cores (CONFIG_X86_GOOD_APIC)
+        * ... yummie.
+        */
+
+       /* Docs say use 0 for future compatibility */
+       apic_write_around(APIC_EOI, 0);
+}
+
+extern int lapic_get_maxlvt(void);
+extern void clear_local_APIC(void);
+extern void connect_bsp_APIC(void);
+extern void disconnect_bsp_APIC(int virt_wire_setup);
+extern void disable_local_APIC(void);
+extern void lapic_shutdown(void);
+extern int verify_local_APIC(void);
+extern void cache_APIC_registers(void);
+extern void sync_Arb_IDs(void);
+extern void init_bsp_APIC(void);
+extern void setup_local_APIC(void);
+extern void end_local_APIC_setup(void);
+extern void init_apic_mappings(void);
+extern void setup_boot_APIC_clock(void);
+extern void setup_secondary_APIC_clock(void);
+extern int APIC_init_uniprocessor(void);
+extern void enable_NMI_through_LVT0(void);
+
+/*
+ * On 32bit this is mach-xxx local
+ */
+#ifdef CONFIG_X86_64
+extern void setup_apic_routing(void);
+#endif
+
+extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
+extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
+
+extern int apic_is_clustered_box(void);
+
+#else /* !CONFIG_X86_LOCAL_APIC */
+static inline void lapic_shutdown(void) { }
+#define local_apic_timer_c2_ok         1
+
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#endif /* __ASM_APIC_H */
diff --git a/include/asm-x86/apic_32.h b/include/asm-x86/apic_32.h

deleted file mode 100644 (file)

index be158b2..0000000
--- a/include/asm-x86/apic_32.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef __ASM_APIC_H
-#define __ASM_APIC_H
-
-#include <linux/pm.h>
-#include <linux/delay.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/processor.h>
-#include <asm/system.h>
-
-#define Dprintk(x...)
-
-/*
- * Debugging macros
- */
-#define APIC_QUIET   0
-#define APIC_VERBOSE 1
-#define APIC_DEBUG   2
-
-extern int apic_verbosity;
-
-/*
- * Define the default level of output to be very little
- * This can be turned up by using apic=verbose for more
- * information and apic=debug for _lots_ of information.
- * apic_verbosity is defined in apic.c
- */
-#define apic_printk(v, s, a...) do {       \
-               if ((v) <= apic_verbosity) \
-                       printk(s, ##a);    \
-       } while (0)
-
-
-extern void generic_apic_probe(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-/*
- * Basic functions accessing APICs.
- */
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define apic_write native_apic_write
-#define apic_write_atomic native_apic_write_atomic
-#define apic_read native_apic_read
-#define setup_boot_clock setup_boot_APIC_clock
-#define setup_secondary_clock setup_secondary_APIC_clock
-#endif
-
-static __inline fastcall void native_apic_write(unsigned long reg,
-                                               unsigned long v)
-{
-       *((volatile unsigned long *)(APIC_BASE+reg)) = v;
-}
-
-static __inline fastcall void native_apic_write_atomic(unsigned long reg,
-                                                      unsigned long v)
-{
-       xchg((volatile unsigned long *)(APIC_BASE+reg), v);
-}
-
-static __inline fastcall unsigned long native_apic_read(unsigned long reg)
-{
-       return *((volatile unsigned long *)(APIC_BASE+reg));
-}
-
-void apic_wait_icr_idle(void);
-unsigned long safe_apic_wait_icr_idle(void);
-int get_physical_broadcast(void);
-
-#ifdef CONFIG_X86_GOOD_APIC
-# define FORCE_READ_AROUND_WRITE 0
-# define apic_read_around(x)
-# define apic_write_around(x,y) apic_write((x),(y))
-#else
-# define FORCE_READ_AROUND_WRITE 1
-# define apic_read_around(x) apic_read(x)
-# define apic_write_around(x,y) apic_write_atomic((x),(y))
-#endif
-
-static inline void ack_APIC_irq(void)
-{
-       /*
-        * ack_APIC_irq() actually gets compiled as a single instruction:
-        * - a single rmw on Pentium/82489DX
-        * - a single write on P6+ cores (CONFIG_X86_GOOD_APIC)
-        * ... yummie.
-        */
-
-       /* Docs say use 0 for future compatibility */
-       apic_write_around(APIC_EOI, 0);
-}
-
-extern int lapic_get_maxlvt(void);
-extern void clear_local_APIC(void);
-extern void connect_bsp_APIC (void);
-extern void disconnect_bsp_APIC (int virt_wire_setup);
-extern void disable_local_APIC (void);
-extern void lapic_shutdown (void);
-extern int verify_local_APIC (void);
-extern void cache_APIC_registers (void);
-extern void sync_Arb_IDs (void);
-extern void init_bsp_APIC (void);
-extern void setup_local_APIC (void);
-extern void init_apic_mappings (void);
-extern void smp_local_timer_interrupt (void);
-extern void setup_boot_APIC_clock (void);
-extern void setup_secondary_APIC_clock (void);
-extern int APIC_init_uniprocessor (void);
-
-extern void enable_NMI_through_LVT0 (void * dummy);
-
-#define ARCH_APICTIMER_STOPS_ON_C3     1
-
-extern int timer_over_8254;
-extern int local_apic_timer_c2_ok;
-
-extern int local_apic_timer_disabled;
-
-#else /* !CONFIG_X86_LOCAL_APIC */
-static inline void lapic_shutdown(void) { }
-#define local_apic_timer_c2_ok         1
-
-#endif /* !CONFIG_X86_LOCAL_APIC */
-
-#endif /* __ASM_APIC_H */
diff --git a/include/asm-x86/apic_64.h b/include/asm-x86/apic_64.h

deleted file mode 100644 (file)

index 2747a11..0000000
--- a/include/asm-x86/apic_64.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef __ASM_APIC_H
-#define __ASM_APIC_H
-
-#include <linux/pm.h>
-#include <linux/delay.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/system.h>
-
-#define Dprintk(x...)
-
-/*
- * Debugging macros
- */
-#define APIC_QUIET   0
-#define APIC_VERBOSE 1
-#define APIC_DEBUG   2
-
-extern int apic_verbosity;
-extern int apic_runs_main_timer;
-extern int ioapic_force;
-extern int disable_apic_timer;
-
-/*
- * Define the default level of output to be very little
- * This can be turned up by using apic=verbose for more
- * information and apic=debug for _lots_ of information.
- * apic_verbosity is defined in apic.c
- */
-#define apic_printk(v, s, a...) do {       \
-               if ((v) <= apic_verbosity) \
-                       printk(s, ##a);    \
-       } while (0)
-
-struct pt_regs;
-
-/*
- * Basic functions accessing APICs.
- */
-
-static __inline void apic_write(unsigned long reg, unsigned int v)
-{
-       *((volatile unsigned int *)(APIC_BASE+reg)) = v;
-}
-
-static __inline unsigned int apic_read(unsigned long reg)
-{
-       return *((volatile unsigned int *)(APIC_BASE+reg));
-}
-
-extern void apic_wait_icr_idle(void);
-extern unsigned int safe_apic_wait_icr_idle(void);
-
-static inline void ack_APIC_irq(void)
-{
-       /*
-        * ack_APIC_irq() actually gets compiled as a single instruction:
-        * - a single rmw on Pentium/82489DX
-        * - a single write on P6+ cores (CONFIG_X86_GOOD_APIC)
-        * ... yummie.
-        */
-
-       /* Docs say use 0 for future compatibility */
-       apic_write(APIC_EOI, 0);
-}
-
-extern int get_maxlvt (void);
-extern void clear_local_APIC (void);
-extern void connect_bsp_APIC (void);
-extern void disconnect_bsp_APIC (int virt_wire_setup);
-extern void disable_local_APIC (void);
-extern void lapic_shutdown (void);
-extern int verify_local_APIC (void);
-extern void cache_APIC_registers (void);
-extern void sync_Arb_IDs (void);
-extern void init_bsp_APIC (void);
-extern void setup_local_APIC (void);
-extern void init_apic_mappings (void);
-extern void smp_local_timer_interrupt (void);
-extern void setup_boot_APIC_clock (void);
-extern void setup_secondary_APIC_clock (void);
-extern int APIC_init_uniprocessor (void);
-extern void setup_apic_routing(void);
-
-extern void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
-                                   unsigned char msg_type, unsigned char mask);
-
-extern int apic_is_clustered_box(void);
-
-#define K8_APIC_EXT_LVT_BASE    0x500
-#define K8_APIC_EXT_INT_MSG_FIX 0x0
-#define K8_APIC_EXT_INT_MSG_SMI 0x2
-#define K8_APIC_EXT_INT_MSG_NMI 0x4
-#define K8_APIC_EXT_INT_MSG_EXT 0x7
-#define K8_APIC_EXT_LVT_ENTRY_THRESHOLD    0
-
-#define ARCH_APICTIMER_STOPS_ON_C3     1
-
-extern unsigned boot_cpu_id;
-extern int local_apic_timer_c2_ok;
-
-#endif /* __ASM_APIC_H */
diff --git a/include/asm-x86/apicdef.h b/include/asm-x86/apicdef.h

index 4542c220bf4d79550b4ba50cb5fb8917ad56848c..550af7a6f88e7e935f7553a4005ffd9f2599ca86 100644 (file)
--- a/include/asm-x86/apicdef.h
+++ b/include/asm-x86/apicdef.h
@@ -1,5 +1,413 @@
+#ifndef _ASM_X86_APICDEF_H
+#define _ASM_X86_APICDEF_H
+
+/*
+ * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
+ *
+ * Alan Cox <Alan.Cox@linux.org>, 1995.
+ * Ingo Molnar <mingo@redhat.com>, 1999, 2000
+ */
+
+#define        APIC_DEFAULT_PHYS_BASE  0xfee00000
+
+#define        APIC_ID         0x20
+
+#ifdef CONFIG_X86_64
+# define       APIC_ID_MASK            (0xFFu<<24)
+# define       GET_APIC_ID(x)          (((x)>>24)&0xFFu)
+# define       SET_APIC_ID(x)          (((x)<<24))
+#endif
+
+#define        APIC_LVR        0x30
+#define                APIC_LVR_MASK           0xFF00FF
+#define                GET_APIC_VERSION(x)     ((x)&0xFFu)
+#define                GET_APIC_MAXLVT(x)      (((x)>>16)&0xFFu)
+#define                APIC_INTEGRATED(x)      ((x)&0xF0u)
+#define                APIC_XAPIC(x)           ((x) >= 0x14)
+#define        APIC_TASKPRI    0x80
+#define                APIC_TPRI_MASK          0xFFu
+#define        APIC_ARBPRI     0x90
+#define                APIC_ARBPRI_MASK        0xFFu
+#define        APIC_PROCPRI    0xA0
+#define        APIC_EOI        0xB0
+#define                APIC_EIO_ACK            0x0
+#define        APIC_RRR        0xC0
+#define        APIC_LDR        0xD0
+#define                APIC_LDR_MASK           (0xFFu<<24)
+#define                GET_APIC_LOGICAL_ID(x)  (((x)>>24)&0xFFu)
+#define                SET_APIC_LOGICAL_ID(x)  (((x)<<24))
+#define                APIC_ALL_CPUS           0xFFu
+#define        APIC_DFR        0xE0
+#define                APIC_DFR_CLUSTER                0x0FFFFFFFul
+#define                APIC_DFR_FLAT                   0xFFFFFFFFul
+#define        APIC_SPIV       0xF0
+#define                APIC_SPIV_FOCUS_DISABLED        (1<<9)
+#define                APIC_SPIV_APIC_ENABLED          (1<<8)
+#define        APIC_ISR        0x100
+#define        APIC_ISR_NR     0x8     /* Number of 32 bit ISR registers. */
+#define        APIC_TMR        0x180
+#define        APIC_IRR        0x200
+#define        APIC_ESR        0x280
+#define                APIC_ESR_SEND_CS        0x00001
+#define                APIC_ESR_RECV_CS        0x00002
+#define                APIC_ESR_SEND_ACC       0x00004
+#define                APIC_ESR_RECV_ACC       0x00008
+#define                APIC_ESR_SENDILL        0x00020
+#define                APIC_ESR_RECVILL        0x00040
+#define                APIC_ESR_ILLREGA        0x00080
+#define        APIC_ICR        0x300
+#define                APIC_DEST_SELF          0x40000
+#define                APIC_DEST_ALLINC        0x80000
+#define                APIC_DEST_ALLBUT        0xC0000
+#define                APIC_ICR_RR_MASK        0x30000
+#define                APIC_ICR_RR_INVALID     0x00000
+#define                APIC_ICR_RR_INPROG      0x10000
+#define                APIC_ICR_RR_VALID       0x20000
+#define                APIC_INT_LEVELTRIG      0x08000
+#define                APIC_INT_ASSERT         0x04000
+#define                APIC_ICR_BUSY           0x01000
+#define                APIC_DEST_LOGICAL       0x00800
+#define                APIC_DEST_PHYSICAL      0x00000
+#define                APIC_DM_FIXED           0x00000
+#define                APIC_DM_LOWEST          0x00100
+#define                APIC_DM_SMI             0x00200
+#define                APIC_DM_REMRD           0x00300
+#define                APIC_DM_NMI             0x00400
+#define                APIC_DM_INIT            0x00500
+#define                APIC_DM_STARTUP         0x00600
+#define                APIC_DM_EXTINT          0x00700
+#define                APIC_VECTOR_MASK        0x000FF
+#define        APIC_ICR2       0x310
+#define                GET_APIC_DEST_FIELD(x)  (((x)>>24)&0xFF)
+#define                SET_APIC_DEST_FIELD(x)  ((x)<<24)
+#define        APIC_LVTT       0x320
+#define        APIC_LVTTHMR    0x330
+#define        APIC_LVTPC      0x340
+#define        APIC_LVT0       0x350
+#define                APIC_LVT_TIMER_BASE_MASK        (0x3<<18)
+#define                GET_APIC_TIMER_BASE(x)          (((x)>>18)&0x3)
+#define                SET_APIC_TIMER_BASE(x)          (((x)<<18))
+#define                APIC_TIMER_BASE_CLKIN           0x0
+#define                APIC_TIMER_BASE_TMBASE          0x1
+#define                APIC_TIMER_BASE_DIV             0x2
+#define                APIC_LVT_TIMER_PERIODIC         (1<<17)
+#define                APIC_LVT_MASKED                 (1<<16)
+#define                APIC_LVT_LEVEL_TRIGGER          (1<<15)
+#define                APIC_LVT_REMOTE_IRR             (1<<14)
+#define                APIC_INPUT_POLARITY             (1<<13)
+#define                APIC_SEND_PENDING               (1<<12)
+#define                APIC_MODE_MASK                  0x700
+#define                GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
+#define                SET_APIC_DELIVERY_MODE(x, y)    (((x)&~0x700)|((y)<<8))
+#define                        APIC_MODE_FIXED         0x0
+#define                        APIC_MODE_NMI           0x4
+#define                        APIC_MODE_EXTINT        0x7
+#define        APIC_LVT1       0x360
+#define        APIC_LVTERR     0x370
+#define        APIC_TMICT      0x380
+#define        APIC_TMCCT      0x390
+#define        APIC_TDCR       0x3E0
+#define                APIC_TDR_DIV_TMBASE     (1<<2)
+#define                APIC_TDR_DIV_1          0xB
+#define                APIC_TDR_DIV_2          0x0
+#define                APIC_TDR_DIV_4          0x1
+#define                APIC_TDR_DIV_8          0x2
+#define                APIC_TDR_DIV_16         0x3
+#define                APIC_TDR_DIV_32         0x8
+#define                APIC_TDR_DIV_64         0x9
+#define                APIC_TDR_DIV_128        0xA
+#define        APIC_EILVT0     0x500
+#define                APIC_EILVT_NR_AMD_K8    1       /* Number of extended interrupts */
+#define                APIC_EILVT_NR_AMD_10H   4
+#define                APIC_EILVT_LVTOFF(x)    (((x)>>4)&0xF)
+#define                APIC_EILVT_MSG_FIX      0x0
+#define                APIC_EILVT_MSG_SMI      0x2
+#define                APIC_EILVT_MSG_NMI      0x4
+#define                APIC_EILVT_MSG_EXT      0x7
+#define                APIC_EILVT_MASKED       (1<<16)
+#define        APIC_EILVT1     0x510
+#define        APIC_EILVT2     0x520
+#define        APIC_EILVT3     0x530
+
+#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
+
  #ifdef CONFIG_X86_32
-# include "apicdef_32.h"
+# define MAX_IO_APICS 64
  #else
-# include "apicdef_64.h"
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 256
+#endif
+
+/*
+ * All x86-64 systems are xAPIC compatible.
+ * In the following, "apicid" is a physical APIC ID.
+ */
+#define XAPIC_DEST_CPUS_SHIFT  4
+#define XAPIC_DEST_CPUS_MASK   ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK        (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CLUSTER(apicid)   ((apicid) & XAPIC_DEST_CLUSTER_MASK)
+#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CPUID(apicid)     ((apicid) & XAPIC_DEST_CPUS_MASK)
+#define NUM_APIC_CLUSTERS      ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
+
+/*
+ * the local APIC register structure, memory mapped. Not terribly well
+ * tested, but we might eventually use this one in the future - the
+ * problem why we cannot use it right now is the P5 APIC, it has an
+ * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
+ */
+#define u32 unsigned int
+
+struct local_apic {
+
+/*000*/        struct { u32 __reserved[4]; } __reserved_01;
+
+/*010*/        struct { u32 __reserved[4]; } __reserved_02;
+
+/*020*/        struct { /* APIC ID Register */
+               u32   __reserved_1      : 24,
+                       phys_apic_id    :  4,
+                       __reserved_2    :  4;
+               u32 __reserved[3];
+       } id;
+
+/*030*/        const
+       struct { /* APIC Version Register */
+               u32   version           :  8,
+                       __reserved_1    :  8,
+                       max_lvt         :  8,
+                       __reserved_2    :  8;
+               u32 __reserved[3];
+       } version;
+
+/*040*/        struct { u32 __reserved[4]; } __reserved_03;
+
+/*050*/        struct { u32 __reserved[4]; } __reserved_04;
+
+/*060*/        struct { u32 __reserved[4]; } __reserved_05;
+
+/*070*/        struct { u32 __reserved[4]; } __reserved_06;
+
+/*080*/        struct { /* Task Priority Register */
+               u32   priority  :  8,
+                       __reserved_1    : 24;
+               u32 __reserved_2[3];
+       } tpr;
+
+/*090*/        const
+       struct { /* Arbitration Priority Register */
+               u32   priority  :  8,
+                       __reserved_1    : 24;
+               u32 __reserved_2[3];
+       } apr;
+
+/*0A0*/        const
+       struct { /* Processor Priority Register */
+               u32   priority  :  8,
+                       __reserved_1    : 24;
+               u32 __reserved_2[3];
+       } ppr;
+
+/*0B0*/        struct { /* End Of Interrupt Register */
+               u32   eoi;
+               u32 __reserved[3];
+       } eoi;
+
+/*0C0*/        struct { u32 __reserved[4]; } __reserved_07;
+
+/*0D0*/        struct { /* Logical Destination Register */
+               u32   __reserved_1      : 24,
+                       logical_dest    :  8;
+               u32 __reserved_2[3];
+       } ldr;
+
+/*0E0*/        struct { /* Destination Format Register */
+               u32   __reserved_1      : 28,
+                       model           :  4;
+               u32 __reserved_2[3];
+       } dfr;
+
+/*0F0*/        struct { /* Spurious Interrupt Vector Register */
+               u32     spurious_vector :  8,
+                       apic_enabled    :  1,
+                       focus_cpu       :  1,
+                       __reserved_2    : 22;
+               u32 __reserved_3[3];
+       } svr;
+
+/*100*/        struct { /* In Service Register */
+/*170*/                u32 bitfield;
+               u32 __reserved[3];
+       } isr [8];
+
+/*180*/        struct { /* Trigger Mode Register */
+/*1F0*/                u32 bitfield;
+               u32 __reserved[3];
+       } tmr [8];
+
+/*200*/        struct { /* Interrupt Request Register */
+/*270*/                u32 bitfield;
+               u32 __reserved[3];
+       } irr [8];
+
+/*280*/        union { /* Error Status Register */
+               struct {
+                       u32   send_cs_error                     :  1,
+                               receive_cs_error                :  1,
+                               send_accept_error               :  1,
+                               receive_accept_error            :  1,
+                               __reserved_1                    :  1,
+                               send_illegal_vector             :  1,
+                               receive_illegal_vector          :  1,
+                               illegal_register_address        :  1,
+                               __reserved_2                    : 24;
+                       u32 __reserved_3[3];
+               } error_bits;
+               struct {
+                       u32 errors;
+                       u32 __reserved_3[3];
+               } all_errors;
+       } esr;
+
+/*290*/        struct { u32 __reserved[4]; } __reserved_08;
+
+/*2A0*/        struct { u32 __reserved[4]; } __reserved_09;
+
+/*2B0*/        struct { u32 __reserved[4]; } __reserved_10;
+
+/*2C0*/        struct { u32 __reserved[4]; } __reserved_11;
+
+/*2D0*/        struct { u32 __reserved[4]; } __reserved_12;
+
+/*2E0*/        struct { u32 __reserved[4]; } __reserved_13;
+
+/*2F0*/        struct { u32 __reserved[4]; } __reserved_14;
+
+/*300*/        struct { /* Interrupt Command Register 1 */
+               u32   vector                    :  8,
+                       delivery_mode           :  3,
+                       destination_mode        :  1,
+                       delivery_status         :  1,
+                       __reserved_1            :  1,
+                       level                   :  1,
+                       trigger                 :  1,
+                       __reserved_2            :  2,
+                       shorthand               :  2,
+                       __reserved_3            :  12;
+               u32 __reserved_4[3];
+       } icr1;
+
+/*310*/        struct { /* Interrupt Command Register 2 */
+               union {
+                       u32   __reserved_1      : 24,
+                               phys_dest       :  4,
+                               __reserved_2    :  4;
+                       u32   __reserved_3      : 24,
+                               logical_dest    :  8;
+               } dest;
+               u32 __reserved_4[3];
+       } icr2;
+
+/*320*/        struct { /* LVT - Timer */
+               u32   vector            :  8,
+                       __reserved_1    :  4,
+                       delivery_status :  1,
+                       __reserved_2    :  3,
+                       mask            :  1,
+                       timer_mode      :  1,
+                       __reserved_3    : 14;
+               u32 __reserved_4[3];
+       } lvt_timer;
+
+/*330*/        struct { /* LVT - Thermal Sensor */
+               u32  vector             :  8,
+                       delivery_mode   :  3,
+                       __reserved_1    :  1,
+                       delivery_status :  1,
+                       __reserved_2    :  3,
+                       mask            :  1,
+                       __reserved_3    : 15;
+               u32 __reserved_4[3];
+       } lvt_thermal;
+
+/*340*/        struct { /* LVT - Performance Counter */
+               u32   vector            :  8,
+                       delivery_mode   :  3,
+                       __reserved_1    :  1,
+                       delivery_status :  1,
+                       __reserved_2    :  3,
+                       mask            :  1,
+                       __reserved_3    : 15;
+               u32 __reserved_4[3];
+       } lvt_pc;
+
+/*350*/        struct { /* LVT - LINT0 */
+               u32   vector            :  8,
+                       delivery_mode   :  3,
+                       __reserved_1    :  1,
+                       delivery_status :  1,
+                       polarity        :  1,
+                       remote_irr      :  1,
+                       trigger         :  1,
+                       mask            :  1,
+                       __reserved_2    : 15;
+               u32 __reserved_3[3];
+       } lvt_lint0;
+
+/*360*/        struct { /* LVT - LINT1 */
+               u32   vector            :  8,
+                       delivery_mode   :  3,
+                       __reserved_1    :  1,
+                       delivery_status :  1,
+                       polarity        :  1,
+                       remote_irr      :  1,
+                       trigger         :  1,
+                       mask            :  1,
+                       __reserved_2    : 15;
+               u32 __reserved_3[3];
+       } lvt_lint1;
+
+/*370*/        struct { /* LVT - Error */
+               u32   vector            :  8,
+                       __reserved_1    :  4,
+                       delivery_status :  1,
+                       __reserved_2    :  3,
+                       mask            :  1,
+                       __reserved_3    : 15;
+               u32 __reserved_4[3];
+       } lvt_error;
+
+/*380*/        struct { /* Timer Initial Count Register */
+               u32   initial_count;
+               u32 __reserved_2[3];
+       } timer_icr;
+
+/*390*/        const
+       struct { /* Timer Current Count Register */
+               u32   curr_count;
+               u32 __reserved_2[3];
+       } timer_ccr;
+
+/*3A0*/        struct { u32 __reserved[4]; } __reserved_16;
+
+/*3B0*/        struct { u32 __reserved[4]; } __reserved_17;
+
+/*3C0*/        struct { u32 __reserved[4]; } __reserved_18;
+
+/*3D0*/        struct { u32 __reserved[4]; } __reserved_19;
+
+/*3E0*/        struct { /* Timer Divide Configuration Register */
+               u32   divisor           :  4,
+                       __reserved_1    : 28;
+               u32 __reserved_2[3];
+       } timer_dcr;
+
+/*3F0*/        struct { u32 __reserved[4]; } __reserved_20;
+
+} __attribute__ ((packed));
+
+#undef u32
+
+#define BAD_APICID 0xFFu
+
  #endif
diff --git a/include/asm-x86/apicdef_32.h b/include/asm-x86/apicdef_32.h

deleted file mode 100644 (file)

index 9f69953..0000000
--- a/include/asm-x86/apicdef_32.h
+++ /dev/null
@@ -1,375 +0,0 @@
-#ifndef __ASM_APICDEF_H
-#define __ASM_APICDEF_H
-
-/*
- * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
- *
- * Alan Cox <Alan.Cox@linux.org>, 1995.
- * Ingo Molnar <mingo@redhat.com>, 1999, 2000
- */
-
-#define                APIC_DEFAULT_PHYS_BASE  0xfee00000
- 
-#define                APIC_ID         0x20
-#define                APIC_LVR        0x30
-#define                        APIC_LVR_MASK           0xFF00FF
-#define                        GET_APIC_VERSION(x)     ((x)&0xFF)
-#define                        GET_APIC_MAXLVT(x)      (((x)>>16)&0xFF)
-#define                        APIC_INTEGRATED(x)      ((x)&0xF0)
-#define                        APIC_XAPIC(x)           ((x) >= 0x14)
-#define                APIC_TASKPRI    0x80
-#define                        APIC_TPRI_MASK          0xFF
-#define                APIC_ARBPRI     0x90
-#define                        APIC_ARBPRI_MASK        0xFF
-#define                APIC_PROCPRI    0xA0
-#define                APIC_EOI        0xB0
-#define                        APIC_EIO_ACK            0x0             /* Write this to the EOI register */
-#define                APIC_RRR        0xC0
-#define                APIC_LDR        0xD0
-#define                        APIC_LDR_MASK           (0xFF<<24)
-#define                        GET_APIC_LOGICAL_ID(x)  (((x)>>24)&0xFF)
-#define                        SET_APIC_LOGICAL_ID(x)  (((x)<<24))
-#define                        APIC_ALL_CPUS           0xFF
-#define                APIC_DFR        0xE0
-#define                        APIC_DFR_CLUSTER                0x0FFFFFFFul
-#define                        APIC_DFR_FLAT                   0xFFFFFFFFul
-#define                APIC_SPIV       0xF0
-#define                        APIC_SPIV_FOCUS_DISABLED        (1<<9)
-#define                        APIC_SPIV_APIC_ENABLED          (1<<8)
-#define                APIC_ISR        0x100
-#define         APIC_ISR_NR     0x8     /* Number of 32 bit ISR registers. */
-#define                APIC_TMR        0x180
-#define        APIC_IRR        0x200
-#define        APIC_ESR        0x280
-#define                        APIC_ESR_SEND_CS        0x00001
-#define                        APIC_ESR_RECV_CS        0x00002
-#define                        APIC_ESR_SEND_ACC       0x00004
-#define                        APIC_ESR_RECV_ACC       0x00008
-#define                        APIC_ESR_SENDILL        0x00020
-#define                        APIC_ESR_RECVILL        0x00040
-#define                        APIC_ESR_ILLREGA        0x00080
-#define                APIC_ICR        0x300
-#define                        APIC_DEST_SELF          0x40000
-#define                        APIC_DEST_ALLINC        0x80000
-#define                        APIC_DEST_ALLBUT        0xC0000
-#define                        APIC_ICR_RR_MASK        0x30000
-#define                        APIC_ICR_RR_INVALID     0x00000
-#define                        APIC_ICR_RR_INPROG      0x10000
-#define                        APIC_ICR_RR_VALID       0x20000
-#define                        APIC_INT_LEVELTRIG      0x08000
-#define                        APIC_INT_ASSERT         0x04000
-#define                        APIC_ICR_BUSY           0x01000
-#define                        APIC_DEST_LOGICAL       0x00800
-#define                        APIC_DM_FIXED           0x00000
-#define                        APIC_DM_LOWEST          0x00100
-#define                        APIC_DM_SMI             0x00200
-#define                        APIC_DM_REMRD           0x00300
-#define                        APIC_DM_NMI             0x00400
-#define                        APIC_DM_INIT            0x00500
-#define                        APIC_DM_STARTUP         0x00600
-#define                        APIC_DM_EXTINT          0x00700
-#define                        APIC_VECTOR_MASK        0x000FF
-#define                APIC_ICR2       0x310
-#define                        GET_APIC_DEST_FIELD(x)  (((x)>>24)&0xFF)
-#define                        SET_APIC_DEST_FIELD(x)  ((x)<<24)
-#define                APIC_LVTT       0x320
-#define                APIC_LVTTHMR    0x330
-#define                APIC_LVTPC      0x340
-#define                APIC_LVT0       0x350
-#define                        APIC_LVT_TIMER_BASE_MASK        (0x3<<18)
-#define                        GET_APIC_TIMER_BASE(x)          (((x)>>18)&0x3)
-#define                        SET_APIC_TIMER_BASE(x)          (((x)<<18))
-#define                        APIC_TIMER_BASE_CLKIN           0x0
-#define                        APIC_TIMER_BASE_TMBASE          0x1
-#define                        APIC_TIMER_BASE_DIV             0x2
-#define                        APIC_LVT_TIMER_PERIODIC         (1<<17)
-#define                        APIC_LVT_MASKED                 (1<<16)
-#define                        APIC_LVT_LEVEL_TRIGGER          (1<<15)
-#define                        APIC_LVT_REMOTE_IRR             (1<<14)
-#define                        APIC_INPUT_POLARITY             (1<<13)
-#define                        APIC_SEND_PENDING               (1<<12)
-#define                        APIC_MODE_MASK                  0x700
-#define                        GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
-#define                        SET_APIC_DELIVERY_MODE(x,y)     (((x)&~0x700)|((y)<<8))
-#define                                APIC_MODE_FIXED         0x0
-#define                                APIC_MODE_NMI           0x4
-#define                                APIC_MODE_EXTINT        0x7
-#define        APIC_LVT1       0x360
-#define                APIC_LVTERR     0x370
-#define                APIC_TMICT      0x380
-#define                APIC_TMCCT      0x390
-#define                APIC_TDCR       0x3E0
-#define                        APIC_TDR_DIV_TMBASE     (1<<2)
-#define                        APIC_TDR_DIV_1          0xB
-#define                        APIC_TDR_DIV_2          0x0
-#define                        APIC_TDR_DIV_4          0x1
-#define                        APIC_TDR_DIV_8          0x2
-#define                        APIC_TDR_DIV_16         0x3
-#define                        APIC_TDR_DIV_32         0x8
-#define                        APIC_TDR_DIV_64         0x9
-#define                        APIC_TDR_DIV_128        0xA
-
-#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
-
-#define MAX_IO_APICS 64
-
-/*
- * the local APIC register structure, memory mapped. Not terribly well
- * tested, but we might eventually use this one in the future - the
- * problem why we cannot use it right now is the P5 APIC, it has an
- * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
- */
-#define u32 unsigned int
-
-
-struct local_apic {
-
-/*000*/        struct { u32 __reserved[4]; } __reserved_01;
-
-/*010*/        struct { u32 __reserved[4]; } __reserved_02;
-
-/*020*/        struct { /* APIC ID Register */
-               u32   __reserved_1      : 24,
-                       phys_apic_id    :  4,
-                       __reserved_2    :  4;
-               u32 __reserved[3];
-       } id;
-
-/*030*/        const
-       struct { /* APIC Version Register */
-               u32   version           :  8,
-                       __reserved_1    :  8,
-                       max_lvt         :  8,
-                       __reserved_2    :  8;
-               u32 __reserved[3];
-       } version;
-
-/*040*/        struct { u32 __reserved[4]; } __reserved_03;
-
-/*050*/        struct { u32 __reserved[4]; } __reserved_04;
-
-/*060*/        struct { u32 __reserved[4]; } __reserved_05;
-
-/*070*/        struct { u32 __reserved[4]; } __reserved_06;
-
-/*080*/        struct { /* Task Priority Register */
-               u32   priority  :  8,
-                       __reserved_1    : 24;
-               u32 __reserved_2[3];
-       } tpr;
-
-/*090*/        const
-       struct { /* Arbitration Priority Register */
-               u32   priority  :  8,
-                       __reserved_1    : 24;
-               u32 __reserved_2[3];
-       } apr;
-
-/*0A0*/        const
-       struct { /* Processor Priority Register */
-               u32   priority  :  8,
-                       __reserved_1    : 24;
-               u32 __reserved_2[3];
-       } ppr;
-
-/*0B0*/        struct { /* End Of Interrupt Register */
-               u32   eoi;
-               u32 __reserved[3];
-       } eoi;
-
-/*0C0*/        struct { u32 __reserved[4]; } __reserved_07;
-
-/*0D0*/        struct { /* Logical Destination Register */
-               u32   __reserved_1      : 24,
-                       logical_dest    :  8;
-               u32 __reserved_2[3];
-       } ldr;
-
-/*0E0*/        struct { /* Destination Format Register */
-               u32   __reserved_1      : 28,
-                       model           :  4;
-               u32 __reserved_2[3];
-       } dfr;
-
-/*0F0*/        struct { /* Spurious Interrupt Vector Register */
-               u32     spurious_vector :  8,
-                       apic_enabled    :  1,
-                       focus_cpu       :  1,
-                       __reserved_2    : 22;
-               u32 __reserved_3[3];
-       } svr;
-
-/*100*/        struct { /* In Service Register */
-/*170*/                u32 bitfield;
-               u32 __reserved[3];
-       } isr [8];
-
-/*180*/        struct { /* Trigger Mode Register */
-/*1F0*/                u32 bitfield;
-               u32 __reserved[3];
-       } tmr [8];
-
-/*200*/        struct { /* Interrupt Request Register */
-/*270*/                u32 bitfield;
-               u32 __reserved[3];
-       } irr [8];
-
-/*280*/        union { /* Error Status Register */
-               struct {
-                       u32   send_cs_error                     :  1,
-                               receive_cs_error                :  1,
-                               send_accept_error               :  1,
-                               receive_accept_error            :  1,
-                               __reserved_1                    :  1,
-                               send_illegal_vector             :  1,
-                               receive_illegal_vector          :  1,
-                               illegal_register_address        :  1,
-                               __reserved_2                    : 24;
-                       u32 __reserved_3[3];
-               } error_bits;
-               struct {
-                       u32 errors;
-                       u32 __reserved_3[3];
-               } all_errors;
-       } esr;
-
-/*290*/        struct { u32 __reserved[4]; } __reserved_08;
-
-/*2A0*/        struct { u32 __reserved[4]; } __reserved_09;
-
-/*2B0*/        struct { u32 __reserved[4]; } __reserved_10;
-
-/*2C0*/        struct { u32 __reserved[4]; } __reserved_11;
-
-/*2D0*/        struct { u32 __reserved[4]; } __reserved_12;
-
-/*2E0*/        struct { u32 __reserved[4]; } __reserved_13;
-
-/*2F0*/        struct { u32 __reserved[4]; } __reserved_14;
-
-/*300*/        struct { /* Interrupt Command Register 1 */
-               u32   vector                    :  8,
-                       delivery_mode           :  3,
-                       destination_mode        :  1,
-                       delivery_status         :  1,
-                       __reserved_1            :  1,
-                       level                   :  1,
-                       trigger                 :  1,
-                       __reserved_2            :  2,
-                       shorthand               :  2,
-                       __reserved_3            :  12;
-               u32 __reserved_4[3];
-       } icr1;
-
-/*310*/        struct { /* Interrupt Command Register 2 */
-               union {
-                       u32   __reserved_1      : 24,
-                               phys_dest       :  4,
-                               __reserved_2    :  4;
-                       u32   __reserved_3      : 24,
-                               logical_dest    :  8;
-               } dest;
-               u32 __reserved_4[3];
-       } icr2;
-
-/*320*/        struct { /* LVT - Timer */
-               u32   vector            :  8,
-                       __reserved_1    :  4,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       timer_mode      :  1,
-                       __reserved_3    : 14;
-               u32 __reserved_4[3];
-       } lvt_timer;
-
-/*330*/        struct { /* LVT - Thermal Sensor */
-               u32  vector             :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       __reserved_3    : 15;
-               u32 __reserved_4[3];
-       } lvt_thermal;
-
-/*340*/        struct { /* LVT - Performance Counter */
-               u32   vector            :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       __reserved_3    : 15;
-               u32 __reserved_4[3];
-       } lvt_pc;
-
-/*350*/        struct { /* LVT - LINT0 */
-               u32   vector            :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       polarity        :  1,
-                       remote_irr      :  1,
-                       trigger         :  1,
-                       mask            :  1,
-                       __reserved_2    : 15;
-               u32 __reserved_3[3];
-       } lvt_lint0;
-
-/*360*/        struct { /* LVT - LINT1 */
-               u32   vector            :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       polarity        :  1,
-                       remote_irr      :  1,
-                       trigger         :  1,
-                       mask            :  1,
-                       __reserved_2    : 15;
-               u32 __reserved_3[3];
-       } lvt_lint1;
-
-/*370*/        struct { /* LVT - Error */
-               u32   vector            :  8,
-                       __reserved_1    :  4,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       __reserved_3    : 15;
-               u32 __reserved_4[3];
-       } lvt_error;
-
-/*380*/        struct { /* Timer Initial Count Register */
-               u32   initial_count;
-               u32 __reserved_2[3];
-       } timer_icr;
-
-/*390*/        const
-       struct { /* Timer Current Count Register */
-               u32   curr_count;
-               u32 __reserved_2[3];
-       } timer_ccr;
-
-/*3A0*/        struct { u32 __reserved[4]; } __reserved_16;
-
-/*3B0*/        struct { u32 __reserved[4]; } __reserved_17;
-
-/*3C0*/        struct { u32 __reserved[4]; } __reserved_18;
-
-/*3D0*/        struct { u32 __reserved[4]; } __reserved_19;
-
-/*3E0*/        struct { /* Timer Divide Configuration Register */
-               u32   divisor           :  4,
-                       __reserved_1    : 28;
-               u32 __reserved_2[3];
-       } timer_dcr;
-
-/*3F0*/        struct { u32 __reserved[4]; } __reserved_20;
-
-} __attribute__ ((packed));
-
-#undef u32
-
-#endif
diff --git a/include/asm-x86/apicdef_64.h b/include/asm-x86/apicdef_64.h

deleted file mode 100644 (file)

index 1dd4006..0000000
--- a/include/asm-x86/apicdef_64.h
+++ /dev/null
@@ -1,392 +0,0 @@
-#ifndef __ASM_APICDEF_H
-#define __ASM_APICDEF_H
-
-/*
- * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
- *
- * Alan Cox <Alan.Cox@linux.org>, 1995.
- * Ingo Molnar <mingo@redhat.com>, 1999, 2000
- */
-
-#define                APIC_DEFAULT_PHYS_BASE  0xfee00000
- 
-#define                APIC_ID         0x20
-#define                        APIC_ID_MASK            (0xFFu<<24)
-#define                        GET_APIC_ID(x)          (((x)>>24)&0xFFu)
-#define                        SET_APIC_ID(x)          (((x)<<24))
-#define                APIC_LVR        0x30
-#define                        APIC_LVR_MASK           0xFF00FF
-#define                        GET_APIC_VERSION(x)     ((x)&0xFFu)
-#define                        GET_APIC_MAXLVT(x)      (((x)>>16)&0xFFu)
-#define                        APIC_INTEGRATED(x)      ((x)&0xF0u)
-#define                APIC_TASKPRI    0x80
-#define                        APIC_TPRI_MASK          0xFFu
-#define                APIC_ARBPRI     0x90
-#define                        APIC_ARBPRI_MASK        0xFFu
-#define                APIC_PROCPRI    0xA0
-#define                APIC_EOI        0xB0
-#define                        APIC_EIO_ACK            0x0             /* Write this to the EOI register */
-#define                APIC_RRR        0xC0
-#define                APIC_LDR        0xD0
-#define                        APIC_LDR_MASK           (0xFFu<<24)
-#define                        GET_APIC_LOGICAL_ID(x)  (((x)>>24)&0xFFu)
-#define                        SET_APIC_LOGICAL_ID(x)  (((x)<<24))
-#define                        APIC_ALL_CPUS           0xFFu
-#define                APIC_DFR        0xE0
-#define                        APIC_DFR_CLUSTER                0x0FFFFFFFul
-#define                        APIC_DFR_FLAT                   0xFFFFFFFFul
-#define                APIC_SPIV       0xF0
-#define                        APIC_SPIV_FOCUS_DISABLED        (1<<9)
-#define                        APIC_SPIV_APIC_ENABLED          (1<<8)
-#define                APIC_ISR        0x100
-#define                APIC_ISR_NR     0x8     /* Number of 32 bit ISR registers. */
-#define                APIC_TMR        0x180
-#define        APIC_IRR        0x200
-#define        APIC_ESR        0x280
-#define                        APIC_ESR_SEND_CS        0x00001
-#define                        APIC_ESR_RECV_CS        0x00002
-#define                        APIC_ESR_SEND_ACC       0x00004
-#define                        APIC_ESR_RECV_ACC       0x00008
-#define                        APIC_ESR_SENDILL        0x00020
-#define                        APIC_ESR_RECVILL        0x00040
-#define                        APIC_ESR_ILLREGA        0x00080
-#define                APIC_ICR        0x300
-#define                        APIC_DEST_SELF          0x40000
-#define                        APIC_DEST_ALLINC        0x80000
-#define                        APIC_DEST_ALLBUT        0xC0000
-#define                        APIC_ICR_RR_MASK        0x30000
-#define                        APIC_ICR_RR_INVALID     0x00000
-#define                        APIC_ICR_RR_INPROG      0x10000
-#define                        APIC_ICR_RR_VALID       0x20000
-#define                        APIC_INT_LEVELTRIG      0x08000
-#define                        APIC_INT_ASSERT         0x04000
-#define                        APIC_ICR_BUSY           0x01000
-#define                        APIC_DEST_LOGICAL       0x00800
-#define                        APIC_DEST_PHYSICAL      0x00000
-#define                        APIC_DM_FIXED           0x00000
-#define                        APIC_DM_LOWEST          0x00100
-#define                        APIC_DM_SMI             0x00200
-#define                        APIC_DM_REMRD           0x00300
-#define                        APIC_DM_NMI             0x00400
-#define                        APIC_DM_INIT            0x00500
-#define                        APIC_DM_STARTUP         0x00600
-#define                        APIC_DM_EXTINT          0x00700
-#define                        APIC_VECTOR_MASK        0x000FF
-#define                APIC_ICR2       0x310
-#define                        GET_APIC_DEST_FIELD(x)  (((x)>>24)&0xFF)
-#define                        SET_APIC_DEST_FIELD(x)  ((x)<<24)
-#define                APIC_LVTT       0x320
-#define                APIC_LVTTHMR    0x330
-#define                APIC_LVTPC      0x340
-#define                APIC_LVT0       0x350
-#define                        APIC_LVT_TIMER_BASE_MASK        (0x3<<18)
-#define                        GET_APIC_TIMER_BASE(x)          (((x)>>18)&0x3)
-#define                        SET_APIC_TIMER_BASE(x)          (((x)<<18))
-#define                        APIC_TIMER_BASE_CLKIN           0x0
-#define                        APIC_TIMER_BASE_TMBASE          0x1
-#define                        APIC_TIMER_BASE_DIV             0x2
-#define                        APIC_LVT_TIMER_PERIODIC         (1<<17)
-#define                        APIC_LVT_MASKED                 (1<<16)
-#define                        APIC_LVT_LEVEL_TRIGGER          (1<<15)
-#define                        APIC_LVT_REMOTE_IRR             (1<<14)
-#define                        APIC_INPUT_POLARITY             (1<<13)
-#define                        APIC_SEND_PENDING               (1<<12)
-#define                        APIC_MODE_MASK                  0x700
-#define                        GET_APIC_DELIVERY_MODE(x)       (((x)>>8)&0x7)
-#define                        SET_APIC_DELIVERY_MODE(x,y)     (((x)&~0x700)|((y)<<8))
-#define                                APIC_MODE_FIXED         0x0
-#define                                APIC_MODE_NMI           0x4
-#define                                APIC_MODE_EXTINT        0x7
-#define        APIC_LVT1       0x360
-#define                APIC_LVTERR     0x370
-#define                APIC_TMICT      0x380
-#define                APIC_TMCCT      0x390
-#define                APIC_TDCR       0x3E0
-#define                        APIC_TDR_DIV_TMBASE     (1<<2)
-#define                        APIC_TDR_DIV_1          0xB
-#define                        APIC_TDR_DIV_2          0x0
-#define                        APIC_TDR_DIV_4          0x1
-#define                        APIC_TDR_DIV_8          0x2
-#define                        APIC_TDR_DIV_16         0x3
-#define                        APIC_TDR_DIV_32         0x8
-#define                        APIC_TDR_DIV_64         0x9
-#define                        APIC_TDR_DIV_128        0xA
-
-#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
-
-#define MAX_IO_APICS 128
-#define MAX_LOCAL_APIC 256
-
-/*
- * All x86-64 systems are xAPIC compatible.
- * In the following, "apicid" is a physical APIC ID.
- */
-#define XAPIC_DEST_CPUS_SHIFT  4
-#define XAPIC_DEST_CPUS_MASK   ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK        (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-#define APIC_CLUSTER(apicid)   ((apicid) & XAPIC_DEST_CLUSTER_MASK)
-#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
-#define APIC_CPUID(apicid)     ((apicid) & XAPIC_DEST_CPUS_MASK)
-#define NUM_APIC_CLUSTERS      ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
-
-/*
- * the local APIC register structure, memory mapped. Not terribly well
- * tested, but we might eventually use this one in the future - the
- * problem why we cannot use it right now is the P5 APIC, it has an
- * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
- */
-#define u32 unsigned int
-
-struct local_apic {
-
-/*000*/        struct { u32 __reserved[4]; } __reserved_01;
-
-/*010*/        struct { u32 __reserved[4]; } __reserved_02;
-
-/*020*/        struct { /* APIC ID Register */
-               u32   __reserved_1      : 24,
-                       phys_apic_id    :  4,
-                       __reserved_2    :  4;
-               u32 __reserved[3];
-       } id;
-
-/*030*/        const
-       struct { /* APIC Version Register */
-               u32   version           :  8,
-                       __reserved_1    :  8,
-                       max_lvt         :  8,
-                       __reserved_2    :  8;
-               u32 __reserved[3];
-       } version;
-
-/*040*/        struct { u32 __reserved[4]; } __reserved_03;
-
-/*050*/        struct { u32 __reserved[4]; } __reserved_04;
-
-/*060*/        struct { u32 __reserved[4]; } __reserved_05;
-
-/*070*/        struct { u32 __reserved[4]; } __reserved_06;
-
-/*080*/        struct { /* Task Priority Register */
-               u32   priority  :  8,
-                       __reserved_1    : 24;
-               u32 __reserved_2[3];
-       } tpr;
-
-/*090*/        const
-       struct { /* Arbitration Priority Register */
-               u32   priority  :  8,
-                       __reserved_1    : 24;
-               u32 __reserved_2[3];
-       } apr;
-
-/*0A0*/        const
-       struct { /* Processor Priority Register */
-               u32   priority  :  8,
-                       __reserved_1    : 24;
-               u32 __reserved_2[3];
-       } ppr;
-
-/*0B0*/        struct { /* End Of Interrupt Register */
-               u32   eoi;
-               u32 __reserved[3];
-       } eoi;
-
-/*0C0*/        struct { u32 __reserved[4]; } __reserved_07;
-
-/*0D0*/        struct { /* Logical Destination Register */
-               u32   __reserved_1      : 24,
-                       logical_dest    :  8;
-               u32 __reserved_2[3];
-       } ldr;
-
-/*0E0*/        struct { /* Destination Format Register */
-               u32   __reserved_1      : 28,
-                       model           :  4;
-               u32 __reserved_2[3];
-       } dfr;
-
-/*0F0*/        struct { /* Spurious Interrupt Vector Register */
-               u32     spurious_vector :  8,
-                       apic_enabled    :  1,
-                       focus_cpu       :  1,
-                       __reserved_2    : 22;
-               u32 __reserved_3[3];
-       } svr;
-
-/*100*/        struct { /* In Service Register */
-/*170*/                u32 bitfield;
-               u32 __reserved[3];
-       } isr [8];
-
-/*180*/        struct { /* Trigger Mode Register */
-/*1F0*/                u32 bitfield;
-               u32 __reserved[3];
-       } tmr [8];
-
-/*200*/        struct { /* Interrupt Request Register */
-/*270*/                u32 bitfield;
-               u32 __reserved[3];
-       } irr [8];
-
-/*280*/        union { /* Error Status Register */
-               struct {
-                       u32   send_cs_error                     :  1,
-                               receive_cs_error                :  1,
-                               send_accept_error               :  1,
-                               receive_accept_error            :  1,
-                               __reserved_1                    :  1,
-                               send_illegal_vector             :  1,
-                               receive_illegal_vector          :  1,
-                               illegal_register_address        :  1,
-                               __reserved_2                    : 24;
-                       u32 __reserved_3[3];
-               } error_bits;
-               struct {
-                       u32 errors;
-                       u32 __reserved_3[3];
-               } all_errors;
-       } esr;
-
-/*290*/        struct { u32 __reserved[4]; } __reserved_08;
-
-/*2A0*/        struct { u32 __reserved[4]; } __reserved_09;
-
-/*2B0*/        struct { u32 __reserved[4]; } __reserved_10;
-
-/*2C0*/        struct { u32 __reserved[4]; } __reserved_11;
-
-/*2D0*/        struct { u32 __reserved[4]; } __reserved_12;
-
-/*2E0*/        struct { u32 __reserved[4]; } __reserved_13;
-
-/*2F0*/        struct { u32 __reserved[4]; } __reserved_14;
-
-/*300*/        struct { /* Interrupt Command Register 1 */
-               u32   vector                    :  8,
-                       delivery_mode           :  3,
-                       destination_mode        :  1,
-                       delivery_status         :  1,
-                       __reserved_1            :  1,
-                       level                   :  1,
-                       trigger                 :  1,
-                       __reserved_2            :  2,
-                       shorthand               :  2,
-                       __reserved_3            :  12;
-               u32 __reserved_4[3];
-       } icr1;
-
-/*310*/        struct { /* Interrupt Command Register 2 */
-               union {
-                       u32   __reserved_1      : 24,
-                               phys_dest       :  4,
-                               __reserved_2    :  4;
-                       u32   __reserved_3      : 24,
-                               logical_dest    :  8;
-               } dest;
-               u32 __reserved_4[3];
-       } icr2;
-
-/*320*/        struct { /* LVT - Timer */
-               u32   vector            :  8,
-                       __reserved_1    :  4,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       timer_mode      :  1,
-                       __reserved_3    : 14;
-               u32 __reserved_4[3];
-       } lvt_timer;
-
-/*330*/        struct { /* LVT - Thermal Sensor */
-               u32  vector             :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       __reserved_3    : 15;
-               u32 __reserved_4[3];
-       } lvt_thermal;
-
-/*340*/        struct { /* LVT - Performance Counter */
-               u32   vector            :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       __reserved_3    : 15;
-               u32 __reserved_4[3];
-       } lvt_pc;
-
-/*350*/        struct { /* LVT - LINT0 */
-               u32   vector            :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       polarity        :  1,
-                       remote_irr      :  1,
-                       trigger         :  1,
-                       mask            :  1,
-                       __reserved_2    : 15;
-               u32 __reserved_3[3];
-       } lvt_lint0;
-
-/*360*/        struct { /* LVT - LINT1 */
-               u32   vector            :  8,
-                       delivery_mode   :  3,
-                       __reserved_1    :  1,
-                       delivery_status :  1,
-                       polarity        :  1,
-                       remote_irr      :  1,
-                       trigger         :  1,
-                       mask            :  1,
-                       __reserved_2    : 15;
-               u32 __reserved_3[3];
-       } lvt_lint1;
-
-/*370*/        struct { /* LVT - Error */
-               u32   vector            :  8,
-                       __reserved_1    :  4,
-                       delivery_status :  1,
-                       __reserved_2    :  3,
-                       mask            :  1,
-                       __reserved_3    : 15;
-               u32 __reserved_4[3];
-       } lvt_error;
-
-/*380*/        struct { /* Timer Initial Count Register */
-               u32   initial_count;
-               u32 __reserved_2[3];
-       } timer_icr;
-
-/*390*/        const
-       struct { /* Timer Current Count Register */
-               u32   curr_count;
-               u32 __reserved_2[3];
-       } timer_ccr;
-
-/*3A0*/        struct { u32 __reserved[4]; } __reserved_16;
-
-/*3B0*/        struct { u32 __reserved[4]; } __reserved_17;
-
-/*3C0*/        struct { u32 __reserved[4]; } __reserved_18;
-
-/*3D0*/        struct { u32 __reserved[4]; } __reserved_19;
-
-/*3E0*/        struct { /* Timer Divide Configuration Register */
-               u32   divisor           :  4,
-                       __reserved_1    : 28;
-               u32 __reserved_2[3];
-       } timer_dcr;
-
-/*3F0*/        struct { u32 __reserved[4]; } __reserved_20;
-
-} __attribute__ ((packed));
-
-#undef u32
-
-#define BAD_APICID 0xFFu
-
-#endif
diff --git a/include/asm-x86/arch_hooks.h b/include/asm-x86/arch_hooks.h

index a8c1fca9726dbcb47c69b96fc203701f63c66589..768aee8a04ef85c5cf1c6ee1b22fa0ebcf3dc60d 100644 (file)
--- a/include/asm-x86/arch_hooks.h
+++ b/include/asm-x86/arch_hooks.h
@@ -6,7 +6,7 @@
  /*
   *     linux/include/asm/arch_hooks.h
   *
- *     define the architecture specific hooks 
+ *     define the architecture specific hooks
   */
  
  /* these aren't arch hooks, they are generic routines
@@ -24,7 +24,4 @@ extern void trap_init_hook(void);
  extern void time_init_hook(void);
  extern void mca_nmi_hook(void);
  
-extern int setup_early_printk(char *);
-extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
-
  #endif
diff --git a/include/asm-x86/asm.h b/include/asm-x86/asm.h

new file mode 100644 (file)

index 0000000..1a6980a
--- /dev/null
+++ b/include/asm-x86/asm.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_X86_ASM_H
+#define _ASM_X86_ASM_H
+
+#ifdef CONFIG_X86_32
+/* 32 bits */
+
+# define _ASM_PTR      " .long "
+# define _ASM_ALIGN    " .balign 4 "
+# define _ASM_MOV_UL   " movl "
+
+# define _ASM_INC      " incl "
+# define _ASM_DEC      " decl "
+# define _ASM_ADD      " addl "
+# define _ASM_SUB      " subl "
+# define _ASM_XADD     " xaddl "
+
+#else
+/* 64 bits */
+
+# define _ASM_PTR      " .quad "
+# define _ASM_ALIGN    " .balign 8 "
+# define _ASM_MOV_UL   " movq "
+
+# define _ASM_INC      " incq "
+# define _ASM_DEC      " decq "
+# define _ASM_ADD      " addq "
+# define _ASM_SUB      " subq "
+# define _ASM_XADD     " xaddq "
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_ASM_H */
diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h

index 07e3f6d4fe47993b5bbfb833584ebd286498e949..1a23ce1a5697e03097be4365f44e422388e2d12c 100644 (file)
--- a/include/asm-x86/bitops.h
+++ b/include/asm-x86/bitops.h
@@ -1,5 +1,321 @@
+#ifndef _ASM_X86_BITOPS_H
+#define _ASM_X86_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+#ifndef _LINUX_BITOPS_H
+#error only <linux/bitops.h> can be included directly
+#endif
+
+#include <linux/compiler.h>
+#include <asm/alternative.h>
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
+/* Technically wrong, but this avoids compilation errors on some gcc
+   versions. */
+#define ADDR "=m" (*(volatile long *) addr)
+#else
+#define ADDR "+m" (*(volatile long *) addr)
+#endif
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered
+ * on non x86 architectures, so if you are writing portable code,
+ * make sure not to rely on its reordering guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void set_bit(int nr, volatile void *addr)
+{
+       asm volatile(LOCK_PREFIX "bts %1,%0"
+                    : ADDR
+                    : "Ir" (nr) : "memory");
+}
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile void *addr)
+{
+       asm volatile("bts %1,%0"
+                    : ADDR
+                    : "Ir" (nr) : "memory");
+}
+
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered.  However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void clear_bit(int nr, volatile void *addr)
+{
+       asm volatile(LOCK_PREFIX "btr %1,%0"
+                    : ADDR
+                    : "Ir" (nr));
+}
+
+/*
+ * clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock.
+ */
+static inline void clear_bit_unlock(unsigned nr, volatile void *addr)
+{
+       barrier();
+       clear_bit(nr, addr);
+}
+
+static inline void __clear_bit(int nr, volatile void *addr)
+{
+       asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
+}
+
+/*
+ * __clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * __clear_bit() is non-atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock if no other CPUs can concurrently
+ * modify other bits in the word.
+ *
+ * No memory barrier is required here, because x86 cannot reorder stores past
+ * older loads. Same principle as spin_unlock.
+ */
+static inline void __clear_bit_unlock(unsigned nr, volatile void *addr)
+{
+       barrier();
+       __clear_bit(nr, addr);
+}
+
+#define smp_mb__before_clear_bit()     barrier()
+#define smp_mb__after_clear_bit()      barrier()
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile void *addr)
+{
+       asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
+}
+
+/**
+ * change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void change_bit(int nr, volatile void *addr)
+{
+       asm volatile(LOCK_PREFIX "btc %1,%0"
+                    : ADDR : "Ir" (nr));
+}
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_set_bit(int nr, volatile void *addr)
+{
+       int oldbit;
+
+       asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
+                    "sbb %0,%0"
+                    : "=r" (oldbit), ADDR
+                    : "Ir" (nr) : "memory");
+
+       return oldbit;
+}
+
+/**
+ * test_and_set_bit_lock - Set a bit and return its old value for lock
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This is the same as test_and_set_bit on x86.
+ */
+static inline int test_and_set_bit_lock(int nr, volatile void *addr)
+{
+       return test_and_set_bit(nr, addr);
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail.  You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile void *addr)
+{
+       int oldbit;
+
+       asm("bts %2,%1\n\t"
+           "sbb %0,%0"
+           : "=r" (oldbit), ADDR
+           : "Ir" (nr));
+       return oldbit;
+}
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_clear_bit(int nr, volatile void *addr)
+{
+       int oldbit;
+
+       asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
+                    "sbb %0,%0"
+                    : "=r" (oldbit), ADDR
+                    : "Ir" (nr) : "memory");
+
+       return oldbit;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail.  You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile void *addr)
+{
+       int oldbit;
+
+       asm volatile("btr %2,%1\n\t"
+                    "sbb %0,%0"
+                    : "=r" (oldbit), ADDR
+                    : "Ir" (nr));
+       return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr, volatile void *addr)
+{
+       int oldbit;
+
+       asm volatile("btc %2,%1\n\t"
+                    "sbb %0,%0"
+                    : "=r" (oldbit), ADDR
+                    : "Ir" (nr) : "memory");
+
+       return oldbit;
+}
+
+/**
+ * test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_change_bit(int nr, volatile void *addr)
+{
+       int oldbit;
+
+       asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
+                    "sbb %0,%0"
+                    : "=r" (oldbit), ADDR
+                    : "Ir" (nr) : "memory");
+
+       return oldbit;
+}
+
+static inline int constant_test_bit(int nr, const volatile void *addr)
+{
+       return ((1UL << (nr % BITS_PER_LONG)) &
+               (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline int variable_test_bit(int nr, volatile const void *addr)
+{
+       int oldbit;
+
+       asm volatile("bt %2,%1\n\t"
+                    "sbb %0,%0"
+                    : "=r" (oldbit)
+                    : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+       return oldbit;
+}
+
+#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static int test_bit(int nr, const volatile unsigned long *addr);
+#endif
+
+#define test_bit(nr,addr)                      \
+       (__builtin_constant_p(nr) ?             \
+        constant_test_bit((nr),(addr)) :       \
+        variable_test_bit((nr),(addr)))
+
+#undef ADDR
+
  #ifdef CONFIG_X86_32
  # include "bitops_32.h"
  #else
  # include "bitops_64.h"
  #endif
+
+#endif /* _ASM_X86_BITOPS_H */
diff --git a/include/asm-x86/bitops_32.h b/include/asm-x86/bitops_32.h

index 0b40f6d20bea207081fd739f8eae5b9bb3ac12f4..e4d75fcf9c03000d6e4fcb1e92a35aef59f90884 100644 (file)
--- a/include/asm-x86/bitops_32.h
+++ b/include/asm-x86/bitops_32.h
@@ -5,320 +5,12 @@
   * Copyright 1992, Linus Torvalds.
   */
  
-#ifndef _LINUX_BITOPS_H
-#error only <linux/bitops.h> can be included directly
-#endif
-
-#include <linux/compiler.h>
-#include <asm/alternative.h>
-
-/*
- * These have to be done with inline assembly: that way the bit-setting
- * is guaranteed to be atomic. All bit operations return 0 if the bit
- * was cleared before the operation and != 0 if it was not.
- *
- * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
- */
-
-#define ADDR (*(volatile long *) addr)
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- *
- * Note: there are no guarantees that this function will not be reordered
- * on non x86 architectures, so if you are writing portable code,
- * make sure not to rely on its reordering guarantees.
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void set_bit(int nr, volatile unsigned long * addr)
-{
-       __asm__ __volatile__( LOCK_PREFIX
-               "btsl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-}
-
-/**
- * __set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __set_bit(int nr, volatile unsigned long * addr)
-{
-       __asm__(
-               "btsl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
- * in order to ensure changes are visible on other processors.
- */
-static inline void clear_bit(int nr, volatile unsigned long * addr)
-{
-       __asm__ __volatile__( LOCK_PREFIX
-               "btrl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-}
-
-/*
- * clear_bit_unlock - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and implies release semantics before the memory
- * operation. It can be used for an unlock.
- */
-static inline void clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
-{
-       barrier();
-       clear_bit(nr, addr);
-}
-
-static inline void __clear_bit(int nr, volatile unsigned long * addr)
-{
-       __asm__ __volatile__(
-               "btrl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-}
-
-/*
- * __clear_bit_unlock - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * __clear_bit() is non-atomic and implies release semantics before the memory
- * operation. It can be used for an unlock if no other CPUs can concurrently
- * modify other bits in the word.
- *
- * No memory barrier is required here, because x86 cannot reorder stores past
- * older loads. Same principle as spin_unlock.
- */
-static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
-{
-       barrier();
-       __clear_bit(nr, addr);
-}
-
-#define smp_mb__before_clear_bit()     barrier()
-#define smp_mb__after_clear_bit()      barrier()
-
-/**
- * __change_bit - Toggle a bit in memory
- * @nr: the bit to change
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __change_bit(int nr, volatile unsigned long * addr)
-{
-       __asm__ __volatile__(
-               "btcl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-}
-
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to change
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered. It may be
- * reordered on other architectures than x86.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void change_bit(int nr, volatile unsigned long * addr)
-{
-       __asm__ __volatile__( LOCK_PREFIX
-               "btcl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-}
-
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It may be reordered on other architectures than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile unsigned long * addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__( LOCK_PREFIX
-               "btsl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-       return oldbit;
-}
-
-/**
- * test_and_set_bit_lock - Set a bit and return its old value for lock
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This is the same as test_and_set_bit on x86.
- */
-static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr)
-{
-       return test_and_set_bit(nr, addr);
-}
-
-/**
- * __test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_set_bit(int nr, volatile unsigned long * addr)
-{
-       int oldbit;
-
-       __asm__(
-               "btsl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr));
-       return oldbit;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It can be reorderdered on other architectures other than x86.
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile unsigned long * addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__( LOCK_PREFIX
-               "btrl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-       return oldbit;
-}
-
-/**
- * __test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
-{
-       int oldbit;
-
-       __asm__(
-               "btrl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr));
-       return oldbit;
-}
-
-/* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__(
-               "btcl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-       return oldbit;
-}
-
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
- */
-static inline int test_and_change_bit(int nr, volatile unsigned long* addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__( LOCK_PREFIX
-               "btcl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-       return oldbit;
-}
-
-#if 0 /* Fool kernel-doc since it doesn't do macros yet */
-/**
- * test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static int test_bit(int nr, const volatile void * addr);
-#endif
-
-static __always_inline int constant_test_bit(int nr, const volatile unsigned long *addr)
-{
-       return ((1UL << (nr & 31)) & (addr[nr >> 5])) != 0;
-}
-
-static inline int variable_test_bit(int nr, const volatile unsigned long * addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__(
-               "btl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit)
-               :"m" (ADDR),"Ir" (nr));
-       return oldbit;
-}
-
-#define test_bit(nr,addr) \
-(__builtin_constant_p(nr) ? \
- constant_test_bit((nr),(addr)) : \
- variable_test_bit((nr),(addr)))
-
-#undef ADDR
-
  /**
   * find_first_zero_bit - find the first zero bit in a memory region
   * @addr: The address to start the search at
   * @size: The maximum size to search
   *
- * Returns the bit-number of the first zero bit, not the number of the byte
+ * Returns the bit number of the first zero bit, not the number of the byte
   * containing a bit.
   */
  static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
@@ -348,7 +40,7 @@ static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
  /**
   * find_next_zero_bit - find the first zero bit in a memory region
   * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
+ * @offset: The bit number to start searching at
   * @size: The maximum size to search
   */
  int find_next_zero_bit(const unsigned long *addr, int size, int offset);
@@ -372,7 +64,7 @@ static inline unsigned long __ffs(unsigned long word)
   * @addr: The address to start the search at
   * @size: The maximum size to search
   *
- * Returns the bit-number of the first set bit, not the number of the byte
+ * Returns the bit number of the first set bit, not the number of the byte
   * containing a bit.
   */
  static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
@@ -391,7 +83,7 @@ static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
  /**
   * find_next_bit - find the first set bit in a memory region
   * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
+ * @offset: The bit number to start searching at
   * @size: The maximum size to search
   */
  int find_next_bit(const unsigned long *addr, int size, int offset);
@@ -460,10 +152,10 @@ static inline int fls(int x)
  
  #include <asm-generic/bitops/ext2-non-atomic.h>
  
-#define ext2_set_bit_atomic(lock,nr,addr) \
-        test_and_set_bit((nr),(unsigned long*)addr)
-#define ext2_clear_bit_atomic(lock,nr, addr) \
-               test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock, nr, addr) \
+               test_and_set_bit((nr), (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+               test_and_clear_bit((nr), (unsigned long *)addr)
  
  #include <asm-generic/bitops/minix.h>
  
diff --git a/include/asm-x86/bitops_64.h b/include/asm-x86/bitops_64.h

index 766bcc0470a612f7476909a7b4daba1d5951e0b3..48adbf56ca60c9f2d7da5954e3245188d6f8a2ea 100644 (file)
--- a/include/asm-x86/bitops_64.h
+++ b/include/asm-x86/bitops_64.h
@@ -5,303 +5,6 @@
   * Copyright 1992, Linus Torvalds.
   */
  
-#ifndef _LINUX_BITOPS_H
-#error only <linux/bitops.h> can be included directly
-#endif
-
-#include <asm/alternative.h>
-
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
-/* Technically wrong, but this avoids compilation errors on some gcc
-   versions. */
-#define ADDR "=m" (*(volatile long *) addr)
-#else
-#define ADDR "+m" (*(volatile long *) addr)
-#endif
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void set_bit(int nr, volatile void *addr)
-{
-       __asm__ __volatile__( LOCK_PREFIX
-               "btsl %1,%0"
-               :ADDR
-               :"dIr" (nr) : "memory");
-}
-
-/**
- * __set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __set_bit(int nr, volatile void *addr)
-{
-       __asm__ volatile(
-               "btsl %1,%0"
-               :ADDR
-               :"dIr" (nr) : "memory");
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
- * in order to ensure changes are visible on other processors.
- */
-static inline void clear_bit(int nr, volatile void *addr)
-{
-       __asm__ __volatile__( LOCK_PREFIX
-               "btrl %1,%0"
-               :ADDR
-               :"dIr" (nr));
-}
-
-/*
- * clear_bit_unlock - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and implies release semantics before the memory
- * operation. It can be used for an unlock.
- */
-static inline void clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
-{
-       barrier();
-       clear_bit(nr, addr);
-}
-
-static inline void __clear_bit(int nr, volatile void *addr)
-{
-       __asm__ __volatile__(
-               "btrl %1,%0"
-               :ADDR
-               :"dIr" (nr));
-}
-
-/*
- * __clear_bit_unlock - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * __clear_bit() is non-atomic and implies release semantics before the memory
- * operation. It can be used for an unlock if no other CPUs can concurrently
- * modify other bits in the word.
- *
- * No memory barrier is required here, because x86 cannot reorder stores past
- * older loads. Same principle as spin_unlock.
- */
-static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *addr)
-{
-       barrier();
-       __clear_bit(nr, addr);
-}
-
-#define smp_mb__before_clear_bit()     barrier()
-#define smp_mb__after_clear_bit()      barrier()
-
-/**
- * __change_bit - Toggle a bit in memory
- * @nr: the bit to change
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static inline void __change_bit(int nr, volatile void *addr)
-{
-       __asm__ __volatile__(
-               "btcl %1,%0"
-               :ADDR
-               :"dIr" (nr));
-}
-
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to change
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void change_bit(int nr, volatile void *addr)
-{
-       __asm__ __volatile__( LOCK_PREFIX
-               "btcl %1,%0"
-               :ADDR
-               :"dIr" (nr));
-}
-
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile void *addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__( LOCK_PREFIX
-               "btsl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),ADDR
-               :"dIr" (nr) : "memory");
-       return oldbit;
-}
-
-/**
- * test_and_set_bit_lock - Set a bit and return its old value for lock
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This is the same as test_and_set_bit on x86.
- */
-static inline int test_and_set_bit_lock(int nr, volatile void *addr)
-{
-       return test_and_set_bit(nr, addr);
-}
-
-/**
- * __test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_set_bit(int nr, volatile void *addr)
-{
-       int oldbit;
-
-       __asm__(
-               "btsl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),ADDR
-               :"dIr" (nr));
-       return oldbit;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile void *addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__( LOCK_PREFIX
-               "btrl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),ADDR
-               :"dIr" (nr) : "memory");
-       return oldbit;
-}
-
-/**
- * __test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static inline int __test_and_clear_bit(int nr, volatile void *addr)
-{
-       int oldbit;
-
-       __asm__(
-               "btrl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),ADDR
-               :"dIr" (nr));
-       return oldbit;
-}
-
-/* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr, volatile void *addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__(
-               "btcl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),ADDR
-               :"dIr" (nr) : "memory");
-       return oldbit;
-}
-
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies a memory barrier.
- */
-static inline int test_and_change_bit(int nr, volatile void *addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__( LOCK_PREFIX
-               "btcl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),ADDR
-               :"dIr" (nr) : "memory");
-       return oldbit;
-}
-
-#if 0 /* Fool kernel-doc since it doesn't do macros yet */
-/**
- * test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static int test_bit(int nr, const volatile void *addr);
-#endif
-
-static inline int constant_test_bit(int nr, const volatile void *addr)
-{
-       return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
-}
-
-static inline int variable_test_bit(int nr, volatile const void *addr)
-{
-       int oldbit;
-
-       __asm__ __volatile__(
-               "btl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit)
-               :"m" (*(volatile long *)addr),"dIr" (nr));
-       return oldbit;
-}
-
-#define test_bit(nr,addr) \
-(__builtin_constant_p(nr) ? \
- constant_test_bit((nr),(addr)) : \
- variable_test_bit((nr),(addr)))
-
-#undef ADDR
-
  extern long find_first_zero_bit(const unsigned long *addr, unsigned long size);
  extern long find_next_zero_bit(const unsigned long *addr, long size, long offset);
  extern long find_first_bit(const unsigned long *addr, unsigned long size);
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h

index 19f3ddf2df4be745426a0990770d30b591b3276e..51151356840fcd5e099c84bb4c7e1ab623d47e77 100644 (file)
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -54,13 +54,14 @@ struct sys_desc_table {
  };
  
  struct efi_info {
-       __u32 _pad1;
+       __u32 efi_loader_signature;
         __u32 efi_systab;
         __u32 efi_memdesc_size;
         __u32 efi_memdesc_version;
         __u32 efi_memmap;
         __u32 efi_memmap_size;
-       __u32 _pad2[2];
+       __u32 efi_systab_hi;
+       __u32 efi_memmap_hi;
  };
  
  /* The so-called "zeropage" */
diff --git a/include/asm-x86/bug.h b/include/asm-x86/bug.h

index fd8bdc639c48f44809497a9927e3479dae37b13e..8d477a201392ec7c7fed8b4bbd0418c8e3884d31 100644 (file)
--- a/include/asm-x86/bug.h
+++ b/include/asm-x86/bug.h
@@ -33,9 +33,6 @@
         } while(0)
  #endif
  
-void out_of_line_bug(void);
-#else /* CONFIG_BUG */
-static inline void out_of_line_bug(void) { }
  #endif /* !CONFIG_BUG */
  
  #include <asm-generic/bug.h>
diff --git a/include/asm-x86/bugs.h b/include/asm-x86/bugs.h

index aac8317420af989800fdf3097a797bc4c2ba6f59..3fcc30dc07314ce0fc7460dfb956d260bf602829 100644 (file)
--- a/include/asm-x86/bugs.h
+++ b/include/asm-x86/bugs.h
@@ -1,6 +1,7 @@
  #ifndef _ASM_X86_BUGS_H
  #define _ASM_X86_BUGS_H
  
-void check_bugs(void);
+extern void check_bugs(void);
+extern int ppro_with_ram_bug(void);
  
  #endif /* _ASM_X86_BUGS_H */
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h

index 9411a2d3f19c2a77feed8d141ddc7c3bd9f13399..8dd8c5e3cc7fca8dabc802662d1acd0263315366 100644 (file)
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -24,18 +24,35 @@
  #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
         memcpy(dst, src, len)
  
-void global_flush_tlb(void);
-int change_page_attr(struct page *page, int numpages, pgprot_t prot);
-int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot);
-void clflush_cache_range(void *addr, int size);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-/* internal debugging function */
-void kernel_map_pages(struct page *page, int numpages, int enable);
-#endif
+int __deprecated_for_modules change_page_attr(struct page *page, int numpages,
+                                                               pgprot_t prot);
+
+int set_pages_uc(struct page *page, int numpages);
+int set_pages_wb(struct page *page, int numpages);
+int set_pages_x(struct page *page, int numpages);
+int set_pages_nx(struct page *page, int numpages);
+int set_pages_ro(struct page *page, int numpages);
+int set_pages_rw(struct page *page, int numpages);
+
+int set_memory_uc(unsigned long addr, int numpages);
+int set_memory_wb(unsigned long addr, int numpages);
+int set_memory_x(unsigned long addr, int numpages);
+int set_memory_nx(unsigned long addr, int numpages);
+int set_memory_ro(unsigned long addr, int numpages);
+int set_memory_rw(unsigned long addr, int numpages);
+int set_memory_np(unsigned long addr, int numpages);
+
+void clflush_cache_range(void *addr, unsigned int size);
  
  #ifdef CONFIG_DEBUG_RODATA
  void mark_rodata_ro(void);
  #endif
+#ifdef CONFIG_DEBUG_RODATA_TEST
+void rodata_test(void);
+#else
+static inline void rodata_test(void)
+{
+}
+#endif
  
  #endif
diff --git a/include/asm-x86/calling.h b/include/asm-x86/calling.h

index 6f4f63af96e1a661f215895e502155f402b40678..f13e62e2cb3e75160297459ef182921b170d0850 100644 (file)
--- a/include/asm-x86/calling.h
+++ b/include/asm-x86/calling.h
@@ -1,162 +1,168 @@
-/* 
+/*
   * Some macros to handle stack frames in assembly.
- */ 
+ */
  
+#define R15              0
+#define R14              8
+#define R13             16
+#define R12             24
+#define RBP             32
+#define RBX             40
  
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
  /* arguments: interrupts/non tracing syscalls only save upto here*/
-#define R11 48
-#define R10 56 
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120       /* + error_code */ 
-/* end of arguments */         
+#define R11             48
+#define R10             56
+#define R9              64
+#define R8              72
+#define RAX             80
+#define RCX             88
+#define RDX             96
+#define RSI            104
+#define RDI            112
+#define ORIG_RAX       120       /* + error_code */
+/* end of arguments */
+
  /* cpu exception frame or undefined in case of fast syscall. */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
-#define ARGOFFSET R11
-#define SWFRAME ORIG_RAX
+#define RIP            128
+#define CS             136
+#define EFLAGS         144
+#define RSP            152
+#define SS             160
+
+#define ARGOFFSET      R11
+#define SWFRAME                ORIG_RAX
  
-       .macro SAVE_ARGS addskip=0,norcx=0,nor891011=0
-       subq  $9*8+\addskip,%rsp
+       .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
+       subq  $9*8+\addskip, %rsp
         CFI_ADJUST_CFA_OFFSET   9*8+\addskip
-       movq  %rdi,8*8(%rsp) 
-       CFI_REL_OFFSET  rdi,8*8
-       movq  %rsi,7*8(%rsp) 
-       CFI_REL_OFFSET  rsi,7*8
-       movq  %rdx,6*8(%rsp)
-       CFI_REL_OFFSET  rdx,6*8
+       movq  %rdi, 8*8(%rsp)
+       CFI_REL_OFFSET  rdi, 8*8
+       movq  %rsi, 7*8(%rsp)
+       CFI_REL_OFFSET  rsi, 7*8
+       movq  %rdx, 6*8(%rsp)
+       CFI_REL_OFFSET  rdx, 6*8
         .if \norcx
         .else
-       movq  %rcx,5*8(%rsp)
-       CFI_REL_OFFSET  rcx,5*8
+       movq  %rcx, 5*8(%rsp)
+       CFI_REL_OFFSET  rcx, 5*8
         .endif
-       movq  %rax,4*8(%rsp) 
-       CFI_REL_OFFSET  rax,4*8
+       movq  %rax, 4*8(%rsp)
+       CFI_REL_OFFSET  rax, 4*8
         .if \nor891011
         .else
-       movq  %r8,3*8(%rsp) 
-       CFI_REL_OFFSET  r8,3*8
-       movq  %r9,2*8(%rsp) 
-       CFI_REL_OFFSET  r9,2*8
-       movq  %r10,1*8(%rsp) 
-       CFI_REL_OFFSET  r10,1*8
-       movq  %r11,(%rsp) 
-       CFI_REL_OFFSET  r11,0*8
+       movq  %r8, 3*8(%rsp)
+       CFI_REL_OFFSET  r8,  3*8
+       movq  %r9, 2*8(%rsp)
+       CFI_REL_OFFSET  r9,  2*8
+       movq  %r10, 1*8(%rsp)
+       CFI_REL_OFFSET  r10, 1*8
+       movq  %r11, (%rsp)
+       CFI_REL_OFFSET  r11, 0*8
         .endif
         .endm
  
-#define ARG_SKIP 9*8
-       .macro RESTORE_ARGS skiprax=0,addskip=0,skiprcx=0,skipr11=0,skipr8910=0,skiprdx=0
+#define ARG_SKIP       9*8
+
+       .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
+                           skipr8910=0, skiprdx=0
         .if \skipr11
         .else
-       movq (%rsp),%r11
+       movq (%rsp), %r11
         CFI_RESTORE r11
         .endif
         .if \skipr8910
         .else
-       movq 1*8(%rsp),%r10
+       movq 1*8(%rsp), %r10
         CFI_RESTORE r10
-       movq 2*8(%rsp),%r9
+       movq 2*8(%rsp), %r9
         CFI_RESTORE r9
-       movq 3*8(%rsp),%r8
+       movq 3*8(%rsp), %r8
         CFI_RESTORE r8
         .endif
         .if \skiprax
         .else
-       movq 4*8(%rsp),%rax
+       movq 4*8(%rsp), %rax
         CFI_RESTORE rax
         .endif
         .if \skiprcx
         .else
-       movq 5*8(%rsp),%rcx
+       movq 5*8(%rsp), %rcx
         CFI_RESTORE rcx
         .endif
         .if \skiprdx
         .else
-       movq 6*8(%rsp),%rdx
+       movq 6*8(%rsp), %rdx
         CFI_RESTORE rdx
         .endif
-       movq 7*8(%rsp),%rsi
+       movq 7*8(%rsp), %rsi
         CFI_RESTORE rsi
-       movq 8*8(%rsp),%rdi
+       movq 8*8(%rsp), %rdi
         CFI_RESTORE rdi
         .if ARG_SKIP+\addskip > 0
-       addq $ARG_SKIP+\addskip,%rsp
+       addq $ARG_SKIP+\addskip, %rsp
         CFI_ADJUST_CFA_OFFSET   -(ARG_SKIP+\addskip)
         .endif
-       .endm   
+       .endm
  
         .macro LOAD_ARGS offset
-       movq \offset(%rsp),%r11
-       movq \offset+8(%rsp),%r10
-       movq \offset+16(%rsp),%r9
-       movq \offset+24(%rsp),%r8
-       movq \offset+40(%rsp),%rcx
-       movq \offset+48(%rsp),%rdx
-       movq \offset+56(%rsp),%rsi
-       movq \offset+64(%rsp),%rdi
-       movq \offset+72(%rsp),%rax
+       movq \offset(%rsp),    %r11
+       movq \offset+8(%rsp),  %r10
+       movq \offset+16(%rsp), %r9
+       movq \offset+24(%rsp), %r8
+       movq \offset+40(%rsp), %rcx
+       movq \offset+48(%rsp), %rdx
+       movq \offset+56(%rsp), %rsi
+       movq \offset+64(%rsp), %rdi
+       movq \offset+72(%rsp), %rax
         .endm
-                       
-#define REST_SKIP 6*8                  
+
+#define REST_SKIP      6*8
+
         .macro SAVE_REST
-       subq $REST_SKIP,%rsp
+       subq $REST_SKIP, %rsp
         CFI_ADJUST_CFA_OFFSET   REST_SKIP
-       movq %rbx,5*8(%rsp) 
-       CFI_REL_OFFSET  rbx,5*8
-       movq %rbp,4*8(%rsp) 
-       CFI_REL_OFFSET  rbp,4*8
-       movq %r12,3*8(%rsp) 
-       CFI_REL_OFFSET  r12,3*8
-       movq %r13,2*8(%rsp) 
-       CFI_REL_OFFSET  r13,2*8
-       movq %r14,1*8(%rsp) 
-       CFI_REL_OFFSET  r14,1*8
-       movq %r15,(%rsp) 
-       CFI_REL_OFFSET  r15,0*8
-       .endm           
+       movq %rbx, 5*8(%rsp)
+       CFI_REL_OFFSET  rbx, 5*8
+       movq %rbp, 4*8(%rsp)
+       CFI_REL_OFFSET  rbp, 4*8
+       movq %r12, 3*8(%rsp)
+       CFI_REL_OFFSET  r12, 3*8
+       movq %r13, 2*8(%rsp)
+       CFI_REL_OFFSET  r13, 2*8
+       movq %r14, 1*8(%rsp)
+       CFI_REL_OFFSET  r14, 1*8
+       movq %r15, (%rsp)
+       CFI_REL_OFFSET  r15, 0*8
+       .endm
  
         .macro RESTORE_REST
-       movq (%rsp),%r15
+       movq (%rsp),     %r15
         CFI_RESTORE r15
-       movq 1*8(%rsp),%r14
+       movq 1*8(%rsp),  %r14
         CFI_RESTORE r14
-       movq 2*8(%rsp),%r13
+       movq 2*8(%rsp),  %r13
         CFI_RESTORE r13
-       movq 3*8(%rsp),%r12
+       movq 3*8(%rsp),  %r12
         CFI_RESTORE r12
-       movq 4*8(%rsp),%rbp
+       movq 4*8(%rsp),  %rbp
         CFI_RESTORE rbp
-       movq 5*8(%rsp),%rbx
+       movq 5*8(%rsp),  %rbx
         CFI_RESTORE rbx
-       addq $REST_SKIP,%rsp
+       addq $REST_SKIP, %rsp
         CFI_ADJUST_CFA_OFFSET   -(REST_SKIP)
         .endm
-               
+
         .macro SAVE_ALL
         SAVE_ARGS
         SAVE_REST
         .endm
-               
+
         .macro RESTORE_ALL addskip=0
         RESTORE_REST
-       RESTORE_ARGS 0,\addskip
+       RESTORE_ARGS 0, \addskip
         .endm
  
         .macro icebp
         .byte 0xf1
         .endm
+
diff --git a/include/asm-x86/checksum_64.h b/include/asm-x86/checksum_64.h

index 419fe88a0342d383962571165f2e2ec04f4d23d3..e5f79997decce1c2635b77301968594bebba0449 100644 (file)
--- a/include/asm-x86/checksum_64.h
+++ b/include/asm-x86/checksum_64.h
@@ -4,7 +4,7 @@
  /* 
   * Checksums for x86-64 
   * Copyright 2002 by Andi Kleen, SuSE Labs 
- * with some code from asm-i386/checksum.h
+ * with some code from asm-x86/checksum.h
   */ 
  
  #include <linux/compiler.h>
diff --git a/include/asm-x86/cmpxchg_32.h b/include/asm-x86/cmpxchg_32.h

index f86ede28f6dc84750ab60250cc7d122534d9c74a..cea1dae288a74b72e567d45ef963f3d5a271eb78 100644 (file)
--- a/include/asm-x86/cmpxchg_32.h
+++ b/include/asm-x86/cmpxchg_32.h
@@ -105,15 +105,24 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
  
  #ifdef CONFIG_X86_CMPXCHG
  #define __HAVE_ARCH_CMPXCHG 1
-#define cmpxchg(ptr,o,n)\
-       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
-                                       (unsigned long)(n),sizeof(*(ptr))))
-#define sync_cmpxchg(ptr,o,n)\
-       ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
-                                       (unsigned long)(n),sizeof(*(ptr))))
-#define cmpxchg_local(ptr,o,n)\
-       ((__typeof__(*(ptr)))__cmpxchg_local((ptr),(unsigned long)(o),\
-                                       (unsigned long)(n),sizeof(*(ptr))))
+#define cmpxchg(ptr, o, n)                                                  \
+       ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o),            \
+                                       (unsigned long)(n), sizeof(*(ptr))))
+#define sync_cmpxchg(ptr, o, n)                                                     \
+       ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o),       \
+                                       (unsigned long)(n), sizeof(*(ptr))))
+#define cmpxchg_local(ptr, o, n)                                            \
+       ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o),      \
+                                       (unsigned long)(n), sizeof(*(ptr))))
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define cmpxchg64(ptr, o, n)                                                 \
+       ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o),      \
+                                       (unsigned long long)(n)))
+#define cmpxchg64_local(ptr, o, n)                                           \
+       ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o),\
+                                       (unsigned long long)(n)))
  #endif
  
  static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -203,6 +212,34 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
         return old;
  }
  
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+                       unsigned long long old, unsigned long long new)
+{
+       unsigned long long prev;
+       __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
+                            : "=A"(prev)
+                            : "b"((unsigned long)new),
+                              "c"((unsigned long)(new >> 32)),
+                              "m"(*__xg(ptr)),
+                              "0"(old)
+                            : "memory");
+       return prev;
+}
+
+static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
+                       unsigned long long old, unsigned long long new)
+{
+       unsigned long long prev;
+       __asm__ __volatile__("cmpxchg8b %3"
+                            : "=A"(prev)
+                            : "b"((unsigned long)new),
+                              "c"((unsigned long)(new >> 32)),
+                              "m"(*__xg(ptr)),
+                              "0"(old)
+                            : "memory");
+       return prev;
+}
+
  #ifndef CONFIG_X86_CMPXCHG
  /*
   * Building a kernel capable running on 80386. It may be necessary to
@@ -228,7 +265,7 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
         return old;
  }
  
-#define cmpxchg(ptr,o,n)                                               \
+#define cmpxchg(ptr, o, n)                                             \
  ({                                                                     \
         __typeof__(*(ptr)) __ret;                                       \
         if (likely(boot_cpu_data.x86 > 3))                              \
@@ -239,7 +276,7 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
                                         (unsigned long)(n), sizeof(*(ptr))); \
         __ret;                                                          \
  })
-#define cmpxchg_local(ptr,o,n)                                         \
+#define cmpxchg_local(ptr, o, n)                                       \
  ({                                                                     \
         __typeof__(*(ptr)) __ret;                                       \
         if (likely(boot_cpu_data.x86 > 3))                              \
@@ -252,38 +289,37 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
  })
  #endif
  
-static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
-                                     unsigned long long new)
-{
-       unsigned long long prev;
-       __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
-                            : "=A"(prev)
-                            : "b"((unsigned long)new),
-                              "c"((unsigned long)(new >> 32)),
-                              "m"(*__xg(ptr)),
-                              "0"(old)
-                            : "memory");
-       return prev;
-}
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
  
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
-                       unsigned long long old, unsigned long long new)
-{
-       unsigned long long prev;
-       __asm__ __volatile__("cmpxchg8b %3"
-                            : "=A"(prev)
-                            : "b"((unsigned long)new),
-                              "c"((unsigned long)(new >> 32)),
-                              "m"(*__xg(ptr)),
-                              "0"(old)
-                            : "memory");
-       return prev;
-}
+extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
+
+#define cmpxchg64(ptr, o, n)                                           \
+({                                                                     \
+       __typeof__(*(ptr)) __ret;                                       \
+       if (likely(boot_cpu_data.x86 > 4))                              \
+               __ret = __cmpxchg64((ptr), (unsigned long long)(o),     \
+                               (unsigned long long)(n));               \
+       else                                                            \
+               __ret = cmpxchg_486_u64((ptr), (unsigned long long)(o), \
+                               (unsigned long long)(n));               \
+       __ret;                                                          \
+})
+#define cmpxchg64_local(ptr, o, n)                                     \
+({                                                                     \
+       __typeof__(*(ptr)) __ret;                                       \
+       if (likely(boot_cpu_data.x86 > 4))                              \
+               __ret = __cmpxchg64_local((ptr), (unsigned long long)(o), \
+                               (unsigned long long)(n));               \
+       else                                                            \
+               __ret = cmpxchg_486_u64((ptr), (unsigned long long)(o), \
+                               (unsigned long long)(n));               \
+       __ret;                                                          \
+})
+
+#endif
  
-#define cmpxchg64(ptr,o,n)\
-       ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
-                                       (unsigned long long)(n)))
-#define cmpxchg64_local(ptr,o,n)\
-       ((__typeof__(*(ptr)))__cmpxchg64_local((ptr),(unsigned long long)(o),\
-                                       (unsigned long long)(n)))
  #endif
diff --git a/include/asm-x86/compat.h b/include/asm-x86/compat.h

index 66ba7987184ae1dfae314010a321a4908b4b4e55..b270ee04959ea7ce3fa73cf4935f39d9b7e0aa8b 100644 (file)
--- a/include/asm-x86/compat.h
+++ b/include/asm-x86/compat.h
@@ -207,7 +207,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
  static __inline__ void __user *compat_alloc_user_space(long len)
  {
         struct pt_regs *regs = task_pt_regs(current);
-       return (void __user *)regs->rsp - len; 
+       return (void __user *)regs->sp - len;
  }
  
  static inline int is_compat_task(void)
diff --git a/include/asm-x86/cpu.h b/include/asm-x86/cpu.h

index b1bc7b1b64b0e304d13f907e4d5cd607417bdfee..85ece5f10e9ee9b69807e32091114cedb32298fc 100644 (file)
--- a/include/asm-x86/cpu.h
+++ b/include/asm-x86/cpu.h
@@ -7,7 +7,7 @@
  #include <linux/nodemask.h>
  #include <linux/percpu.h>
  
-struct i386_cpu {
+struct x86_cpu {
         struct cpu cpu;
  };
  extern int arch_register_cpu(int num);
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h

index b7160a4598d74e4db6da761e08abff080d59c1e0..3fb7dfa7fc915b66f827c722ec31efc70cd63810 100644 (file)
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -1,5 +1,207 @@
-#ifdef CONFIG_X86_32
-# include "cpufeature_32.h"
+/*
+ * Defines x86 CPU feature bits
+ */
+#ifndef _ASM_X86_CPUFEATURE_H
+#define _ASM_X86_CPUFEATURE_H
+
+#ifndef __ASSEMBLY__
+#include <linux/bitops.h>
+#endif
+#include <asm/required-features.h>
+
+#define NCAPINTS       8       /* N 32-bit words worth of info */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU                (0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME                (0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE         (0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE                (0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC                (0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR                (0*32+ 5) /* Model-Specific Registers, RDMSR, WRMSR */
+#define X86_FEATURE_PAE                (0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_CX8                (0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC       (0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP                (0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR       (0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE                (0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA                (0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV       (0*32+15) /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */
+#define X86_FEATURE_PAT                (0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36      (0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN         (0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLSH     (0*32+19) /* Supports the CLFLUSH instruction */
+#define X86_FEATURE_DS         (0*32+21) /* Debug Store */
+#define X86_FEATURE_ACPI       (0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX                (0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR       (0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
+                                         /* of FPU context), and CR4.OSFXSR available */
+#define X86_FEATURE_XMM                (0*32+25) /* Streaming SIMD Extensions */
+#define X86_FEATURE_XMM2       (0*32+26) /* Streaming SIMD Extensions-2 */
+#define X86_FEATURE_SELFSNOOP  (0*32+27) /* CPU self snoop */
+#define X86_FEATURE_HT         (0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC                (0*32+29) /* Automatic clock control */
+#define X86_FEATURE_IA64       (0*32+30) /* IA-64 processor */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL    (1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP         (1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX         (1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT     (1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_RDTSCP     (1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM         (1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT   (1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW      (1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY   (2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN    (2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI       (2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX      (3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR    (3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR  (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR        (3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8         (3*32+ 4) /* Opteron, Athlon64 */
+#define X86_FEATURE_K7         (3*32+ 5) /* Athlon */
+#define X86_FEATURE_P3         (3*32+ 6) /* P3 */
+#define X86_FEATURE_P4         (3*32+ 7) /* P4 */
+#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP         (3*32+ 9) /* smp kernel running on up */
+#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS       (3*32+12)  /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS                (3*32+13)  /* Branch Trace Store */
+/* 14 free */
+/* 15 free */
+#define X86_FEATURE_REP_GOOD   (3*32+16) /* rep microcode works well on this CPU */
+#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3       (4*32+ 0) /* Streaming SIMD Extensions-3 */
+#define X86_FEATURE_MWAIT      (4*32+ 3) /* Monitor/Mwait support */
+#define X86_FEATURE_DSCPL      (4*32+ 4) /* CPL Qualified Debug Store */
+#define X86_FEATURE_EST                (4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2                (4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_CID                (4*32+10) /* Context ID */
+#define X86_FEATURE_CX16       (4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR       (4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_DCA                (4*32+18) /* Direct Cache Access */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE     (5*32+ 2) /* on-CPU RNG present (xstore insn) */
+#define X86_FEATURE_XSTORE_EN  (5*32+ 3) /* on-CPU RNG enabled */
+#define X86_FEATURE_XCRYPT     (5*32+ 6) /* on-CPU crypto (xcrypt insn) */
+#define X86_FEATURE_XCRYPT_EN  (5*32+ 7) /* on-CPU crypto enabled */
+#define X86_FEATURE_ACE2       (5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN    (5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE                (5*32+ 10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN     (5*32+ 11) /* PHE enabled */
+#define X86_FEATURE_PMM                (5*32+ 12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN     (5*32+ 13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM    (6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc
+ */
+#define X86_FEATURE_IDA                (7*32+ 0) /* Intel Dynamic Acceleration */
+
+#define cpu_has(c, bit)                                                        \
+       (__builtin_constant_p(bit) &&                                   \
+        ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) ||     \
+          (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) ||     \
+          (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) ||     \
+          (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) ||     \
+          (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) ||     \
+          (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) ||     \
+          (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||     \
+          (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) )      \
+         ? 1 :                                                         \
+        test_bit(bit, (unsigned long *)((c)->x86_capability)))
+#define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
+
+#define set_cpu_cap(c, bit)    set_bit(bit, (unsigned long *)((c)->x86_capability))
+#define clear_cpu_cap(c, bit)  clear_bit(bit, (unsigned long *)((c)->x86_capability))
+#define setup_clear_cpu_cap(bit) do { \
+       clear_cpu_cap(&boot_cpu_data, bit);     \
+       set_bit(bit, cleared_cpu_caps);         \
+} while (0)
+#define setup_force_cpu_cap(bit) do { \
+       set_cpu_cap(&boot_cpu_data, bit);       \
+       clear_bit(bit, cleared_cpu_caps);       \
+} while (0)
+
+#define cpu_has_fpu            boot_cpu_has(X86_FEATURE_FPU)
+#define cpu_has_vme            boot_cpu_has(X86_FEATURE_VME)
+#define cpu_has_de             boot_cpu_has(X86_FEATURE_DE)
+#define cpu_has_pse            boot_cpu_has(X86_FEATURE_PSE)
+#define cpu_has_tsc            boot_cpu_has(X86_FEATURE_TSC)
+#define cpu_has_pae            boot_cpu_has(X86_FEATURE_PAE)
+#define cpu_has_pge            boot_cpu_has(X86_FEATURE_PGE)
+#define cpu_has_apic           boot_cpu_has(X86_FEATURE_APIC)
+#define cpu_has_sep            boot_cpu_has(X86_FEATURE_SEP)
+#define cpu_has_mtrr           boot_cpu_has(X86_FEATURE_MTRR)
+#define cpu_has_mmx            boot_cpu_has(X86_FEATURE_MMX)
+#define cpu_has_fxsr           boot_cpu_has(X86_FEATURE_FXSR)
+#define cpu_has_xmm            boot_cpu_has(X86_FEATURE_XMM)
+#define cpu_has_xmm2           boot_cpu_has(X86_FEATURE_XMM2)
+#define cpu_has_xmm3           boot_cpu_has(X86_FEATURE_XMM3)
+#define cpu_has_ht             boot_cpu_has(X86_FEATURE_HT)
+#define cpu_has_mp             boot_cpu_has(X86_FEATURE_MP)
+#define cpu_has_nx             boot_cpu_has(X86_FEATURE_NX)
+#define cpu_has_k6_mtrr                boot_cpu_has(X86_FEATURE_K6_MTRR)
+#define cpu_has_cyrix_arr      boot_cpu_has(X86_FEATURE_CYRIX_ARR)
+#define cpu_has_centaur_mcr    boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
+#define cpu_has_xstore         boot_cpu_has(X86_FEATURE_XSTORE)
+#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
+#define cpu_has_xcrypt         boot_cpu_has(X86_FEATURE_XCRYPT)
+#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
+#define cpu_has_ace2           boot_cpu_has(X86_FEATURE_ACE2)
+#define cpu_has_ace2_enabled   boot_cpu_has(X86_FEATURE_ACE2_EN)
+#define cpu_has_phe            boot_cpu_has(X86_FEATURE_PHE)
+#define cpu_has_phe_enabled    boot_cpu_has(X86_FEATURE_PHE_EN)
+#define cpu_has_pmm            boot_cpu_has(X86_FEATURE_PMM)
+#define cpu_has_pmm_enabled    boot_cpu_has(X86_FEATURE_PMM_EN)
+#define cpu_has_ds             boot_cpu_has(X86_FEATURE_DS)
+#define cpu_has_pebs           boot_cpu_has(X86_FEATURE_PEBS)
+#define cpu_has_clflush                boot_cpu_has(X86_FEATURE_CLFLSH)
+#define cpu_has_bts            boot_cpu_has(X86_FEATURE_BTS)
+
+#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
+# define cpu_has_invlpg                1
  #else
-# include "cpufeature_64.h"
+# define cpu_has_invlpg                (boot_cpu_data.x86 > 3)
  #endif
+
+#ifdef CONFIG_X86_64
+
+#undef  cpu_has_vme
+#define cpu_has_vme            0
+
+#undef  cpu_has_pae
+#define cpu_has_pae            ___BUG___
+
+#undef  cpu_has_mp
+#define cpu_has_mp             1
+
+#undef  cpu_has_k6_mtrr
+#define cpu_has_k6_mtrr                0
+
+#undef  cpu_has_cyrix_arr
+#define cpu_has_cyrix_arr      0
+
+#undef  cpu_has_centaur_mcr
+#define cpu_has_centaur_mcr    0
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/include/asm-x86/cpufeature_32.h b/include/asm-x86/cpufeature_32.h

deleted file mode 100644 (file)

index f17e688..0000000
--- a/include/asm-x86/cpufeature_32.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * cpufeature.h
- *
- * Defines x86 CPU feature bits
- */
-
-#ifndef __ASM_I386_CPUFEATURE_H
-#define __ASM_I386_CPUFEATURE_H
-
-#ifndef __ASSEMBLY__
-#include <linux/bitops.h>
-#endif
-#include <asm/required-features.h>
-
-#define NCAPINTS       8       /* N 32-bit words worth of info */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU                (0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME                (0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE         (0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE        (0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC                (0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR                (0*32+ 5) /* Model-Specific Registers, RDMSR, WRMSR */
-#define X86_FEATURE_PAE                (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Architecture */
-#define X86_FEATURE_CX8                (0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC       (0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP                (0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR       (0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE                (0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA                (0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV       (0*32+15) /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */
-#define X86_FEATURE_PAT                (0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36      (0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN         (0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLSH     (0*32+19) /* Supports the CLFLUSH instruction */
-#define X86_FEATURE_DS         (0*32+21) /* Debug Store */
-#define X86_FEATURE_ACPI       (0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX                (0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR       (0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */
-                                         /* of FPU context), and CR4.OSFXSR available */
-#define X86_FEATURE_XMM                (0*32+25) /* Streaming SIMD Extensions */
-#define X86_FEATURE_XMM2       (0*32+26) /* Streaming SIMD Extensions-2 */
-#define X86_FEATURE_SELFSNOOP  (0*32+27) /* CPU self snoop */
-#define X86_FEATURE_HT         (0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC                (0*32+29) /* Automatic clock control */
-#define X86_FEATURE_IA64       (0*32+30) /* IA-64 processor */
-
-/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL    (1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP         (1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX         (1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT     (1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_RDTSCP     (1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM         (1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT   (1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW      (1*32+31) /* 3DNow! */
-
-/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY   (2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN    (2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI       (2*32+ 3) /* LongRun table interface */
-
-/* Other features, Linux-defined mapping, word 3 */
-/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX      (3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR    (3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR  (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR        (3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8         (3*32+ 4) /* Opteron, Athlon64 */
-#define X86_FEATURE_K7         (3*32+ 5) /* Athlon */
-#define X86_FEATURE_P3         (3*32+ 6) /* P3 */
-#define X86_FEATURE_P4         (3*32+ 7) /* P4 */
-#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP         (3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS       (3*32+12)  /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS                (3*32+13)  /* Branch Trace Store */
-/* 14 free */
-#define X86_FEATURE_SYNC_RDTSC (3*32+15)  /* RDTSC synchronizes the CPU */
-#define X86_FEATURE_REP_GOOD   (3*32+16) /* rep microcode works well on this CPU */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3       (4*32+ 0) /* Streaming SIMD Extensions-3 */
-#define X86_FEATURE_MWAIT      (4*32+ 3) /* Monitor/Mwait support */
-#define X86_FEATURE_DSCPL      (4*32+ 4) /* CPL Qualified Debug Store */
-#define X86_FEATURE_EST                (4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2                (4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_CID                (4*32+10) /* Context ID */
-#define X86_FEATURE_CX16        (4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR       (4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_DCA                (4*32+18) /* Direct Cache Access */
-
-/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE     (5*32+ 2) /* on-CPU RNG present (xstore insn) */
-#define X86_FEATURE_XSTORE_EN  (5*32+ 3) /* on-CPU RNG enabled */
-#define X86_FEATURE_XCRYPT     (5*32+ 6) /* on-CPU crypto (xcrypt insn) */
-#define X86_FEATURE_XCRYPT_EN  (5*32+ 7) /* on-CPU crypto enabled */
-#define X86_FEATURE_ACE2       (5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN    (5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE                (5*32+ 10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN     (5*32+ 11) /* PHE enabled */
-#define X86_FEATURE_PMM                (5*32+ 12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN     (5*32+ 13) /* PMM enabled */
-
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM    (6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
-
-/*
- * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc
- */
-#define X86_FEATURE_IDA                (7*32+ 0) /* Intel Dynamic Acceleration */
-
-#define cpu_has(c, bit)                                                        \
-       (__builtin_constant_p(bit) &&                                   \
-        ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) ||     \
-          (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) ||     \
-          (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) ||     \
-          (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) ||     \
-          (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) ||     \
-          (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) ||     \
-          (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||     \
-          (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) )      \
-         ? 1 :                                                         \
-         test_bit(bit, (c)->x86_capability))
-#define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
-
-#define cpu_has_fpu            boot_cpu_has(X86_FEATURE_FPU)
-#define cpu_has_vme            boot_cpu_has(X86_FEATURE_VME)
-#define cpu_has_de             boot_cpu_has(X86_FEATURE_DE)
-#define cpu_has_pse            boot_cpu_has(X86_FEATURE_PSE)
-#define cpu_has_tsc            boot_cpu_has(X86_FEATURE_TSC)
-#define cpu_has_pae            boot_cpu_has(X86_FEATURE_PAE)
-#define cpu_has_pge            boot_cpu_has(X86_FEATURE_PGE)
-#define cpu_has_apic           boot_cpu_has(X86_FEATURE_APIC)
-#define cpu_has_sep            boot_cpu_has(X86_FEATURE_SEP)
-#define cpu_has_mtrr           boot_cpu_has(X86_FEATURE_MTRR)
-#define cpu_has_mmx            boot_cpu_has(X86_FEATURE_MMX)
-#define cpu_has_fxsr           boot_cpu_has(X86_FEATURE_FXSR)
-#define cpu_has_xmm            boot_cpu_has(X86_FEATURE_XMM)
-#define cpu_has_xmm2           boot_cpu_has(X86_FEATURE_XMM2)
-#define cpu_has_xmm3           boot_cpu_has(X86_FEATURE_XMM3)
-#define cpu_has_ht             boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_mp             boot_cpu_has(X86_FEATURE_MP)
-#define cpu_has_nx             boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_k6_mtrr                boot_cpu_has(X86_FEATURE_K6_MTRR)
-#define cpu_has_cyrix_arr      boot_cpu_has(X86_FEATURE_CYRIX_ARR)
-#define cpu_has_centaur_mcr    boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
-#define cpu_has_xstore         boot_cpu_has(X86_FEATURE_XSTORE)
-#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
-#define cpu_has_xcrypt         boot_cpu_has(X86_FEATURE_XCRYPT)
-#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
-#define cpu_has_ace2           boot_cpu_has(X86_FEATURE_ACE2)
-#define cpu_has_ace2_enabled   boot_cpu_has(X86_FEATURE_ACE2_EN)
-#define cpu_has_phe            boot_cpu_has(X86_FEATURE_PHE)
-#define cpu_has_phe_enabled    boot_cpu_has(X86_FEATURE_PHE_EN)
-#define cpu_has_pmm            boot_cpu_has(X86_FEATURE_PMM)
-#define cpu_has_pmm_enabled    boot_cpu_has(X86_FEATURE_PMM_EN)
-#define cpu_has_ds             boot_cpu_has(X86_FEATURE_DS)
-#define cpu_has_pebs           boot_cpu_has(X86_FEATURE_PEBS)
-#define cpu_has_clflush                boot_cpu_has(X86_FEATURE_CLFLSH)
-#define cpu_has_bts            boot_cpu_has(X86_FEATURE_BTS)
-
-#endif /* __ASM_I386_CPUFEATURE_H */
-
-/* 
- * Local Variables:
- * mode:c
- * comment-column:42
- * End:
- */
diff --git a/include/asm-x86/cpufeature_64.h b/include/asm-x86/cpufeature_64.h

deleted file mode 100644 (file)

index e18496b..0000000
--- a/include/asm-x86/cpufeature_64.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * cpufeature_32.h
- *
- * Defines x86 CPU feature bits
- */
-
-#ifndef __ASM_X8664_CPUFEATURE_H
-#define __ASM_X8664_CPUFEATURE_H
-
-#include "cpufeature_32.h"
-
-#undef  cpu_has_vme
-#define cpu_has_vme            0
-
-#undef  cpu_has_pae
-#define cpu_has_pae            ___BUG___
-
-#undef  cpu_has_mp
-#define cpu_has_mp             1 /* XXX */
-
-#undef  cpu_has_k6_mtrr
-#define cpu_has_k6_mtrr        0
-
-#undef  cpu_has_cyrix_arr
-#define cpu_has_cyrix_arr      0
-
-#undef  cpu_has_centaur_mcr
-#define cpu_has_centaur_mcr    0
-
-#endif /* __ASM_X8664_CPUFEATURE_H */
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h

index 6065c5092265b6d42602e4f431fffa2b931e0e8b..5b6a05d3a7712aa7b29b26a37aff2c843ba78819 100644 (file)
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -1,5 +1,381 @@
+#ifndef _ASM_DESC_H_
+#define _ASM_DESC_H_
+
+#ifndef __ASSEMBLY__
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+#include <linux/smp.h>
+
+static inline void fill_ldt(struct desc_struct *desc,
+                           const struct user_desc *info)
+{
+       desc->limit0 = info->limit & 0x0ffff;
+       desc->base0 = info->base_addr & 0x0000ffff;
+
+       desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+       desc->type = (info->read_exec_only ^ 1) << 1;
+       desc->type |= info->contents << 2;
+       desc->s = 1;
+       desc->dpl = 0x3;
+       desc->p = info->seg_not_present ^ 1;
+       desc->limit = (info->limit & 0xf0000) >> 16;
+       desc->avl = info->useable;
+       desc->d = info->seg_32bit;
+       desc->g = info->limit_in_pages;
+       desc->base2 = (info->base_addr & 0xff000000) >> 24;
+}
+
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+
+#ifdef CONFIG_X86_64
+extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
+extern struct desc_ptr cpu_gdt_descr[];
+/* the cpu gdt accessor */
+#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+                            unsigned dpl, unsigned ist, unsigned seg)
+{
+       gate->offset_low = PTR_LOW(func);
+       gate->segment = __KERNEL_CS;
+       gate->ist = ist;
+       gate->p = 1;
+       gate->dpl = dpl;
+       gate->zero0 = 0;
+       gate->zero1 = 0;
+       gate->type = type;
+       gate->offset_middle = PTR_MIDDLE(func);
+       gate->offset_high = PTR_HIGH(func);
+}
+
+#else
+struct gdt_page {
+       struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+       return per_cpu(gdt_page, cpu).gdt;
+}
+
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+       unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
+
+{
+       gate->a = (seg << 16) | (base & 0xffff);
+       gate->b = (base & 0xffff0000) |
+                 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+       const u32 *desc = ptr;
+       return !(desc[0] | desc[1]);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) \
+                               native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) \
+                               native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
+#endif
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry,
+                                         const gate_desc *gate)
+{
+       memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
+                                         const void *desc)
+{
+       memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
+                                         const void *desc, int type)
+{
+       unsigned int size;
+       switch (type) {
+       case DESC_TSS:
+               size = sizeof(tss_desc);
+               break;
+       case DESC_LDT:
+               size = sizeof(ldt_desc);
+               break;
+       default:
+               size = sizeof(struct desc_struct);
+               break;
+       }
+       memcpy(&gdt[entry], desc, size);
+}
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+                                  unsigned long limit, unsigned char type,
+                                  unsigned char flags)
+{
+       desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+       desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+                 (limit & 0x000f0000) | ((type & 0xff) << 8) |
+                 ((flags & 0xf) << 20);
+       desc->p = 1;
+}
+
+
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+                                        unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+       struct ldttss_desc64 *desc = d;
+       memset(desc, 0, sizeof(*desc));
+       desc->limit0 = size & 0xFFFF;
+       desc->base0 = PTR_LOW(addr);
+       desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+       desc->type = type;
+       desc->p = 1;
+       desc->limit1 = (size >> 16) & 0xF;
+       desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+       desc->base3 = PTR_HIGH(addr);
+#else
+
+       pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+       struct desc_struct *d = get_cpu_gdt_table(cpu);
+       tss_desc tss;
+
+       /*
+        * sizeof(unsigned long) coming from an extra "long" at the end
+        * of the iobitmap. See tss_struct definition in processor.h
+        *
+        * -1? seg base+limit should be pointing to the address of the
+        * last valid byte
+        */
+       set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
+       write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+       if (likely(entries == 0))
+               __asm__ __volatile__("lldt %w0"::"q" (0));
+       else {
+               unsigned cpu = smp_processor_id();
+               ldt_desc ldt;
+
+               set_tssldt_descriptor(&ldt, (unsigned long)addr,
+                                     DESC_LDT, entries * sizeof(ldt) - 1);
+               write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+                               &ldt, DESC_LDT);
+               __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+       }
+}
+
+static inline void native_load_tr_desc(void)
+{
+       asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+       asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+       asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+       asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+       asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+       unsigned long tr;
+       asm volatile("str %0":"=r" (tr));
+       return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+       unsigned int i;
+       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+               gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+
+#define _LDT_empty(info) (\
+       (info)->base_addr       == 0    && \
+       (info)->limit           == 0    && \
+       (info)->contents        == 0    && \
+       (info)->read_exec_only  == 1    && \
+       (info)->seg_32bit       == 0    && \
+       (info)->limit_in_pages  == 0    && \
+       (info)->seg_not_present == 1    && \
+       (info)->useable         == 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+       set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+       set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+       preempt_disable();
+       load_LDT_nolock(pc);
+       preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+       return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+       return desc->limit0 | (desc->limit << 16);
+}
+
+static inline void _set_gate(int gate, unsigned type, void *addr,
+                             unsigned dpl, unsigned ist, unsigned seg)
+{
+       gate_desc s;
+       pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+       /*
+        * does not need to be atomic because it is only done once at
+        * setup time
+        */
+       write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_system_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
  #ifdef CONFIG_X86_32
-# include "desc_32.h"
+       _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+#else
+       _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+#endif
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+
  #else
-# include "desc_64.h"
+/*
+ * GET_DESC_BASE reads the descriptor base of the specified segment.
+ *
+ * Args:
+ *    idx - descriptor index
+ *    gdt - GDT pointer
+ *    base - 32bit register to which the base will be written
+ *    lo_w - lo word of the "base" register
+ *    lo_b - lo byte of the "base" register
+ *    hi_b - hi byte of the low word of the "base" register
+ *
+ * Example:
+ *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
+ *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
+ */
+#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
+       movb idx*8+4(gdt), lo_b; \
+       movb idx*8+7(gdt), hi_b; \
+       shll $16, base; \
+       movw idx*8+2(gdt), lo_w;
+
+
+#endif /* __ASSEMBLY__ */
+
  #endif
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h

deleted file mode 100644 (file)

index c547403..0000000
--- a/include/asm-x86/desc_32.h
+++ /dev/null
@@ -1,244 +0,0 @@
-#ifndef __ARCH_DESC_H
-#define __ARCH_DESC_H
-
-#include <asm/ldt.h>
-#include <asm/segment.h>
-
-#ifndef __ASSEMBLY__
-
-#include <linux/preempt.h>
-#include <linux/smp.h>
-#include <linux/percpu.h>
-
-#include <asm/mmu.h>
-
-struct Xgt_desc_struct {
-       unsigned short size;
-       unsigned long address __attribute__((packed));
-       unsigned short pad;
-} __attribute__ ((packed));
-
-struct gdt_page
-{
-       struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU(struct gdt_page, gdt_page);
-
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
-{
-       return per_cpu(gdt_page, cpu).gdt;
-}
-
-extern struct Xgt_desc_struct idt_descr;
-extern struct desc_struct idt_table[];
-extern void set_intr_gate(unsigned int irq, void * addr);
-
-static inline void pack_descriptor(__u32 *a, __u32 *b,
-       unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
-{
-       *a = ((base & 0xffff) << 16) | (limit & 0xffff);
-       *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
-               (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
-}
-
-static inline void pack_gate(__u32 *a, __u32 *b,
-       unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
-{
-       *a = (seg << 16) | (base & 0xffff);
-       *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
-}
-
-#define DESCTYPE_LDT   0x82    /* present, system, DPL-0, LDT */
-#define DESCTYPE_TSS   0x89    /* present, system, DPL-0, 32-bit TSS */
-#define DESCTYPE_TASK  0x85    /* present, system, DPL-0, task gate */
-#define DESCTYPE_INT   0x8e    /* present, system, DPL-0, interrupt gate */
-#define DESCTYPE_TRAP  0x8f    /* present, system, DPL-0, trap gate */
-#define DESCTYPE_DPL3  0x60    /* DPL-3 */
-#define DESCTYPE_S     0x10    /* !system */
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define load_TR_desc() native_load_tr_desc()
-#define load_gdt(dtr) native_load_gdt(dtr)
-#define load_idt(dtr) native_load_idt(dtr)
-#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
-
-#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
-#define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
-
-#define load_TLS(t, cpu) native_load_tls(t, cpu)
-#define set_ldt native_set_ldt
-
-#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-#endif
-
-static inline void write_dt_entry(struct desc_struct *dt,
-                                 int entry, u32 entry_low, u32 entry_high)
-{
-       dt[entry].a = entry_low;
-       dt[entry].b = entry_high;
-}
-
-static inline void native_set_ldt(const void *addr, unsigned int entries)
-{
-       if (likely(entries == 0))
-               __asm__ __volatile__("lldt %w0"::"q" (0));
-       else {
-               unsigned cpu = smp_processor_id();
-               __u32 a, b;
-
-               pack_descriptor(&a, &b, (unsigned long)addr,
-                               entries * sizeof(struct desc_struct) - 1,
-                               DESCTYPE_LDT, 0);
-               write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
-               __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
-       }
-}
-
-
-static inline void native_load_tr_desc(void)
-{
-       asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
-}
-
-static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
-{
-       asm volatile("lgdt %0"::"m" (*dtr));
-}
-
-static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
-{
-       asm volatile("lidt %0"::"m" (*dtr));
-}
-
-static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
-{
-       asm ("sgdt %0":"=m" (*dtr));
-}
-
-static inline void native_store_idt(struct Xgt_desc_struct *dtr)
-{
-       asm ("sidt %0":"=m" (*dtr));
-}
-
-static inline unsigned long native_store_tr(void)
-{
-       unsigned long tr;
-       asm ("str %0":"=r" (tr));
-       return tr;
-}
-
-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-       unsigned int i;
-       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-
-       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
-               gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
-}
-
-static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
-{
-       __u32 a, b;
-       pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
-       write_idt_entry(idt_table, gate, a, b);
-}
-
-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
-{
-       __u32 a, b;
-       pack_descriptor(&a, &b, (unsigned long)addr,
-                       offsetof(struct tss_struct, __cacheline_filler) - 1,
-                       DESCTYPE_TSS, 0);
-       write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
-}
-
-
-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
-
-#define LDT_entry_a(info) \
-       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
-
-#define LDT_entry_b(info) \
-       (((info)->base_addr & 0xff000000) | \
-       (((info)->base_addr & 0x00ff0000) >> 16) | \
-       ((info)->limit & 0xf0000) | \
-       (((info)->read_exec_only ^ 1) << 9) | \
-       ((info)->contents << 10) | \
-       (((info)->seg_not_present ^ 1) << 15) | \
-       ((info)->seg_32bit << 22) | \
-       ((info)->limit_in_pages << 23) | \
-       ((info)->useable << 20) | \
-       0x7000)
-
-#define LDT_empty(info) (\
-       (info)->base_addr       == 0    && \
-       (info)->limit           == 0    && \
-       (info)->contents        == 0    && \
-       (info)->read_exec_only  == 1    && \
-       (info)->seg_32bit       == 0    && \
-       (info)->limit_in_pages  == 0    && \
-       (info)->seg_not_present == 1    && \
-       (info)->useable         == 0    )
-
-static inline void clear_LDT(void)
-{
-       set_ldt(NULL, 0);
-}
-
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
-       set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
-       preempt_disable();
-       load_LDT_nolock(pc);
-       preempt_enable();
-}
-
-static inline unsigned long get_desc_base(unsigned long *desc)
-{
-       unsigned long base;
-       base = ((desc[0] >> 16)  & 0x0000ffff) |
-               ((desc[1] << 16) & 0x00ff0000) |
-               (desc[1] & 0xff000000);
-       return base;
-}
-
-#else /* __ASSEMBLY__ */
-
-/*
- * GET_DESC_BASE reads the descriptor base of the specified segment.
- *
- * Args:
- *    idx - descriptor index
- *    gdt - GDT pointer
- *    base - 32bit register to which the base will be written
- *    lo_w - lo word of the "base" register
- *    lo_b - lo byte of the "base" register
- *    hi_b - hi byte of the low word of the "base" register
- *
- * Example:
- *    GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- *    Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
- */
-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
-       movb idx*8+4(gdt), lo_b; \
-       movb idx*8+7(gdt), hi_b; \
-       shll $16, base; \
-       movw idx*8+2(gdt), lo_w;
-
-#endif /* !__ASSEMBLY__ */
-
-#endif
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h

index 7d9c938e69fd296d1e493565fc2e4dca0265b291..8b137891791fe96927ad78e64b0aad7bded08bdc 100644 (file)
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -1,204 +1 @@
-/* Written 2000 by Andi Kleen */ 
-#ifndef __ARCH_DESC_H
-#define __ARCH_DESC_H
  
-#include <linux/threads.h>
-#include <asm/ldt.h>
-
-#ifndef __ASSEMBLY__
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <asm/desc_defs.h>
-
-#include <asm/segment.h>
-#include <asm/mmu.h>
-
-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
-
-#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
-#define clear_LDT()  asm volatile("lldt %w0"::"r" (0))
-
-static inline unsigned long __store_tr(void)
-{
-       unsigned long tr;
-
-       asm volatile ("str %w0":"=r" (tr));
-       return tr;
-}
-
-#define store_tr(tr) (tr) = __store_tr()
-
-/*
- * This is the ldt that every process will get unless we need
- * something other than this.
- */
-extern struct desc_struct default_ldt[];
-extern struct gate_struct idt_table[]; 
-extern struct desc_ptr cpu_gdt_descr[];
-
-/* the cpu gdt accessor */
-#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
-
-static inline void load_gdt(const struct desc_ptr *ptr)
-{
-       asm volatile("lgdt %w0"::"m" (*ptr));
-}
-
-static inline void store_gdt(struct desc_ptr *ptr)
-{
-       asm("sgdt %w0":"=m" (*ptr));
-}
-
-static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)  
-{
-       struct gate_struct s;   
-       s.offset_low = PTR_LOW(func); 
-       s.segment = __KERNEL_CS;
-       s.ist = ist; 
-       s.p = 1;
-       s.dpl = dpl; 
-       s.zero0 = 0;
-       s.zero1 = 0; 
-       s.type = type; 
-       s.offset_middle = PTR_MIDDLE(func); 
-       s.offset_high = PTR_HIGH(func); 
-       /* does not need to be atomic because it is only done once at setup time */ 
-       memcpy(adr, &s, 16); 
-} 
-
-static inline void set_intr_gate(int nr, void *func) 
-{ 
-       BUG_ON((unsigned)nr > 0xFF);
-       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
-} 
-
-static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
-{ 
-       BUG_ON((unsigned)nr > 0xFF);
-       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
-} 
-
-static inline void set_system_gate(int nr, void *func) 
-{ 
-       BUG_ON((unsigned)nr > 0xFF);
-       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
-} 
-
-static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
-{
-       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
-}
-
-static inline void load_idt(const struct desc_ptr *ptr)
-{
-       asm volatile("lidt %w0"::"m" (*ptr));
-}
-
-static inline void store_idt(struct desc_ptr *dtr)
-{
-       asm("sidt %w0":"=m" (*dtr));
-}
-
-static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, 
-                                        unsigned size) 
-{ 
-       struct ldttss_desc d;
-       memset(&d,0,sizeof(d)); 
-       d.limit0 = size & 0xFFFF;
-       d.base0 = PTR_LOW(tss); 
-       d.base1 = PTR_MIDDLE(tss) & 0xFF; 
-       d.type = type;
-       d.p = 1; 
-       d.limit1 = (size >> 16) & 0xF;
-       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; 
-       d.base3 = PTR_HIGH(tss); 
-       memcpy(ptr, &d, 16); 
-}
-
-static inline void set_tss_desc(unsigned cpu, void *addr)
-{ 
-       /*
-        * sizeof(unsigned long) coming from an extra "long" at the end
-        * of the iobitmap. See tss_struct definition in processor.h
-        *
-        * -1? seg base+limit should be pointing to the address of the
-        * last valid byte
-        */
-       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
-               (unsigned long)addr, DESC_TSS,
-               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
-} 
-
-static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
-{ 
-       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
-                             DESC_LDT, size * 8 - 1);
-}
-
-#define LDT_entry_a(info) \
-       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
-/* Don't allow setting of the lm bit. It is useless anyways because 
-   64bit system calls require __USER_CS. */ 
-#define LDT_entry_b(info) \
-       (((info)->base_addr & 0xff000000) | \
-       (((info)->base_addr & 0x00ff0000) >> 16) | \
-       ((info)->limit & 0xf0000) | \
-       (((info)->read_exec_only ^ 1) << 9) | \
-       ((info)->contents << 10) | \
-       (((info)->seg_not_present ^ 1) << 15) | \
-       ((info)->seg_32bit << 22) | \
-       ((info)->limit_in_pages << 23) | \
-       ((info)->useable << 20) | \
-       /* ((info)->lm << 21) | */ \
-       0x7000)
-
-#define LDT_empty(info) (\
-       (info)->base_addr       == 0    && \
-       (info)->limit           == 0    && \
-       (info)->contents        == 0    && \
-       (info)->read_exec_only  == 1    && \
-       (info)->seg_32bit       == 0    && \
-       (info)->limit_in_pages  == 0    && \
-       (info)->seg_not_present == 1    && \
-       (info)->useable         == 0    && \
-       (info)->lm              == 0)
-
-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
-{
-       unsigned int i;
-       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
-
-       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
-               gdt[i] = t->tls_array[i];
-} 
-
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
-{
-       int count = pc->size;
-
-       if (likely(!count)) {
-               clear_LDT();
-               return;
-       }
-               
-       set_ldt_desc(cpu, pc->ldt, count);
-       load_LDT_desc();
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
-       int cpu = get_cpu();
-       load_LDT_nolock(pc, cpu);
-       put_cpu();
-}
-
-extern struct desc_ptr idt_descr;
-
-#endif /* !__ASSEMBLY__ */
-
-#endif
diff --git a/include/asm-x86/desc_defs.h b/include/asm-x86/desc_defs.h

index 089004070099200c53ce9550ebd945b7ea9f9931..e33f078b3e54201d4b4b801e0519b4995a2dac5a 100644 (file)
--- a/include/asm-x86/desc_defs.h
+++ b/include/asm-x86/desc_defs.h
@@ -11,26 +11,36 @@
  
  #include <linux/types.h>
  
+/*
+ * FIXME: Acessing the desc_struct through its fields is more elegant,
+ * and should be the one valid thing to do. However, a lot of open code
+ * still touches the a and b acessors, and doing this allow us to do it
+ * incrementally. We keep the signature as a struct, rather than an union,
+ * so we can get rid of it transparently in the future -- glommer
+ */
  // 8 byte segment descriptor
  struct desc_struct {
-       u16 limit0;
-       u16 base0;
-       unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
-       unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
-} __attribute__((packed));
+       union {
+               struct { unsigned int a, b; };
+               struct {
+                       u16 limit0;
+                       u16 base0;
+                       unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+                       unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+               };
  
-struct n_desc_struct {
-       unsigned int a,b;
-};
+       };
+} __attribute__((packed));
  
  enum {
         GATE_INTERRUPT = 0xE,
         GATE_TRAP = 0xF,
         GATE_CALL = 0xC,
+       GATE_TASK = 0x5,
  };
  
  // 16byte gate
-struct gate_struct {
+struct gate_struct64 {
         u16 offset_low;
         u16 segment;
         unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
@@ -39,17 +49,18 @@ struct gate_struct {
         u32 zero1;
  } __attribute__((packed));
  
-#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
-#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
-#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
+#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
+#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
+#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
  
  enum {
         DESC_TSS = 0x9,
         DESC_LDT = 0x2,
+       DESCTYPE_S =    0x10,   /* !system */
  };
  
  // LDT or TSS descriptor in the GDT. 16 bytes.
-struct ldttss_desc {
+struct ldttss_desc64 {
         u16 limit0;
         u16 base0;
         unsigned base1 : 8, type : 5, dpl : 2, p : 1;
@@ -58,6 +69,16 @@ struct ldttss_desc {
         u32 zero1;
  } __attribute__((packed));
  
+#ifdef CONFIG_X86_64
+typedef struct gate_struct64 gate_desc;
+typedef struct ldttss_desc64 ldt_desc;
+typedef struct ldttss_desc64 tss_desc;
+#else
+typedef struct desc_struct gate_desc;
+typedef struct desc_struct ldt_desc;
+typedef struct desc_struct tss_desc;
+#endif
+
  struct desc_ptr {
         unsigned short size;
         unsigned long address;
diff --git a/include/asm-x86/dma.h b/include/asm-x86/dma.h

index 9f936c61a4e50dbcf9d2dce8d12d3cb1cbc58660..e9733ce8988084688a1724a4bd0b47fd76072518 100644 (file)
--- a/include/asm-x86/dma.h
+++ b/include/asm-x86/dma.h
@@ -1,5 +1,319 @@
+/*
+ * linux/include/asm/dma.h: Defines for using and allocating dma channels.
+ * Written by Hennus Bergman, 1992.
+ * High DMA channel support & info by Hannu Savolainen
+ * and John Boyd, Nov. 1992.
+ */
+
+#ifndef _ASM_X86_DMA_H
+#define _ASM_X86_DMA_H
+
+#include <linux/spinlock.h>    /* And spinlocks */
+#include <asm/io.h>            /* need byte IO */
+#include <linux/delay.h>
+
+
+#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
+#define dma_outb       outb_p
+#else
+#define dma_outb       outb
+#endif
+
+#define dma_inb                inb
+
+/*
+ * NOTES about DMA transfers:
+ *
+ *  controller 1: channels 0-3, byte operations, ports 00-1F
+ *  controller 2: channels 4-7, word operations, ports C0-DF
+ *
+ *  - ALL registers are 8 bits only, regardless of transfer size
+ *  - channel 4 is not used - cascades 1 into 2.
+ *  - channels 0-3 are byte - addresses/counts are for physical bytes
+ *  - channels 5-7 are word - addresses/counts are for physical words
+ *  - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
+ *  - transfer count loaded to registers is 1 less than actual count
+ *  - controller 2 offsets are all even (2x offsets for controller 1)
+ *  - page registers for 5-7 don't use data bit 0, represent 128K pages
+ *  - page registers for 0-3 use bit 0, represent 64K pages
+ *
+ * DMA transfers are limited to the lower 16MB of _physical_ memory.
+ * Note that addresses loaded into registers must be _physical_ addresses,
+ * not logical addresses (which may differ if paging is active).
+ *
+ *  Address mapping for channels 0-3:
+ *
+ *   A23 ... A16 A15 ... A8  A7 ... A0    (Physical addresses)
+ *    |  ...  |   |  ... |   |  ... |
+ *    |  ...  |   |  ... |   |  ... |
+ *    |  ...  |   |  ... |   |  ... |
+ *   P7  ...  P0  A7 ... A0  A7 ... A0
+ * |    Page    | Addr MSB | Addr LSB |   (DMA registers)
+ *
+ *  Address mapping for channels 5-7:
+ *
+ *   A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0    (Physical addresses)
+ *    |  ...  |   \   \   ... \  \  \  ... \  \
+ *    |  ...  |    \   \   ... \  \  \  ... \  (not used)
+ *    |  ...  |     \   \   ... \  \  \  ... \
+ *   P7  ...  P1 (0) A7 A6  ... A0 A7 A6 ... A0
+ * |      Page      |  Addr MSB   |  Addr LSB  |   (DMA registers)
+ *
+ * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
+ * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
+ * the hardware level, so odd-byte transfers aren't possible).
+ *
+ * Transfer count (_not # bytes_) is limited to 64K, represented as actual
+ * count - 1 : 64K => 0xFFFF, 1 => 0x0000.  Thus, count is always 1 or more,
+ * and up to 128K bytes may be transferred on channels 5-7 in one operation.
+ *
+ */
+
+#define MAX_DMA_CHANNELS       8
+
  #ifdef CONFIG_X86_32
-# include "dma_32.h"
+
+/* The maximum address that we can perform a DMA transfer to on this platform */
+#define MAX_DMA_ADDRESS      (PAGE_OFFSET+0x1000000)
+
+#else
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN   ((16*1024*1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
+
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
+
+#endif
+
+/* 8237 DMA controllers */
+#define IO_DMA1_BASE   0x00    /* 8 bit slave DMA, channels 0..3 */
+#define IO_DMA2_BASE   0xC0    /* 16 bit master DMA, ch 4(=slave input)..7 */
+
+/* DMA controller registers */
+#define DMA1_CMD_REG           0x08    /* command register (w) */
+#define DMA1_STAT_REG          0x08    /* status register (r) */
+#define DMA1_REQ_REG           0x09    /* request register (w) */
+#define DMA1_MASK_REG          0x0A    /* single-channel mask (w) */
+#define DMA1_MODE_REG          0x0B    /* mode register (w) */
+#define DMA1_CLEAR_FF_REG      0x0C    /* clear pointer flip-flop (w) */
+#define DMA1_TEMP_REG          0x0D    /* Temporary Register (r) */
+#define DMA1_RESET_REG         0x0D    /* Master Clear (w) */
+#define DMA1_CLR_MASK_REG       0x0E    /* Clear Mask */
+#define DMA1_MASK_ALL_REG       0x0F    /* all-channels mask (w) */
+
+#define DMA2_CMD_REG           0xD0    /* command register (w) */
+#define DMA2_STAT_REG          0xD0    /* status register (r) */
+#define DMA2_REQ_REG           0xD2    /* request register (w) */
+#define DMA2_MASK_REG          0xD4    /* single-channel mask (w) */
+#define DMA2_MODE_REG          0xD6    /* mode register (w) */
+#define DMA2_CLEAR_FF_REG      0xD8    /* clear pointer flip-flop (w) */
+#define DMA2_TEMP_REG          0xDA    /* Temporary Register (r) */
+#define DMA2_RESET_REG         0xDA    /* Master Clear (w) */
+#define DMA2_CLR_MASK_REG       0xDC    /* Clear Mask */
+#define DMA2_MASK_ALL_REG       0xDE    /* all-channels mask (w) */
+
+#define DMA_ADDR_0             0x00    /* DMA address registers */
+#define DMA_ADDR_1             0x02
+#define DMA_ADDR_2             0x04
+#define DMA_ADDR_3             0x06
+#define DMA_ADDR_4             0xC0
+#define DMA_ADDR_5             0xC4
+#define DMA_ADDR_6             0xC8
+#define DMA_ADDR_7             0xCC
+
+#define DMA_CNT_0              0x01    /* DMA count registers */
+#define DMA_CNT_1              0x03
+#define DMA_CNT_2              0x05
+#define DMA_CNT_3              0x07
+#define DMA_CNT_4              0xC2
+#define DMA_CNT_5              0xC6
+#define DMA_CNT_6              0xCA
+#define DMA_CNT_7              0xCE
+
+#define DMA_PAGE_0             0x87    /* DMA page registers */
+#define DMA_PAGE_1             0x83
+#define DMA_PAGE_2             0x81
+#define DMA_PAGE_3             0x82
+#define DMA_PAGE_5             0x8B
+#define DMA_PAGE_6             0x89
+#define DMA_PAGE_7             0x8A
+
+/* I/O to memory, no autoinit, increment, single mode */
+#define DMA_MODE_READ          0x44
+/* memory to I/O, no autoinit, increment, single mode */
+#define DMA_MODE_WRITE         0x48
+/* pass thru DREQ->HRQ, DACK<-HLDA only */
+#define DMA_MODE_CASCADE       0xC0
+
+#define DMA_AUTOINIT           0x10
+
+
+extern spinlock_t  dma_spin_lock;
+
+static __inline__ unsigned long claim_dma_lock(void)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&dma_spin_lock, flags);
+       return flags;
+}
+
+static __inline__ void release_dma_lock(unsigned long flags)
+{
+       spin_unlock_irqrestore(&dma_spin_lock, flags);
+}
+
+/* enable/disable a specific DMA channel */
+static __inline__ void enable_dma(unsigned int dmanr)
+{
+       if (dmanr <= 3)
+               dma_outb(dmanr, DMA1_MASK_REG);
+       else
+               dma_outb(dmanr & 3, DMA2_MASK_REG);
+}
+
+static __inline__ void disable_dma(unsigned int dmanr)
+{
+       if (dmanr <= 3)
+               dma_outb(dmanr | 4, DMA1_MASK_REG);
+       else
+               dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
+}
+
+/* Clear the 'DMA Pointer Flip Flop'.
+ * Write 0 for LSB/MSB, 1 for MSB/LSB access.
+ * Use this once to initialize the FF to a known state.
+ * After that, keep track of it. :-)
+ * --- In order to do that, the DMA routines below should ---
+ * --- only be used while holding the DMA lock ! ---
+ */
+static __inline__ void clear_dma_ff(unsigned int dmanr)
+{
+       if (dmanr <= 3)
+               dma_outb(0, DMA1_CLEAR_FF_REG);
+       else
+               dma_outb(0, DMA2_CLEAR_FF_REG);
+}
+
+/* set mode (above) for a specific DMA channel */
+static __inline__ void set_dma_mode(unsigned int dmanr, char mode)
+{
+       if (dmanr <= 3)
+               dma_outb(mode | dmanr, DMA1_MODE_REG);
+       else
+               dma_outb(mode | (dmanr & 3), DMA2_MODE_REG);
+}
+
+/* Set only the page register bits of the transfer address.
+ * This is used for successive transfers when we know the contents of
+ * the lower 16 bits of the DMA current address register, but a 64k boundary
+ * may have been crossed.
+ */
+static __inline__ void set_dma_page(unsigned int dmanr, char pagenr)
+{
+       switch (dmanr) {
+       case 0:
+               dma_outb(pagenr, DMA_PAGE_0);
+               break;
+       case 1:
+               dma_outb(pagenr, DMA_PAGE_1);
+               break;
+       case 2:
+               dma_outb(pagenr, DMA_PAGE_2);
+               break;
+       case 3:
+               dma_outb(pagenr, DMA_PAGE_3);
+               break;
+       case 5:
+               dma_outb(pagenr & 0xfe, DMA_PAGE_5);
+               break;
+       case 6:
+               dma_outb(pagenr & 0xfe, DMA_PAGE_6);
+               break;
+       case 7:
+               dma_outb(pagenr & 0xfe, DMA_PAGE_7);
+               break;
+       }
+}
+
+
+/* Set transfer address & page bits for specific DMA channel.
+ * Assumes dma flipflop is clear.
+ */
+static __inline__ void set_dma_addr(unsigned int dmanr, unsigned int a)
+{
+       set_dma_page(dmanr, a>>16);
+       if (dmanr <= 3)  {
+               dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+               dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+       }  else  {
+           dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+           dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+       }
+}
+
+
+/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
+ * a specific DMA channel.
+ * You must ensure the parameters are valid.
+ * NOTE: from a manual: "the number of transfers is one more
+ * than the initial word count"! This is taken into account.
+ * Assumes dma flip-flop is clear.
+ * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
+ */
+static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
+{
+       count--;
+       if (dmanr <= 3)  {
+           dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+           dma_outb((count >> 8) & 0xff,
+                    ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+       } else {
+           dma_outb((count >> 1) & 0xff,
+                    ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+           dma_outb((count >> 9) & 0xff,
+                    ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+       }
+}
+
+
+/* Get DMA residue count. After a DMA transfer, this
+ * should return zero. Reading this while a DMA transfer is
+ * still in progress will return unpredictable results.
+ * If called before the channel has been used, it may return 1.
+ * Otherwise, it returns the number of _bytes_ left to transfer.
+ *
+ * Assumes DMA flip-flop is clear.
+ */
+static __inline__ int get_dma_residue(unsigned int dmanr)
+{
+       unsigned int io_port;
+       /* using short to get 16-bit wrap around */
+       unsigned short count;
+
+       io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE
+               : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE;
+
+       count = 1 + dma_inb(io_port);
+       count += dma_inb(io_port) << 8;
+
+       return (dmanr <= 3) ? count : (count << 1);
+}
+
+
+/* These are in kernel/dma.c: */
+extern int request_dma(unsigned int dmanr, const char *device_id);
+extern void free_dma(unsigned int dmanr);
+
+/* From PCI */
+
+#ifdef CONFIG_PCI
+extern int isa_dma_bridge_buggy;
  #else
-# include "dma_64.h"
+#define isa_dma_bridge_buggy   (0)
  #endif
+
+#endif /* _ASM_X86_DMA_H */
diff --git a/include/asm-x86/dma_32.h b/include/asm-x86/dma_32.h

deleted file mode 100644 (file)

index d23aac8..0000000
--- a/include/asm-x86/dma_32.h
+++ /dev/null
@@ -1,297 +0,0 @@
-/* $Id: dma.h,v 1.7 1992/12/14 00:29:34 root Exp root $
- * linux/include/asm/dma.h: Defines for using and allocating dma channels.
- * Written by Hennus Bergman, 1992.
- * High DMA channel support & info by Hannu Savolainen
- * and John Boyd, Nov. 1992.
- */
-
-#ifndef _ASM_DMA_H
-#define _ASM_DMA_H
-
-#include <linux/spinlock.h>    /* And spinlocks */
-#include <asm/io.h>            /* need byte IO */
-#include <linux/delay.h>
-
-
-#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
-#define dma_outb       outb_p
-#else
-#define dma_outb       outb
-#endif
-
-#define dma_inb                inb
-
-/*
- * NOTES about DMA transfers:
- *
- *  controller 1: channels 0-3, byte operations, ports 00-1F
- *  controller 2: channels 4-7, word operations, ports C0-DF
- *
- *  - ALL registers are 8 bits only, regardless of transfer size
- *  - channel 4 is not used - cascades 1 into 2.
- *  - channels 0-3 are byte - addresses/counts are for physical bytes
- *  - channels 5-7 are word - addresses/counts are for physical words
- *  - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
- *  - transfer count loaded to registers is 1 less than actual count
- *  - controller 2 offsets are all even (2x offsets for controller 1)
- *  - page registers for 5-7 don't use data bit 0, represent 128K pages
- *  - page registers for 0-3 use bit 0, represent 64K pages
- *
- * DMA transfers are limited to the lower 16MB of _physical_ memory.  
- * Note that addresses loaded into registers must be _physical_ addresses,
- * not logical addresses (which may differ if paging is active).
- *
- *  Address mapping for channels 0-3:
- *
- *   A23 ... A16 A15 ... A8  A7 ... A0    (Physical addresses)
- *    |  ...  |   |  ... |   |  ... |
- *    |  ...  |   |  ... |   |  ... |
- *    |  ...  |   |  ... |   |  ... |
- *   P7  ...  P0  A7 ... A0  A7 ... A0   
- * |    Page    | Addr MSB | Addr LSB |   (DMA registers)
- *
- *  Address mapping for channels 5-7:
- *
- *   A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0    (Physical addresses)
- *    |  ...  |   \   \   ... \  \  \  ... \  \
- *    |  ...  |    \   \   ... \  \  \  ... \  (not used)
- *    |  ...  |     \   \   ... \  \  \  ... \
- *   P7  ...  P1 (0) A7 A6  ... A0 A7 A6 ... A0   
- * |      Page      |  Addr MSB   |  Addr LSB  |   (DMA registers)
- *
- * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
- * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
- * the hardware level, so odd-byte transfers aren't possible).
- *
- * Transfer count (_not # bytes_) is limited to 64K, represented as actual
- * count - 1 : 64K => 0xFFFF, 1 => 0x0000.  Thus, count is always 1 or more,
- * and up to 128K bytes may be transferred on channels 5-7 in one operation. 
- *
- */
-
-#define MAX_DMA_CHANNELS       8
-
-/* The maximum address that we can perform a DMA transfer to on this platform */
-#define MAX_DMA_ADDRESS      (PAGE_OFFSET+0x1000000)
-
-/* 8237 DMA controllers */
-#define IO_DMA1_BASE   0x00    /* 8 bit slave DMA, channels 0..3 */
-#define IO_DMA2_BASE   0xC0    /* 16 bit master DMA, ch 4(=slave input)..7 */
-
-/* DMA controller registers */
-#define DMA1_CMD_REG           0x08    /* command register (w) */
-#define DMA1_STAT_REG          0x08    /* status register (r) */
-#define DMA1_REQ_REG            0x09    /* request register (w) */
-#define DMA1_MASK_REG          0x0A    /* single-channel mask (w) */
-#define DMA1_MODE_REG          0x0B    /* mode register (w) */
-#define DMA1_CLEAR_FF_REG      0x0C    /* clear pointer flip-flop (w) */
-#define DMA1_TEMP_REG           0x0D    /* Temporary Register (r) */
-#define DMA1_RESET_REG         0x0D    /* Master Clear (w) */
-#define DMA1_CLR_MASK_REG       0x0E    /* Clear Mask */
-#define DMA1_MASK_ALL_REG       0x0F    /* all-channels mask (w) */
-
-#define DMA2_CMD_REG           0xD0    /* command register (w) */
-#define DMA2_STAT_REG          0xD0    /* status register (r) */
-#define DMA2_REQ_REG            0xD2    /* request register (w) */
-#define DMA2_MASK_REG          0xD4    /* single-channel mask (w) */
-#define DMA2_MODE_REG          0xD6    /* mode register (w) */
-#define DMA2_CLEAR_FF_REG      0xD8    /* clear pointer flip-flop (w) */
-#define DMA2_TEMP_REG           0xDA    /* Temporary Register (r) */
-#define DMA2_RESET_REG         0xDA    /* Master Clear (w) */
-#define DMA2_CLR_MASK_REG       0xDC    /* Clear Mask */
-#define DMA2_MASK_ALL_REG       0xDE    /* all-channels mask (w) */
-
-#define DMA_ADDR_0              0x00    /* DMA address registers */
-#define DMA_ADDR_1              0x02
-#define DMA_ADDR_2              0x04
-#define DMA_ADDR_3              0x06
-#define DMA_ADDR_4              0xC0
-#define DMA_ADDR_5              0xC4
-#define DMA_ADDR_6              0xC8
-#define DMA_ADDR_7              0xCC
-
-#define DMA_CNT_0               0x01    /* DMA count registers */
-#define DMA_CNT_1               0x03
-#define DMA_CNT_2               0x05
-#define DMA_CNT_3               0x07
-#define DMA_CNT_4               0xC2
-#define DMA_CNT_5               0xC6
-#define DMA_CNT_6               0xCA
-#define DMA_CNT_7               0xCE
-
-#define DMA_PAGE_0              0x87    /* DMA page registers */
-#define DMA_PAGE_1              0x83
-#define DMA_PAGE_2              0x81
-#define DMA_PAGE_3              0x82
-#define DMA_PAGE_5              0x8B
-#define DMA_PAGE_6              0x89
-#define DMA_PAGE_7              0x8A
-
-#define DMA_MODE_READ  0x44    /* I/O to memory, no autoinit, increment, single mode */
-#define DMA_MODE_WRITE 0x48    /* memory to I/O, no autoinit, increment, single mode */
-#define DMA_MODE_CASCADE 0xC0   /* pass thru DREQ->HRQ, DACK<-HLDA only */
-
-#define DMA_AUTOINIT   0x10
-
-
-extern spinlock_t  dma_spin_lock;
-
-static __inline__ unsigned long claim_dma_lock(void)
-{
-       unsigned long flags;
-       spin_lock_irqsave(&dma_spin_lock, flags);
-       return flags;
-}
-
-static __inline__ void release_dma_lock(unsigned long flags)
-{
-       spin_unlock_irqrestore(&dma_spin_lock, flags);
-}
-
-/* enable/disable a specific DMA channel */
-static __inline__ void enable_dma(unsigned int dmanr)
-{
-       if (dmanr<=3)
-               dma_outb(dmanr,  DMA1_MASK_REG);
-       else
-               dma_outb(dmanr & 3,  DMA2_MASK_REG);
-}
-
-static __inline__ void disable_dma(unsigned int dmanr)
-{
-       if (dmanr<=3)
-               dma_outb(dmanr | 4,  DMA1_MASK_REG);
-       else
-               dma_outb((dmanr & 3) | 4,  DMA2_MASK_REG);
-}
-
-/* Clear the 'DMA Pointer Flip Flop'.
- * Write 0 for LSB/MSB, 1 for MSB/LSB access.
- * Use this once to initialize the FF to a known state.
- * After that, keep track of it. :-)
- * --- In order to do that, the DMA routines below should ---
- * --- only be used while holding the DMA lock ! ---
- */
-static __inline__ void clear_dma_ff(unsigned int dmanr)
-{
-       if (dmanr<=3)
-               dma_outb(0,  DMA1_CLEAR_FF_REG);
-       else
-               dma_outb(0,  DMA2_CLEAR_FF_REG);
-}
-
-/* set mode (above) for a specific DMA channel */
-static __inline__ void set_dma_mode(unsigned int dmanr, char mode)
-{
-       if (dmanr<=3)
-               dma_outb(mode | dmanr,  DMA1_MODE_REG);
-       else
-               dma_outb(mode | (dmanr&3),  DMA2_MODE_REG);
-}
-
-/* Set only the page register bits of the transfer address.
- * This is used for successive transfers when we know the contents of
- * the lower 16 bits of the DMA current address register, but a 64k boundary
- * may have been crossed.
- */
-static __inline__ void set_dma_page(unsigned int dmanr, char pagenr)
-{
-       switch(dmanr) {
-               case 0:
-                       dma_outb(pagenr, DMA_PAGE_0);
-                       break;
-               case 1:
-                       dma_outb(pagenr, DMA_PAGE_1);
-                       break;
-               case 2:
-                       dma_outb(pagenr, DMA_PAGE_2);
-                       break;
-               case 3:
-                       dma_outb(pagenr, DMA_PAGE_3);
-                       break;
-               case 5:
-                       dma_outb(pagenr & 0xfe, DMA_PAGE_5);
-                       break;
-               case 6:
-                       dma_outb(pagenr & 0xfe, DMA_PAGE_6);
-                       break;
-               case 7:
-                       dma_outb(pagenr & 0xfe, DMA_PAGE_7);
-                       break;
-       }
-}
-
-
-/* Set transfer address & page bits for specific DMA channel.
- * Assumes dma flipflop is clear.
- */
-static __inline__ void set_dma_addr(unsigned int dmanr, unsigned int a)
-{
-       set_dma_page(dmanr, a>>16);
-       if (dmanr <= 3)  {
-           dma_outb( a & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
-            dma_outb( (a>>8) & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
-       }  else  {
-           dma_outb( (a>>1) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
-           dma_outb( (a>>9) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
-       }
-}
-
-
-/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
- * a specific DMA channel.
- * You must ensure the parameters are valid.
- * NOTE: from a manual: "the number of transfers is one more
- * than the initial word count"! This is taken into account.
- * Assumes dma flip-flop is clear.
- * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
- */
-static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
-{
-        count--;
-       if (dmanr <= 3)  {
-           dma_outb( count & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
-           dma_outb( (count>>8) & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
-        } else {
-           dma_outb( (count>>1) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
-           dma_outb( (count>>9) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
-        }
-}
-
-
-/* Get DMA residue count. After a DMA transfer, this
- * should return zero. Reading this while a DMA transfer is
- * still in progress will return unpredictable results.
- * If called before the channel has been used, it may return 1.
- * Otherwise, it returns the number of _bytes_ left to transfer.
- *
- * Assumes DMA flip-flop is clear.
- */
-static __inline__ int get_dma_residue(unsigned int dmanr)
-{
-       unsigned int io_port = (dmanr<=3)? ((dmanr&3)<<1) + 1 + IO_DMA1_BASE
-                                        : ((dmanr&3)<<2) + 2 + IO_DMA2_BASE;
-
-       /* using short to get 16-bit wrap around */
-       unsigned short count;
-
-       count = 1 + dma_inb(io_port);
-       count += dma_inb(io_port) << 8;
-       
-       return (dmanr<=3)? count : (count<<1);
-}
-
-
-/* These are in kernel/dma.c: */
-extern int request_dma(unsigned int dmanr, const char * device_id);    /* reserve a DMA channel */
-extern void free_dma(unsigned int dmanr);      /* release it again */
-
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy   (0)
-#endif
-
-#endif /* _ASM_DMA_H */
diff --git a/include/asm-x86/dma_64.h b/include/asm-x86/dma_64.h

deleted file mode 100644 (file)

index a37c16f..0000000
--- a/include/asm-x86/dma_64.h
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * linux/include/asm/dma.h: Defines for using and allocating dma channels.
- * Written by Hennus Bergman, 1992.
- * High DMA channel support & info by Hannu Savolainen
- * and John Boyd, Nov. 1992.
- */
-
-#ifndef _ASM_DMA_H
-#define _ASM_DMA_H
-
-#include <linux/spinlock.h>    /* And spinlocks */
-#include <asm/io.h>            /* need byte IO */
-#include <linux/delay.h>
-
-
-#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
-#define dma_outb       outb_p
-#else
-#define dma_outb       outb
-#endif
-
-#define dma_inb                inb
-
-/*
- * NOTES about DMA transfers:
- *
- *  controller 1: channels 0-3, byte operations, ports 00-1F
- *  controller 2: channels 4-7, word operations, ports C0-DF
- *
- *  - ALL registers are 8 bits only, regardless of transfer size
- *  - channel 4 is not used - cascades 1 into 2.
- *  - channels 0-3 are byte - addresses/counts are for physical bytes
- *  - channels 5-7 are word - addresses/counts are for physical words
- *  - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
- *  - transfer count loaded to registers is 1 less than actual count
- *  - controller 2 offsets are all even (2x offsets for controller 1)
- *  - page registers for 5-7 don't use data bit 0, represent 128K pages
- *  - page registers for 0-3 use bit 0, represent 64K pages
- *
- * DMA transfers are limited to the lower 16MB of _physical_ memory.  
- * Note that addresses loaded into registers must be _physical_ addresses,
- * not logical addresses (which may differ if paging is active).
- *
- *  Address mapping for channels 0-3:
- *
- *   A23 ... A16 A15 ... A8  A7 ... A0    (Physical addresses)
- *    |  ...  |   |  ... |   |  ... |
- *    |  ...  |   |  ... |   |  ... |
- *    |  ...  |   |  ... |   |  ... |
- *   P7  ...  P0  A7 ... A0  A7 ... A0   
- * |    Page    | Addr MSB | Addr LSB |   (DMA registers)
- *
- *  Address mapping for channels 5-7:
- *
- *   A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0    (Physical addresses)
- *    |  ...  |   \   \   ... \  \  \  ... \  \
- *    |  ...  |    \   \   ... \  \  \  ... \  (not used)
- *    |  ...  |     \   \   ... \  \  \  ... \
- *   P7  ...  P1 (0) A7 A6  ... A0 A7 A6 ... A0   
- * |      Page      |  Addr MSB   |  Addr LSB  |   (DMA registers)
- *
- * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
- * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
- * the hardware level, so odd-byte transfers aren't possible).
- *
- * Transfer count (_not # bytes_) is limited to 64K, represented as actual
- * count - 1 : 64K => 0xFFFF, 1 => 0x0000.  Thus, count is always 1 or more,
- * and up to 128K bytes may be transferred on channels 5-7 in one operation. 
- *
- */
-
-#define MAX_DMA_CHANNELS       8
-
-
-/* 16MB ISA DMA zone */
-#define MAX_DMA_PFN   ((16*1024*1024) >> PAGE_SHIFT)
-
-/* 4GB broken PCI/AGP hardware bus master zone */
-#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
-
-/* Compat define for old dma zone */
-#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
-
-/* 8237 DMA controllers */
-#define IO_DMA1_BASE   0x00    /* 8 bit slave DMA, channels 0..3 */
-#define IO_DMA2_BASE   0xC0    /* 16 bit master DMA, ch 4(=slave input)..7 */
-
-/* DMA controller registers */
-#define DMA1_CMD_REG           0x08    /* command register (w) */
-#define DMA1_STAT_REG          0x08    /* status register (r) */
-#define DMA1_REQ_REG            0x09    /* request register (w) */
-#define DMA1_MASK_REG          0x0A    /* single-channel mask (w) */
-#define DMA1_MODE_REG          0x0B    /* mode register (w) */
-#define DMA1_CLEAR_FF_REG      0x0C    /* clear pointer flip-flop (w) */
-#define DMA1_TEMP_REG           0x0D    /* Temporary Register (r) */
-#define DMA1_RESET_REG         0x0D    /* Master Clear (w) */
-#define DMA1_CLR_MASK_REG       0x0E    /* Clear Mask */
-#define DMA1_MASK_ALL_REG       0x0F    /* all-channels mask (w) */
-
-#define DMA2_CMD_REG           0xD0    /* command register (w) */
-#define DMA2_STAT_REG          0xD0    /* status register (r) */
-#define DMA2_REQ_REG            0xD2    /* request register (w) */
-#define DMA2_MASK_REG          0xD4    /* single-channel mask (w) */
-#define DMA2_MODE_REG          0xD6    /* mode register (w) */
-#define DMA2_CLEAR_FF_REG      0xD8    /* clear pointer flip-flop (w) */
-#define DMA2_TEMP_REG           0xDA    /* Temporary Register (r) */
-#define DMA2_RESET_REG         0xDA    /* Master Clear (w) */
-#define DMA2_CLR_MASK_REG       0xDC    /* Clear Mask */
-#define DMA2_MASK_ALL_REG       0xDE    /* all-channels mask (w) */
-
-#define DMA_ADDR_0              0x00    /* DMA address registers */
-#define DMA_ADDR_1              0x02
-#define DMA_ADDR_2              0x04
-#define DMA_ADDR_3              0x06
-#define DMA_ADDR_4              0xC0
-#define DMA_ADDR_5              0xC4
-#define DMA_ADDR_6              0xC8
-#define DMA_ADDR_7              0xCC
-
-#define DMA_CNT_0               0x01    /* DMA count registers */
-#define DMA_CNT_1               0x03
-#define DMA_CNT_2               0x05
-#define DMA_CNT_3               0x07
-#define DMA_CNT_4               0xC2
-#define DMA_CNT_5               0xC6
-#define DMA_CNT_6               0xCA
-#define DMA_CNT_7               0xCE
-
-#define DMA_PAGE_0              0x87    /* DMA page registers */
-#define DMA_PAGE_1              0x83
-#define DMA_PAGE_2              0x81
-#define DMA_PAGE_3              0x82
-#define DMA_PAGE_5              0x8B
-#define DMA_PAGE_6              0x89
-#define DMA_PAGE_7              0x8A
-
-#define DMA_MODE_READ  0x44    /* I/O to memory, no autoinit, increment, single mode */
-#define DMA_MODE_WRITE 0x48    /* memory to I/O, no autoinit, increment, single mode */
-#define DMA_MODE_CASCADE 0xC0   /* pass thru DREQ->HRQ, DACK<-HLDA only */
-
-#define DMA_AUTOINIT   0x10
-
-
-extern spinlock_t  dma_spin_lock;
-
-static __inline__ unsigned long claim_dma_lock(void)
-{
-       unsigned long flags;
-       spin_lock_irqsave(&dma_spin_lock, flags);
-       return flags;
-}
-
-static __inline__ void release_dma_lock(unsigned long flags)
-{
-       spin_unlock_irqrestore(&dma_spin_lock, flags);
-}
-
-/* enable/disable a specific DMA channel */
-static __inline__ void enable_dma(unsigned int dmanr)
-{
-       if (dmanr<=3)
-               dma_outb(dmanr,  DMA1_MASK_REG);
-       else
-               dma_outb(dmanr & 3,  DMA2_MASK_REG);
-}
-
-static __inline__ void disable_dma(unsigned int dmanr)
-{
-       if (dmanr<=3)
-               dma_outb(dmanr | 4,  DMA1_MASK_REG);
-       else
-               dma_outb((dmanr & 3) | 4,  DMA2_MASK_REG);
-}
-
-/* Clear the 'DMA Pointer Flip Flop'.
- * Write 0 for LSB/MSB, 1 for MSB/LSB access.
- * Use this once to initialize the FF to a known state.
- * After that, keep track of it. :-)
- * --- In order to do that, the DMA routines below should ---
- * --- only be used while holding the DMA lock ! ---
- */
-static __inline__ void clear_dma_ff(unsigned int dmanr)
-{
-       if (dmanr<=3)
-               dma_outb(0,  DMA1_CLEAR_FF_REG);
-       else
-               dma_outb(0,  DMA2_CLEAR_FF_REG);
-}
-
-/* set mode (above) for a specific DMA channel */
-static __inline__ void set_dma_mode(unsigned int dmanr, char mode)
-{
-       if (dmanr<=3)
-               dma_outb(mode | dmanr,  DMA1_MODE_REG);
-       else
-               dma_outb(mode | (dmanr&3),  DMA2_MODE_REG);
-}
-
-/* Set only the page register bits of the transfer address.
- * This is used for successive transfers when we know the contents of
- * the lower 16 bits of the DMA current address register, but a 64k boundary
- * may have been crossed.
- */
-static __inline__ void set_dma_page(unsigned int dmanr, char pagenr)
-{
-       switch(dmanr) {
-               case 0:
-                       dma_outb(pagenr, DMA_PAGE_0);
-                       break;
-               case 1:
-                       dma_outb(pagenr, DMA_PAGE_1);
-                       break;
-               case 2:
-                       dma_outb(pagenr, DMA_PAGE_2);
-                       break;
-               case 3:
-                       dma_outb(pagenr, DMA_PAGE_3);
-                       break;
-               case 5:
-                       dma_outb(pagenr & 0xfe, DMA_PAGE_5);
-                       break;
-               case 6:
-                       dma_outb(pagenr & 0xfe, DMA_PAGE_6);
-                       break;
-               case 7:
-                       dma_outb(pagenr & 0xfe, DMA_PAGE_7);
-                       break;
-       }
-}
-
-
-/* Set transfer address & page bits for specific DMA channel.
- * Assumes dma flipflop is clear.
- */
-static __inline__ void set_dma_addr(unsigned int dmanr, unsigned int a)
-{
-       set_dma_page(dmanr, a>>16);
-       if (dmanr <= 3)  {
-           dma_outb( a & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
-            dma_outb( (a>>8) & 0xff, ((dmanr&3)<<1) + IO_DMA1_BASE );
-       }  else  {
-           dma_outb( (a>>1) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
-           dma_outb( (a>>9) & 0xff, ((dmanr&3)<<2) + IO_DMA2_BASE );
-       }
-}
-
-
-/* Set transfer size (max 64k for DMA1..3, 128k for DMA5..7) for
- * a specific DMA channel.
- * You must ensure the parameters are valid.
- * NOTE: from a manual: "the number of transfers is one more
- * than the initial word count"! This is taken into account.
- * Assumes dma flip-flop is clear.
- * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
- */
-static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
-{
-        count--;
-       if (dmanr <= 3)  {
-           dma_outb( count & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
-           dma_outb( (count>>8) & 0xff, ((dmanr&3)<<1) + 1 + IO_DMA1_BASE );
-        } else {
-           dma_outb( (count>>1) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
-           dma_outb( (count>>9) & 0xff, ((dmanr&3)<<2) + 2 + IO_DMA2_BASE );
-        }
-}
-
-
-/* Get DMA residue count. After a DMA transfer, this
- * should return zero. Reading this while a DMA transfer is
- * still in progress will return unpredictable results.
- * If called before the channel has been used, it may return 1.
- * Otherwise, it returns the number of _bytes_ left to transfer.
- *
- * Assumes DMA flip-flop is clear.
- */
-static __inline__ int get_dma_residue(unsigned int dmanr)
-{
-       unsigned int io_port = (dmanr<=3)? ((dmanr&3)<<1) + 1 + IO_DMA1_BASE
-                                        : ((dmanr&3)<<2) + 2 + IO_DMA2_BASE;
-
-       /* using short to get 16-bit wrap around */
-       unsigned short count;
-
-       count = 1 + dma_inb(io_port);
-       count += dma_inb(io_port) << 8;
-       
-       return (dmanr<=3)? count : (count<<1);
-}
-
-
-/* These are in kernel/dma.c: */
-extern int request_dma(unsigned int dmanr, const char * device_id);    /* reserve a DMA channel */
-extern void free_dma(unsigned int dmanr);      /* release it again */
-
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy   (0)
-#endif
-
-#endif /* _ASM_DMA_H */
diff --git a/include/asm-x86/dmi.h b/include/asm-x86/dmi.h

index 8e2b0e6aa8e7eb81687f4861758a141084c5402e..1241e6ad1935b99c6273554eb8501f05833b28cb 100644 (file)
--- a/include/asm-x86/dmi.h
+++ b/include/asm-x86/dmi.h
@@ -5,9 +5,6 @@
  
  #ifdef CONFIG_X86_32
  
-/* Use early IO mappings for DMI because it's initialized early */
-#define dmi_ioremap bt_ioremap
-#define dmi_iounmap bt_iounmap
  #define dmi_alloc alloc_bootmem
  
  #else /* CONFIG_X86_32 */
@@ -22,14 +19,15 @@ extern char dmi_alloc_data[DMI_MAX_DATA];
  static inline void *dmi_alloc(unsigned len)
  {
         int idx = dmi_alloc_index;
-       if ((dmi_alloc_index += len) > DMI_MAX_DATA)
+       if ((dmi_alloc_index + len) > DMI_MAX_DATA)
                 return NULL;
+       dmi_alloc_index += len;
         return dmi_alloc_data + idx;
  }
  
+#endif
+
  #define dmi_ioremap early_ioremap
  #define dmi_iounmap early_iounmap
  
  #endif
-
-#endif
diff --git a/include/asm-x86/ds.h b/include/asm-x86/ds.h

new file mode 100644 (file)

index 0000000..7881368
--- /dev/null
+++ b/include/asm-x86/ds.h
@@ -0,0 +1,72 @@
+/*
+ * Debug Store (DS) support
+ *
+ * This provides a low-level interface to the hardware's Debug Store
+ * feature that is used for last branch recording (LBR) and
+ * precise-event based sampling (PEBS).
+ *
+ * Different architectures use a different DS layout/pointer size.
+ * The below functions therefore work on a void*.
+ *
+ *
+ * Since there is no user for PEBS, yet, only LBR (or branch
+ * trace store, BTS) is supported.
+ *
+ *
+ * Copyright (C) 2007 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ */
+
+#ifndef _ASM_X86_DS_H
+#define _ASM_X86_DS_H
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+struct cpuinfo_x86;
+
+
+/* a branch trace record entry
+ *
+ * In order to unify the interface between various processor versions,
+ * we use the below data structure for all processors.
+ */
+enum bts_qualifier {
+       BTS_INVALID = 0,
+       BTS_BRANCH,
+       BTS_TASK_ARRIVES,
+       BTS_TASK_DEPARTS
+};
+
+struct bts_struct {
+       u64 qualifier;
+       union {
+               /* BTS_BRANCH */
+               struct {
+                       u64 from_ip;
+                       u64 to_ip;
+               } lbr;
+               /* BTS_TASK_ARRIVES or
+                  BTS_TASK_DEPARTS */
+               u64 jiffies;
+       } variant;
+};
+
+/* Overflow handling mechanisms */
+#define DS_O_SIGNAL    1 /* send overflow signal */
+#define DS_O_WRAP      2 /* wrap around */
+
+extern int ds_allocate(void **, size_t);
+extern int ds_free(void **);
+extern int ds_get_bts_size(void *);
+extern int ds_get_bts_end(void *);
+extern int ds_get_bts_index(void *);
+extern int ds_set_overflow(void *, int);
+extern int ds_get_overflow(void *);
+extern int ds_clear(void *);
+extern int ds_read_bts(void *, int, struct bts_struct *);
+extern int ds_write_bts(void *, const struct bts_struct *);
+extern unsigned long ds_debugctl_mask(void);
+extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *c);
+
+#endif /* _ASM_X86_DS_H */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h

index 3e214f39fad3df2b9674d1b1f9cb1d8c9abd3338..7004251fc66bd7925a787bd09ccb675dc0714de8 100644 (file)
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -22,6 +22,12 @@ struct e820map {
  };
  #endif /* __ASSEMBLY__ */
  
+#define ISA_START_ADDRESS      0xa0000
+#define ISA_END_ADDRESS                0x100000
+
+#define BIOS_BEGIN             0x000a0000
+#define BIOS_END               0x00100000
+
  #ifdef __KERNEL__
  #ifdef CONFIG_X86_32
  # include "e820_32.h"
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h

index 03f60c690c8a147cdf1c9296e759023d1b17f054..f1da7ebd19051dd040ce24f77db73ef02eee8ff3 100644 (file)
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -12,20 +12,28 @@
  #ifndef __E820_HEADER
  #define __E820_HEADER
  
+#include <linux/ioport.h>
+
  #define HIGH_MEMORY    (1024*1024)
  
  #ifndef __ASSEMBLY__
  
  extern struct e820map e820;
+extern void update_e820(void);
  
  extern int e820_all_mapped(unsigned long start, unsigned long end,
                            unsigned type);
  extern int e820_any_mapped(u64 start, u64 end, unsigned type);
  extern void find_max_pfn(void);
  extern void register_bootmem_low_pages(unsigned long max_low_pfn);
+extern void add_memory_region(unsigned long long start,
+                             unsigned long long size, int type);
  extern void e820_register_memory(void);
  extern void limit_regions(unsigned long long size);
  extern void print_memory_map(char *who);
+extern void init_iomem_resources(struct resource *code_resource,
+                           struct resource *data_resource,
+                           struct resource *bss_resource);
  
  #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
  extern void e820_mark_nosave_regions(void);
@@ -35,5 +43,6 @@ static inline void e820_mark_nosave_regions(void)
  }
  #endif
  
+
  #endif/*!__ASSEMBLY__*/
  #endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h

index 0bd4787a5d575cf3675103ef114f3ce0ea0e9c8a..51e4170f9ca54e021f83b431977d0dbe7c6da3ba 100644 (file)
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -11,6 +11,8 @@
  #ifndef __E820_HEADER
  #define __E820_HEADER
  
+#include <linux/ioport.h>
+
  #ifndef __ASSEMBLY__
  extern unsigned long find_e820_area(unsigned long start, unsigned long end, 
                                     unsigned size);
@@ -19,11 +21,15 @@ extern void add_memory_region(unsigned long start, unsigned long size,
  extern void setup_memory_region(void);
  extern void contig_e820_setup(void); 
  extern unsigned long e820_end_of_ram(void);
-extern void e820_reserve_resources(void);
+extern void e820_reserve_resources(struct resource *code_resource,
+               struct resource *data_resource, struct resource *bss_resource);
  extern void e820_mark_nosave_regions(void);
-extern void e820_print_map(char *who);
  extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
  extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
+extern int e820_any_non_reserved(unsigned long start, unsigned long end);
+extern int is_memory_any_valid(unsigned long start, unsigned long end);
+extern int e820_all_non_reserved(unsigned long start, unsigned long end);
+extern int is_memory_all_valid(unsigned long start, unsigned long end);
  extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
  
  extern void e820_setup_gap(void);
@@ -33,9 +39,11 @@ extern void e820_register_active_regions(int nid,
  extern void finish_e820_parsing(void);
  
  extern struct e820map e820;
+extern void update_e820(void);
+
+extern void reserve_early(unsigned long start, unsigned long end);
+extern void early_res_to_bootmem(void);
  
-extern unsigned ebda_addr, ebda_size;
-extern unsigned long nodemap_addr, nodemap_size;
  #endif/*!__ASSEMBLY__*/
  
  #endif/*__E820_HEADER*/
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h

new file mode 100644 (file)

index 0000000..9c68a1f
--- /dev/null
+++ b/include/asm-x86/efi.h
@@ -0,0 +1,97 @@
+#ifndef _ASM_X86_EFI_H
+#define _ASM_X86_EFI_H
+
+#ifdef CONFIG_X86_32
+
+extern unsigned long asmlinkage efi_call_phys(void *, ...);
+
+#define efi_call_phys0(f)              efi_call_phys(f)
+#define efi_call_phys1(f, a1)          efi_call_phys(f, a1)
+#define efi_call_phys2(f, a1, a2)      efi_call_phys(f, a1, a2)
+#define efi_call_phys3(f, a1, a2, a3)  efi_call_phys(f, a1, a2, a3)
+#define efi_call_phys4(f, a1, a2, a3, a4)      \
+       efi_call_phys(f, a1, a2, a3, a4)
+#define efi_call_phys5(f, a1, a2, a3, a4, a5)  \
+       efi_call_phys(f, a1, a2, a3, a4, a5)
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6)      \
+       efi_call_phys(f, a1, a2, a3, a4, a5, a6)
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+
+#define efi_call_virt(f, args...) \
+     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+
+#define efi_call_virt0(f)              efi_call_virt(f)
+#define efi_call_virt1(f, a1)          efi_call_virt(f, a1)
+#define efi_call_virt2(f, a1, a2)      efi_call_virt(f, a1, a2)
+#define efi_call_virt3(f, a1, a2, a3)  efi_call_virt(f, a1, a2, a3)
+#define efi_call_virt4(f, a1, a2, a3, a4)      \
+       efi_call_virt(f, a1, a2, a3, a4)
+#define efi_call_virt5(f, a1, a2, a3, a4, a5)  \
+       efi_call_virt(f, a1, a2, a3, a4, a5)
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)      \
+       efi_call_virt(f, a1, a2, a3, a4, a5, a6)
+
+#define efi_ioremap(addr, size)                        ioremap(addr, size)
+
+#else /* !CONFIG_X86_32 */
+
+#define MAX_EFI_IO_PAGES       100
+
+extern u64 efi_call0(void *fp);
+extern u64 efi_call1(void *fp, u64 arg1);
+extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
+extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
+extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
+extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
+                    u64 arg4, u64 arg5);
+extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
+                    u64 arg4, u64 arg5, u64 arg6);
+
+#define efi_call_phys0(f)                      \
+       efi_call0((void *)(f))
+#define efi_call_phys1(f, a1)                  \
+       efi_call1((void *)(f), (u64)(a1))
+#define efi_call_phys2(f, a1, a2)                      \
+       efi_call2((void *)(f), (u64)(a1), (u64)(a2))
+#define efi_call_phys3(f, a1, a2, a3)                          \
+       efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
+#define efi_call_phys4(f, a1, a2, a3, a4)                              \
+       efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),         \
+                 (u64)(a4))
+#define efi_call_phys5(f, a1, a2, a3, a4, a5)                          \
+       efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),         \
+                 (u64)(a4), (u64)(a5))
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6)                      \
+       efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),         \
+                 (u64)(a4), (u64)(a5), (u64)(a6))
+
+#define efi_call_virt0(f)                              \
+       efi_call0((void *)(efi.systab->runtime->f))
+#define efi_call_virt1(f, a1)                                  \
+       efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
+#define efi_call_virt2(f, a1, a2)                                      \
+       efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
+#define efi_call_virt3(f, a1, a2, a3)                                  \
+       efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+                 (u64)(a3))
+#define efi_call_virt4(f, a1, a2, a3, a4)                              \
+       efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+                 (u64)(a3), (u64)(a4))
+#define efi_call_virt5(f, a1, a2, a3, a4, a5)                          \
+       efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+                 (u64)(a3), (u64)(a4), (u64)(a5))
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)                      \
+       efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+                 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+
+extern void *efi_ioremap(unsigned long offset, unsigned long size);
+
+#endif /* CONFIG_X86_32 */
+
+extern void efi_reserve_bootmem(void);
+extern void efi_call_phys_prelog(void);
+extern void efi_call_phys_epilog(void);
+
+#endif
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h

index ec42a4d2e83b1ab60ab3e24e8fe0732a8431ee4c..d9c94e7072894e9d1f584c5da1d9d4704313dcaa 100644 (file)
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -73,18 +73,23 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
  #endif
  
  #ifdef __KERNEL__
+#include <asm/vdso.h>
  
-#ifdef CONFIG_X86_32
-#include <asm/processor.h>
-#include <asm/system.h>                /* for savesegment */
-#include <asm/desc.h>
+extern unsigned int vdso_enabled;
  
  /*
   * This is used to ensure we don't load something for the wrong architecture.
   */
-#define elf_check_arch(x) \
+#define elf_check_arch_ia32(x) \
         (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
  
+#ifdef CONFIG_X86_32
+#include <asm/processor.h>
+#include <asm/system.h>                /* for savesegment */
+#include <asm/desc.h>
+
+#define elf_check_arch(x)      elf_check_arch_ia32(x)
+
  /* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
     contains a pointer to a function which might be registered using `atexit'.
     This provides a mean for the dynamic linker to call DT_FINI functions for
@@ -96,36 +101,38 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
     just to make things more deterministic.
   */
  #define ELF_PLAT_INIT(_r, load_addr)   do { \
-       _r->ebx = 0; _r->ecx = 0; _r->edx = 0; \
-       _r->esi = 0; _r->edi = 0; _r->ebp = 0; \
-       _r->eax = 0; \
+       _r->bx = 0; _r->cx = 0; _r->dx = 0; \
+       _r->si = 0; _r->di = 0; _r->bp = 0; \
+       _r->ax = 0; \
  } while (0)
  
-/* regs is struct pt_regs, pr_reg is elf_gregset_t (which is
-   now struct_user_regs, they are different) */
-
-#define ELF_CORE_COPY_REGS(pr_reg, regs)               \
-       pr_reg[0] = regs->ebx;                          \
-       pr_reg[1] = regs->ecx;                          \
-       pr_reg[2] = regs->edx;                          \
-       pr_reg[3] = regs->esi;                          \
-       pr_reg[4] = regs->edi;                          \
-       pr_reg[5] = regs->ebp;                          \
-       pr_reg[6] = regs->eax;                          \
-       pr_reg[7] = regs->xds & 0xffff;                 \
-       pr_reg[8] = regs->xes & 0xffff;                 \
-       pr_reg[9] = regs->xfs & 0xffff;                 \
-       savesegment(gs,pr_reg[10]);                     \
-       pr_reg[11] = regs->orig_eax;                    \
-       pr_reg[12] = regs->eip;                         \
-       pr_reg[13] = regs->xcs & 0xffff;                \
-       pr_reg[14] = regs->eflags;                      \
-       pr_reg[15] = regs->esp;                         \
-       pr_reg[16] = regs->xss & 0xffff;
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different)
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do {          \
+       pr_reg[0] = regs->bx;                           \
+       pr_reg[1] = regs->cx;                           \
+       pr_reg[2] = regs->dx;                           \
+       pr_reg[3] = regs->si;                           \
+       pr_reg[4] = regs->di;                           \
+       pr_reg[5] = regs->bp;                           \
+       pr_reg[6] = regs->ax;                           \
+       pr_reg[7] = regs->ds & 0xffff;                  \
+       pr_reg[8] = regs->es & 0xffff;                  \
+       pr_reg[9] = regs->fs & 0xffff;                  \
+       savesegment(gs, pr_reg[10]);                    \
+       pr_reg[11] = regs->orig_ax;                     \
+       pr_reg[12] = regs->ip;                          \
+       pr_reg[13] = regs->cs & 0xffff;                 \
+       pr_reg[14] = regs->flags;                       \
+       pr_reg[15] = regs->sp;                          \
+       pr_reg[16] = regs->ss & 0xffff;                 \
+} while (0);
  
  #define ELF_PLATFORM   (utsname()->machine)
  #define set_personality_64bit()        do { } while (0)
-extern unsigned int vdso_enabled;
  
  #else /* CONFIG_X86_32 */
  
@@ -137,28 +144,57 @@ extern unsigned int vdso_enabled;
  #define elf_check_arch(x) \
         ((x)->e_machine == EM_X86_64)
  
+#define compat_elf_check_arch(x)       elf_check_arch_ia32(x)
+
+static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
+{
+       asm volatile("movl %0,%%fs" :: "r" (0));
+       asm volatile("movl %0,%%es; movl %0,%%ds" : : "r" (__USER32_DS));
+       load_gs_index(0);
+       regs->ip = ip;
+       regs->sp = sp;
+       regs->flags = X86_EFLAGS_IF;
+       regs->cs = __USER32_CS;
+       regs->ss = __USER32_DS;
+}
+
+static inline void elf_common_init(struct thread_struct *t,
+                                  struct pt_regs *regs, const u16 ds)
+{
+       regs->ax = regs->bx = regs->cx = regs->dx = 0;
+       regs->si = regs->di = regs->bp = 0;
+       regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
+       regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+       t->fs = t->gs = 0;
+       t->fsindex = t->gsindex = 0;
+       t->ds = t->es = ds;
+}
+
  #define ELF_PLAT_INIT(_r, load_addr)   do {              \
-       struct task_struct *cur = current;                \
-       (_r)->rbx = 0; (_r)->rcx = 0; (_r)->rdx = 0;      \
-       (_r)->rsi = 0; (_r)->rdi = 0; (_r)->rbp = 0;      \
-       (_r)->rax = 0;                                    \
-       (_r)->r8 = 0;                                     \
-       (_r)->r9 = 0;                                     \
-       (_r)->r10 = 0;                                    \
-       (_r)->r11 = 0;                                    \
-       (_r)->r12 = 0;                                    \
-       (_r)->r13 = 0;                                    \
-       (_r)->r14 = 0;                                    \
-       (_r)->r15 = 0;                                    \
-       cur->thread.fs = 0; cur->thread.gs = 0;           \
-       cur->thread.fsindex = 0; cur->thread.gsindex = 0; \
-       cur->thread.ds = 0; cur->thread.es = 0;           \
+       elf_common_init(&current->thread, _r, 0);         \
         clear_thread_flag(TIF_IA32);                      \
  } while (0)
  
-/* regs is struct pt_regs, pr_reg is elf_gregset_t (which is
-   now struct_user_regs, they are different). Assumes current is the process
-   getting dumped. */
+#define        COMPAT_ELF_PLAT_INIT(regs, load_addr)   \
+       elf_common_init(&current->thread, regs, __USER_DS)
+#define        compat_start_thread(regs, ip, sp)       do {            \
+               start_ia32_thread(regs, ip, sp);                \
+               set_fs(USER_DS);                                \
+       } while (0)
+#define COMPAT_SET_PERSONALITY(ex, ibcs2)      do {            \
+               if (test_thread_flag(TIF_IA32))                 \
+                       clear_thread_flag(TIF_ABI_PENDING);     \
+               else                                            \
+                       set_thread_flag(TIF_ABI_PENDING);       \
+               current->personality |= force_personality32;    \
+       } while (0)
+#define COMPAT_ELF_PLATFORM                    ("i686")
+
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different). Assumes current is the process
+ * getting dumped.
+ */
  
  #define ELF_CORE_COPY_REGS(pr_reg, regs)  do {                 \
         unsigned v;                                             \
@@ -166,22 +202,22 @@ extern unsigned int vdso_enabled;
         (pr_reg)[1] = (regs)->r14;                              \
         (pr_reg)[2] = (regs)->r13;                              \
         (pr_reg)[3] = (regs)->r12;                              \
-       (pr_reg)[4] = (regs)->rbp;                              \
-       (pr_reg)[5] = (regs)->rbx;                              \
+       (pr_reg)[4] = (regs)->bp;                               \
+       (pr_reg)[5] = (regs)->bx;                               \
         (pr_reg)[6] = (regs)->r11;                              \
         (pr_reg)[7] = (regs)->r10;                              \
         (pr_reg)[8] = (regs)->r9;                               \
         (pr_reg)[9] = (regs)->r8;                               \
-       (pr_reg)[10] = (regs)->rax;                             \
-       (pr_reg)[11] = (regs)->rcx;                             \
-       (pr_reg)[12] = (regs)->rdx;                             \
-       (pr_reg)[13] = (regs)->rsi;                             \
-       (pr_reg)[14] = (regs)->rdi;                             \
-       (pr_reg)[15] = (regs)->orig_rax;                        \
-       (pr_reg)[16] = (regs)->rip;                             \
+       (pr_reg)[10] = (regs)->ax;                              \
+       (pr_reg)[11] = (regs)->cx;                              \
+       (pr_reg)[12] = (regs)->dx;                              \
+       (pr_reg)[13] = (regs)->si;                              \
+       (pr_reg)[14] = (regs)->di;                              \
+       (pr_reg)[15] = (regs)->orig_ax;                         \
+       (pr_reg)[16] = (regs)->ip;                              \
         (pr_reg)[17] = (regs)->cs;                              \
-       (pr_reg)[18] = (regs)->eflags;                          \
-       (pr_reg)[19] = (regs)->rsp;                             \
+       (pr_reg)[18] = (regs)->flags;                           \
+       (pr_reg)[19] = (regs)->sp;                              \
         (pr_reg)[20] = (regs)->ss;                              \
         (pr_reg)[21] = current->thread.fs;                      \
         (pr_reg)[22] = current->thread.gs;                      \
@@ -189,15 +225,17 @@ extern unsigned int vdso_enabled;
         asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;       \
         asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;       \
         asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v;       \
-} while(0);
+} while (0);
  
  /* I'm not sure if we can use '-' here */
  #define ELF_PLATFORM       ("x86_64")
  extern void set_personality_64bit(void);
-extern int vdso_enabled;
+extern unsigned int sysctl_vsyscall32;
+extern int force_personality32;
  
  #endif /* !CONFIG_X86_32 */
  
+#define CORE_DUMP_USE_REGSET
  #define USE_ELF_CORE_DUMP
  #define ELF_EXEC_PAGESIZE      4096
  
@@ -232,43 +270,24 @@ extern int vdso_enabled;
  
  struct task_struct;
  
-extern int dump_task_regs (struct task_struct *, elf_gregset_t *);
-extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
-
-#define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk, elf_regs)
-#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
+#define        ARCH_DLINFO_IA32(vdso_enabled) \
+do if (vdso_enabled) {                                                 \
+               NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);                    \
+               NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);        \
+} while (0)
  
  #ifdef CONFIG_X86_32
-extern int dump_task_extended_fpu (struct task_struct *,
-                                  struct user_fxsr_struct *);
-#define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) \
-       dump_task_extended_fpu(tsk, elf_xfpregs)
-#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
  
  #define VDSO_HIGH_BASE         (__fix_to_virt(FIX_VDSO))
-#define VDSO_CURRENT_BASE      ((unsigned long)current->mm->context.vdso)
-#define VDSO_PRELINK           0
-
-#define VDSO_SYM(x) \
-               (VDSO_CURRENT_BASE + (unsigned long)(x) - VDSO_PRELINK)
-
-#define VDSO_HIGH_EHDR         ((const struct elfhdr *) VDSO_HIGH_BASE)
-#define VDSO_EHDR              ((const struct elfhdr *) VDSO_CURRENT_BASE)
  
-extern void __kernel_vsyscall;
-
-#define VDSO_ENTRY             VDSO_SYM(&__kernel_vsyscall)
+#define ARCH_DLINFO            ARCH_DLINFO_IA32(vdso_enabled)
  
  /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
  
-#define ARCH_DLINFO \
-do if (vdso_enabled) {                                                 \
-               NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);                    \
-               NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);        \
-} while (0)
-
  #else /* CONFIG_X86_32 */
  
+#define VDSO_HIGH_BASE         0xffffe000U /* CONFIG_COMPAT_VDSO address */
+
  /* 1GB for 64bit, 8MB for 32bit */
  #define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
  
@@ -277,14 +296,31 @@ do if (vdso_enabled) {                                            \
         NEW_AUX_ENT(AT_SYSINFO_EHDR,(unsigned long)current->mm->context.vdso);\
  } while (0)
  
+#define AT_SYSINFO             32
+
+#define COMPAT_ARCH_DLINFO     ARCH_DLINFO_IA32(sysctl_vsyscall32)
+
+#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
+
  #endif /* !CONFIG_X86_32 */
  
+#define VDSO_CURRENT_BASE      ((unsigned long)current->mm->context.vdso)
+
+#define VDSO_ENTRY \
+       ((unsigned long) VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
+
  struct linux_binprm;
  
  #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
  extern int arch_setup_additional_pages(struct linux_binprm *bprm,
                                        int executable_stack);
  
+extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+#define compat_arch_setup_additional_pages     syscall32_setup_pages
+
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+#define arch_randomize_brk arch_randomize_brk
+
  #endif /* __KERNEL__ */
  
  #endif
diff --git a/include/asm-x86/emergency-restart.h b/include/asm-x86/emergency-restart.h

index 680c3956334542d75617e6d89b82298f906e250a..8e6aef19f8f02ce8be4edf1cf346ac9b257aaa77 100644 (file)
--- a/include/asm-x86/emergency-restart.h
+++ b/include/asm-x86/emergency-restart.h
@@ -1,6 +1,18 @@
  #ifndef _ASM_EMERGENCY_RESTART_H
  #define _ASM_EMERGENCY_RESTART_H
  
+enum reboot_type {
+       BOOT_TRIPLE = 't',
+       BOOT_KBD = 'k',
+#ifdef CONFIG_X86_32
+       BOOT_BIOS = 'b',
+#endif
+       BOOT_ACPI = 'a',
+       BOOT_EFI = 'e'
+};
+
+extern enum reboot_type reboot_type;
+
  extern void machine_emergency_restart(void);
  
  #endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-x86/fixmap_32.h b/include/asm-x86/fixmap_32.h

index 249e753ac8053fd3c7b37ccf2a17eec20e2164aa..a7404d50686b8d7a28c01eb0dd369d69b03dd50f 100644 (file)
--- a/include/asm-x86/fixmap_32.h
+++ b/include/asm-x86/fixmap_32.h
@@ -65,7 +65,7 @@ enum fixed_addresses {
  #endif
  #ifdef CONFIG_X86_VISWS_APIC
         FIX_CO_CPU,     /* Cobalt timer */
-       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */ 
+       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
         FIX_LI_PCIA,    /* Lithium PCI Bridge A */
         FIX_LI_PCIB,    /* Lithium PCI Bridge B */
  #endif
@@ -74,7 +74,7 @@ enum fixed_addresses {
  #endif
  #ifdef CONFIG_X86_CYCLONE_TIMER
         FIX_CYCLONE_TIMER, /*cyclone timer register*/
-#endif 
+#endif
  #ifdef CONFIG_HIGHMEM
         FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
         FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -90,11 +90,23 @@ enum fixed_addresses {
         FIX_PARAVIRT_BOOTMAP,
  #endif
         __end_of_permanent_fixed_addresses,
-       /* temporary boot-time mappings, used before ioremap() is functional */
-#define NR_FIX_BTMAPS  16
-       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
-       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
+       /*
+        * 256 temporary boot-time mappings, used by early_ioremap(),
+        * before ioremap() is functional.
+        *
+        * We round it up to the next 512 pages boundary so that we
+        * can have a single pgd entry and a single pte table:
+        */
+#define NR_FIX_BTMAPS          64
+#define FIX_BTMAPS_NESTING     4
+       FIX_BTMAP_END =
+               __end_of_permanent_fixed_addresses + 512 -
+                       (__end_of_permanent_fixed_addresses & 511),
+       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
         FIX_WP_TEST,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+       FIX_OHCI1394_BASE,
+#endif
         __end_of_fixed_addresses
  };
  
diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h

index cdfbe4a6ae6f1573f460e255d6309861f4aba6f3..70ddb21e6458238da00c249ea8529a7678f41a96 100644 (file)
--- a/include/asm-x86/fixmap_64.h
+++ b/include/asm-x86/fixmap_64.h
@@ -15,6 +15,7 @@
  #include <asm/apicdef.h>
  #include <asm/page.h>
  #include <asm/vsyscall.h>
+#include <asm/efi.h>
  
  /*
   * Here we define all the compile-time 'special' virtual
@@ -41,6 +42,11 @@ enum fixed_addresses {
         FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
         FIX_IO_APIC_BASE_0,
         FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+       FIX_EFI_IO_MAP_LAST_PAGE,
+       FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+       FIX_OHCI1394_BASE,
+#endif
         __end_of_fixed_addresses
  };
  
diff --git a/include/asm-x86/fpu32.h b/include/asm-x86/fpu32.h

deleted file mode 100644 (file)

index 4153db5..0000000
--- a/include/asm-x86/fpu32.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _FPU32_H
-#define _FPU32_H 1
-
-struct _fpstate_ia32;
-
-int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave);
-int save_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, 
-                  struct pt_regs *regs, int fsave);
-
-#endif
diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h

index 1f4610e0c613031b003ddda1afe903ce2a12b658..62828d63f1b1d4bc6d79281f3095ca546d8db9ac 100644 (file)
--- a/include/asm-x86/futex.h
+++ b/include/asm-x86/futex.h
@@ -1,5 +1,135 @@
-#ifdef CONFIG_X86_32
-# include "futex_32.h"
-#else
-# include "futex_64.h"
+#ifndef _ASM_X86_FUTEX_H
+#define _ASM_X86_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+
+#include <asm/asm.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg)    \
+  __asm__ __volatile(                                          \
+"1:    " insn "\n"                                             \
+"2:    .section .fixup,\"ax\"\n                                \
+3:     mov     %3, %1\n                                        \
+       jmp     2b\n                                            \
+       .previous\n                                             \
+       .section __ex_table,\"a\"\n                             \
+       .align  8\n"                                            \
+       _ASM_PTR "1b,3b\n                                       \
+       .previous"                                              \
+       : "=r" (oldval), "=r" (ret), "+m" (*uaddr)              \
+       : "i" (-EFAULT), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg)    \
+  __asm__ __volatile(                                          \
+"1:    movl    %2, %0\n                                        \
+       movl    %0, %3\n"                                       \
+       insn "\n"                                               \
+"2:    " LOCK_PREFIX "cmpxchgl %3, %2\n                        \
+       jnz     1b\n                                            \
+3:     .section .fixup,\"ax\"\n                                \
+4:     mov     %5, %1\n                                        \
+       jmp     3b\n                                            \
+       .previous\n                                             \
+       .section __ex_table,\"a\"\n                             \
+       .align  8\n"                                            \
+       _ASM_PTR "1b,4b,2b,4b\n                                 \
+       .previous"                                              \
+       : "=&a" (oldval), "=&r" (ret), "+m" (*uaddr),           \
+         "=&r" (tem)                                           \
+       : "r" (oparg), "i" (-EFAULT), "1" (0))
+
+static inline int
+futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+{
+       int op = (encoded_op >> 28) & 7;
+       int cmp = (encoded_op >> 24) & 15;
+       int oparg = (encoded_op << 8) >> 20;
+       int cmparg = (encoded_op << 20) >> 20;
+       int oldval = 0, ret, tem;
+
+       if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+               oparg = 1 << oparg;
+
+       if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+               return -EFAULT;
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
+       /* Real i386 machines can only support FUTEX_OP_SET */
+       if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3)
+               return -ENOSYS;
+#endif
+
+       pagefault_disable();
+
+       switch (op) {
+       case FUTEX_OP_SET:
+               __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+               break;
+       case FUTEX_OP_ADD:
+               __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
+                                  uaddr, oparg);
+               break;
+       case FUTEX_OP_OR:
+               __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
+               break;
+       case FUTEX_OP_ANDN:
+               __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
+               break;
+       case FUTEX_OP_XOR:
+               __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
+               break;
+       default:
+               ret = -ENOSYS;
+       }
+
+       pagefault_enable();
+
+       if (!ret) {
+               switch (cmp) {
+               case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+               case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+               case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+               case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+               case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+               case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+               default: ret = -ENOSYS;
+               }
+       }
+       return ret;
+}
+
+static inline int
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
+{
+       if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+               return -EFAULT;
+
+       __asm__ __volatile__(
+               "1:     " LOCK_PREFIX "cmpxchgl %3, %1          \n"
+
+               "2:     .section .fixup, \"ax\"                 \n"
+               "3:     mov     %2, %0                          \n"
+               "       jmp     2b                              \n"
+               "       .previous                               \n"
+
+               "       .section __ex_table, \"a\"              \n"
+               "       .align  8                               \n"
+                       _ASM_PTR " 1b,3b                        \n"
+               "       .previous                               \n"
+
+               : "=a" (oldval), "+m" (*uaddr)
+               : "i" (-EFAULT), "r" (newval), "0" (oldval)
+               : "memory"
+       );
+
+       return oldval;
+}
+
+#endif
  #endif
diff --git a/include/asm-x86/futex_32.h b/include/asm-x86/futex_32.h

deleted file mode 100644 (file)

index 438ef0e..0000000
--- a/include/asm-x86/futex_32.h
+++ /dev/null
@@ -1,135 +0,0 @@
-#ifndef _ASM_FUTEX_H
-#define _ASM_FUTEX_H
-
-#ifdef __KERNEL__
-
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-
-#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
-  __asm__ __volatile (                                         \
-"1:    " insn "\n"                                             \
-"2:    .section .fixup,\"ax\"\n\
-3:     mov     %3, %1\n\
-       jmp     2b\n\
-       .previous\n\
-       .section __ex_table,\"a\"\n\
-       .align  8\n\
-       .long   1b,3b\n\
-       .previous"                                              \
-       : "=r" (oldval), "=r" (ret), "+m" (*uaddr)              \
-       : "i" (-EFAULT), "0" (oparg), "1" (0))
-
-#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
-  __asm__ __volatile (                                         \
-"1:    movl    %2, %0\n\
-       movl    %0, %3\n"                                       \
-       insn "\n"                                               \
-"2:    " LOCK_PREFIX "cmpxchgl %3, %2\n\
-       jnz     1b\n\
-3:     .section .fixup,\"ax\"\n\
-4:     mov     %5, %1\n\
-       jmp     3b\n\
-       .previous\n\
-       .section __ex_table,\"a\"\n\
-       .align  8\n\
-       .long   1b,4b,2b,4b\n\
-       .previous"                                              \
-       : "=&a" (oldval), "=&r" (ret), "+m" (*uaddr),           \
-         "=&r" (tem)                                           \
-       : "r" (oparg), "i" (-EFAULT), "1" (0))
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-       int op = (encoded_op >> 28) & 7;
-       int cmp = (encoded_op >> 24) & 15;
-       int oparg = (encoded_op << 8) >> 20;
-       int cmparg = (encoded_op << 20) >> 20;
-       int oldval = 0, ret, tem;
-       if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-               oparg = 1 << oparg;
-
-       if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-               return -EFAULT;
-
-       pagefault_disable();
-
-       if (op == FUTEX_OP_SET)
-               __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
-       else {
-#ifndef CONFIG_X86_BSWAP
-               if (boot_cpu_data.x86 == 3)
-                       ret = -ENOSYS;
-               else
-#endif
-               switch (op) {
-               case FUTEX_OP_ADD:
-                       __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
-                                          oldval, uaddr, oparg);
-                       break;
-               case FUTEX_OP_OR:
-                       __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr,
-                                          oparg);
-                       break;
-               case FUTEX_OP_ANDN:
-                       __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr,
-                                          ~oparg);
-                       break;
-               case FUTEX_OP_XOR:
-                       __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr,
-                                          oparg);
-                       break;
-               default:
-                       ret = -ENOSYS;
-               }
-       }
-
-       pagefault_enable();
-
-       if (!ret) {
-               switch (cmp) {
-               case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-               case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-               case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-               case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-               case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-               case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-               default: ret = -ENOSYS;
-               }
-       }
-       return ret;
-}
-
-static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
-{
-       if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-               return -EFAULT;
-
-       __asm__ __volatile__(
-               "1:     " LOCK_PREFIX "cmpxchgl %3, %1          \n"
-
-               "2:     .section .fixup, \"ax\"                 \n"
-               "3:     mov     %2, %0                          \n"
-               "       jmp     2b                              \n"
-               "       .previous                               \n"
-
-               "       .section __ex_table, \"a\"              \n"
-               "       .align  8                               \n"
-               "       .long   1b,3b                           \n"
-               "       .previous                               \n"
-
-               : "=a" (oldval), "+m" (*uaddr)
-               : "i" (-EFAULT), "r" (newval), "0" (oldval)
-               : "memory"
-       );
-
-       return oldval;
-}
-
-#endif
-#endif
diff --git a/include/asm-x86/futex_64.h b/include/asm-x86/futex_64.h

deleted file mode 100644 (file)

index 5cdfb08..0000000
--- a/include/asm-x86/futex_64.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef _ASM_FUTEX_H
-#define _ASM_FUTEX_H
-
-#ifdef __KERNEL__
-
-#include <linux/futex.h>
-#include <asm/errno.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
-  __asm__ __volatile (                                         \
-"1:    " insn "\n"                                             \
-"2:    .section .fixup,\"ax\"\n\
-3:     mov     %3, %1\n\
-       jmp     2b\n\
-       .previous\n\
-       .section __ex_table,\"a\"\n\
-       .align  8\n\
-       .quad   1b,3b\n\
-       .previous"                                              \
-       : "=r" (oldval), "=r" (ret), "=m" (*uaddr)              \
-       : "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
-
-#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
-  __asm__ __volatile (                                         \
-"1:    movl    %2, %0\n\
-       movl    %0, %3\n"                                       \
-       insn "\n"                                               \
-"2:    " LOCK_PREFIX "cmpxchgl %3, %2\n\
-       jnz     1b\n\
-3:     .section .fixup,\"ax\"\n\
-4:     mov     %5, %1\n\
-       jmp     3b\n\
-       .previous\n\
-       .section __ex_table,\"a\"\n\
-       .align  8\n\
-       .quad   1b,4b,2b,4b\n\
-       .previous"                                              \
-       : "=&a" (oldval), "=&r" (ret), "=m" (*uaddr),           \
-         "=&r" (tem)                                           \
-       : "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
-
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-       int op = (encoded_op >> 28) & 7;
-       int cmp = (encoded_op >> 24) & 15;
-       int oparg = (encoded_op << 8) >> 20;
-       int cmparg = (encoded_op << 20) >> 20;
-       int oldval = 0, ret, tem;
-       if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-               oparg = 1 << oparg;
-
-       if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-               return -EFAULT;
-
-       pagefault_disable();
-
-       switch (op) {
-       case FUTEX_OP_SET:
-               __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
-               break;
-       case FUTEX_OP_ADD:
-               __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
-                                  uaddr, oparg);
-               break;
-       case FUTEX_OP_OR:
-               __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
-               break;
-       case FUTEX_OP_ANDN:
-               __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
-               break;
-       case FUTEX_OP_XOR:
-               __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
-               break;
-       default:
-               ret = -ENOSYS;
-       }
-
-       pagefault_enable();
-
-       if (!ret) {
-               switch (cmp) {
-               case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-               case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-               case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-               case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-               case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-               case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-               default: ret = -ENOSYS;
-               }
-       }
-       return ret;
-}
-
-static inline int
-futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
-{
-       if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-               return -EFAULT;
-
-       __asm__ __volatile__(
-               "1:     " LOCK_PREFIX "cmpxchgl %3, %1          \n"
-
-               "2:     .section .fixup, \"ax\"                 \n"
-               "3:     mov     %2, %0                          \n"
-               "       jmp     2b                              \n"
-               "       .previous                               \n"
-
-               "       .section __ex_table, \"a\"              \n"
-               "       .align  8                               \n"
-               "       .quad   1b,3b                           \n"
-               "       .previous                               \n"
-
-               : "=a" (oldval), "=m" (*uaddr)
-               : "i" (-EFAULT), "r" (newval), "0" (oldval)
-               : "memory"
-       );
-
-       return oldval;
-}
-
-#endif
-#endif
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h

index f704c50519b899c07e49399131ca39f0e60bc784..90958ed993faa3d140c495911bb77a1c57037ee9 100644 (file)
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -9,6 +9,7 @@ extern int iommu_detected;
  extern void gart_iommu_init(void);
  extern void gart_iommu_shutdown(void);
  extern void __init gart_parse_options(char *);
+extern void early_gart_iommu_check(void);
  extern void gart_iommu_hole_init(void);
  extern int fallback_aper_order;
  extern int fallback_aper_force;
@@ -20,6 +21,10 @@ extern int fix_aperture;
  #define gart_iommu_aperture 0
  #define gart_iommu_aperture_allowed 0
  
+static inline void early_gart_iommu_check(void)
+{
+}
+
  static inline void gart_iommu_shutdown(void)
  {
  }
diff --git a/include/asm-x86/geode.h b/include/asm-x86/geode.h

index 771af336734fc73aab39d03a8414f54cfadaad71..811fe14f70b26f1e835657ce0b9a95f3a065b103 100644 (file)
--- a/include/asm-x86/geode.h
+++ b/include/asm-x86/geode.h
@@ -121,9 +121,15 @@ extern int geode_get_dev_base(unsigned int dev);
  #define GPIO_MAP_Z             0xE8
  #define GPIO_MAP_W             0xEC
  
-extern void geode_gpio_set(unsigned int, unsigned int);
-extern void geode_gpio_clear(unsigned int, unsigned int);
-extern int geode_gpio_isset(unsigned int, unsigned int);
+static inline u32 geode_gpio(unsigned int nr)
+{
+       BUG_ON(nr > 28);
+       return 1 << nr;
+}
+
+extern void geode_gpio_set(u32, unsigned int);
+extern void geode_gpio_clear(u32, unsigned int);
+extern int geode_gpio_isset(u32, unsigned int);
  extern void geode_gpio_setup_event(unsigned int, int, int);
  extern void geode_gpio_set_irq(unsigned int, unsigned int);
  
diff --git a/include/asm-x86/gpio.h b/include/asm-x86/gpio.h

new file mode 100644 (file)

index 0000000..ff87fca
--- /dev/null
+++ b/include/asm-x86/gpio.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_I386_GPIO_H
+#define _ASM_I386_GPIO_H
+
+#include <gpio.h>
+
+#endif /* _ASM_I386_GPIO_H */
diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h

index ad8d6e758785ca283b33b802880945f4e8c1f4ee..6a9b4ac59bf717c321f6e10d8c74a9e2796e0c31 100644 (file)
--- a/include/asm-x86/hpet.h
+++ b/include/asm-x86/hpet.h
@@ -69,6 +69,7 @@ extern void force_hpet_resume(void);
  
  #include <linux/interrupt.h>
  
+typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
  extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
  extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
  extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
@@ -77,13 +78,16 @@ extern int hpet_set_periodic_freq(unsigned long freq);
  extern int hpet_rtc_dropped_irq(void);
  extern int hpet_rtc_timer_init(void);
  extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
+extern int hpet_register_irq_handler(rtc_irq_handler handler);
+extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
  
  #endif /* CONFIG_HPET_EMULATE_RTC */
  
-#else
+#else /* CONFIG_HPET_TIMER */
  
  static inline int hpet_enable(void) { return 0; }
  static inline unsigned long hpet_readl(unsigned long a) { return 0; }
+static inline int is_hpet_enabled(void) { return 0; }
  
-#endif /* CONFIG_HPET_TIMER */
+#endif
  #endif /* ASM_X86_HPET_H */
diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h

index 0bedbdf5e9078af5d579f80832a057911067f6b8..6d65fbb6358b0d91692da95313fd09c5ee3c8690 100644 (file)
--- a/include/asm-x86/hw_irq_32.h
+++ b/include/asm-x86/hw_irq_32.h
@@ -26,19 +26,19 @@
   * Interrupt entry/exit code at both C and assembly level
   */
  
-extern void (*interrupt[NR_IRQS])(void);
+extern void (*const interrupt[NR_IRQS])(void);
  
  #ifdef CONFIG_SMP
-fastcall void reschedule_interrupt(void);
-fastcall void invalidate_interrupt(void);
-fastcall void call_function_interrupt(void);
+void reschedule_interrupt(void);
+void invalidate_interrupt(void);
+void call_function_interrupt(void);
  #endif
  
  #ifdef CONFIG_X86_LOCAL_APIC
-fastcall void apic_timer_interrupt(void);
-fastcall void error_interrupt(void);
-fastcall void spurious_interrupt(void);
-fastcall void thermal_interrupt(void);
+void apic_timer_interrupt(void);
+void error_interrupt(void);
+void spurious_interrupt(void);
+void thermal_interrupt(void);
  #define platform_legacy_irq(irq)       ((irq) < 16)
  #endif
  
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h

index a470d59da67832e0b8265b55f318705134435023..312a58d6dac65cc96e4b4e0fd519650143c6c58a 100644 (file)
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -135,11 +135,13 @@ extern void init_8259A(int aeoi);
  extern void send_IPI_self(int vector);
  extern void init_VISWS_APIC_irqs(void);
  extern void setup_IO_APIC(void);
+extern void enable_IO_APIC(void);
  extern void disable_IO_APIC(void);
  extern void print_IO_APIC(void);
  extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
  extern void send_IPI(int dest, int vector);
  extern void setup_ioapic_dest(void);
+extern void native_init_IRQ(void);
  
  extern unsigned long io_apic_irqs;
  
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h

index a8bbed3496648db50aa69491251a108a3646f10c..ba8105ca822b99840f6bcf91b23e9d7f0c34902f 100644 (file)
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -1,5 +1,360 @@
-#ifdef CONFIG_X86_32
-# include "i387_32.h"
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_I387_H
+#define _ASM_X86_I387_H
+
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/regset.h>
+#include <asm/processor.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/uaccess.h>
+
+extern void fpu_init(void);
+extern unsigned int mxcsr_feature_mask;
+extern void mxcsr_feature_mask_init(void);
+extern void init_fpu(struct task_struct *child);
+extern asmlinkage void math_state_restore(void);
+
+extern user_regset_active_fn fpregs_active, xfpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
+
+#ifdef CONFIG_IA32_EMULATION
+struct _fpstate_ia32;
+extern int save_i387_ia32(struct _fpstate_ia32 __user *buf);
+extern int restore_i387_ia32(struct _fpstate_ia32 __user *buf);
+#endif
+
+#ifdef CONFIG_X86_64
+
+/* Ignore delayed exceptions from user space */
+static inline void tolerant_fwait(void)
+{
+       asm volatile("1: fwait\n"
+                    "2:\n"
+                    "   .section __ex_table,\"a\"\n"
+                    "  .align 8\n"
+                    "  .quad 1b,2b\n"
+                    "  .previous\n");
+}
+
+static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
+{
+       int err;
+
+       asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
+                    "2:\n"
+                    ".section .fixup,\"ax\"\n"
+                    "3:  movl $-1,%[err]\n"
+                    "    jmp  2b\n"
+                    ".previous\n"
+                    ".section __ex_table,\"a\"\n"
+                    "   .align 8\n"
+                    "   .quad  1b,3b\n"
+                    ".previous"
+                    : [err] "=r" (err)
+#if 0 /* See comment in __save_init_fpu() below. */
+                    : [fx] "r" (fx), "m" (*fx), "0" (0));
+#else
+                    : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
+#endif
+       if (unlikely(err))
+               init_fpu(current);
+       return err;
+}
+
+#define X87_FSW_ES (1 << 7)    /* Exception Summary */
+
+/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
+   is pending. Clear the x87 state here by setting it to fixed
+   values. The kernel data segment can be sometimes 0 and sometimes
+   new user value. Both should be ok.
+   Use the PDA as safe address because it should be already in L1. */
+static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
+{
+       if (unlikely(fx->swd & X87_FSW_ES))
+                asm volatile("fnclex");
+       alternative_input(ASM_NOP8 ASM_NOP2,
+                    "    emms\n"               /* clear stack tags */
+                    "    fildl %%gs:0",        /* load to clear state */
+                    X86_FEATURE_FXSAVE_LEAK);
+}
+
+static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
+{
+       int err;
+
+       asm volatile("1:  rex64/fxsave (%[fx])\n\t"
+                    "2:\n"
+                    ".section .fixup,\"ax\"\n"
+                    "3:  movl $-1,%[err]\n"
+                    "    jmp  2b\n"
+                    ".previous\n"
+                    ".section __ex_table,\"a\"\n"
+                    "   .align 8\n"
+                    "   .quad  1b,3b\n"
+                    ".previous"
+                    : [err] "=r" (err), "=m" (*fx)
+#if 0 /* See comment in __fxsave_clear() below. */
+                    : [fx] "r" (fx), "0" (0));
+#else
+                    : [fx] "cdaSDb" (fx), "0" (0));
+#endif
+       if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct)))
+               err = -EFAULT;
+       /* No need to clear here because the caller clears USED_MATH */
+       return err;
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+       /* Using "rex64; fxsave %0" is broken because, if the memory operand
+          uses any extended registers for addressing, a second REX prefix
+          will be generated (to the assembler, rex64 followed by semicolon
+          is a separate instruction), and hence the 64-bitness is lost. */
+#if 0
+       /* Using "fxsaveq %0" would be the ideal choice, but is only supported
+          starting with gas 2.16. */
+       __asm__ __volatile__("fxsaveq %0"
+                            : "=m" (tsk->thread.i387.fxsave));
+#elif 0
+       /* Using, as a workaround, the properly prefixed form below isn't
+          accepted by any binutils version so far released, complaining that
+          the same type of prefix is used twice if an extended register is
+          needed for addressing (fix submitted to mainline 2005-11-21). */
+       __asm__ __volatile__("rex64/fxsave %0"
+                            : "=m" (tsk->thread.i387.fxsave));
+#else
+       /* This, however, we can work around by forcing the compiler to select
+          an addressing mode that doesn't require extended registers. */
+       __asm__ __volatile__("rex64/fxsave %P2(%1)"
+                            : "=m" (tsk->thread.i387.fxsave)
+                            : "cdaSDb" (tsk),
+                               "i" (offsetof(__typeof__(*tsk),
+                                             thread.i387.fxsave)));
+#endif
+       clear_fpu_state(&tsk->thread.i387.fxsave);
+       task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387(struct _fpstate __user *buf)
+{
+       struct task_struct *tsk = current;
+       int err = 0;
+
+       BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
+                       sizeof(tsk->thread.i387.fxsave));
+
+       if ((unsigned long)buf % 16)
+               printk("save_i387: bad fpstate %p\n", buf);
+
+       if (!used_math())
+               return 0;
+       clear_used_math(); /* trigger finit */
+       if (task_thread_info(tsk)->status & TS_USEDFPU) {
+               err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+               if (err) return err;
+               task_thread_info(tsk)->status &= ~TS_USEDFPU;
+               stts();
+       } else {
+               if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
+                                  sizeof(struct i387_fxsave_struct)))
+                       return -1;
+       }
+       return 1;
+}
+
+/*
+ * This restores directly out of user space. Exceptions are handled.
+ */
+static inline int restore_i387(struct _fpstate __user *buf)
+{
+       set_used_math();
+       if (!(task_thread_info(current)->status & TS_USEDFPU)) {
+               clts();
+               task_thread_info(current)->status |= TS_USEDFPU;
+       }
+       return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
+}
+
+#else  /* CONFIG_X86_32 */
+
+static inline void tolerant_fwait(void)
+{
+       asm volatile("fnclex ; fwait");
+}
+
+static inline void restore_fpu(struct task_struct *tsk)
+{
+       /*
+        * The "nop" is needed to make the instructions the same
+        * length.
+        */
+       alternative_input(
+               "nop ; frstor %1",
+               "fxrstor %1",
+               X86_FEATURE_FXSR,
+               "m" ((tsk)->thread.i387.fxsave));
+}
+
+/* We need a safe address that is cheap to find and that is already
+   in L1 during context switch. The best choices are unfortunately
+   different for UP and SMP */
+#ifdef CONFIG_SMP
+#define safe_address (__per_cpu_offset[0])
  #else
-# include "i387_64.h"
+#define safe_address (kstat_cpu(0).cpustat.user)
  #endif
+
+/*
+ * These must be called with preempt disabled
+ */
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+       /* Use more nops than strictly needed in case the compiler
+          varies code */
+       alternative_input(
+               "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
+               "fxsave %[fx]\n"
+               "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
+               X86_FEATURE_FXSR,
+               [fx] "m" (tsk->thread.i387.fxsave),
+               [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
+       /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+          is pending.  Clear the x87 state here by setting it to fixed
+          values. safe_address is a random variable that should be in L1 */
+       alternative_input(
+               GENERIC_NOP8 GENERIC_NOP2,
+               "emms\n\t"              /* clear stack tags */
+               "fildl %[addr]",        /* set F?P to defined value */
+               X86_FEATURE_FXSAVE_LEAK,
+               [addr] "m" (safe_address));
+       task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387(struct _fpstate __user *buf);
+extern int restore_i387(struct _fpstate __user *buf);
+
+#endif /* CONFIG_X86_64 */
+
+static inline void __unlazy_fpu(struct task_struct *tsk)
+{
+       if (task_thread_info(tsk)->status & TS_USEDFPU) {
+               __save_init_fpu(tsk);
+               stts();
+       } else
+               tsk->fpu_counter = 0;
+}
+
+static inline void __clear_fpu(struct task_struct *tsk)
+{
+       if (task_thread_info(tsk)->status & TS_USEDFPU) {
+               tolerant_fwait();
+               task_thread_info(tsk)->status &= ~TS_USEDFPU;
+               stts();
+       }
+}
+
+static inline void kernel_fpu_begin(void)
+{
+       struct thread_info *me = current_thread_info();
+       preempt_disable();
+       if (me->status & TS_USEDFPU)
+               __save_init_fpu(me->task);
+       else
+               clts();
+}
+
+static inline void kernel_fpu_end(void)
+{
+       stts();
+       preempt_enable();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+       __save_init_fpu(tsk);
+       stts();
+}
+
+#define unlazy_fpu     __unlazy_fpu
+#define clear_fpu      __clear_fpu
+
+#else  /* CONFIG_X86_32 */
+
+/*
+ * These disable preemption on their own and are safe
+ */
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+       preempt_disable();
+       __save_init_fpu(tsk);
+       stts();
+       preempt_enable();
+}
+
+static inline void unlazy_fpu(struct task_struct *tsk)
+{
+       preempt_disable();
+       __unlazy_fpu(tsk);
+       preempt_enable();
+}
+
+static inline void clear_fpu(struct task_struct *tsk)
+{
+       preempt_disable();
+       __clear_fpu(tsk);
+       preempt_enable();
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * i387 state interaction
+ */
+static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
+{
+       if (cpu_has_fxsr) {
+               return tsk->thread.i387.fxsave.cwd;
+       } else {
+               return (unsigned short)tsk->thread.i387.fsave.cwd;
+       }
+}
+
+static inline unsigned short get_fpu_swd(struct task_struct *tsk)
+{
+       if (cpu_has_fxsr) {
+               return tsk->thread.i387.fxsave.swd;
+       } else {
+               return (unsigned short)tsk->thread.i387.fsave.swd;
+       }
+}
+
+static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
+{
+       if (cpu_has_xmm) {
+               return tsk->thread.i387.fxsave.mxcsr;
+       } else {
+               return MXCSR_DEFAULT;
+       }
+}
+
+#endif /* _ASM_X86_I387_H */
diff --git a/include/asm-x86/i387_32.h b/include/asm-x86/i387_32.h

deleted file mode 100644 (file)

index cdd1e24..0000000
--- a/include/asm-x86/i387_32.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * include/asm-i386/i387.h
- *
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#ifndef __ASM_I386_I387_H
-#define __ASM_I386_I387_H
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-
-extern void mxcsr_feature_mask_init(void);
-extern void init_fpu(struct task_struct *);
-
-/*
- * FPU lazy state save handling...
- */
-
-/*
- * The "nop" is needed to make the instructions the same
- * length.
- */
-#define restore_fpu(tsk)                       \
-       alternative_input(                      \
-               "nop ; frstor %1",              \
-               "fxrstor %1",                   \
-               X86_FEATURE_FXSR,               \
-               "m" ((tsk)->thread.i387.fxsave))
-
-extern void kernel_fpu_begin(void);
-#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
-
-/* We need a safe address that is cheap to find and that is already
-   in L1 during context switch. The best choices are unfortunately
-   different for UP and SMP */
-#ifdef CONFIG_SMP
-#define safe_address (__per_cpu_offset[0])
-#else
-#define safe_address (kstat_cpu(0).cpustat.user)
-#endif
-
-/*
- * These must be called with preempt disabled
- */
-static inline void __save_init_fpu( struct task_struct *tsk )
-{
-       /* Use more nops than strictly needed in case the compiler
-          varies code */
-       alternative_input(
-               "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
-               "fxsave %[fx]\n"
-               "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
-               X86_FEATURE_FXSR,
-               [fx] "m" (tsk->thread.i387.fxsave),
-               [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
-       /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-          is pending.  Clear the x87 state here by setting it to fixed
-          values. safe_address is a random variable that should be in L1 */
-       alternative_input(
-               GENERIC_NOP8 GENERIC_NOP2,
-               "emms\n\t"              /* clear stack tags */
-               "fildl %[addr]",        /* set F?P to defined value */
-               X86_FEATURE_FXSAVE_LEAK,
-               [addr] "m" (safe_address));
-       task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
-#define __unlazy_fpu( tsk ) do {                               \
-       if (task_thread_info(tsk)->status & TS_USEDFPU) {       \
-               __save_init_fpu(tsk);                           \
-               stts();                                         \
-       } else                                                  \
-               tsk->fpu_counter = 0;                           \
-} while (0)
-
-#define __clear_fpu( tsk )                                     \
-do {                                                           \
-       if (task_thread_info(tsk)->status & TS_USEDFPU) {       \
-               asm volatile("fnclex ; fwait");                 \
-               task_thread_info(tsk)->status &= ~TS_USEDFPU;   \
-               stts();                                         \
-       }                                                       \
-} while (0)
-
-
-/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu( struct task_struct *tsk )
-{
-       preempt_disable();
-       __save_init_fpu(tsk);
-       stts();
-       preempt_enable();
-}
-
-#define unlazy_fpu( tsk ) do { \
-       preempt_disable();      \
-       __unlazy_fpu(tsk);      \
-       preempt_enable();       \
-} while (0)
-
-#define clear_fpu( tsk ) do {  \
-       preempt_disable();      \
-       __clear_fpu( tsk );     \
-       preempt_enable();       \
-} while (0)
-
-/*
- * FPU state interaction...
- */
-extern unsigned short get_fpu_cwd( struct task_struct *tsk );
-extern unsigned short get_fpu_swd( struct task_struct *tsk );
-extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
-extern asmlinkage void math_state_restore(void);
-
-/*
- * Signal frame handlers...
- */
-extern int save_i387( struct _fpstate __user *buf );
-extern int restore_i387( struct _fpstate __user *buf );
-
-/*
- * ptrace request handers...
- */
-extern int get_fpregs( struct user_i387_struct __user *buf,
-                      struct task_struct *tsk );
-extern int set_fpregs( struct task_struct *tsk,
-                      struct user_i387_struct __user *buf );
-
-extern int get_fpxregs( struct user_fxsr_struct __user *buf,
-                       struct task_struct *tsk );
-extern int set_fpxregs( struct task_struct *tsk,
-                       struct user_fxsr_struct __user *buf );
-
-/*
- * FPU state for core dumps...
- */
-extern int dump_fpu( struct pt_regs *regs,
-                    struct user_i387_struct *fpu );
-
-#endif /* __ASM_I386_I387_H */
diff --git a/include/asm-x86/i387_64.h b/include/asm-x86/i387_64.h

deleted file mode 100644 (file)

index 3a4ffba..0000000
--- a/include/asm-x86/i387_64.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * include/asm-x86_64/i387.h
- *
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef __ASM_X86_64_I387_H
-#define __ASM_X86_64_I387_H
-
-#include <linux/sched.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/thread_info.h>
-#include <asm/uaccess.h>
-
-extern void fpu_init(void);
-extern unsigned int mxcsr_feature_mask;
-extern void mxcsr_feature_mask_init(void);
-extern void init_fpu(struct task_struct *child);
-extern int save_i387(struct _fpstate __user *buf);
-extern asmlinkage void math_state_restore(void);
-
-/*
- * FPU lazy state save handling...
- */
-
-#define unlazy_fpu(tsk) do { \
-       if (task_thread_info(tsk)->status & TS_USEDFPU) \
-               save_init_fpu(tsk);                     \
-       else                                            \
-               tsk->fpu_counter = 0;                   \
-} while (0)
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
-       asm volatile("1: fwait\n"
-                    "2:\n"
-                    "   .section __ex_table,\"a\"\n"
-                    "  .align 8\n"
-                    "  .quad 1b,2b\n"
-                    "  .previous\n");
-}
-
-#define clear_fpu(tsk) do { \
-       if (task_thread_info(tsk)->status & TS_USEDFPU) {       \
-               tolerant_fwait();                               \
-               task_thread_info(tsk)->status &= ~TS_USEDFPU;   \
-               stts();                                         \
-       }                                                       \
-} while (0)
-
-/*
- * ptrace request handers...
- */
-extern int get_fpregs(struct user_i387_struct __user *buf,
-                     struct task_struct *tsk);
-extern int set_fpregs(struct task_struct *tsk,
-                     struct user_i387_struct __user *buf);
-
-/*
- * i387 state interaction
- */
-#define get_fpu_mxcsr(t) ((t)->thread.i387.fxsave.mxcsr)
-#define get_fpu_cwd(t) ((t)->thread.i387.fxsave.cwd)
-#define get_fpu_fxsr_twd(t) ((t)->thread.i387.fxsave.twd)
-#define get_fpu_swd(t) ((t)->thread.i387.fxsave.swd)
-#define set_fpu_cwd(t,val) ((t)->thread.i387.fxsave.cwd = (val))
-#define set_fpu_swd(t,val) ((t)->thread.i387.fxsave.swd = (val))
-#define set_fpu_fxsr_twd(t,val) ((t)->thread.i387.fxsave.twd = (val))
-
-#define X87_FSW_ES (1 << 7)    /* Exception Summary */
-
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
-   is pending. Clear the x87 state here by setting it to fixed
-   values. The kernel data segment can be sometimes 0 and sometimes
-   new user value. Both should be ok.
-   Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
-{
-       if (unlikely(fx->swd & X87_FSW_ES))
-                asm volatile("fnclex");
-       alternative_input(ASM_NOP8 ASM_NOP2,
-                    "    emms\n"               /* clear stack tags */
-                    "    fildl %%gs:0",        /* load to clear state */
-                    X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline int restore_fpu_checking(struct i387_fxsave_struct *fx) 
-{ 
-       int err;
-
-       asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
-                    "2:\n"
-                    ".section .fixup,\"ax\"\n"
-                    "3:  movl $-1,%[err]\n"
-                    "    jmp  2b\n"
-                    ".previous\n"
-                    ".section __ex_table,\"a\"\n"
-                    "   .align 8\n"
-                    "   .quad  1b,3b\n"
-                    ".previous"
-                    : [err] "=r" (err)
-#if 0 /* See comment in __fxsave_clear() below. */
-                    : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
-                    : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
-       if (unlikely(err))
-               init_fpu(current);
-       return err;
-} 
-
-static inline int save_i387_checking(struct i387_fxsave_struct __user *fx) 
-{ 
-       int err;
-
-       asm volatile("1:  rex64/fxsave (%[fx])\n\t"
-                    "2:\n"
-                    ".section .fixup,\"ax\"\n"
-                    "3:  movl $-1,%[err]\n"
-                    "    jmp  2b\n"
-                    ".previous\n"
-                    ".section __ex_table,\"a\"\n"
-                    "   .align 8\n"
-                    "   .quad  1b,3b\n"
-                    ".previous"
-                    : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
-                    : [fx] "r" (fx), "0" (0));
-#else
-                    : [fx] "cdaSDb" (fx), "0" (0));
-#endif
-       if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct)))
-               err = -EFAULT;
-       /* No need to clear here because the caller clears USED_MATH */
-       return err;
-} 
-
-static inline void __fxsave_clear(struct task_struct *tsk)
-{
-       /* Using "rex64; fxsave %0" is broken because, if the memory operand
-          uses any extended registers for addressing, a second REX prefix
-          will be generated (to the assembler, rex64 followed by semicolon
-          is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
-       /* Using "fxsaveq %0" would be the ideal choice, but is only supported
-          starting with gas 2.16. */
-       __asm__ __volatile__("fxsaveq %0"
-                            : "=m" (tsk->thread.i387.fxsave));
-#elif 0
-       /* Using, as a workaround, the properly prefixed form below isn't
-          accepted by any binutils version so far released, complaining that
-          the same type of prefix is used twice if an extended register is
-          needed for addressing (fix submitted to mainline 2005-11-21). */
-       __asm__ __volatile__("rex64/fxsave %0"
-                            : "=m" (tsk->thread.i387.fxsave));
-#else
-       /* This, however, we can work around by forcing the compiler to select
-          an addressing mode that doesn't require extended registers. */
-       __asm__ __volatile__("rex64/fxsave %P2(%1)"
-                            : "=m" (tsk->thread.i387.fxsave)
-                            : "cdaSDb" (tsk),
-                               "i" (offsetof(__typeof__(*tsk),
-                                             thread.i387.fxsave)));
-#endif
-       clear_fpu_state(&tsk->thread.i387.fxsave);
-}
-
-static inline void kernel_fpu_begin(void)
-{
-       struct thread_info *me = current_thread_info();
-       preempt_disable();
-       if (me->status & TS_USEDFPU) {
-               __fxsave_clear(me->task);
-               me->status &= ~TS_USEDFPU;
-               return;
-       }
-       clts();
-}
-
-static inline void kernel_fpu_end(void)
-{
-       stts();
-       preempt_enable();
-}
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-       __fxsave_clear(tsk);
-       task_thread_info(tsk)->status &= ~TS_USEDFPU;
-       stts();
-}
-
-/* 
- * This restores directly out of user space. Exceptions are handled.
- */
-static inline int restore_i387(struct _fpstate __user *buf)
-{
-       set_used_math();
-       if (!(task_thread_info(current)->status & TS_USEDFPU)) {
-               clts();
-               task_thread_info(current)->status |= TS_USEDFPU;
-       }
-       return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
-}
-
-#endif /* __ASM_X86_64_I387_H */
diff --git a/include/asm-x86/i8253.h b/include/asm-x86/i8253.h

index 747548ec5d1ddce5fb1574b77b47d119f84e0754..b51c0487fc41d6aec43c7db474e80abd0330d50d 100644 (file)
--- a/include/asm-x86/i8253.h
+++ b/include/asm-x86/i8253.h
@@ -12,4 +12,7 @@ extern struct clock_event_device *global_clock_event;
  
  extern void setup_pit_timer(void);
  
+#define inb_pit                inb_p
+#define outb_pit       outb_p
+
  #endif /* __ASM_I8253_H__ */
diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h

index 29d8f9a6b3fcc60a85528ce03b2edd5990129e99..67c319e0efc79f4975e1983e8a9f75e31465e434 100644 (file)
--- a/include/asm-x86/i8259.h
+++ b/include/asm-x86/i8259.h
@@ -3,10 +3,25 @@
  
  extern unsigned int cached_irq_mask;
  
-#define __byte(x,y)            (((unsigned char *) &(y))[x])
+#define __byte(x,y)            (((unsigned char *) &(y))[x])
  #define cached_master_mask     (__byte(0, cached_irq_mask))
  #define cached_slave_mask      (__byte(1, cached_irq_mask))
  
+/* i8259A PIC registers */
+#define PIC_MASTER_CMD         0x20
+#define PIC_MASTER_IMR         0x21
+#define PIC_MASTER_ISR         PIC_MASTER_CMD
+#define PIC_MASTER_POLL                PIC_MASTER_ISR
+#define PIC_MASTER_OCW3                PIC_MASTER_ISR
+#define PIC_SLAVE_CMD          0xa0
+#define PIC_SLAVE_IMR          0xa1
+
+/* i8259A PIC related value */
+#define PIC_CASCADE_IR         2
+#define MASTER_ICW4_DEFAULT    0x01
+#define SLAVE_ICW4_DEFAULT     0x01
+#define PIC_ICW4_AEOI          2
+
  extern spinlock_t i8259A_lock;
  
  extern void init_8259A(int auto_eoi);
@@ -14,4 +29,7 @@ extern void enable_8259A_irq(unsigned int irq);
  extern void disable_8259A_irq(unsigned int irq);
  extern unsigned int startup_8259A_irq(unsigned int irq);
  
+#define inb_pic                inb_p
+#define outb_pic       outb_p
+
  #endif /* __ASM_I8259_H__ */
diff --git a/include/asm-x86/ia32.h b/include/asm-x86/ia32.h

index 0190b7c4e319cc1a153a23ba2030267ba8694b4e..aa9733206e29cdb6c46fba5a6d00d6db7ecceafe 100644 (file)
--- a/include/asm-x86/ia32.h
+++ b/include/asm-x86/ia32.h
@@ -159,12 +159,6 @@ struct ustat32 {
  #define IA32_STACK_TOP IA32_PAGE_OFFSET
  
  #ifdef __KERNEL__
-struct user_desc;
-struct siginfo_t;
-int do_get_thread_area(struct thread_struct *t, struct user_desc __user *info);
-int do_set_thread_area(struct thread_struct *t, struct user_desc __user *info);
-int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs);
-
  struct linux_binprm;
  extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
                                 unsigned long stack_top, int exec_stack);
diff --git a/include/asm-x86/ia32_unistd.h b/include/asm-x86/ia32_unistd.h

index 5b52ce5073383606db7b14c08d90136975bf69cd..61cea9e7c5c1b499dc67395b3d61ea18d54fee3f 100644 (file)
--- a/include/asm-x86/ia32_unistd.h
+++ b/include/asm-x86/ia32_unistd.h
@@ -5,7 +5,7 @@
   * This file contains the system call numbers of the ia32 port,
   * this is for the kernel only.
   * Only add syscalls here where some part of the kernel needs to know
- * the number. This should be otherwise in sync with asm-i386/unistd.h. -AK
+ * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
   */
  
  #define __NR_ia32_restart_syscall 0
diff --git a/include/asm-x86/ide.h b/include/asm-x86/ide.h

index 42130adf9c7c36ce71a0e1d71da21b089974992a..c2552d8bebf7b78abadef71fdcc92836228f472b 100644 (file)
--- a/include/asm-x86/ide.h
+++ b/include/asm-x86/ide.h
@@ -1,6 +1,4 @@
  /*
- *  linux/include/asm-i386/ide.h
- *
   *  Copyright (C) 1994-1996  Linus Torvalds & authors
   */
  
diff --git a/include/asm-x86/idle.h b/include/asm-x86/idle.h

index 6bd47dcf2067ce70e8569da825fc55787420b9a1..d240e5b30a45d198acd26cf251c34930338d4e37 100644 (file)
--- a/include/asm-x86/idle.h
+++ b/include/asm-x86/idle.h
@@ -6,7 +6,6 @@
  
  struct notifier_block;
  void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
  
  void enter_idle(void);
  void exit_idle(void);
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h

index fe881cd1e6f435ce56d84c0009db072cd756dff4..586d7aa54cebcaa2f629c112bbc88a4d077e9c21 100644 (file)
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -100,8 +100,6 @@ static inline void * phys_to_virt(unsigned long address)
   */
  #define page_to_phys(page)    ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
  
-extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
-
  /**
   * ioremap     -   map bus memory into CPU space
   * @offset:    bus address of the memory
@@ -111,32 +109,39 @@ extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsign
   * make bus memory CPU accessible via the readb/readw/readl/writeb/
   * writew/writel functions and the other mmio helpers. The returned
   * address is not guaranteed to be usable directly as a virtual
- * address. 
+ * address.
   *
   * If the area you are trying to map is a PCI BAR you should have a
   * look at pci_iomap().
   */
+extern void __iomem *ioremap_nocache(unsigned long offset, unsigned long size);
+extern void __iomem *ioremap_cache(unsigned long offset, unsigned long size);
  
-static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
  {
-       return __ioremap(offset, size, 0);
+       return ioremap_nocache(offset, size);
  }
  
-extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
  extern void iounmap(volatile void __iomem *addr);
  
  /*
- * bt_ioremap() and bt_iounmap() are for temporary early boot-time
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
   * mappings, before the real ioremap() is functional.
   * A boot-time mapping is currently limited to at most 16 pages.
   */
-extern void *bt_ioremap(unsigned long offset, unsigned long size);
-extern void bt_iounmap(void *addr, unsigned long size);
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
  extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
  
  /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_ioremap bt_ioremap
-#define dmi_iounmap bt_iounmap
+#define dmi_ioremap early_ioremap
+#define dmi_iounmap early_iounmap
  #define dmi_alloc alloc_bootmem
  
  /*
@@ -250,10 +255,10 @@ static inline void flush_write_buffers(void)
  
  #endif /* __KERNEL__ */
  
-static inline void native_io_delay(void)
-{
-       asm volatile("outb %%al,$0x80" : : : "memory");
-}
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
  
  #if defined(CONFIG_PARAVIRT)
  #include <asm/paravirt.h>
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h

index a037b079433200b0633d845e54cae305d6941d53..f64a59cc396d5eaee13c7b0840df629f9c909733 100644 (file)
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,12 +35,24 @@
    *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
    */
  
-#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
+extern void native_io_delay(void);
  
-#ifdef REALLY_SLOW_IO
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
  #else
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
+
+static inline void slow_down_io(void)
+{
+       native_io_delay();
+#ifdef REALLY_SLOW_IO
+       native_io_delay();
+       native_io_delay();
+       native_io_delay();
+#endif
+}
  #endif
  
  /*
@@ -52,9 +64,15 @@ static inline void out##s(unsigned x value, unsigned short port) {
  #define __OUT2(s,s1,s2) \
  __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
  
+#ifndef REALLY_SLOW_IO
+#define REALLY_SLOW_IO
+#define UNSET_REALLY_SLOW_IO
+#endif
+
  #define __OUT(s,s1,x) \
  __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
-__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
+__OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
+               slow_down_io(); }
  
  #define __IN1(s) \
  static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
@@ -63,8 +81,13 @@ static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
  __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
  
  #define __IN(s,s1,i...) \
-__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
-__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
+__IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); return _v; } \
+__IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i);          \
+                               slow_down_io(); return _v; }
+
+#ifdef UNSET_REALLY_SLOW_IO
+#undef REALLY_SLOW_IO
+#endif
  
  #define __INS(s) \
  static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
@@ -127,13 +150,6 @@ static inline void * phys_to_virt(unsigned long address)
  
  #include <asm-generic/iomap.h>
  
-extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
-
-static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
-{
-       return __ioremap(offset, size, 0);
-}
-
  extern void *early_ioremap(unsigned long addr, unsigned long size);
  extern void early_iounmap(void *addr, unsigned long size);
  
@@ -142,8 +158,19 @@ extern void early_iounmap(void *addr, unsigned long size);
   * it's useful if some control registers are in such an area and write combining
   * or read caching is not desirable:
   */
-extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
+extern void __iomem *ioremap_nocache(unsigned long offset, unsigned long size);
+extern void __iomem *ioremap_cache(unsigned long offset, unsigned long size);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
+{
+       return ioremap_nocache(offset, size);
+}
+
  extern void iounmap(volatile void __iomem *addr);
+
  extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
  
  /*
diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h

index 88494966beeb520b21e6bd70d30292f628ee9c26..0f5b3fef0b0848ac9f77483bcb44051f15434a36 100644 (file)
--- a/include/asm-x86/io_apic.h
+++ b/include/asm-x86/io_apic.h
@@ -1,5 +1,159 @@
+#ifndef __ASM_IO_APIC_H
+#define __ASM_IO_APIC_H
+
+#include <asm/types.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+
+/*
+ * Intel IO-APIC support for SMP and UP systems.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
+ */
+
+/*
+ * The structure of the IO-APIC:
+ */
+union IO_APIC_reg_00 {
+       u32     raw;
+       struct {
+               u32     __reserved_2    : 14,
+                       LTS             :  1,
+                       delivery_type   :  1,
+                       __reserved_1    :  8,
+                       ID              :  8;
+       } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_01 {
+       u32     raw;
+       struct {
+               u32     version         :  8,
+                       __reserved_2    :  7,
+                       PRQ             :  1,
+                       entries         :  8,
+                       __reserved_1    :  8;
+       } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_02 {
+       u32     raw;
+       struct {
+               u32     __reserved_2    : 24,
+                       arbitration     :  4,
+                       __reserved_1    :  4;
+       } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_03 {
+       u32     raw;
+       struct {
+               u32     boot_DT         :  1,
+                       __reserved_1    : 31;
+       } __attribute__ ((packed)) bits;
+};
+
+enum ioapic_irq_destination_types {
+       dest_Fixed = 0,
+       dest_LowestPrio = 1,
+       dest_SMI = 2,
+       dest__reserved_1 = 3,
+       dest_NMI = 4,
+       dest_INIT = 5,
+       dest__reserved_2 = 6,
+       dest_ExtINT = 7
+};
+
+struct IO_APIC_route_entry {
+       __u32   vector          :  8,
+               delivery_mode   :  3,   /* 000: FIXED
+                                        * 001: lowest prio
+                                        * 111: ExtINT
+                                        */
+               dest_mode       :  1,   /* 0: physical, 1: logical */
+               delivery_status :  1,
+               polarity        :  1,
+               irr             :  1,
+               trigger         :  1,   /* 0: edge, 1: level */
+               mask            :  1,   /* 0: enabled, 1: disabled */
+               __reserved_2    : 15;
+
  #ifdef CONFIG_X86_32
-# include "io_apic_32.h"
+       union {
+               struct {
+                       __u32   __reserved_1    : 24,
+                               physical_dest   :  4,
+                               __reserved_2    :  4;
+               } physical;
+
+               struct {
+                       __u32   __reserved_1    : 24,
+                               logical_dest    :  8;
+               } logical;
+       } dest;
  #else
-# include "io_apic_64.h"
+       __u32   __reserved_3    : 24,
+               dest            :  8;
+#endif
+
+} __attribute__ ((packed));
+
+#ifdef CONFIG_X86_IO_APIC
+
+/*
+ * # of IO-APICs and # of IRQ routing registers
+ */
+extern int nr_ioapics;
+extern int nr_ioapic_registers[MAX_IO_APICS];
+
+/*
+ * MP-BIOS irq configuration table structures:
+ */
+
+/* I/O APIC entries */
+extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+extern int mp_irq_entries;
+
+/* MP IRQ source entries */
+extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* non-0 if default (table-less) MP configuration */
+extern int mpc_default_type;
+
+/* Older SiS APIC requires we rewrite the index register */
+extern int sis_apic_bug;
+
+/* 1 if "noapic" boot option passed */
+extern int skip_ioapic_setup;
+
+static inline void disable_ioapic_setup(void)
+{
+       skip_ioapic_setup = 1;
+}
+
+/*
+ * If we use the IO-APIC for IRQ routing, disable automatic
+ * assignment of PCI IRQ's.
+ */
+#define io_apic_assign_pci_irqs \
+       (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
+
+#ifdef CONFIG_ACPI
+extern int io_apic_get_unique_id(int ioapic, int apic_id);
+extern int io_apic_get_version(int ioapic);
+extern int io_apic_get_redir_entries(int ioapic);
+extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
+                                  int edge_level, int active_high_low);
+extern int timer_uses_ioapic_pin_0;
+#endif /* CONFIG_ACPI */
+
+extern int (*ioapic_renumber_irq)(int ioapic, int irq);
+extern void ioapic_init_mappings(void);
+
+#else  /* !CONFIG_X86_IO_APIC */
+#define io_apic_assign_pci_irqs 0
+#endif
+
  #endif
diff --git a/include/asm-x86/io_apic_32.h b/include/asm-x86/io_apic_32.h

deleted file mode 100644 (file)

index 3f08788..0000000
--- a/include/asm-x86/io_apic_32.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef __ASM_IO_APIC_H
-#define __ASM_IO_APIC_H
-
-#include <asm/types.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-
-/*
- * Intel IO-APIC support for SMP and UP systems.
- *
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
- */
-
-/*
- * The structure of the IO-APIC:
- */
-union IO_APIC_reg_00 {
-       u32     raw;
-       struct {
-               u32     __reserved_2    : 14,
-                       LTS             :  1,
-                       delivery_type   :  1,
-                       __reserved_1    :  8,
-                       ID              :  8;
-       } __attribute__ ((packed)) bits;
-};
-
-union IO_APIC_reg_01 {
-       u32     raw;
-       struct {
-               u32     version         :  8,
-                       __reserved_2    :  7,
-                       PRQ             :  1,
-                       entries         :  8,
-                       __reserved_1    :  8;
-       } __attribute__ ((packed)) bits;
-};
-
-union IO_APIC_reg_02 {
-       u32     raw;
-       struct {
-               u32     __reserved_2    : 24,
-                       arbitration     :  4,
-                       __reserved_1    :  4;
-       } __attribute__ ((packed)) bits;
-};
-
-union IO_APIC_reg_03 {
-       u32     raw;
-       struct {
-               u32     boot_DT         :  1,
-                       __reserved_1    : 31;
-       } __attribute__ ((packed)) bits;
-};
-
-enum ioapic_irq_destination_types {
-       dest_Fixed = 0,
-       dest_LowestPrio = 1,
-       dest_SMI = 2,
-       dest__reserved_1 = 3,
-       dest_NMI = 4,
-       dest_INIT = 5,
-       dest__reserved_2 = 6,
-       dest_ExtINT = 7
-};
-
-struct IO_APIC_route_entry {
-       __u32   vector          :  8,
-               delivery_mode   :  3,   /* 000: FIXED
-                                        * 001: lowest prio
-                                        * 111: ExtINT
-                                        */
-               dest_mode       :  1,   /* 0: physical, 1: logical */
-               delivery_status :  1,
-               polarity        :  1,
-               irr             :  1,
-               trigger         :  1,   /* 0: edge, 1: level */
-               mask            :  1,   /* 0: enabled, 1: disabled */
-               __reserved_2    : 15;
-
-       union {         struct { __u32
-                                       __reserved_1    : 24,
-                                       physical_dest   :  4,
-                                       __reserved_2    :  4;
-                       } physical;
-
-                       struct { __u32
-                                       __reserved_1    : 24,
-                                       logical_dest    :  8;
-                       } logical;
-       } dest;
-
-} __attribute__ ((packed));
-
-#ifdef CONFIG_X86_IO_APIC
-
-/*
- * # of IO-APICs and # of IRQ routing registers
- */
-extern int nr_ioapics;
-extern int nr_ioapic_registers[MAX_IO_APICS];
-
-/*
- * MP-BIOS irq configuration table structures:
- */
-
-/* I/O APIC entries */
-extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-extern int mp_irq_entries;
-
-/* MP IRQ source entries */
-extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* non-0 if default (table-less) MP configuration */
-extern int mpc_default_type;
-
-/* Older SiS APIC requires we rewrite the index register */
-extern int sis_apic_bug;
-
-/* 1 if "noapic" boot option passed */
-extern int skip_ioapic_setup;
-
-static inline void disable_ioapic_setup(void)
-{
-       skip_ioapic_setup = 1;
-}
-
-static inline int ioapic_setup_disabled(void)
-{
-       return skip_ioapic_setup;
-}
-
-/*
- * If we use the IO-APIC for IRQ routing, disable automatic
- * assignment of PCI IRQ's.
- */
-#define io_apic_assign_pci_irqs (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
-
-#ifdef CONFIG_ACPI
-extern int io_apic_get_unique_id (int ioapic, int apic_id);
-extern int io_apic_get_version (int ioapic);
-extern int io_apic_get_redir_entries (int ioapic);
-extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low);
-extern int timer_uses_ioapic_pin_0;
-#endif /* CONFIG_ACPI */
-
-extern int (*ioapic_renumber_irq)(int ioapic, int irq);
-
-#else  /* !CONFIG_X86_IO_APIC */
-#define io_apic_assign_pci_irqs 0
-#endif
-
-#endif
diff --git a/include/asm-x86/io_apic_64.h b/include/asm-x86/io_apic_64.h

deleted file mode 100644 (file)

index e2c1367..0000000
--- a/include/asm-x86/io_apic_64.h
+++ /dev/null
@@ -1,138 +0,0 @@
-#ifndef __ASM_IO_APIC_H
-#define __ASM_IO_APIC_H
-
-#include <asm/types.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-
-/*
- * Intel IO-APIC support for SMP and UP systems.
- *
- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
- */
-
-#define APIC_MISMATCH_DEBUG
-
-/*
- * The structure of the IO-APIC:
- */
-union IO_APIC_reg_00 {
-       u32     raw;
-       struct {
-               u32     __reserved_2    : 14,
-                       LTS             :  1,
-                       delivery_type   :  1,
-                       __reserved_1    :  8,
-                       ID              :  8;
-       } __attribute__ ((packed)) bits;
-};
-
-union IO_APIC_reg_01 {
-       u32     raw;
-       struct {
-               u32     version         :  8,
-               __reserved_2    :  7,
-               PRQ             :  1,
-               entries         :  8,
-               __reserved_1    :  8;
-       } __attribute__ ((packed)) bits;
-};
-
-union IO_APIC_reg_02 {
-       u32     raw;
-       struct {
-               u32     __reserved_2    : 24,
-               arbitration     :  4,
-               __reserved_1    :  4;
-       } __attribute__ ((packed)) bits;
-};
-
-union IO_APIC_reg_03 {
-       u32     raw;
-       struct {
-               u32     boot_DT         :  1,
-                       __reserved_1    : 31;
-       } __attribute__ ((packed)) bits;
-};
-
-/*
- * # of IO-APICs and # of IRQ routing registers
- */
-extern int nr_ioapics;
-extern int nr_ioapic_registers[MAX_IO_APICS];
-
-enum ioapic_irq_destination_types {
-       dest_Fixed = 0,
-       dest_LowestPrio = 1,
-       dest_SMI = 2,
-       dest__reserved_1 = 3,
-       dest_NMI = 4,
-       dest_INIT = 5,
-       dest__reserved_2 = 6,
-       dest_ExtINT = 7
-};
-
-struct IO_APIC_route_entry {
-       __u32   vector          :  8,
-               delivery_mode   :  3,   /* 000: FIXED
-                                        * 001: lowest prio
-                                        * 111: ExtINT
-                                        */
-               dest_mode       :  1,   /* 0: physical, 1: logical */
-               delivery_status :  1,
-               polarity        :  1,
-               irr             :  1,
-               trigger         :  1,   /* 0: edge, 1: level */
-               mask            :  1,   /* 0: enabled, 1: disabled */
-               __reserved_2    : 15;
-
-       __u32   __reserved_3    : 24,
-               dest            :  8;
-} __attribute__ ((packed));
-
-/*
- * MP-BIOS irq configuration table structures:
- */
-
-/* I/O APIC entries */
-extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-extern int mp_irq_entries;
-
-/* MP IRQ source entries */
-extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* non-0 if default (table-less) MP configuration */
-extern int mpc_default_type;
-
-/* 1 if "noapic" boot option passed */
-extern int skip_ioapic_setup;
-
-static inline void disable_ioapic_setup(void)
-{
-       skip_ioapic_setup = 1;
-}
-
-
-/*
- * If we use the IO-APIC for IRQ routing, disable automatic
- * assignment of PCI IRQ's.
- */
-#define io_apic_assign_pci_irqs (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
-
-#ifdef CONFIG_ACPI
-extern int io_apic_get_version (int ioapic);
-extern int io_apic_get_redir_entries (int ioapic);
-extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int, int);
-#endif
-
-extern int sis_apic_bug; /* dummy */ 
-
-void enable_NMI_through_LVT0 (void * dummy);
-
-extern spinlock_t i8259A_lock;
-
-extern int timer_over_8254;
-
-#endif
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h

index 1b695ff52687f30eef250b5d065161b4c2be0c8f..92021c1ffa3ae8282f4a1dcfd685e31ba03c4f5c 100644 (file)
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -1,5 +1,245 @@
-#ifdef CONFIG_X86_32
-# include "irqflags_32.h"
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/processor-flags.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * Interrupt control:
+ */
+
+static inline unsigned long native_save_fl(void)
+{
+       unsigned long flags;
+
+       __asm__ __volatile__(
+               "# __raw_save_flags\n\t"
+               "pushf ; pop %0"
+               : "=g" (flags)
+               : /* no input */
+               : "memory"
+       );
+
+       return flags;
+}
+
+static inline void native_restore_fl(unsigned long flags)
+{
+       __asm__ __volatile__(
+               "push %0 ; popf"
+               : /* no output */
+               :"g" (flags)
+               :"memory", "cc"
+       );
+}
+
+static inline void native_irq_disable(void)
+{
+       asm volatile("cli": : :"memory");
+}
+
+static inline void native_irq_enable(void)
+{
+       asm volatile("sti": : :"memory");
+}
+
+static inline void native_safe_halt(void)
+{
+       asm volatile("sti; hlt": : :"memory");
+}
+
+static inline void native_halt(void)
+{
+       asm volatile("hlt": : :"memory");
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#ifndef __ASSEMBLY__
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+       return native_save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+       native_restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+       native_irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+       native_irq_enable();
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+       native_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+       native_halt();
+}
+
+/*
+ * For spinlocks, etc:
+ */
+static inline unsigned long __raw_local_irq_save(void)
+{
+       unsigned long flags = __raw_local_save_flags();
+
+       raw_local_irq_disable();
+
+       return flags;
+}
+#else
+
+#define ENABLE_INTERRUPTS(x)   sti
+#define DISABLE_INTERRUPTS(x)  cli
+
+#ifdef CONFIG_X86_64
+#define INTERRUPT_RETURN       iretq
+#define ENABLE_INTERRUPTS_SYSCALL_RET                  \
+                       movq    %gs:pda_oldrsp, %rsp;   \
+                       swapgs;                         \
+                       sysretq;
+#else
+#define INTERRUPT_RETURN               iret
+#define ENABLE_INTERRUPTS_SYSCALL_RET  sti; sysexit
+#define GET_CR0_INTO_EAX               movl %cr0, %eax
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+               do { (flags) = __raw_local_save_flags(); } while (0)
+
+#define raw_local_irq_save(flags) \
+               do { (flags) = __raw_local_irq_save(); } while (0)
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+       return !(flags & X86_EFLAGS_IF);
+}
+
+static inline int raw_irqs_disabled(void)
+{
+       unsigned long flags = __raw_local_save_flags();
+
+       return raw_irqs_disabled_flags(flags);
+}
+
+/*
+ * makes the traced hardirq state match with the machine state
+ *
+ * should be a rarely used function, only in places where its
+ * otherwise impossible to know the irq state, like in traps.
+ */
+static inline void trace_hardirqs_fixup_flags(unsigned long flags)
+{
+       if (raw_irqs_disabled_flags(flags))
+               trace_hardirqs_off();
+       else
+               trace_hardirqs_on();
+}
+
+static inline void trace_hardirqs_fixup(void)
+{
+       unsigned long flags = __raw_local_save_flags();
+
+       trace_hardirqs_fixup_flags(flags);
+}
+
  #else
-# include "irqflags_64.h"
+
+#ifdef CONFIG_X86_64
+/*
+ * Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack).  So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack. x86_64 only.
+ */
+#define SWAPGS_UNSAFE_STACK    swapgs
+#define ARCH_TRACE_IRQS_ON             call trace_hardirqs_on_thunk
+#define ARCH_TRACE_IRQS_OFF            call trace_hardirqs_off_thunk
+#define ARCH_LOCKDEP_SYS_EXIT          call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ      \
+       TRACE_IRQS_ON; \
+       sti; \
+       SAVE_REST; \
+       LOCKDEP_SYS_EXIT; \
+       RESTORE_REST; \
+       cli; \
+       TRACE_IRQS_OFF;
+
+#else
+#define ARCH_TRACE_IRQS_ON                     \
+       pushl %eax;                             \
+       pushl %ecx;                             \
+       pushl %edx;                             \
+       call trace_hardirqs_on;                 \
+       popl %edx;                              \
+       popl %ecx;                              \
+       popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF                    \
+       pushl %eax;                             \
+       pushl %ecx;                             \
+       pushl %edx;                             \
+       call trace_hardirqs_off;                \
+       popl %edx;                              \
+       popl %ecx;                              \
+       popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT                  \
+       pushl %eax;                             \
+       pushl %ecx;                             \
+       pushl %edx;                             \
+       call lockdep_sys_exit;                  \
+       popl %edx;                              \
+       popl %ecx;                              \
+       popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON                ARCH_TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF       ARCH_TRACE_IRQS_OFF
+#else
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#  define LOCKDEP_SYS_EXIT     ARCH_LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+#  define LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
  #endif
diff --git a/include/asm-x86/irqflags_32.h b/include/asm-x86/irqflags_32.h

deleted file mode 100644 (file)

index 4c77200..0000000
--- a/include/asm-x86/irqflags_32.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * include/asm-i386/irqflags.h
- *
- * IRQ flags handling
- *
- * This file gets included from lowlevel asm headers too, to provide
- * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() functions from the lowlevel headers.
- */
-#ifndef _ASM_IRQFLAGS_H
-#define _ASM_IRQFLAGS_H
-#include <asm/processor-flags.h>
-
-#ifndef __ASSEMBLY__
-static inline unsigned long native_save_fl(void)
-{
-       unsigned long f;
-       asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
-       return f;
-}
-
-static inline void native_restore_fl(unsigned long f)
-{
-       asm volatile("pushl %0 ; popfl": /* no output */
-                            :"g" (f)
-                            :"memory", "cc");
-}
-
-static inline void native_irq_disable(void)
-{
-       asm volatile("cli": : :"memory");
-}
-
-static inline void native_irq_enable(void)
-{
-       asm volatile("sti": : :"memory");
-}
-
-static inline void native_safe_halt(void)
-{
-       asm volatile("sti; hlt": : :"memory");
-}
-
-static inline void native_halt(void)
-{
-       asm volatile("hlt": : :"memory");
-}
-#endif /* __ASSEMBLY__ */
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#ifndef __ASSEMBLY__
-
-static inline unsigned long __raw_local_save_flags(void)
-{
-       return native_save_fl();
-}
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
-       native_restore_fl(flags);
-}
-
-static inline void raw_local_irq_disable(void)
-{
-       native_irq_disable();
-}
-
-static inline void raw_local_irq_enable(void)
-{
-       native_irq_enable();
-}
-
-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static inline void raw_safe_halt(void)
-{
-       native_safe_halt();
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static inline void halt(void)
-{
-       native_halt();
-}
-
-/*
- * For spinlocks, etc:
- */
-static inline unsigned long __raw_local_irq_save(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       raw_local_irq_disable();
-
-       return flags;
-}
-
-#else
-#define DISABLE_INTERRUPTS(clobbers)   cli
-#define ENABLE_INTERRUPTS(clobbers)    sti
-#define ENABLE_INTERRUPTS_SYSEXIT      sti; sysexit
-#define INTERRUPT_RETURN               iret
-#define GET_CR0_INTO_EAX               movl %cr0, %eax
-#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_PARAVIRT */
-
-#ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags) \
-               do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_save(flags) \
-               do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-       return !(flags & X86_EFLAGS_IF);
-}
-
-static inline int raw_irqs_disabled(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       return raw_irqs_disabled_flags(flags);
-}
-
-/*
- * makes the traced hardirq state match with the machine state
- *
- * should be a rarely used function, only in places where its
- * otherwise impossible to know the irq state, like in traps.
- */
-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
-{
-       if (raw_irqs_disabled_flags(flags))
-               trace_hardirqs_off();
-       else
-               trace_hardirqs_on();
-}
-
-static inline void trace_hardirqs_fixup(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       trace_hardirqs_fixup_flags(flags);
-}
-#endif /* __ASSEMBLY__ */
-
-/*
- * Do the CPU's IRQ-state tracing from assembly code. We call a
- * C function, so save all the C-clobbered registers:
- */
-#ifdef CONFIG_TRACE_IRQFLAGS
-
-# define TRACE_IRQS_ON                         \
-       pushl %eax;                             \
-       pushl %ecx;                             \
-       pushl %edx;                             \
-       call trace_hardirqs_on;                 \
-       popl %edx;                              \
-       popl %ecx;                              \
-       popl %eax;
-
-# define TRACE_IRQS_OFF                                \
-       pushl %eax;                             \
-       pushl %ecx;                             \
-       pushl %edx;                             \
-       call trace_hardirqs_off;                \
-       popl %edx;                              \
-       popl %ecx;                              \
-       popl %eax;
-
-#else
-# define TRACE_IRQS_ON
-# define TRACE_IRQS_OFF
-#endif
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCKDEP_SYS_EXIT                      \
-       pushl %eax;                             \
-       pushl %ecx;                             \
-       pushl %edx;                             \
-       call lockdep_sys_exit;                  \
-       popl %edx;                              \
-       popl %ecx;                              \
-       popl %eax;
-#else
-# define LOCKDEP_SYS_EXIT
-#endif
-
-#endif
diff --git a/include/asm-x86/irqflags_64.h b/include/asm-x86/irqflags_64.h

deleted file mode 100644 (file)

index bb9163b..0000000
--- a/include/asm-x86/irqflags_64.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * include/asm-x86_64/irqflags.h
- *
- * IRQ flags handling
- *
- * This file gets included from lowlevel asm headers too, to provide
- * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() functions from the lowlevel headers.
- */
-#ifndef _ASM_IRQFLAGS_H
-#define _ASM_IRQFLAGS_H
-#include <asm/processor-flags.h>
-
-#ifndef __ASSEMBLY__
-/*
- * Interrupt control:
- */
-
-static inline unsigned long __raw_local_save_flags(void)
-{
-       unsigned long flags;
-
-       __asm__ __volatile__(
-               "# __raw_save_flags\n\t"
-               "pushfq ; popq %q0"
-               : "=g" (flags)
-               : /* no input */
-               : "memory"
-       );
-
-       return flags;
-}
-
-#define raw_local_save_flags(flags) \
-               do { (flags) = __raw_local_save_flags(); } while (0)
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
-       __asm__ __volatile__(
-               "pushq %0 ; popfq"
-               : /* no output */
-               :"g" (flags)
-               :"memory", "cc"
-       );
-}
-
-#ifdef CONFIG_X86_VSMP
-
-/*
- * Interrupt control for the VSMP architecture:
- */
-
-static inline void raw_local_irq_disable(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
-}
-
-static inline void raw_local_irq_enable(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
-}
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-       return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
-}
-
-#else /* CONFIG_X86_VSMP */
-
-static inline void raw_local_irq_disable(void)
-{
-       __asm__ __volatile__("cli" : : : "memory");
-}
-
-static inline void raw_local_irq_enable(void)
-{
-       __asm__ __volatile__("sti" : : : "memory");
-}
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-       return !(flags & X86_EFLAGS_IF);
-}
-
-#endif
-
-/*
- * For spinlocks, etc.:
- */
-
-static inline unsigned long __raw_local_irq_save(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       raw_local_irq_disable();
-
-       return flags;
-}
-
-#define raw_local_irq_save(flags) \
-               do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       return raw_irqs_disabled_flags(flags);
-}
-
-/*
- * makes the traced hardirq state match with the machine state
- *
- * should be a rarely used function, only in places where its
- * otherwise impossible to know the irq state, like in traps.
- */
-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
-{
-       if (raw_irqs_disabled_flags(flags))
-               trace_hardirqs_off();
-       else
-               trace_hardirqs_on();
-}
-
-static inline void trace_hardirqs_fixup(void)
-{
-       unsigned long flags = __raw_local_save_flags();
-
-       trace_hardirqs_fixup_flags(flags);
-}
-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static inline void raw_safe_halt(void)
-{
-       __asm__ __volatile__("sti; hlt" : : : "memory");
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static inline void halt(void)
-{
-       __asm__ __volatile__("hlt": : :"memory");
-}
-
-#else /* __ASSEMBLY__: */
-# ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk
-#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk
-# else
-#  define TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF
-# endif
-# ifdef CONFIG_DEBUG_LOCK_ALLOC
-#  define LOCKDEP_SYS_EXIT     call lockdep_sys_exit_thunk
-#  define LOCKDEP_SYS_EXIT_IRQ \
-       TRACE_IRQS_ON; \
-       sti; \
-       SAVE_REST; \
-       LOCKDEP_SYS_EXIT; \
-       RESTORE_REST; \
-       cli; \
-       TRACE_IRQS_OFF;
-# else
-#  define LOCKDEP_SYS_EXIT
-#  define LOCKDEP_SYS_EXIT_IRQ
-# endif
-#endif
-
-#endif
diff --git a/include/asm-x86/k8.h b/include/asm-x86/k8.h

index 699dd6961eda06ae0943d62916b1bedc1396bb1b..452e2b696ff48ef2cdf26175572dde1132a175cf 100644 (file)
--- a/include/asm-x86/k8.h
+++ b/include/asm-x86/k8.h
@@ -10,5 +10,6 @@ extern struct pci_dev **k8_northbridges;
  extern int num_k8_northbridges;
  extern int cache_k8_northbridges(void);
  extern void k8_flush_garts(void);
+extern int k8_scan_nodes(unsigned long start, unsigned long end);
  
  #endif
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h

index e2f9b62e535efd6a3754bf39100e145c9f005a81..dd442a1632c00897800700bc6d01bc68c1f71c71 100644 (file)
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -22,12 +22,17 @@ enum die_val {
         DIE_PAGE_FAULT,
  };
  
-extern void printk_address(unsigned long address);
+extern void printk_address(unsigned long address, int reliable);
  extern void die(const char *,struct pt_regs *,long);
-extern void __die(const char *,struct pt_regs *,long);
+extern int __must_check __die(const char *, struct pt_regs *, long);
  extern void show_registers(struct pt_regs *regs);
+extern void __show_registers(struct pt_regs *, int all);
+extern void show_trace(struct task_struct *t, struct pt_regs *regs,
+                       unsigned long *sp, unsigned long bp);
+extern void __show_regs(struct pt_regs *regs);
+extern void show_regs(struct pt_regs *regs);
  extern void dump_pagetable(unsigned long);
  extern unsigned long oops_begin(void);
-extern void oops_end(unsigned long);
+extern void oops_end(unsigned long, struct pt_regs *, int signr);
  
  #endif
diff --git a/include/asm-x86/kexec.h b/include/asm-x86/kexec.h

index 718ddbfb95163f8d514709ffa980458c5d616892..c90d3c77afc26bc7f71f3d689bdcb0eb4f822319 100644 (file)
--- a/include/asm-x86/kexec.h
+++ b/include/asm-x86/kexec.h
@@ -1,5 +1,170 @@
+#ifndef _KEXEC_H
+#define _KEXEC_H
+
  #ifdef CONFIG_X86_32
-# include "kexec_32.h"
+# define PA_CONTROL_PAGE       0
+# define VA_CONTROL_PAGE       1
+# define PA_PGD                        2
+# define VA_PGD                        3
+# define PA_PTE_0              4
+# define VA_PTE_0              5
+# define PA_PTE_1              6
+# define VA_PTE_1              7
+# ifdef CONFIG_X86_PAE
+#  define PA_PMD_0             8
+#  define VA_PMD_0             9
+#  define PA_PMD_1             10
+#  define VA_PMD_1             11
+#  define PAGES_NR             12
+# else
+#  define PAGES_NR             8
+# endif
  #else
-# include "kexec_64.h"
+# define PA_CONTROL_PAGE       0
+# define VA_CONTROL_PAGE       1
+# define PA_PGD                        2
+# define VA_PGD                        3
+# define PA_PUD_0              4
+# define VA_PUD_0              5
+# define PA_PMD_0              6
+# define VA_PMD_0              7
+# define PA_PTE_0              8
+# define VA_PTE_0              9
+# define PA_PUD_1              10
+# define VA_PUD_1              11
+# define PA_PMD_1              12
+# define VA_PMD_1              13
+# define PA_PTE_1              14
+# define VA_PTE_1              15
+# define PA_TABLE_PAGE         16
+# define PAGES_NR              17
  #endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/string.h>
+
+#include <asm/page.h>
+#include <asm/ptrace.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * So far x86_64 is limited to 40 physical address bits.
+ */
+#ifdef CONFIG_X86_32
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+# define KEXEC_CONTROL_CODE_SIZE       4096
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_386
+
+/* We can also handle crash dumps from 64 bit kernel. */
+# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
+#else
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT      (0xFFFFFFFFFFUL)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+/* Maximum address we can use for the control pages */
+# define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+
+/* Allocate one page for the pdp and the second for the code */
+# define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_X86_64
+#endif
+
+/*
+ * CPU does not save ss and sp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
+static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
+                                     struct pt_regs *oldregs)
+{
+#ifdef CONFIG_X86_32
+       newregs->sp = (unsigned long)&(oldregs->sp);
+       __asm__ __volatile__(
+                       "xorl %%eax, %%eax\n\t"
+                       "movw %%ss, %%ax\n\t"
+                       :"=a"(newregs->ss));
+#endif
+}
+
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and sp if coming via kernel
+ * mode exception.
+ */
+static inline void crash_setup_regs(struct pt_regs *newregs,
+                                   struct pt_regs *oldregs)
+{
+       if (oldregs) {
+               memcpy(newregs, oldregs, sizeof(*newregs));
+               crash_fixup_ss_esp(newregs, oldregs);
+       } else {
+#ifdef CONFIG_X86_32
+               __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->bx));
+               __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->cx));
+               __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->dx));
+               __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->si));
+               __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->di));
+               __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->bp));
+               __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->ax));
+               __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->sp));
+               __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+               __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+               __asm__ __volatile__("movl %%ds, %%eax;" :"=a"(newregs->ds));
+               __asm__ __volatile__("movl %%es, %%eax;" :"=a"(newregs->es));
+               __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->flags));
+#else
+               __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->bx));
+               __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->cx));
+               __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->dx));
+               __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->si));
+               __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->di));
+               __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->bp));
+               __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->ax));
+               __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->sp));
+               __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
+               __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
+               __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
+               __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
+               __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
+               __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
+               __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
+               __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
+               __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+               __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+               __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->flags));
+#endif
+               newregs->ip = (unsigned long)current_text_addr();
+       }
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage NORET_TYPE void
+relocate_kernel(unsigned long indirection_page,
+               unsigned long control_page,
+               unsigned long start_address,
+               unsigned int has_pae) ATTRIB_NORET;
+#else
+NORET_TYPE void
+relocate_kernel(unsigned long indirection_page,
+               unsigned long page_list,
+               unsigned long start_address) ATTRIB_NORET;
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _KEXEC_H */
diff --git a/include/asm-x86/kexec_32.h b/include/asm-x86/kexec_32.h

deleted file mode 100644 (file)

index 4b9dc9e..0000000
--- a/include/asm-x86/kexec_32.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef _I386_KEXEC_H
-#define _I386_KEXEC_H
-
-#define PA_CONTROL_PAGE  0
-#define VA_CONTROL_PAGE  1
-#define PA_PGD           2
-#define VA_PGD           3
-#define PA_PTE_0         4
-#define VA_PTE_0         5
-#define PA_PTE_1         6
-#define VA_PTE_1         7
-#ifdef CONFIG_X86_PAE
-#define PA_PMD_0         8
-#define VA_PMD_0         9
-#define PA_PMD_1         10
-#define VA_PMD_1         11
-#define PAGES_NR         12
-#else
-#define PAGES_NR         8
-#endif
-
-#ifndef __ASSEMBLY__
-
-#include <asm/ptrace.h>
-#include <asm/string.h>
-
-/*
- * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
- * I.e. Maximum page that is mapped directly into kernel memory,
- * and kmap is not required.
- */
-
-/* Maximum physical address we can use pages from */
-#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
-/* Maximum address we can reach in physical address mode */
-#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
-/* Maximum address we can use for the control code buffer */
-#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
-
-#define KEXEC_CONTROL_CODE_SIZE        4096
-
-/* The native architecture */
-#define KEXEC_ARCH KEXEC_ARCH_386
-
-/* We can also handle crash dumps from 64 bit kernel. */
-#define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
-
-/* CPU does not save ss and esp on stack if execution is already
- * running in kernel mode at the time of NMI occurrence. This code
- * fixes it.
- */
-static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
-                                       struct pt_regs *oldregs)
-{
-       memcpy(newregs, oldregs, sizeof(*newregs));
-       newregs->esp = (unsigned long)&(oldregs->esp);
-       __asm__ __volatile__(
-                       "xorl %%eax, %%eax\n\t"
-                       "movw %%ss, %%ax\n\t"
-                       :"=a"(newregs->xss));
-}
-
-/*
- * This function is responsible for capturing register states if coming
- * via panic otherwise just fix up the ss and esp if coming via kernel
- * mode exception.
- */
-static inline void crash_setup_regs(struct pt_regs *newregs,
-                                       struct pt_regs *oldregs)
-{
-       if (oldregs)
-               crash_fixup_ss_esp(newregs, oldregs);
-       else {
-               __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
-               __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
-               __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
-               __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
-               __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
-               __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
-               __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
-               __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
-               __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss));
-               __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs));
-               __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds));
-               __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes));
-               __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
-
-               newregs->eip = (unsigned long)current_text_addr();
-       }
-}
-asmlinkage NORET_TYPE void
-relocate_kernel(unsigned long indirection_page,
-               unsigned long control_page,
-               unsigned long start_address,
-               unsigned int has_pae) ATTRIB_NORET;
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _I386_KEXEC_H */
diff --git a/include/asm-x86/kexec_64.h b/include/asm-x86/kexec_64.h

deleted file mode 100644 (file)

index 738e581..0000000
--- a/include/asm-x86/kexec_64.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef _X86_64_KEXEC_H
-#define _X86_64_KEXEC_H
-
-#define PA_CONTROL_PAGE  0
-#define VA_CONTROL_PAGE  1
-#define PA_PGD           2
-#define VA_PGD           3
-#define PA_PUD_0         4
-#define VA_PUD_0         5
-#define PA_PMD_0         6
-#define VA_PMD_0         7
-#define PA_PTE_0         8
-#define VA_PTE_0         9
-#define PA_PUD_1         10
-#define VA_PUD_1         11
-#define PA_PMD_1         12
-#define VA_PMD_1         13
-#define PA_PTE_1         14
-#define VA_PTE_1         15
-#define PA_TABLE_PAGE    16
-#define PAGES_NR         17
-
-#ifndef __ASSEMBLY__
-
-#include <linux/string.h>
-
-#include <asm/page.h>
-#include <asm/ptrace.h>
-
-/*
- * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
- * I.e. Maximum page that is mapped directly into kernel memory,
- * and kmap is not required.
- *
- * So far x86_64 is limited to 40 physical address bits.
- */
-
-/* Maximum physical address we can use pages from */
-#define KEXEC_SOURCE_MEMORY_LIMIT      (0xFFFFFFFFFFUL)
-/* Maximum address we can reach in physical address mode */
-#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
-/* Maximum address we can use for the control pages */
-#define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
-
-/* Allocate one page for the pdp and the second for the code */
-#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
-
-/* The native architecture */
-#define KEXEC_ARCH KEXEC_ARCH_X86_64
-
-/*
- * Saving the registers of the cpu on which panic occured in
- * crash_kexec to save a valid sp. The registers of other cpus
- * will be saved in machine_crash_shutdown while shooting down them.
- */
-
-static inline void crash_setup_regs(struct pt_regs *newregs,
-                                               struct pt_regs *oldregs)
-{
-       if (oldregs)
-               memcpy(newregs, oldregs, sizeof(*newregs));
-       else {
-               __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
-               __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
-               __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
-               __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
-               __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
-               __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
-               __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
-               __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
-               __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
-               __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
-               __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
-               __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
-               __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
-               __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
-               __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
-               __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
-               __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
-               __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
-               __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
-
-               newregs->rip = (unsigned long)current_text_addr();
-       }
-}
-
-NORET_TYPE void
-relocate_kernel(unsigned long indirection_page,
-               unsigned long page_list,
-               unsigned long start_address) ATTRIB_NORET;
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _X86_64_KEXEC_H */
diff --git a/include/asm-x86/kprobes.h b/include/asm-x86/kprobes.h

index b7bbd25ba2a674ad0b65a1c13d51cc2a59cda494..143476a3cb52c8328dc653c5b60b535ea5248f30 100644 (file)
--- a/include/asm-x86/kprobes.h
+++ b/include/asm-x86/kprobes.h
@@ -1,5 +1,98 @@
-#ifdef CONFIG_X86_32
-# include "kprobes_32.h"
-#else
-# include "kprobes_64.h"
-#endif
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ *  Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * See arch/x86/kernel/kprobes.c for x86 kprobes history.
+ */
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#define  __ARCH_WANT_KPROBES_INSN_SLOT
+
+struct pt_regs;
+struct kprobe;
+
+typedef u8 kprobe_opcode_t;
+#define BREAKPOINT_INSTRUCTION 0xcc
+#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define MAX_INSN_SIZE 16
+#define MAX_STACK_SIZE 64
+#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
+       (((unsigned long)current_thread_info()) + THREAD_SIZE \
+        - (unsigned long)(ADDR))) \
+       ? (MAX_STACK_SIZE) \
+       : (((unsigned long)current_thread_info()) + THREAD_SIZE \
+          - (unsigned long)(ADDR)))
+
+#define ARCH_SUPPORTS_KRETPROBES
+#define flush_insn_slot(p)     do { } while (0)
+
+extern const int kretprobe_blacklist_size;
+
+void arch_remove_kprobe(struct kprobe *p);
+void kretprobe_trampoline(void);
+
+/* Architecture specific copy of original instruction*/
+struct arch_specific_insn {
+       /* copy of the original instruction */
+       kprobe_opcode_t *insn;
+       /*
+        * boostable = -1: This instruction type is not boostable.
+        * boostable = 0: This instruction type is boostable.
+        * boostable = 1: This instruction has been boosted: we have
+        * added a relative jump after the instruction copy in insn,
+        * so no single-step and fixup are needed (unless there's
+        * a post_handler or break_handler).
+        */
+       int boostable;
+};
+
+struct prev_kprobe {
+       struct kprobe *kp;
+       unsigned long status;
+       unsigned long old_flags;
+       unsigned long saved_flags;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+       unsigned long kprobe_status;
+       unsigned long kprobe_old_flags;
+       unsigned long kprobe_saved_flags;
+       unsigned long *jprobe_saved_sp;
+       struct pt_regs jprobe_saved_regs;
+       kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+       struct prev_kprobe prev_kprobe;
+};
+
+/* trap3/1 are intr gates for kprobes.  So, restore the status of IF,
+ * if necessary, before executing the original int3/1 (trap) handler.
+ */
+static inline void restore_interrupts(struct pt_regs *regs)
+{
+       if (regs->flags & X86_EFLAGS_IF)
+               local_irq_enable();
+}
+
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+                                   unsigned long val, void *data);
+#endif                         /* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h

deleted file mode 100644 (file)

index 9fe8f3b..0000000
--- a/include/asm-x86/kprobes_32.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef _ASM_KPROBES_H
-#define _ASM_KPROBES_H
-/*
- *  Kernel Probes (KProbes)
- *  include/asm-i386/kprobes.h
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *             Probes initial implementation ( includes suggestions from
- *             Rusty Russell).
- */
-#include <linux/types.h>
-#include <linux/ptrace.h>
-
-#define  __ARCH_WANT_KPROBES_INSN_SLOT
-
-struct kprobe;
-struct pt_regs;
-
-typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
-#define MAX_INSN_SIZE 16
-#define MAX_STACK_SIZE 64
-#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
-       (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \
-       ? (MAX_STACK_SIZE) \
-       : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
-
-#define ARCH_SUPPORTS_KRETPROBES
-#define flush_insn_slot(p)     do { } while (0)
-
-extern const int kretprobe_blacklist_size;
-
-void arch_remove_kprobe(struct kprobe *p);
-void kretprobe_trampoline(void);
-
-/* Architecture specific copy of original instruction*/
-struct arch_specific_insn {
-       /* copy of the original instruction */
-       kprobe_opcode_t *insn;
-       /*
-        * If this flag is not 0, this kprobe can be boost when its
-        * post_handler and break_handler is not set.
-        */
-       int boostable;
-};
-
-struct prev_kprobe {
-       struct kprobe *kp;
-       unsigned long status;
-       unsigned long old_eflags;
-       unsigned long saved_eflags;
-};
-
-/* per-cpu kprobe control block */
-struct kprobe_ctlblk {
-       unsigned long kprobe_status;
-       unsigned long kprobe_old_eflags;
-       unsigned long kprobe_saved_eflags;
-       unsigned long *jprobe_saved_esp;
-       struct pt_regs jprobe_saved_regs;
-       kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
-       struct prev_kprobe prev_kprobe;
-};
-
-/* trap3/1 are intr gates for kprobes.  So, restore the status of IF,
- * if necessary, before executing the original int3/1 (trap) handler.
- */
-static inline void restore_interrupts(struct pt_regs *regs)
-{
-       if (regs->eflags & IF_MASK)
-               local_irq_enable();
-}
-
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-                                   unsigned long val, void *data);
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-#endif                         /* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h

deleted file mode 100644 (file)

index 743d762..0000000
--- a/include/asm-x86/kprobes_64.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef _ASM_KPROBES_H
-#define _ASM_KPROBES_H
-/*
- *  Kernel Probes (KProbes)
- *  include/asm-x86_64/kprobes.h
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2004-Oct    Prasanna S Panchamukhi <prasanna@in.ibm.com> and Jim Keniston
- *             kenistoj@us.ibm.com adopted from i386.
- */
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/percpu.h>
-
-#define  __ARCH_WANT_KPROBES_INSN_SLOT
-
-struct pt_regs;
-struct kprobe;
-
-typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION 0xcc
-#define MAX_INSN_SIZE 15
-#define MAX_STACK_SIZE 64
-#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
-       (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \
-       ? (MAX_STACK_SIZE) \
-       : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
-
-#define ARCH_SUPPORTS_KRETPROBES
-extern const int kretprobe_blacklist_size;
-
-void kretprobe_trampoline(void);
-extern void arch_remove_kprobe(struct kprobe *p);
-#define flush_insn_slot(p)     do { } while (0)
-
-/* Architecture specific copy of original instruction*/
-struct arch_specific_insn {
-       /* copy of the original instruction */
-       kprobe_opcode_t *insn;
-};
-
-struct prev_kprobe {
-       struct kprobe *kp;
-       unsigned long status;
-       unsigned long old_rflags;
-       unsigned long saved_rflags;
-};
-
-/* per-cpu kprobe control block */
-struct kprobe_ctlblk {
-       unsigned long kprobe_status;
-       unsigned long kprobe_old_rflags;
-       unsigned long kprobe_saved_rflags;
-       unsigned long *jprobe_saved_rsp;
-       struct pt_regs jprobe_saved_regs;
-       kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
-       struct prev_kprobe prev_kprobe;
-};
-
-/* trap3/1 are intr gates for kprobes.  So, restore the status of IF,
- * if necessary, before executing the original int3/1 (trap) handler.
- */
-static inline void restore_interrupts(struct pt_regs *regs)
-{
-       if (regs->eflags & IF_MASK)
-               local_irq_enable();
-}
-
-extern int post_kprobe_handler(struct pt_regs *regs);
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_handler(struct pt_regs *regs);
-
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-                                   unsigned long val, void *data);
-#endif                         /* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h

new file mode 100644 (file)

index 0000000..7a71120
--- /dev/null
+++ b/include/asm-x86/kvm.h
@@ -0,0 +1,191 @@
+#ifndef __LINUX_KVM_X86_H
+#define __LINUX_KVM_X86_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+       __u32 slot;  /* this has a different namespace than memory slots */
+       __u32 flags;
+       __u64 guest_phys_addr;
+       __u64 memory_size;
+       __u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+       __u8 last_irr;  /* edge detection */
+       __u8 irr;               /* interrupt request register */
+       __u8 imr;               /* interrupt mask register */
+       __u8 isr;               /* interrupt service register */
+       __u8 priority_add;      /* highest irq priority */
+       __u8 irq_base;
+       __u8 read_reg_select;
+       __u8 poll;
+       __u8 special_mask;
+       __u8 init_state;
+       __u8 auto_eoi;
+       __u8 rotate_on_auto_eoi;
+       __u8 special_fully_nested_mode;
+       __u8 init4;             /* true if 4 byte init */
+       __u8 elcr;              /* PIIX edge/trigger selection */
+       __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS  24
+struct kvm_ioapic_state {
+       __u64 base_address;
+       __u32 ioregsel;
+       __u32 id;
+       __u32 irr;
+       __u32 pad;
+       union {
+               __u64 bits;
+               struct {
+                       __u8 vector;
+                       __u8 delivery_mode:3;
+                       __u8 dest_mode:1;
+                       __u8 delivery_status:1;
+                       __u8 polarity:1;
+                       __u8 remote_irr:1;
+                       __u8 trig_mode:1;
+                       __u8 mask:1;
+                       __u8 reserve:7;
+                       __u8 reserved[4];
+                       __u8 dest_id;
+               } fields;
+       } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER   0
+#define KVM_IRQCHIP_PIC_SLAVE    1
+#define KVM_IRQCHIP_IOAPIC       2
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+       /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+       __u64 rax, rbx, rcx, rdx;
+       __u64 rsi, rdi, rsp, rbp;
+       __u64 r8,  r9,  r10, r11;
+       __u64 r12, r13, r14, r15;
+       __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+       char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+       __u64 base;
+       __u32 limit;
+       __u16 selector;
+       __u8  type;
+       __u8  present, dpl, db, s, l, g, avl;
+       __u8  unusable;
+       __u8  padding;
+};
+
+struct kvm_dtable {
+       __u64 base;
+       __u16 limit;
+       __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+       /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+       struct kvm_segment cs, ds, es, fs, gs, ss;
+       struct kvm_segment tr, ldt;
+       struct kvm_dtable gdt, idt;
+       __u64 cr0, cr2, cr3, cr4, cr8;
+       __u64 efer;
+       __u64 apic_base;
+       __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+       __u8  fpr[8][16];
+       __u16 fcw;
+       __u16 fsw;
+       __u8  ftwx;  /* in fxsave format */
+       __u8  pad1;
+       __u16 last_opcode;
+       __u64 last_ip;
+       __u64 last_dp;
+       __u8  xmm[16][16];
+       __u32 mxcsr;
+       __u32 pad2;
+};
+
+struct kvm_msr_entry {
+       __u32 index;
+       __u32 reserved;
+       __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+       __u32 nmsrs; /* number of msrs in entries */
+       __u32 pad;
+
+       struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+       __u32 nmsrs; /* number of msrs in entries */
+       __u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+       __u32 function;
+       __u32 eax;
+       __u32 ebx;
+       __u32 ecx;
+       __u32 edx;
+       __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+       __u32 nent;
+       __u32 padding;
+       struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+       __u32 function;
+       __u32 index;
+       __u32 flags;
+       __u32 eax;
+       __u32 ebx;
+       __u32 ecx;
+       __u32 edx;
+       __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+       __u32 nent;
+       __u32 padding;
+       struct kvm_cpuid_entry2 entries[0];
+};
+
+#endif
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h

similarity index 64%

rename from drivers/kvm/kvm.h

rename to include/asm-x86/kvm_host.h

index 3b0bc4bda5f2372122b7ee21b0837c4b771722d7..4702b04b979a74e9dce84bbebe670a0fe4f71d9f 100644 (file)
--- a/drivers/kvm/kvm.h
+++ b/include/asm-x86/kvm_host.h
@@ -1,23 +1,24 @@
-#ifndef __KVM_H
-#define __KVM_H
-
-/*
+#/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
   * This work is licensed under the terms of the GNU GPL, version 2.  See
   * the COPYING file in the top-level directory.
+ *
   */
  
+#ifndef ASM_KVM_HOST_H
+#define ASM_KVM_HOST_H
+
  #include <linux/types.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
  #include <linux/mm.h>
-#include <linux/preempt.h>
-#include <asm/signal.h>
  
  #include <linux/kvm.h>
  #include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/desc.h>
  
  #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
  #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
@@ -37,15 +38,8 @@
  #define INVALID_PAGE (~(hpa_t)0)
  #define UNMAPPED_GVA (~(gpa_t)0)
  
-#define KVM_MAX_VCPUS 4
-#define KVM_ALIAS_SLOTS 4
-#define KVM_MEMORY_SLOTS 8
-#define KVM_NUM_MMU_PAGES 1024
-#define KVM_MIN_FREE_MMU_PAGES 5
-#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
-
  #define DE_VECTOR 0
+#define UD_VECTOR 6
  #define NM_VECTOR 7
  #define DF_VECTOR 8
  #define TS_VECTOR 10
@@ -59,31 +53,66 @@
  
  #define IOPL_SHIFT 12
  
-#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_ALIAS_SLOTS 4
  
-/*
- * vcpu->requests bit members
- */
-#define KVM_TLB_FLUSH 0
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_NUM_MMU_PAGES 1024
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 40
  
-/*
- * Address types:
- *
- *  gva - guest virtual address
- *  gpa - guest physical address
- *  gfn - guest frame number
- *  hva - host virtual address
- *  hpa - host physical address
- *  hfn - host frame number
- */
+extern spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
+struct kvm_vcpu;
+struct kvm;
+
+enum {
+       VCPU_REGS_RAX = 0,
+       VCPU_REGS_RCX = 1,
+       VCPU_REGS_RDX = 2,
+       VCPU_REGS_RBX = 3,
+       VCPU_REGS_RSP = 4,
+       VCPU_REGS_RBP = 5,
+       VCPU_REGS_RSI = 6,
+       VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+       VCPU_REGS_R8 = 8,
+       VCPU_REGS_R9 = 9,
+       VCPU_REGS_R10 = 10,
+       VCPU_REGS_R11 = 11,
+       VCPU_REGS_R12 = 12,
+       VCPU_REGS_R13 = 13,
+       VCPU_REGS_R14 = 14,
+       VCPU_REGS_R15 = 15,
+#endif
+       NR_VCPU_REGS
+};
+
+enum {
+       VCPU_SREG_CS,
+       VCPU_SREG_DS,
+       VCPU_SREG_ES,
+       VCPU_SREG_FS,
+       VCPU_SREG_GS,
+       VCPU_SREG_SS,
+       VCPU_SREG_TR,
+       VCPU_SREG_LDTR,
+};
  
-typedef unsigned long  gva_t;
-typedef u64            gpa_t;
-typedef unsigned long  gfn_t;
+#include <asm/kvm_x86_emulate.h>
  
-typedef unsigned long  hva_t;
-typedef u64            hpa_t;
-typedef unsigned long  hfn_t;
+#define KVM_NR_MEM_OBJS 40
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+       int nobjs;
+       void *objects[KVM_NR_MEM_OBJS];
+};
  
  #define NR_PTE_CHAIN_ENTRIES 5
  
@@ -99,7 +128,7 @@ struct kvm_pte_chain {
   *   bits 4:7 - page table level for this shadow (1-4)
   *   bits 8:9 - page table quadrant for 2-level guests
   *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
- *   bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
+ *   bits 17:19 - common access permissions for all ptes in this shadow page
   */
  union kvm_mmu_page_role {
         unsigned word;
@@ -109,7 +138,7 @@ union kvm_mmu_page_role {
                 unsigned quadrant : 2;
                 unsigned pad_for_nice_hex_output : 6;
                 unsigned metaphysical : 1;
-               unsigned hugepage_access : 3;
+               unsigned access : 3;
         };
  };
  
@@ -125,6 +154,8 @@ struct kvm_mmu_page {
         union kvm_mmu_page_role role;
  
         u64 *spt;
+       /* hold the gfn of each spte inside spt */
+       gfn_t *gfns;
         unsigned long slot_bitmap; /* One bit set per slot which has memory
                                     * in this shadow page.
                                     */
@@ -136,9 +167,6 @@ struct kvm_mmu_page {
         };
  };
  
-struct kvm_vcpu;
-extern struct kmem_cache *kvm_vcpu_cache;
-
  /*
   * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
   * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -149,6 +177,8 @@ struct kvm_mmu {
         int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
         void (*free)(struct kvm_vcpu *vcpu);
         gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
+       void (*prefetch_page)(struct kvm_vcpu *vcpu,
+                             struct kvm_mmu_page *page);
         hpa_t root_hpa;
         int root_level;
         int shadow_root_level;
@@ -156,159 +186,9 @@ struct kvm_mmu {
         u64 *pae_root;
  };
  
-#define KVM_NR_MEM_OBJS 20
-
-struct kvm_mmu_memory_cache {
-       int nobjs;
-       void *objects[KVM_NR_MEM_OBJS];
-};
-
-/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_guest_debug {
-       int enabled;
-       unsigned long bp[4];
-       int singlestep;
-};
-
-enum {
-       VCPU_REGS_RAX = 0,
-       VCPU_REGS_RCX = 1,
-       VCPU_REGS_RDX = 2,
-       VCPU_REGS_RBX = 3,
-       VCPU_REGS_RSP = 4,
-       VCPU_REGS_RBP = 5,
-       VCPU_REGS_RSI = 6,
-       VCPU_REGS_RDI = 7,
-#ifdef CONFIG_X86_64
-       VCPU_REGS_R8 = 8,
-       VCPU_REGS_R9 = 9,
-       VCPU_REGS_R10 = 10,
-       VCPU_REGS_R11 = 11,
-       VCPU_REGS_R12 = 12,
-       VCPU_REGS_R13 = 13,
-       VCPU_REGS_R14 = 14,
-       VCPU_REGS_R15 = 15,
-#endif
-       NR_VCPU_REGS
-};
-
-enum {
-       VCPU_SREG_CS,
-       VCPU_SREG_DS,
-       VCPU_SREG_ES,
-       VCPU_SREG_FS,
-       VCPU_SREG_GS,
-       VCPU_SREG_SS,
-       VCPU_SREG_TR,
-       VCPU_SREG_LDTR,
-};
-
-struct kvm_pio_request {
-       unsigned long count;
-       int cur_count;
-       struct page *guest_pages[2];
-       unsigned guest_page_offset;
-       int in;
-       int port;
-       int size;
-       int string;
-       int down;
-       int rep;
-};
-
-struct kvm_stat {
-       u32 pf_fixed;
-       u32 pf_guest;
-       u32 tlb_flush;
-       u32 invlpg;
-
-       u32 exits;
-       u32 io_exits;
-       u32 mmio_exits;
-       u32 signal_exits;
-       u32 irq_window_exits;
-       u32 halt_exits;
-       u32 halt_wakeup;
-       u32 request_irq_exits;
-       u32 irq_exits;
-       u32 light_exits;
-       u32 efer_reload;
-};
-
-struct kvm_io_device {
-       void (*read)(struct kvm_io_device *this,
-                    gpa_t addr,
-                    int len,
-                    void *val);
-       void (*write)(struct kvm_io_device *this,
-                     gpa_t addr,
-                     int len,
-                     const void *val);
-       int (*in_range)(struct kvm_io_device *this, gpa_t addr);
-       void (*destructor)(struct kvm_io_device *this);
-
-       void             *private;
-};
-
-static inline void kvm_iodevice_read(struct kvm_io_device *dev,
-                                    gpa_t addr,
-                                    int len,
-                                    void *val)
-{
-       dev->read(dev, addr, len, val);
-}
-
-static inline void kvm_iodevice_write(struct kvm_io_device *dev,
-                                     gpa_t addr,
-                                     int len,
-                                     const void *val)
-{
-       dev->write(dev, addr, len, val);
-}
-
-static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
-{
-       return dev->in_range(dev, addr);
-}
-
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
-       if (dev->destructor)
-               dev->destructor(dev);
-}
-
-/*
- * It would be nice to use something smarter than a linear search, TBD...
- * Thankfully we dont expect many devices to register (famous last words :),
- * so until then it will suffice.  At least its abstracted so we can change
- * in one place.
- */
-struct kvm_io_bus {
-       int                   dev_count;
-#define NR_IOBUS_DEVS 6
-       struct kvm_io_device *devs[NR_IOBUS_DEVS];
-};
-
-void kvm_io_bus_init(struct kvm_io_bus *bus);
-void kvm_io_bus_destroy(struct kvm_io_bus *bus);
-struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
-void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
-                            struct kvm_io_device *dev);
-
-struct kvm_vcpu {
-       struct kvm *kvm;
-       struct preempt_notifier preempt_notifier;
-       int vcpu_id;
-       struct mutex mutex;
-       int   cpu;
+struct kvm_vcpu_arch {
         u64 host_tsc;
-       struct kvm_run *run;
         int interrupt_window_open;
-       int guest_mode;
-       unsigned long requests;
         unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
         DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
         unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
@@ -317,9 +197,6 @@ struct kvm_vcpu {
         unsigned long cr0;
         unsigned long cr2;
         unsigned long cr3;
-       gpa_t para_state_gpa;
-       struct page *para_state_page;
-       gpa_t hypercall_gpa;
         unsigned long cr4;
         unsigned long cr8;
         u64 pdptrs[4]; /* pae */
@@ -334,6 +211,7 @@ struct kvm_vcpu {
         int mp_state;
         int sipi_vector;
         u64 ia32_misc_enable_msr;
+       bool tpr_access_reporting;
  
         struct kvm_mmu mmu;
  
@@ -344,29 +222,26 @@ struct kvm_vcpu {
  
         gfn_t last_pt_write_gfn;
         int   last_pt_write_count;
+       u64  *last_pte_updated;
  
-       struct kvm_guest_debug guest_debug;
+       struct {
+               gfn_t gfn;          /* presumed gfn during guest pte update */
+               struct page *page;  /* page corresponding to that gfn */
+       } update_pte;
  
         struct i387_fxsave_struct host_fx_image;
         struct i387_fxsave_struct guest_fx_image;
-       int fpu_active;
-       int guest_fpu_loaded;
-
-       int mmio_needed;
-       int mmio_read_completed;
-       int mmio_is_write;
-       int mmio_size;
-       unsigned char mmio_data[8];
-       gpa_t mmio_phys_addr;
+
         gva_t mmio_fault_cr2;
         struct kvm_pio_request pio;
         void *pio_data;
-       wait_queue_head_t wq;
  
-       int sigset_active;
-       sigset_t sigset;
-
-       struct kvm_stat stat;
+       struct kvm_queued_exception {
+               bool pending;
+               bool has_error_code;
+               u8 nr;
+               u32 error_code;
+       } exception;
  
         struct {
                 int active;
@@ -381,7 +256,10 @@ struct kvm_vcpu {
         int halt_request; /* real mode on Intel only */
  
         int cpuid_nent;
-       struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+       struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+       /* emulate context */
+
+       struct x86_emulate_ctxt emulate_ctxt;
  };
  
  struct kvm_mem_alias {
@@ -390,51 +268,58 @@ struct kvm_mem_alias {
         gfn_t target_gfn;
  };
  
-struct kvm_memory_slot {
-       gfn_t base_gfn;
-       unsigned long npages;
-       unsigned long flags;
-       struct page **phys_mem;
-       unsigned long *dirty_bitmap;
-};
-
-struct kvm {
-       struct mutex lock; /* protects everything except vcpus */
+struct kvm_arch{
         int naliases;
         struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-       int nmemslots;
-       struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
+
+       unsigned int n_free_mmu_pages;
+       unsigned int n_requested_mmu_pages;
+       unsigned int n_alloc_mmu_pages;
+       struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
         /*
          * Hash table of struct kvm_mmu_page.
          */
         struct list_head active_mmu_pages;
-       int n_free_mmu_pages;
-       struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-       struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
-       unsigned long rmap_overflow;
-       struct list_head vm_list;
-       struct file *filp;
-       struct kvm_io_bus mmio_bus;
-       struct kvm_io_bus pio_bus;
         struct kvm_pic *vpic;
         struct kvm_ioapic *vioapic;
+
         int round_robin_prev_vcpu;
+       unsigned int tss_addr;
+       struct page *apic_access_page;
  };
  
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
-       return kvm->vpic;
-}
+struct kvm_vm_stat {
+       u32 mmu_shadow_zapped;
+       u32 mmu_pte_write;
+       u32 mmu_pte_updated;
+       u32 mmu_pde_zapped;
+       u32 mmu_flooded;
+       u32 mmu_recycled;
+       u32 mmu_cache_miss;
+       u32 remote_tlb_flush;
+};
  
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
-       return kvm->vioapic;
-}
+struct kvm_vcpu_stat {
+       u32 pf_fixed;
+       u32 pf_guest;
+       u32 tlb_flush;
+       u32 invlpg;
  
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-       return pic_irqchip(kvm) != 0;
-}
+       u32 exits;
+       u32 io_exits;
+       u32 mmio_exits;
+       u32 signal_exits;
+       u32 irq_window_exits;
+       u32 halt_exits;
+       u32 halt_wakeup;
+       u32 request_irq_exits;
+       u32 irq_exits;
+       u32 host_state_reload;
+       u32 efer_reload;
+       u32 fpu_reload;
+       u32 insn_emulation;
+       u32 insn_emulation_fail;
+};
  
  struct descriptor_table {
         u16 limit;
@@ -449,11 +334,12 @@ struct kvm_x86_ops {
         void (*check_processor_compatibility)(void *rtn);
         int (*hardware_setup)(void);               /* __init */
         void (*hardware_unsetup)(void);            /* __exit */
+       bool (*cpu_has_accelerated_tpr)(void);
  
         /* Create, but do not attach this VCPU */
         struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
         void (*vcpu_free)(struct kvm_vcpu *vcpu);
-       void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+       int (*vcpu_reset)(struct kvm_vcpu *vcpu);
  
         void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
         void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -489,10 +375,6 @@ struct kvm_x86_ops {
         void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
  
         void (*tlb_flush)(struct kvm_vcpu *vcpu);
-       void (*inject_page_fault)(struct kvm_vcpu *vcpu,
-                                 unsigned long addr, u32 err_code);
-
-       void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
  
         void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
         int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
@@ -501,54 +383,31 @@ struct kvm_x86_ops {
                                 unsigned char *hypercall_addr);
         int (*get_irq)(struct kvm_vcpu *vcpu);
         void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+       void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
+                               bool has_error_code, u32 error_code);
+       bool (*exception_injected)(struct kvm_vcpu *vcpu);
         void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
         void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
                                        struct kvm_run *run);
+
+       int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
  };
  
  extern struct kvm_x86_ops *kvm_x86_ops;
  
-/* The guest did something we don't support. */
-#define pr_unimpl(vcpu, fmt, ...)                                      \
- do {                                                                  \
-       if (printk_ratelimit())                                         \
-               printk(KERN_ERR "kvm: %i: cpu%i " fmt,                  \
-                      current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
- } while(0)
-
-#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
-#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
-
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
-
-int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
-                 struct module *module);
-void kvm_exit_x86(void);
-
  int kvm_mmu_module_init(void);
  void kvm_mmu_module_exit(void);
  
  void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
  int kvm_mmu_create(struct kvm_vcpu *vcpu);
  int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
  
  int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
  void kvm_mmu_zap_all(struct kvm *kvm);
-
-hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
-
-extern hpa_t bad_page_address;
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
-void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
  
  enum emulation_result {
         EMULATE_DONE,       /* no further processing */
@@ -556,8 +415,10 @@ enum emulation_result {
         EMULATE_FAIL,         /* can't emulate this instruction */
  };
  
+#define EMULTYPE_NO_DECODE         (1 << 0)
+#define EMULTYPE_TRAP_UD           (1 << 1)
  int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
-                       unsigned long cr2, u16 error_code);
+                       unsigned long cr2, u16 error_code, int emulation_type);
  void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
  void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
  void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
  
  struct x86_emulate_ctxt;
  
-int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
+int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                      int size, unsigned port);
  int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                            int size, unsigned long count, int down,
@@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
  int kvm_emulate_halt(struct kvm_vcpu *vcpu);
  int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
  int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
                     unsigned long *dest);
  int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
                     unsigned long value);
@@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
  
-void fx_init(struct kvm_vcpu *vcpu);
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
+                          u32 error_code);
  
-void kvm_resched(struct kvm_vcpu *vcpu);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_flush_remote_tlbs(struct kvm *kvm);
+void fx_init(struct kvm_vcpu *vcpu);
  
  int emulator_read_std(unsigned long addr,
-                      void *val,
+                     void *val,
                       unsigned int bytes,
                       struct kvm_vcpu *vcpu);
  int emulator_write_emulated(unsigned long addr,
@@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr,
  
  unsigned long segment_base(u16 selector);
  
+void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
  void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                        const u8 *new, int bytes);
  int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
@@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
  int kvm_mmu_load(struct kvm_vcpu *vcpu);
  void kvm_mmu_unload(struct kvm_vcpu *vcpu);
  
-int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
  
-static inline void kvm_guest_enter(void)
-{
-       current->flags |= PF_VCPU;
-}
+int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
  
-static inline void kvm_guest_exit(void)
-{
-       current->flags &= ~PF_VCPU;
-}
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
  
-static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-                                    u32 error_code)
-{
-       return vcpu->mmu.page_fault(vcpu, gva, error_code);
-}
-
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
-               __kvm_mmu_free_some_pages(vcpu);
-}
-
-static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
-{
-       if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
-               return 0;
-
-       return kvm_mmu_load(vcpu);
-}
-
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
-       return vcpu->shadow_efer & EFER_LME;
-#else
-       return 0;
-#endif
-}
-
-static inline int is_pae(struct kvm_vcpu *vcpu)
-{
-       return vcpu->cr4 & X86_CR4_PAE;
-}
-
-static inline int is_pse(struct kvm_vcpu *vcpu)
-{
-       return vcpu->cr4 & X86_CR4_PSE;
-}
-
-static inline int is_paging(struct kvm_vcpu *vcpu)
-{
-       return vcpu->cr0 & X86_CR0_PG;
-}
-
-static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-       return slot - kvm->memslots;
-}
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int complete_pio(struct kvm_vcpu *vcpu);
  
  static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
  {
@@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
  static inline u16 read_fs(void)
  {
         u16 seg;
-       asm ("mov %%fs, %0" : "=g"(seg));
+       asm("mov %%fs, %0" : "=g"(seg));
         return seg;
  }
  
  static inline u16 read_gs(void)
  {
         u16 seg;
-       asm ("mov %%gs, %0" : "=g"(seg));
+       asm("mov %%gs, %0" : "=g"(seg));
         return seg;
  }
  
  static inline u16 read_ldt(void)
  {
         u16 ldt;
-       asm ("sldt %0" : "=g"(ldt));
+       asm("sldt %0" : "=g"(ldt));
         return ldt;
  }
  
  static inline void load_fs(u16 sel)
  {
-       asm ("mov %0, %%fs" : : "rm"(sel));
+       asm("mov %0, %%fs" : : "rm"(sel));
  }
  
  static inline void load_gs(u16 sel)
  {
-       asm ("mov %0, %%gs" : : "rm"(sel));
+       asm("mov %0, %%gs" : : "rm"(sel));
  }
  
  #ifndef load_ldt
  static inline void load_ldt(u16 sel)
  {
-       asm ("lldt %0" : : "rm"(sel));
+       asm("lldt %0" : : "rm"(sel));
  }
  #endif
  
  static inline void get_idt(struct descriptor_table *table)
  {
-       asm ("sidt %0" : "=m"(*table));
+       asm("sidt %0" : "=m"(*table));
  }
  
  static inline void get_gdt(struct descriptor_table *table)
  {
-       asm ("sgdt %0" : "=m"(*table));
+       asm("sgdt %0" : "=m"(*table));
  }
  
  static inline unsigned long read_tr_base(void)
  {
         u16 tr;
-       asm ("str %0" : "=g"(tr));
+       asm("str %0" : "=g"(tr));
         return segment_base(tr);
  }
  
@@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr)
  
  static inline void fx_save(struct i387_fxsave_struct *image)
  {
-       asm ("fxsave (%0)":: "r" (image));
+       asm("fxsave (%0)":: "r" (image));
  }
  
  static inline void fx_restore(struct i387_fxsave_struct *image)
  {
-       asm ("fxrstor (%0)":: "r" (image));
+       asm("fxrstor (%0)":: "r" (image));
  }
  
  static inline void fpu_init(void)
  {
-       asm ("finit");
+       asm("finit");
  }
  
  static inline u32 get_rdx_init_val(void)
@@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void)
         return 0x600; /* P6 family */
  }
  
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+       kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
  #define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
  #define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
  #define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h

new file mode 100644 (file)

index 0000000..c6f3fd8
--- /dev/null
+++ b/include/asm-x86/kvm_para.h
@@ -0,0 +1,105 @@
+#ifndef __X86_KVM_PARA_H
+#define __X86_KVM_PARA_H
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE    0x40000000
+
+/* This CPUID returns a feature bitmap in eax.  Before enabling a particular
+ * paravirtualization, the appropriate feature bit should be checked.
+ */
+#define KVM_CPUID_FEATURES     0x40000001
+
+#ifdef __KERNEL__
+#include <asm/processor.h>
+
+/* This instruction is vmcall.  On non-VT architectures, it will generate a
+ * trap that we will then rewrite to the appropriate instruction.
+ */
+#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+
+/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
+ * instruction.  The hypervisor may replace it with something else but only the
+ * instructions are guaranteed to be supported.
+ *
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ * The hypercall number should be placed in rax and the return value will be
+ * placed in rax.  No other registers will be clobbered unless explicited
+ * noted by the particular hypercall.
+ */
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+       long ret;
+       asm volatile(KVM_HYPERCALL
+                    : "=a"(ret)
+                    : "a"(nr));
+       return ret;
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+       long ret;
+       asm volatile(KVM_HYPERCALL
+                    : "=a"(ret)
+                    : "a"(nr), "b"(p1));
+       return ret;
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+                                 unsigned long p2)
+{
+       long ret;
+       asm volatile(KVM_HYPERCALL
+                    : "=a"(ret)
+                    : "a"(nr), "b"(p1), "c"(p2));
+       return ret;
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+                                 unsigned long p2, unsigned long p3)
+{
+       long ret;
+       asm volatile(KVM_HYPERCALL
+                    : "=a"(ret)
+                    : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
+       return ret;
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+                                 unsigned long p2, unsigned long p3,
+                                 unsigned long p4)
+{
+       long ret;
+       asm volatile(KVM_HYPERCALL
+                    : "=a"(ret)
+                    : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
+       return ret;
+}
+
+static inline int kvm_para_available(void)
+{
+       unsigned int eax, ebx, ecx, edx;
+       char signature[13];
+
+       cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
+       memcpy(signature + 0, &ebx, 4);
+       memcpy(signature + 4, &ecx, 4);
+       memcpy(signature + 8, &edx, 4);
+       signature[12] = 0;
+
+       if (strcmp(signature, "KVMKVMKVM") == 0)
+               return 1;
+
+       return 0;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+       return cpuid_eax(KVM_CPUID_FEATURES);
+}
+
+#endif
+
+#endif
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h

similarity index 83%

rename from drivers/kvm/x86_emulate.h

rename to include/asm-x86/kvm_x86_emulate.h

index 92c73aa7f9ac4f81ed863177e40c551385477ee4..7db91b9bdcd4c59d950485497fef97b4dc53c43b 100644 (file)
--- a/drivers/kvm/x86_emulate.h
+++ b/include/asm-x86/kvm_x86_emulate.h
@@ -62,17 +62,6 @@ struct x86_emulate_ops {
         int (*read_std)(unsigned long addr, void *val,
                         unsigned int bytes, struct kvm_vcpu *vcpu);
  
-       /*
-        * write_std: Write bytes of standard (non-emulated/special) memory.
-        *            Used for stack operations, and others.
-        *  @addr:  [IN ] Linear address to which to write.
-        *  @val:   [IN ] Value to write to memory (low-order bytes used as
-        *                required).
-        *  @bytes: [IN ] Number of bytes to write to memory.
-        */
-       int (*write_std)(unsigned long addr, const void *val,
-                        unsigned int bytes, struct kvm_vcpu *vcpu);
-
         /*
          * read_emulated: Read bytes from emulated/special memory area.
          *  @addr:  [IN ] Linear address from which to read.
@@ -112,13 +101,50 @@ struct x86_emulate_ops {
  
  };
  
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+       enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+       unsigned int bytes;
+       unsigned long val, orig_val, *ptr;
+};
+
+struct fetch_cache {
+       u8 data[15];
+       unsigned long start;
+       unsigned long end;
+};
+
+struct decode_cache {
+       u8 twobyte;
+       u8 b;
+       u8 lock_prefix;
+       u8 rep_prefix;
+       u8 op_bytes;
+       u8 ad_bytes;
+       u8 rex_prefix;
+       struct operand src;
+       struct operand dst;
+       unsigned long *override_base;
+       unsigned int d;
+       unsigned long regs[NR_VCPU_REGS];
+       unsigned long eip;
+       /* modrm */
+       u8 modrm;
+       u8 modrm_mod;
+       u8 modrm_reg;
+       u8 modrm_rm;
+       u8 use_modrm_ea;
+       unsigned long modrm_ea;
+       unsigned long modrm_val;
+       struct fetch_cache fetch;
+};
+
  struct x86_emulate_ctxt {
         /* Register state before/after emulation. */
         struct kvm_vcpu *vcpu;
  
         /* Linear faulting address (if emulating a page-faulting instruction). */
         unsigned long eflags;
-       unsigned long cr2;
  
         /* Emulated execution mode, represented by an X86EMUL_MODE value. */
         int mode;
@@ -129,8 +155,16 @@ struct x86_emulate_ctxt {
         unsigned long ss_base;
         unsigned long gs_base;
         unsigned long fs_base;
+
+       /* decode cache */
+
+       struct decode_cache decode;
  };
  
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX  1
+#define REPNE_PREFIX    2
+
  /* Execution mode, passed to the emulator. */
  #define X86EMUL_MODE_REAL     0        /* Real mode.             */
  #define X86EMUL_MODE_PROT16   2        /* 16-bit protected mode. */
@@ -144,12 +178,9 @@ struct x86_emulate_ctxt {
  #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
  #endif
  
-/*
- * x86_emulate_memop: Emulate an instruction that faulted attempting to
- *                    read/write a 'special' memory area.
- * Returns -1 on failure, 0 on success.
- */
-int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
-                     struct x86_emulate_ops *ops);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
+                   struct x86_emulate_ops *ops);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
+                    struct x86_emulate_ops *ops);
  
  #endif                         /* __X86_EMULATE_H__ */
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h

index ccd3384608111ba91dd10bb2c3a62cdabda7e6a3..4d9367b72976a1601897af8ce3f448ad58245c11 100644 (file)
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -44,19 +44,19 @@ struct lguest_ro_state
  {
         /* Host information we need to restore when we switch back. */
         u32 host_cr3;
-       struct Xgt_desc_struct host_idt_desc;
-       struct Xgt_desc_struct host_gdt_desc;
+       struct desc_ptr host_idt_desc;
+       struct desc_ptr host_gdt_desc;
         u32 host_sp;
  
         /* Fields which are used when guest is running. */
-       struct Xgt_desc_struct guest_idt_desc;
-       struct Xgt_desc_struct guest_gdt_desc;
-       struct i386_hw_tss guest_tss;
+       struct desc_ptr guest_idt_desc;
+       struct desc_ptr guest_gdt_desc;
+       struct x86_hw_tss guest_tss;
         struct desc_struct guest_idt[IDT_ENTRIES];
         struct desc_struct guest_gdt[GDT_ENTRIES];
  };
  
-struct lguest_arch
+struct lg_cpu_arch
  {
         /* The GDT entries copied into lguest_ro_state when running. */
         struct desc_struct gdt[GDT_ENTRIES];
@@ -78,8 +78,8 @@ static inline void lguest_set_ts(void)
  }
  
  /* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
-#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
+#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
  
  #endif /* __ASSEMBLY__ */
  
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h

index 2091779e91fbe4f06b659235dc9e8abe2c09922c..758b9a5d4539e583dd29efced39701e7ae1b64d0 100644 (file)
--- a/include/asm-x86/lguest_hcall.h
+++ b/include/asm-x86/lguest_hcall.h
@@ -4,7 +4,7 @@
  
  #define LHCALL_FLUSH_ASYNC     0
  #define LHCALL_LGUEST_INIT     1
-#define LHCALL_CRASH           2
+#define LHCALL_SHUTDOWN                2
  #define LHCALL_LOAD_GDT                3
  #define LHCALL_NEW_PGTABLE     4
  #define LHCALL_FLUSH_TLB       5
@@ -20,6 +20,10 @@
  
  #define LGUEST_TRAP_ENTRY 0x1F
  
+/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
+#define LGUEST_SHUTDOWN_POWEROFF       1
+#define LGUEST_SHUTDOWN_RESTART                2
+
  #ifndef __ASSEMBLY__
  #include <asm/hw_irq.h>
  
diff --git a/include/asm-x86/linkage.h b/include/asm-x86/linkage.h

index 94b257fa8701904284ebe0f996a405daf18d16f6..31739c7d66a94930868ed1037bed18df47d3c1e1 100644 (file)
--- a/include/asm-x86/linkage.h
+++ b/include/asm-x86/linkage.h
@@ -1,5 +1,25 @@
+#ifndef __ASM_LINKAGE_H
+#define __ASM_LINKAGE_H
+
+#ifdef CONFIG_X86_64
+#define __ALIGN .p2align 4,,15
+#define __ALIGN_STR ".p2align 4,,15"
+#endif
+
  #ifdef CONFIG_X86_32
-# include "linkage_32.h"
-#else
-# include "linkage_64.h"
+#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
+#define prevent_tail_call(ret) __asm__ ("" : "=r" (ret) : "0" (ret))
+/*
+ * For 32-bit UML - mark functions implemented in assembly that use
+ * regparm input parameters:
+ */
+#define asmregparm __attribute__((regparm(3)))
+#endif
+
+#ifdef CONFIG_X86_ALIGNMENT_16
+#define __ALIGN .align 16,0x90
+#define __ALIGN_STR ".align 16,0x90"
+#endif
+
  #endif
+
diff --git a/include/asm-x86/linkage_32.h b/include/asm-x86/linkage_32.h

deleted file mode 100644 (file)

index f4a6eba..0000000
--- a/include/asm-x86/linkage_32.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef __ASM_LINKAGE_H
-#define __ASM_LINKAGE_H
-
-#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
-#define FASTCALL(x)    x __attribute__((regparm(3)))
-#define fastcall       __attribute__((regparm(3)))
-
-#define prevent_tail_call(ret) __asm__ ("" : "=r" (ret) : "0" (ret))
-
-#ifdef CONFIG_X86_ALIGNMENT_16
-#define __ALIGN .align 16,0x90
-#define __ALIGN_STR ".align 16,0x90"
-#endif
-
-#endif
diff --git a/include/asm-x86/linkage_64.h b/include/asm-x86/linkage_64.h

deleted file mode 100644 (file)

index b5f39d0..0000000
--- a/include/asm-x86/linkage_64.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_LINKAGE_H
-#define __ASM_LINKAGE_H
-
-#define __ALIGN .p2align 4,,15
-
-#endif
diff --git a/include/asm-x86/local.h b/include/asm-x86/local.h

index c7a1b1c66c9630f34409b7c44176a5cce0ab4e1e..f852c62b3319b514c9d9eff64994d4cf3a280ec8 100644 (file)
--- a/include/asm-x86/local.h
+++ b/include/asm-x86/local.h
@@ -1,5 +1,240 @@
-#ifdef CONFIG_X86_32
-# include "local_32.h"
-#else
-# include "local_64.h"
+#ifndef _ARCH_LOCAL_H
+#define _ARCH_LOCAL_H
+
+#include <linux/percpu.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+#include <asm/asm.h>
+
+typedef struct {
+       atomic_long_t a;
+} local_t;
+
+#define LOCAL_INIT(i)  { ATOMIC_LONG_INIT(i) }
+
+#define local_read(l)  atomic_long_read(&(l)->a)
+#define local_set(l, i)        atomic_long_set(&(l)->a, (i))
+
+static inline void local_inc(local_t *l)
+{
+       __asm__ __volatile__(
+               _ASM_INC "%0"
+               :"+m" (l->a.counter));
+}
+
+static inline void local_dec(local_t *l)
+{
+       __asm__ __volatile__(
+               _ASM_DEC "%0"
+               :"+m" (l->a.counter));
+}
+
+static inline void local_add(long i, local_t *l)
+{
+       __asm__ __volatile__(
+               _ASM_ADD "%1,%0"
+               :"+m" (l->a.counter)
+               :"ir" (i));
+}
+
+static inline void local_sub(long i, local_t *l)
+{
+       __asm__ __volatile__(
+               _ASM_SUB "%1,%0"
+               :"+m" (l->a.counter)
+               :"ir" (i));
+}
+
+/**
+ * local_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @l: pointer to type local_t
+ *
+ * Atomically subtracts @i from @l and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int local_sub_and_test(long i, local_t *l)
+{
+       unsigned char c;
+
+       __asm__ __volatile__(
+               _ASM_SUB "%2,%0; sete %1"
+               :"+m" (l->a.counter), "=qm" (c)
+               :"ir" (i) : "memory");
+       return c;
+}
+
+/**
+ * local_dec_and_test - decrement and test
+ * @l: pointer to type local_t
+ *
+ * Atomically decrements @l by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int local_dec_and_test(local_t *l)
+{
+       unsigned char c;
+
+       __asm__ __volatile__(
+               _ASM_DEC "%0; sete %1"
+               :"+m" (l->a.counter), "=qm" (c)
+               : : "memory");
+       return c != 0;
+}
+
+/**
+ * local_inc_and_test - increment and test
+ * @l: pointer to type local_t
+ *
+ * Atomically increments @l by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int local_inc_and_test(local_t *l)
+{
+       unsigned char c;
+
+       __asm__ __volatile__(
+               _ASM_INC "%0; sete %1"
+               :"+m" (l->a.counter), "=qm" (c)
+               : : "memory");
+       return c != 0;
+}
+
+/**
+ * local_add_negative - add and test if negative
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int local_add_negative(long i, local_t *l)
+{
+       unsigned char c;
+
+       __asm__ __volatile__(
+               _ASM_ADD "%2,%0; sets %1"
+               :"+m" (l->a.counter), "=qm" (c)
+               :"ir" (i) : "memory");
+       return c;
+}
+
+/**
+ * local_add_return - add and return
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns @i + @l
+ */
+static inline long local_add_return(long i, local_t *l)
+{
+       long __i;
+#ifdef CONFIG_M386
+       unsigned long flags;
+       if (unlikely(boot_cpu_data.x86 <= 3))
+               goto no_xadd;
  #endif
+       /* Modern 486+ processor */
+       __i = i;
+       __asm__ __volatile__(
+               _ASM_XADD "%0, %1;"
+               :"+r" (i), "+m" (l->a.counter)
+               : : "memory");
+       return i + __i;
+
+#ifdef CONFIG_M386
+no_xadd: /* Legacy 386 processor */
+       local_irq_save(flags);
+       __i = local_read(l);
+       local_set(l, i + __i);
+       local_irq_restore(flags);
+       return i + __i;
+#endif
+}
+
+static inline long local_sub_return(long i, local_t *l)
+{
+       return local_add_return(-i, l);
+}
+
+#define local_inc_return(l)  (local_add_return(1, l))
+#define local_dec_return(l)  (local_sub_return(1, l))
+
+#define local_cmpxchg(l, o, n) \
+       (cmpxchg_local(&((l)->a.counter), (o), (n)))
+/* Always has a lock prefix */
+#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
+
+/**
+ * local_add_unless - add unless the number is a given value
+ * @l: pointer of type local_t
+ * @a: the amount to add to l...
+ * @u: ...unless l is equal to u.
+ *
+ * Atomically adds @a to @l, so long as it was not @u.
+ * Returns non-zero if @l was not @u, and zero otherwise.
+ */
+#define local_add_unless(l, a, u)                              \
+({                                                             \
+       long c, old;                                            \
+       c = local_read(l);                                      \
+       for (;;) {                                              \
+               if (unlikely(c == (u)))                         \
+                       break;                                  \
+               old = local_cmpxchg((l), c, c + (a));   \
+               if (likely(old == c))                           \
+                       break;                                  \
+               c = old;                                        \
+       }                                                       \
+       c != (u);                                               \
+})
+#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
+
+/* On x86_32, these are no better than the atomic variants.
+ * On x86-64 these are better than the atomic variants on SMP kernels
+ * because they dont use a lock prefix.
+ */
+#define __local_inc(l)         local_inc(l)
+#define __local_dec(l)         local_dec(l)
+#define __local_add(i, l)      local_add((i), (l))
+#define __local_sub(i, l)      local_sub((i), (l))
+
+/* Use these for per-cpu local_t variables: on some archs they are
+ * much more efficient than these naive implementations.  Note they take
+ * a variable, not an address.
+ *
+ * X86_64: This could be done better if we moved the per cpu data directly
+ * after GS.
+ */
+
+/* Need to disable preemption for the cpu local counters otherwise we could
+   still access a variable of a previous CPU in a non atomic way. */
+#define cpu_local_wrap_v(l)            \
+       ({ local_t res__;               \
+          preempt_disable();           \
+          res__ = (l);                 \
+          preempt_enable();            \
+          res__; })
+#define cpu_local_wrap(l)              \
+       ({ preempt_disable();           \
+          l;                           \
+          preempt_enable(); })         \
+
+#define cpu_local_read(l)    cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
+#define cpu_local_set(l, i)  cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
+#define cpu_local_inc(l)     cpu_local_wrap(local_inc(&__get_cpu_var(l)))
+#define cpu_local_dec(l)     cpu_local_wrap(local_dec(&__get_cpu_var(l)))
+#define cpu_local_add(i, l)  cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
+#define cpu_local_sub(i, l)  cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
+
+#define __cpu_local_inc(l)     cpu_local_inc(l)
+#define __cpu_local_dec(l)     cpu_local_dec(l)
+#define __cpu_local_add(i, l)  cpu_local_add((i), (l))
+#define __cpu_local_sub(i, l)  cpu_local_sub((i), (l))
+
+#endif /* _ARCH_LOCAL_H */
diff --git a/include/asm-x86/local_32.h b/include/asm-x86/local_32.h

deleted file mode 100644 (file)

index 6e85975..0000000
--- a/include/asm-x86/local_32.h
+++ /dev/null
@@ -1,233 +0,0 @@
-#ifndef _ARCH_I386_LOCAL_H
-#define _ARCH_I386_LOCAL_H
-
-#include <linux/percpu.h>
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-typedef struct
-{
-       atomic_long_t a;
-} local_t;
-
-#define LOCAL_INIT(i)  { ATOMIC_LONG_INIT(i) }
-
-#define local_read(l)  atomic_long_read(&(l)->a)
-#define local_set(l,i) atomic_long_set(&(l)->a, (i))
-
-static __inline__ void local_inc(local_t *l)
-{
-       __asm__ __volatile__(
-               "incl %0"
-               :"+m" (l->a.counter));
-}
-
-static __inline__ void local_dec(local_t *l)
-{
-       __asm__ __volatile__(
-               "decl %0"
-               :"+m" (l->a.counter));
-}
-
-static __inline__ void local_add(long i, local_t *l)
-{
-       __asm__ __volatile__(
-               "addl %1,%0"
-               :"+m" (l->a.counter)
-               :"ir" (i));
-}
-
-static __inline__ void local_sub(long i, local_t *l)
-{
-       __asm__ __volatile__(
-               "subl %1,%0"
-               :"+m" (l->a.counter)
-               :"ir" (i));
-}
-
-/**
- * local_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @l: pointer of type local_t
- *
- * Atomically subtracts @i from @l and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static __inline__ int local_sub_and_test(long i, local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "subl %2,%0; sete %1"
-               :"+m" (l->a.counter), "=qm" (c)
-               :"ir" (i) : "memory");
-       return c;
-}
-
-/**
- * local_dec_and_test - decrement and test
- * @l: pointer of type local_t
- *
- * Atomically decrements @l by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static __inline__ int local_dec_and_test(local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "decl %0; sete %1"
-               :"+m" (l->a.counter), "=qm" (c)
-               : : "memory");
-       return c != 0;
-}
-
-/**
- * local_inc_and_test - increment and test
- * @l: pointer of type local_t
- *
- * Atomically increments @l by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static __inline__ int local_inc_and_test(local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "incl %0; sete %1"
-               :"+m" (l->a.counter), "=qm" (c)
-               : : "memory");
-       return c != 0;
-}
-
-/**
- * local_add_negative - add and test if negative
- * @l: pointer of type local_t
- * @i: integer value to add
- *
- * Atomically adds @i to @l and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static __inline__ int local_add_negative(long i, local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "addl %2,%0; sets %1"
-               :"+m" (l->a.counter), "=qm" (c)
-               :"ir" (i) : "memory");
-       return c;
-}
-
-/**
- * local_add_return - add and return
- * @l: pointer of type local_t
- * @i: integer value to add
- *
- * Atomically adds @i to @l and returns @i + @l
- */
-static __inline__ long local_add_return(long i, local_t *l)
-{
-       long __i;
-#ifdef CONFIG_M386
-       unsigned long flags;
-       if(unlikely(boot_cpu_data.x86 <= 3))
-               goto no_xadd;
-#endif
-       /* Modern 486+ processor */
-       __i = i;
-       __asm__ __volatile__(
-               "xaddl %0, %1;"
-               :"+r" (i), "+m" (l->a.counter)
-               : : "memory");
-       return i + __i;
-
-#ifdef CONFIG_M386
-no_xadd: /* Legacy 386 processor */
-       local_irq_save(flags);
-       __i = local_read(l);
-       local_set(l, i + __i);
-       local_irq_restore(flags);
-       return i + __i;
-#endif
-}
-
-static __inline__ long local_sub_return(long i, local_t *l)
-{
-       return local_add_return(-i,l);
-}
-
-#define local_inc_return(l)  (local_add_return(1,l))
-#define local_dec_return(l)  (local_sub_return(1,l))
-
-#define local_cmpxchg(l, o, n) \
-       (cmpxchg_local(&((l)->a.counter), (o), (n)))
-/* Always has a lock prefix */
-#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
-
-/**
- * local_add_unless - add unless the number is a given value
- * @l: pointer of type local_t
- * @a: the amount to add to l...
- * @u: ...unless l is equal to u.
- *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
- */
-#define local_add_unless(l, a, u)                              \
-({                                                             \
-       long c, old;                                            \
-       c = local_read(l);                                      \
-       for (;;) {                                              \
-               if (unlikely(c == (u)))                         \
-                       break;                                  \
-               old = local_cmpxchg((l), c, c + (a));   \
-               if (likely(old == c))                           \
-                       break;                                  \
-               c = old;                                        \
-       }                                                       \
-       c != (u);                                               \
-})
-#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
-
-/* On x86, these are no better than the atomic variants. */
-#define __local_inc(l)         local_inc(l)
-#define __local_dec(l)         local_dec(l)
-#define __local_add(i,l)       local_add((i),(l))
-#define __local_sub(i,l)       local_sub((i),(l))
-
-/* Use these for per-cpu local_t variables: on some archs they are
- * much more efficient than these naive implementations.  Note they take
- * a variable, not an address.
- */
-
-/* Need to disable preemption for the cpu local counters otherwise we could
-   still access a variable of a previous CPU in a non atomic way. */
-#define cpu_local_wrap_v(l)            \
-       ({ local_t res__;               \
-          preempt_disable();           \
-          res__ = (l);                 \
-          preempt_enable();            \
-          res__; })
-#define cpu_local_wrap(l)              \
-       ({ preempt_disable();           \
-          l;                           \
-          preempt_enable(); })         \
-
-#define cpu_local_read(l)    cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
-#define cpu_local_set(l, i)  cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
-#define cpu_local_inc(l)     cpu_local_wrap(local_inc(&__get_cpu_var(l)))
-#define cpu_local_dec(l)     cpu_local_wrap(local_dec(&__get_cpu_var(l)))
-#define cpu_local_add(i, l)  cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
-#define cpu_local_sub(i, l)  cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
-
-#define __cpu_local_inc(l)     cpu_local_inc(l)
-#define __cpu_local_dec(l)     cpu_local_dec(l)
-#define __cpu_local_add(i, l)  cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l)  cpu_local_sub((i), (l))
-
-#endif /* _ARCH_I386_LOCAL_H */
diff --git a/include/asm-x86/local_64.h b/include/asm-x86/local_64.h

deleted file mode 100644 (file)

index e87492b..0000000
--- a/include/asm-x86/local_64.h
+++ /dev/null
@@ -1,222 +0,0 @@
-#ifndef _ARCH_X8664_LOCAL_H
-#define _ARCH_X8664_LOCAL_H
-
-#include <linux/percpu.h>
-#include <asm/atomic.h>
-
-typedef struct
-{
-       atomic_long_t a;
-} local_t;
-
-#define LOCAL_INIT(i)  { ATOMIC_LONG_INIT(i) }
-
-#define local_read(l)  atomic_long_read(&(l)->a)
-#define local_set(l,i) atomic_long_set(&(l)->a, (i))
-
-static inline void local_inc(local_t *l)
-{
-       __asm__ __volatile__(
-               "incq %0"
-               :"=m" (l->a.counter)
-               :"m" (l->a.counter));
-}
-
-static inline void local_dec(local_t *l)
-{
-       __asm__ __volatile__(
-               "decq %0"
-               :"=m" (l->a.counter)
-               :"m" (l->a.counter));
-}
-
-static inline void local_add(long i, local_t *l)
-{
-       __asm__ __volatile__(
-               "addq %1,%0"
-               :"=m" (l->a.counter)
-               :"ir" (i), "m" (l->a.counter));
-}
-
-static inline void local_sub(long i, local_t *l)
-{
-       __asm__ __volatile__(
-               "subq %1,%0"
-               :"=m" (l->a.counter)
-               :"ir" (i), "m" (l->a.counter));
-}
-
-/**
- * local_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @l: pointer to type local_t
- *
- * Atomically subtracts @i from @l and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static __inline__ int local_sub_and_test(long i, local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "subq %2,%0; sete %1"
-               :"=m" (l->a.counter), "=qm" (c)
-               :"ir" (i), "m" (l->a.counter) : "memory");
-       return c;
-}
-
-/**
- * local_dec_and_test - decrement and test
- * @l: pointer to type local_t
- *
- * Atomically decrements @l by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static __inline__ int local_dec_and_test(local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "decq %0; sete %1"
-               :"=m" (l->a.counter), "=qm" (c)
-               :"m" (l->a.counter) : "memory");
-       return c != 0;
-}
-
-/**
- * local_inc_and_test - increment and test
- * @l: pointer to type local_t
- *
- * Atomically increments @l by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static __inline__ int local_inc_and_test(local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "incq %0; sete %1"
-               :"=m" (l->a.counter), "=qm" (c)
-               :"m" (l->a.counter) : "memory");
-       return c != 0;
-}
-
-/**
- * local_add_negative - add and test if negative
- * @i: integer value to add
- * @l: pointer to type local_t
- *
- * Atomically adds @i to @l and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static __inline__ int local_add_negative(long i, local_t *l)
-{
-       unsigned char c;
-
-       __asm__ __volatile__(
-               "addq %2,%0; sets %1"
-               :"=m" (l->a.counter), "=qm" (c)
-               :"ir" (i), "m" (l->a.counter) : "memory");
-       return c;
-}
-
-/**
- * local_add_return - add and return
- * @i: integer value to add
- * @l: pointer to type local_t
- *
- * Atomically adds @i to @l and returns @i + @l
- */
-static __inline__ long local_add_return(long i, local_t *l)
-{
-       long __i = i;
-       __asm__ __volatile__(
-               "xaddq %0, %1;"
-               :"+r" (i), "+m" (l->a.counter)
-               : : "memory");
-       return i + __i;
-}
-
-static __inline__ long local_sub_return(long i, local_t *l)
-{
-       return local_add_return(-i,l);
-}
-
-#define local_inc_return(l)  (local_add_return(1,l))
-#define local_dec_return(l)  (local_sub_return(1,l))
-
-#define local_cmpxchg(l, o, n) \
-       (cmpxchg_local(&((l)->a.counter), (o), (n)))
-/* Always has a lock prefix */
-#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
-
-/**
- * atomic_up_add_unless - add unless the number is a given value
- * @l: pointer of type local_t
- * @a: the amount to add to l...
- * @u: ...unless l is equal to u.
- *
- * Atomically adds @a to @l, so long as it was not @u.
- * Returns non-zero if @l was not @u, and zero otherwise.
- */
-#define local_add_unless(l, a, u)                              \
-({                                                             \
-       long c, old;                                            \
-       c = local_read(l);                                      \
-       for (;;) {                                              \
-               if (unlikely(c == (u)))                         \
-                       break;                                  \
-               old = local_cmpxchg((l), c, c + (a));   \
-               if (likely(old == c))                           \
-                       break;                                  \
-               c = old;                                        \
-       }                                                       \
-       c != (u);                                               \
-})
-#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
-
-/* On x86-64 these are better than the atomic variants on SMP kernels
-   because they dont use a lock prefix. */
-#define __local_inc(l)         local_inc(l)
-#define __local_dec(l)         local_dec(l)
-#define __local_add(i,l)       local_add((i),(l))
-#define __local_sub(i,l)       local_sub((i),(l))
-
-/* Use these for per-cpu local_t variables: on some archs they are
- * much more efficient than these naive implementations.  Note they take
- * a variable, not an address.
- *
- * This could be done better if we moved the per cpu data directly
- * after GS.
- */
-
-/* Need to disable preemption for the cpu local counters otherwise we could
-   still access a variable of a previous CPU in a non atomic way. */
-#define cpu_local_wrap_v(l)            \
-       ({ local_t res__;               \
-          preempt_disable();           \
-          res__ = (l);                 \
-          preempt_enable();            \
-          res__; })
-#define cpu_local_wrap(l)              \
-       ({ preempt_disable();           \
-          l;                           \
-          preempt_enable(); })         \
-
-#define cpu_local_read(l)    cpu_local_wrap_v(local_read(&__get_cpu_var(l)))
-#define cpu_local_set(l, i)  cpu_local_wrap(local_set(&__get_cpu_var(l), (i)))
-#define cpu_local_inc(l)     cpu_local_wrap(local_inc(&__get_cpu_var(l)))
-#define cpu_local_dec(l)     cpu_local_wrap(local_dec(&__get_cpu_var(l)))
-#define cpu_local_add(i, l)  cpu_local_wrap(local_add((i), &__get_cpu_var(l)))
-#define cpu_local_sub(i, l)  cpu_local_wrap(local_sub((i), &__get_cpu_var(l)))
-
-#define __cpu_local_inc(l)     cpu_local_inc(l)
-#define __cpu_local_dec(l)     cpu_local_dec(l)
-#define __cpu_local_add(i, l)  cpu_local_add((i), (l))
-#define __cpu_local_sub(i, l)  cpu_local_sub((i), (l))
-
-#endif /* _ARCH_X8664_LOCAL_H */
diff --git a/include/asm-x86/mach-bigsmp/mach_apic.h b/include/asm-x86/mach-bigsmp/mach_apic.h

index ebd319f838ab5644392759e863fb284274993e82..6df235e8ea91b580bbaeb95b595a878c1b4c703a 100644 (file)
--- a/include/asm-x86/mach-bigsmp/mach_apic.h
+++ b/include/asm-x86/mach-bigsmp/mach_apic.h
@@ -110,13 +110,13 @@ static inline int cpu_to_logical_apicid(int cpu)
  }
  
  static inline int mpc_apic_id(struct mpc_config_processor *m,
-                       struct mpc_config_translation *translation_record)
+                             struct mpc_config_translation *translation_record)
  {
-       printk("Processor #%d %ld:%ld APIC version %d\n",
-               m->mpc_apicid,
-               (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-               (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-               m->mpc_apicver);
+       printk("Processor #%d %u:%u APIC version %d\n",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver);
         return m->mpc_apicid;
  }
  
diff --git a/include/asm-x86/mach-default/apm.h b/include/asm-x86/mach-default/apm.h

index 1f730b8bd1fd463af3d02d714812b7933703e261..989f34c37d321249f838efe86d4e9c2086b42233 100644 (file)
--- a/include/asm-x86/mach-default/apm.h
+++ b/include/asm-x86/mach-default/apm.h
@@ -1,6 +1,4 @@
  /*
- *  include/asm-i386/mach-default/apm.h
- *
   *  Machine specific APM BIOS functions for generic.
   *  Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
   */
diff --git a/include/asm-x86/mach-default/io_ports.h b/include/asm-x86/mach-default/io_ports.h

deleted file mode 100644 (file)

index 48540ba..0000000
--- a/include/asm-x86/mach-default/io_ports.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  arch/i386/mach-generic/io_ports.h
- *
- *  Machine specific IO port address definition for generic.
- *  Written by Osamu Tomita <tomita@cinet.co.jp>
- */
-#ifndef _MACH_IO_PORTS_H
-#define _MACH_IO_PORTS_H
-
-/* i8259A PIC registers */
-#define PIC_MASTER_CMD         0x20
-#define PIC_MASTER_IMR         0x21
-#define PIC_MASTER_ISR         PIC_MASTER_CMD
-#define PIC_MASTER_POLL                PIC_MASTER_ISR
-#define PIC_MASTER_OCW3                PIC_MASTER_ISR
-#define PIC_SLAVE_CMD          0xa0
-#define PIC_SLAVE_IMR          0xa1
-
-/* i8259A PIC related value */
-#define PIC_CASCADE_IR         2
-#define MASTER_ICW4_DEFAULT    0x01
-#define SLAVE_ICW4_DEFAULT     0x01
-#define PIC_ICW4_AEOI          2
-
-#endif /* !_MACH_IO_PORTS_H */
diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h

index 6db1c3babe9abe7612c2f3b034def7b1a33a558f..e3c2c1012c1cd458324a6f25e4e0fb115d1920ab 100644 (file)
--- a/include/asm-x86/mach-default/mach_apic.h
+++ b/include/asm-x86/mach-default/mach_apic.h
@@ -89,15 +89,15 @@ static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
         return physid_mask_of_physid(phys_apicid);
  }
  
-static inline int mpc_apic_id(struct mpc_config_processor *m, 
-                       struct mpc_config_translation *translation_record)
-{
-       printk("Processor #%d %ld:%ld APIC version %d\n",
-                       m->mpc_apicid,
-                       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-                       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-                       m->mpc_apicver);
-       return (m->mpc_apicid);
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+                             struct mpc_config_translation *translation_record)
+{
+       printk("Processor #%d %u:%u APIC version %d\n",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver);
+       return m->mpc_apicid;
  }
  
  static inline void setup_portio_remap(void)
diff --git a/include/asm-x86/mach-default/mach_time.h b/include/asm-x86/mach-default/mach_time.h

deleted file mode 100644 (file)

index 31eb5de..0000000
--- a/include/asm-x86/mach-default/mach_time.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- *  include/asm-i386/mach-default/mach_time.h
- *
- *  Machine specific set RTC function for generic.
- *  Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
- */
-#ifndef _MACH_TIME_H
-#define _MACH_TIME_H
-
-#include <linux/mc146818rtc.h>
-
-/* for check timing call set_rtc_mmss() 500ms     */
-/* used in arch/i386/time.c::do_timer_interrupt() */
-#define USEC_AFTER     500000
-#define USEC_BEFORE    500000
-
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
- * called 500 ms after the second nowtime has started, because when
- * nowtime is written into the registers of the CMOS clock, it will
- * jump to the next second precisely 500 ms later. Check the Motorola
- * MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- *      sets the minutes. Usually you'll only notice that after reboot!
- */
-static inline int mach_set_rtc_mmss(unsigned long nowtime)
-{
-       int retval = 0;
-       int real_seconds, real_minutes, cmos_minutes;
-       unsigned char save_control, save_freq_select;
-
-       save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
-       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
-       save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
-       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
-       cmos_minutes = CMOS_READ(RTC_MINUTES);
-       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-               BCD_TO_BIN(cmos_minutes);
-
-       /*
-        * since we're only adjusting minutes and seconds,
-        * don't interfere with hour overflow. This avoids
-        * messing with unknown time zones but requires your
-        * RTC not to be off by more than 15 minutes
-        */
-       real_seconds = nowtime % 60;
-       real_minutes = nowtime / 60;
-       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
-               real_minutes += 30;             /* correct for half hour time zone */
-       real_minutes %= 60;
-
-       if (abs(real_minutes - cmos_minutes) < 30) {
-               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-                       BIN_TO_BCD(real_seconds);
-                       BIN_TO_BCD(real_minutes);
-               }
-               CMOS_WRITE(real_seconds,RTC_SECONDS);
-               CMOS_WRITE(real_minutes,RTC_MINUTES);
-       } else {
-               printk(KERN_WARNING
-                      "set_rtc_mmss: can't update from %d to %d\n",
-                      cmos_minutes, real_minutes);
-               retval = -1;
-       }
-
-       /* The following flags have to be released exactly in this order,
-        * otherwise the DS12887 (popular MC146818A clone with integrated
-        * battery and quartz) will not reset the oscillator and will not
-        * update precisely 500 ms later. You won't find this mentioned in
-        * the Dallas Semiconductor data sheets, but who believes data
-        * sheets anyway ...                           -- Markus Kuhn
-        */
-       CMOS_WRITE(save_control, RTC_CONTROL);
-       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-
-       return retval;
-}
-
-static inline unsigned long mach_get_cmos_time(void)
-{
-       unsigned int year, mon, day, hour, min, sec;
-
-       do {
-               sec = CMOS_READ(RTC_SECONDS);
-               min = CMOS_READ(RTC_MINUTES);
-               hour = CMOS_READ(RTC_HOURS);
-               day = CMOS_READ(RTC_DAY_OF_MONTH);
-               mon = CMOS_READ(RTC_MONTH);
-               year = CMOS_READ(RTC_YEAR);
-       } while (sec != CMOS_READ(RTC_SECONDS));
-
-       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-               BCD_TO_BIN(sec);
-               BCD_TO_BIN(min);
-               BCD_TO_BIN(hour);
-               BCD_TO_BIN(day);
-               BCD_TO_BIN(mon);
-               BCD_TO_BIN(year);
-       }
-
-       year += 1900;
-       if (year < 1970)
-               year += 100;
-
-       return mktime(year, mon, day, hour, min, sec);
-}
-
-#endif /* !_MACH_TIME_H */
diff --git a/include/asm-x86/mach-default/mach_timer.h b/include/asm-x86/mach-default/mach_timer.h

index 807992fd4171b78fe9d86287706e590f9d11deb2..4b76e536cd986307b071c8c11d2bff4c30b4112f 100644 (file)
--- a/include/asm-x86/mach-default/mach_timer.h
+++ b/include/asm-x86/mach-default/mach_timer.h
@@ -1,6 +1,4 @@
  /*
- *  include/asm-i386/mach-default/mach_timer.h
- *
   *  Machine specific calibrate_tsc() for generic.
   *  Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
   */
diff --git a/include/asm-x86/mach-default/mach_traps.h b/include/asm-x86/mach-default/mach_traps.h

index 625438b8a6eb485a52e7df429e315a653b33d8b7..2fe7705c0484ed074a5d0b2f9e2bf7acc88cd7cf 100644 (file)
--- a/include/asm-x86/mach-default/mach_traps.h
+++ b/include/asm-x86/mach-default/mach_traps.h
@@ -1,6 +1,4 @@
  /*
- *  include/asm-i386/mach-default/mach_traps.h
- *
   *  Machine specific NMI handling for generic.
   *  Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
   */
diff --git a/include/asm-x86/mach-es7000/mach_apic.h b/include/asm-x86/mach-es7000/mach_apic.h

index caec64be516d4c47f713b3f5d086d1ad7cf9e88a..d23011fdf454fa75143cf903788db5191d86e54c 100644 (file)
--- a/include/asm-x86/mach-es7000/mach_apic.h
+++ b/include/asm-x86/mach-es7000/mach_apic.h
@@ -131,11 +131,11 @@ static inline int cpu_to_logical_apicid(int cpu)
  
  static inline int mpc_apic_id(struct mpc_config_processor *m, struct mpc_config_translation *unused)
  {
-       printk("Processor #%d %ld:%ld APIC version %d\n",
-               m->mpc_apicid,
-               (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-               (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-               m->mpc_apicver);
+       printk("Processor #%d %u:%u APIC version %d\n",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver);
         return (m->mpc_apicid);
  }
  
diff --git a/include/asm-x86/mach-generic/gpio.h b/include/asm-x86/mach-generic/gpio.h

new file mode 100644 (file)

index 0000000..5305dcb
--- /dev/null
+++ b/include/asm-x86/mach-generic/gpio.h
@@ -0,0 +1,15 @@
+#ifndef __ASM_MACH_GENERIC_GPIO_H
+#define __ASM_MACH_GENERIC_GPIO_H
+
+int gpio_request(unsigned gpio, const char *label);
+void gpio_free(unsigned gpio);
+int gpio_direction_input(unsigned gpio);
+int gpio_direction_output(unsigned gpio, int value);
+int gpio_get_value(unsigned gpio);
+void gpio_set_value(unsigned gpio, int value);
+int gpio_to_irq(unsigned gpio);
+int irq_to_gpio(unsigned irq);
+
+#include <asm-generic/gpio.h>           /* cansleep wrappers */
+
+#endif /* __ASM_MACH_GENERIC_GPIO_H */
diff --git a/include/asm-x86/mach-numaq/mach_apic.h b/include/asm-x86/mach-numaq/mach_apic.h

index 5e5e7dd2692ef91a888ee4c3c7e586627219f656..17e183bd39c1395d044489f58f17a09bbefe0e89 100644 (file)
--- a/include/asm-x86/mach-numaq/mach_apic.h
+++ b/include/asm-x86/mach-numaq/mach_apic.h
@@ -101,11 +101,11 @@ static inline int mpc_apic_id(struct mpc_config_processor *m,
         int quad = translation_record->trans_quad;
         int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
  
-       printk("Processor #%d %ld:%ld APIC version %d (quad %d, apic %d)\n",
-                       m->mpc_apicid,
-                       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-                       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-                       m->mpc_apicver, quad, logical_apicid);
+       printk("Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver, quad, logical_apicid);
         return logical_apicid;
  }
  
diff --git a/include/asm-x86/mach-rdc321x/gpio.h b/include/asm-x86/mach-rdc321x/gpio.h

new file mode 100644 (file)

index 0000000..db31b92
--- /dev/null
+++ b/include/asm-x86/mach-rdc321x/gpio.h
@@ -0,0 +1,56 @@
+#ifndef _RDC321X_GPIO_H
+#define _RDC321X_GPIO_H
+
+extern int rdc_gpio_get_value(unsigned gpio);
+extern void rdc_gpio_set_value(unsigned gpio, int value);
+extern int rdc_gpio_direction_input(unsigned gpio);
+extern int rdc_gpio_direction_output(unsigned gpio, int value);
+
+
+/* Wrappers for the arch-neutral GPIO API */
+
+static inline int gpio_request(unsigned gpio, const char *label)
+{
+       /* Not yet implemented */
+       return 0;
+}
+
+static inline void gpio_free(unsigned gpio)
+{
+       /* Not yet implemented */
+}
+
+static inline int gpio_direction_input(unsigned gpio)
+{
+       return rdc_gpio_direction_input(gpio);
+}
+
+static inline int gpio_direction_output(unsigned gpio, int value)
+{
+       return rdc_gpio_direction_output(gpio, value);
+}
+
+static inline int gpio_get_value(unsigned gpio)
+{
+       return rdc_gpio_get_value(gpio);
+}
+
+static inline void gpio_set_value(unsigned gpio, int value)
+{
+       rdc_gpio_set_value(gpio, value);
+}
+
+static inline int gpio_to_irq(unsigned gpio)
+{
+       return gpio;
+}
+
+static inline int irq_to_gpio(unsigned irq)
+{
+       return irq;
+}
+
+/* For cansleep */
+#include <asm-generic/gpio.h>
+
+#endif /* _RDC321X_GPIO_H_ */
diff --git a/include/asm-x86/mach-rdc321x/rdc321x_defs.h b/include/asm-x86/mach-rdc321x/rdc321x_defs.h

new file mode 100644 (file)

index 0000000..838ba8f
--- /dev/null
+++ b/include/asm-x86/mach-rdc321x/rdc321x_defs.h
@@ -0,0 +1,6 @@
+#define PFX    "rdc321x: "
+
+/* General purpose configuration and data registers */
+#define RDC3210_CFGREG_ADDR     0x0CF8
+#define RDC3210_CFGREG_DATA     0x0CFC
+#define RDC_MAX_GPIO           0x3A
diff --git a/include/asm-x86/mach-summit/mach_apic.h b/include/asm-x86/mach-summit/mach_apic.h

index 732f776aab8e68ca8501e314cfdc8cd39e14250b..062c97f6100b06e1259b0fcb2fcc746ca969bee0 100644 (file)
--- a/include/asm-x86/mach-summit/mach_apic.h
+++ b/include/asm-x86/mach-summit/mach_apic.h
@@ -126,15 +126,15 @@ static inline physid_mask_t apicid_to_cpu_present(int apicid)
         return physid_mask_of_physid(0);
  }
  
-static inline int mpc_apic_id(struct mpc_config_processor *m, 
-                       struct mpc_config_translation *translation_record)
-{
-       printk("Processor #%d %ld:%ld APIC version %d\n",
-                       m->mpc_apicid,
-                       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-                       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-                       m->mpc_apicver);
-       return (m->mpc_apicid);
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+                             struct mpc_config_translation *translation_record)
+{
+       printk("Processor #%d %u:%u APIC version %d\n",
+              m->mpc_apicid,
+              (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+              (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+              m->mpc_apicver);
+       return m->mpc_apicid;
  }
  
  static inline void setup_portio_remap(void)
diff --git a/include/asm-x86/math_emu.h b/include/asm-x86/math_emu.h

index a4b0aa3320e681f802cb7b297766ec58f26d7f9c..9bf4ae93ab10b3a549efaac717fcdce0f81a158d 100644 (file)
--- a/include/asm-x86/math_emu.h
+++ b/include/asm-x86/math_emu.h
@@ -1,11 +1,6 @@
  #ifndef _I386_MATH_EMU_H
  #define _I386_MATH_EMU_H
  
-#include <asm/sigcontext.h>
-
-int restore_i387_soft(void *s387, struct _fpstate __user *buf);
-int save_i387_soft(void *s387, struct _fpstate __user *buf);
-
  /* This structure matches the layout of the data saved to the stack
     following a device-not-present interrupt, part of it saved
     automatically by the 80386/80486.
diff --git a/include/asm-x86/mc146818rtc.h b/include/asm-x86/mc146818rtc.h

index 5c2bb66caf177b751dfa5c47f3805ffdd326b98f..cdd9f965835a66d35b3a1b8d56e43f2dd715eb83 100644 (file)
--- a/include/asm-x86/mc146818rtc.h
+++ b/include/asm-x86/mc146818rtc.h
@@ -1,5 +1,100 @@
-#ifdef CONFIG_X86_32
-# include "mc146818rtc_32.h"
+/*
+ * Machine dependent access functions for RTC registers.
+ */
+#ifndef _ASM_MC146818RTC_H
+#define _ASM_MC146818RTC_H
+
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <linux/mc146818rtc.h>
+
+#ifndef RTC_PORT
+#define RTC_PORT(x)    (0x70 + (x))
+#define RTC_ALWAYS_BCD 1       /* RTC operates in binary mode */
+#endif
+
+#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+/*
+ * This lock provides nmi access to the CMOS/RTC registers.  It has some
+ * special properties.  It is owned by a CPU and stores the index register
+ * currently being accessed (if owned).  The idea here is that it works
+ * like a normal lock (normally).  However, in an NMI, the NMI code will
+ * first check to see if its CPU owns the lock, meaning that the NMI
+ * interrupted during the read/write of the device.  If it does, it goes ahead
+ * and performs the access and then restores the index register.  If it does
+ * not, it locks normally.
+ *
+ * Note that since we are working with NMIs, we need this lock even in
+ * a non-SMP machine just to mark that the lock is owned.
+ *
+ * This only works with compare-and-swap.  There is no other way to
+ * atomically claim the lock and set the owner.
+ */
+#include <linux/smp.h>
+extern volatile unsigned long cmos_lock;
+
+/*
+ * All of these below must be called with interrupts off, preempt
+ * disabled, etc.
+ */
+
+static inline void lock_cmos(unsigned char reg)
+{
+       unsigned long new;
+       new = ((smp_processor_id()+1) << 8) | reg;
+       for (;;) {
+               if (cmos_lock) {
+                       cpu_relax();
+                       continue;
+               }
+               if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
+                       return;
+       }
+}
+
+static inline void unlock_cmos(void)
+{
+       cmos_lock = 0;
+}
+static inline int do_i_have_lock_cmos(void)
+{
+       return (cmos_lock >> 8) == (smp_processor_id()+1);
+}
+static inline unsigned char current_lock_cmos_reg(void)
+{
+       return cmos_lock & 0xff;
+}
+#define lock_cmos_prefix(reg) \
+       do {                                    \
+               unsigned long cmos_flags;       \
+               local_irq_save(cmos_flags);     \
+               lock_cmos(reg)
+#define lock_cmos_suffix(reg) \
+               unlock_cmos();                  \
+               local_irq_restore(cmos_flags);  \
+       } while (0)
  #else
-# include "mc146818rtc_64.h"
+#define lock_cmos_prefix(reg) do {} while (0)
+#define lock_cmos_suffix(reg) do {} while (0)
+#define lock_cmos(reg)
+#define unlock_cmos()
+#define do_i_have_lock_cmos() 0
+#define current_lock_cmos_reg() 0
  #endif
+
+/*
+ * The yet supported machines all access the RTC index register via
+ * an ISA port access but the way to access the date register differs ...
+ */
+#define CMOS_READ(addr) rtc_cmos_read(addr)
+#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
+unsigned char rtc_cmos_read(unsigned char addr);
+void rtc_cmos_write(unsigned char val, unsigned char addr);
+
+extern int mach_set_rtc_mmss(unsigned long nowtime);
+extern unsigned long mach_get_cmos_time(void);
+
+#define RTC_IRQ 8
+
+#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mc146818rtc_32.h b/include/asm-x86/mc146818rtc_32.h

deleted file mode 100644 (file)

index 1613b42..0000000
--- a/include/asm-x86/mc146818rtc_32.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-#include <asm/io.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <linux/mc146818rtc.h>
-
-#ifndef RTC_PORT
-#define RTC_PORT(x)    (0x70 + (x))
-#define RTC_ALWAYS_BCD 1       /* RTC operates in binary mode */
-#endif
-
-#ifdef __HAVE_ARCH_CMPXCHG
-/*
- * This lock provides nmi access to the CMOS/RTC registers.  It has some
- * special properties.  It is owned by a CPU and stores the index register
- * currently being accessed (if owned).  The idea here is that it works
- * like a normal lock (normally).  However, in an NMI, the NMI code will
- * first check to see if its CPU owns the lock, meaning that the NMI
- * interrupted during the read/write of the device.  If it does, it goes ahead
- * and performs the access and then restores the index register.  If it does
- * not, it locks normally.
- *
- * Note that since we are working with NMIs, we need this lock even in
- * a non-SMP machine just to mark that the lock is owned.
- *
- * This only works with compare-and-swap.  There is no other way to
- * atomically claim the lock and set the owner.
- */
-#include <linux/smp.h>
-extern volatile unsigned long cmos_lock;
-
-/*
- * All of these below must be called with interrupts off, preempt
- * disabled, etc.
- */
-
-static inline void lock_cmos(unsigned char reg)
-{
-       unsigned long new;
-       new = ((smp_processor_id()+1) << 8) | reg;
-       for (;;) {
-               if (cmos_lock) {
-                       cpu_relax();
-                       continue;
-               }
-               if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
-                       return;
-       }
-}
-
-static inline void unlock_cmos(void)
-{
-       cmos_lock = 0;
-}
-static inline int do_i_have_lock_cmos(void)
-{
-       return (cmos_lock >> 8) == (smp_processor_id()+1);
-}
-static inline unsigned char current_lock_cmos_reg(void)
-{
-       return cmos_lock & 0xff;
-}
-#define lock_cmos_prefix(reg) \
-       do {                                    \
-               unsigned long cmos_flags;       \
-               local_irq_save(cmos_flags);     \
-               lock_cmos(reg)
-#define lock_cmos_suffix(reg) \
-               unlock_cmos();                  \
-               local_irq_restore(cmos_flags);  \
-       } while (0)
-#else
-#define lock_cmos_prefix(reg) do {} while (0)
-#define lock_cmos_suffix(reg) do {} while (0)
-#define lock_cmos(reg)
-#define unlock_cmos()
-#define do_i_have_lock_cmos() 0
-#define current_lock_cmos_reg() 0
-#endif
-
-/*
- * The yet supported machines all access the RTC index register via
- * an ISA port access but the way to access the date register differs ...
- */
-#define CMOS_READ(addr) rtc_cmos_read(addr)
-#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
-unsigned char rtc_cmos_read(unsigned char addr);
-void rtc_cmos_write(unsigned char val, unsigned char addr);
-
-#define RTC_IRQ 8
-
-#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mc146818rtc_64.h b/include/asm-x86/mc146818rtc_64.h

deleted file mode 100644 (file)

index d6e3009..0000000
--- a/include/asm-x86/mc146818rtc_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-#include <asm/io.h>
-
-#ifndef RTC_PORT
-#define RTC_PORT(x)    (0x70 + (x))
-#define RTC_ALWAYS_BCD 1       /* RTC operates in binary mode */
-#endif
-
-/*
- * The yet supported machines all access the RTC index register via
- * an ISA port access but the way to access the date register differs ...
- */
-#define CMOS_READ(addr) ({ \
-outb_p((addr),RTC_PORT(0)); \
-inb_p(RTC_PORT(1)); \
-})
-#define CMOS_WRITE(val, addr) ({ \
-outb_p((addr),RTC_PORT(0)); \
-outb_p((val),RTC_PORT(1)); \
-})
-
-#define RTC_IRQ 8
-
-#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mce.h b/include/asm-x86/mce.h

index df304fd89c27860e9c709d78a5261da6c2af77a7..94f1fd79e22a9512511b9660363290b535e7424e 100644 (file)
--- a/include/asm-x86/mce.h
+++ b/include/asm-x86/mce.h
@@ -13,7 +13,7 @@
  #define MCG_CTL_P       (1UL<<8)   /* MCG_CAP register available */
  
  #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
-#define MCG_STATUS_EIPV  (1UL<<1)   /* eip points to correct instruction */
+#define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
  #define MCG_STATUS_MCIP  (1UL<<2)   /* machine check in progress */
  
  #define MCI_STATUS_VAL   (1UL<<63)  /* valid error */
@@ -30,7 +30,7 @@ struct mce {
         __u64 misc;
         __u64 addr;
         __u64 mcgstatus;
-       __u64 rip;
+       __u64 ip;
         __u64 tsc;      /* cpu time stamp counter */
         __u64 res1;     /* for future extension */
         __u64 res2;     /* dito. */
@@ -85,14 +85,7 @@ struct mce_log {
  #ifdef __KERNEL__
  
  #ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_MCE
-extern void mcheck_init(struct cpuinfo_x86 *c);
-#else
-#define mcheck_init(c) do {} while(0)
-#endif
-
  extern int mce_disabled;
-
  #else /* CONFIG_X86_32 */
  
  #include <asm/atomic.h>
@@ -121,6 +114,13 @@ extern int mce_notify_user(void);
  
  #endif /* !CONFIG_X86_32 */
  
+
+
+#ifdef CONFIG_X86_MCE
+extern void mcheck_init(struct cpuinfo_x86 *c);
+#else
+#define mcheck_init(c) do { } while (0)
+#endif
  extern void stop_mce(void);
  extern void restart_mce(void);
  
diff --git a/include/asm-x86/mmsegment.h b/include/asm-x86/mmsegment.h

deleted file mode 100644 (file)

index d3f80c9..0000000
--- a/include/asm-x86/mmsegment.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_MMSEGMENT_H
-#define _ASM_MMSEGMENT_H 1
-
-typedef struct {
-       unsigned long seg;
-} mm_segment_t;
-
-#endif
diff --git a/include/asm-x86/mmu.h b/include/asm-x86/mmu.h

index 3f922c8e1c881fcba9f779c75648d54225f15481..efa962c388975c916e3dec785e4b09669e764d42 100644 (file)
--- a/include/asm-x86/mmu.h
+++ b/include/asm-x86/mmu.h
@@ -20,4 +20,12 @@ typedef struct {
         void *vdso;
  } mm_context_t;
  
+#ifdef CONFIG_SMP
+void leave_mm(int cpu);
+#else
+static inline void leave_mm(int cpu)
+{
+}
+#endif
+
  #endif /* _ASM_X86_MMU_H */
diff --git a/include/asm-x86/mmu_context_32.h b/include/asm-x86/mmu_context_32.h

index 7eb0b0b1fb3c3e24899eaa3ba0fa378857f77e62..8198d1cca1f31264dc6b0ee4da90cdd0c13cd323 100644 (file)
--- a/include/asm-x86/mmu_context_32.h
+++ b/include/asm-x86/mmu_context_32.h
@@ -32,8 +32,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
  #endif
  }
  
-void leave_mm(unsigned long cpu);
-
  static inline void switch_mm(struct mm_struct *prev,
                              struct mm_struct *next,
                              struct task_struct *tsk)
diff --git a/include/asm-x86/mmu_context_64.h b/include/asm-x86/mmu_context_64.h

index 0cce83a78378ec20a12a348c8b907d4b2386a6cf..ad6dc821ef9ecc8a9e139f9fae15205f6ab52ebf 100644 (file)
--- a/include/asm-x86/mmu_context_64.h
+++ b/include/asm-x86/mmu_context_64.h
@@ -7,7 +7,9 @@
  #include <asm/pda.h>
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
+#ifndef CONFIG_PARAVIRT
  #include <asm-generic/mm_hooks.h>
+#endif
  
  /*
   * possibly do the LDT unload here?
@@ -23,11 +25,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
  #endif
  }
  
-static inline void load_cr3(pgd_t *pgd)
-{
-       asm volatile("movq %0,%%cr3" :: "r" (__pa(pgd)) : "memory");
-}
-
  static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
                              struct task_struct *tsk)
  {
@@ -43,20 +40,20 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                 load_cr3(next->pgd);
  
                 if (unlikely(next->context.ldt != prev->context.ldt)) 
-                       load_LDT_nolock(&next->context, cpu);
+                       load_LDT_nolock(&next->context);
         }
  #ifdef CONFIG_SMP
         else {
                 write_pda(mmu_state, TLBSTATE_OK);
                 if (read_pda(active_mm) != next)
-                       out_of_line_bug();
+                       BUG();
                 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
                         /* We were in lazy tlb mode and leave_mm disabled 
                          * tlb flush IPI delivery. We must reload CR3
                          * to make sure to use no freed page tables.
                          */
                         load_cr3(next->pgd);
-                       load_LDT_nolock(&next->context, cpu);
+                       load_LDT_nolock(&next->context);
                 }
         }
  #endif
diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h

index 118e9812778fd857f832632d2cbaa38a888cf9b0..5d6f4ce6e6d64b9fe7dce596f67e4b1165147e77 100644 (file)
--- a/include/asm-x86/mmzone_32.h
+++ b/include/asm-x86/mmzone_32.h
@@ -87,9 +87,6 @@ static inline int pfn_to_nid(unsigned long pfn)
         __pgdat->node_start_pfn + __pgdat->node_spanned_pages;          \
  })
  
-/* XXX: FIXME -- wli */
-#define kern_addr_valid(kaddr) (0)
-
  #ifdef CONFIG_X86_NUMAQ            /* we have contiguous memory on NUMA-Q */
  #define pfn_valid(pfn)          ((pfn) < num_physpages)
  #else
diff --git a/include/asm-x86/mmzone_64.h b/include/asm-x86/mmzone_64.h

index 19a89377b123b125d64fe50c24327bc0eac9c202..ebaf9663aa8aa9e42cd1297bdc123989020d2d1e 100644 (file)
--- a/include/asm-x86/mmzone_64.h
+++ b/include/asm-x86/mmzone_64.h
@@ -15,9 +15,9 @@
  struct memnode {
         int shift;
         unsigned int mapsize;
-       u8 *map;
-       u8 embedded_map[64-16];
-} ____cacheline_aligned; /* total size = 64 bytes */
+       s16 *map;
+       s16 embedded_map[64-8];
+} ____cacheline_aligned; /* total size = 128 bytes */
  extern struct memnode memnode;
  #define memnode_shift memnode.shift
  #define memnodemap memnode.map
@@ -41,11 +41,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
  #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
                                  NODE_DATA(nid)->node_spanned_pages)
  
-#ifdef CONFIG_DISCONTIGMEM
-#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
-
-extern int pfn_valid(unsigned long pfn);
-#endif
+extern int early_pfn_to_nid(unsigned long pfn);
  
  #ifdef CONFIG_NUMA_EMU
  #define FAKE_NODE_MIN_SIZE     (64*1024*1024)
diff --git a/include/asm-x86/module.h b/include/asm-x86/module.h

index 2b2f18d8a5317e1755fe16c220bedf76ce4c417b..bfedb247871cfbc39ecb2823f428ac62f57eb9f4 100644 (file)
--- a/include/asm-x86/module.h
+++ b/include/asm-x86/module.h
@@ -1,5 +1,82 @@
+#ifndef _ASM_MODULE_H
+#define _ASM_MODULE_H
+
+/* x86_32/64 are simple */
+struct mod_arch_specific {};
+
  #ifdef CONFIG_X86_32
-# include "module_32.h"
+# define Elf_Shdr Elf32_Shdr
+# define Elf_Sym Elf32_Sym
+# define Elf_Ehdr Elf32_Ehdr
  #else
-# include "module_64.h"
+# define Elf_Shdr Elf64_Shdr
+# define Elf_Sym Elf64_Sym
+# define Elf_Ehdr Elf64_Ehdr
  #endif
+
+#ifdef CONFIG_X86_64
+/* X86_64 does not define MODULE_PROC_FAMILY */
+#elif defined CONFIG_M386
+#define MODULE_PROC_FAMILY "386 "
+#elif defined CONFIG_M486
+#define MODULE_PROC_FAMILY "486 "
+#elif defined CONFIG_M586
+#define MODULE_PROC_FAMILY "586 "
+#elif defined CONFIG_M586TSC
+#define MODULE_PROC_FAMILY "586TSC "
+#elif defined CONFIG_M586MMX
+#define MODULE_PROC_FAMILY "586MMX "
+#elif defined CONFIG_MCORE2
+#define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_M686
+#define MODULE_PROC_FAMILY "686 "
+#elif defined CONFIG_MPENTIUMII
+#define MODULE_PROC_FAMILY "PENTIUMII "
+#elif defined CONFIG_MPENTIUMIII
+#define MODULE_PROC_FAMILY "PENTIUMIII "
+#elif defined CONFIG_MPENTIUMM
+#define MODULE_PROC_FAMILY "PENTIUMM "
+#elif defined CONFIG_MPENTIUM4
+#define MODULE_PROC_FAMILY "PENTIUM4 "
+#elif defined CONFIG_MK6
+#define MODULE_PROC_FAMILY "K6 "
+#elif defined CONFIG_MK7
+#define MODULE_PROC_FAMILY "K7 "
+#elif defined CONFIG_MK8
+#define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_X86_ELAN
+#define MODULE_PROC_FAMILY "ELAN "
+#elif defined CONFIG_MCRUSOE
+#define MODULE_PROC_FAMILY "CRUSOE "
+#elif defined CONFIG_MEFFICEON
+#define MODULE_PROC_FAMILY "EFFICEON "
+#elif defined CONFIG_MWINCHIPC6
+#define MODULE_PROC_FAMILY "WINCHIPC6 "
+#elif defined CONFIG_MWINCHIP2
+#define MODULE_PROC_FAMILY "WINCHIP2 "
+#elif defined CONFIG_MWINCHIP3D
+#define MODULE_PROC_FAMILY "WINCHIP3D "
+#elif defined CONFIG_MCYRIXIII
+#define MODULE_PROC_FAMILY "CYRIXIII "
+#elif defined CONFIG_MVIAC3_2
+#define MODULE_PROC_FAMILY "VIAC3-2 "
+#elif defined CONFIG_MVIAC7
+#define MODULE_PROC_FAMILY "VIAC7 "
+#elif defined CONFIG_MGEODEGX1
+#define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
+#else
+#error unknown processor family
+#endif
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_4KSTACKS
+#  define MODULE_STACKSIZE "4KSTACKS "
+# else
+#  define MODULE_STACKSIZE ""
+# endif
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+#endif
+
+#endif /* _ASM_MODULE_H */
diff --git a/include/asm-x86/module_32.h b/include/asm-x86/module_32.h

deleted file mode 100644 (file)

index 7e5fda6..0000000
--- a/include/asm-x86/module_32.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef _ASM_I386_MODULE_H
-#define _ASM_I386_MODULE_H
-
-/* x86 is simple */
-struct mod_arch_specific
-{
-};
-
-#define Elf_Shdr Elf32_Shdr
-#define Elf_Sym Elf32_Sym
-#define Elf_Ehdr Elf32_Ehdr
-
-#ifdef CONFIG_M386
-#define MODULE_PROC_FAMILY "386 "
-#elif defined CONFIG_M486
-#define MODULE_PROC_FAMILY "486 "
-#elif defined CONFIG_M586
-#define MODULE_PROC_FAMILY "586 "
-#elif defined CONFIG_M586TSC
-#define MODULE_PROC_FAMILY "586TSC "
-#elif defined CONFIG_M586MMX
-#define MODULE_PROC_FAMILY "586MMX "
-#elif defined CONFIG_MCORE2
-#define MODULE_PROC_FAMILY "CORE2 "
-#elif defined CONFIG_M686
-#define MODULE_PROC_FAMILY "686 "
-#elif defined CONFIG_MPENTIUMII
-#define MODULE_PROC_FAMILY "PENTIUMII "
-#elif defined CONFIG_MPENTIUMIII
-#define MODULE_PROC_FAMILY "PENTIUMIII "
-#elif defined CONFIG_MPENTIUMM
-#define MODULE_PROC_FAMILY "PENTIUMM "
-#elif defined CONFIG_MPENTIUM4
-#define MODULE_PROC_FAMILY "PENTIUM4 "
-#elif defined CONFIG_MK6
-#define MODULE_PROC_FAMILY "K6 "
-#elif defined CONFIG_MK7
-#define MODULE_PROC_FAMILY "K7 "
-#elif defined CONFIG_MK8
-#define MODULE_PROC_FAMILY "K8 "
-#elif defined CONFIG_X86_ELAN
-#define MODULE_PROC_FAMILY "ELAN "
-#elif defined CONFIG_MCRUSOE
-#define MODULE_PROC_FAMILY "CRUSOE "
-#elif defined CONFIG_MEFFICEON
-#define MODULE_PROC_FAMILY "EFFICEON "
-#elif defined CONFIG_MWINCHIPC6
-#define MODULE_PROC_FAMILY "WINCHIPC6 "
-#elif defined CONFIG_MWINCHIP2
-#define MODULE_PROC_FAMILY "WINCHIP2 "
-#elif defined CONFIG_MWINCHIP3D
-#define MODULE_PROC_FAMILY "WINCHIP3D "
-#elif defined CONFIG_MCYRIXIII
-#define MODULE_PROC_FAMILY "CYRIXIII "
-#elif defined CONFIG_MVIAC3_2
-#define MODULE_PROC_FAMILY "VIAC3-2 "
-#elif defined CONFIG_MVIAC7
-#define MODULE_PROC_FAMILY "VIAC7 "
-#elif defined CONFIG_MGEODEGX1
-#define MODULE_PROC_FAMILY "GEODEGX1 "
-#elif defined CONFIG_MGEODE_LX
-#define MODULE_PROC_FAMILY "GEODE "
-#else
-#error unknown processor family
-#endif
-
-#ifdef CONFIG_4KSTACKS
-#define MODULE_STACKSIZE "4KSTACKS "
-#else
-#define MODULE_STACKSIZE ""
-#endif
-
-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
-
-#endif /* _ASM_I386_MODULE_H */
diff --git a/include/asm-x86/module_64.h b/include/asm-x86/module_64.h

deleted file mode 100644 (file)

index 67f8f69..0000000
--- a/include/asm-x86/module_64.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X8664_MODULE_H
-#define _ASM_X8664_MODULE_H
-
-struct mod_arch_specific {}; 
-
-#define Elf_Shdr Elf64_Shdr
-#define Elf_Sym Elf64_Sym
-#define Elf_Ehdr Elf64_Ehdr
-
-#endif 
diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h

index 8f268e8fd2e9d4ad06c6751c0f6e087af4d1912a..781ad74ab9e9e1a519fd27a7fb42c347bff52743 100644 (file)
--- a/include/asm-x86/mpspec.h
+++ b/include/asm-x86/mpspec.h
@@ -1,5 +1,117 @@
+#ifndef _AM_X86_MPSPEC_H
+#define _AM_X86_MPSPEC_H
+
+#include <asm/mpspec_def.h>
+
  #ifdef CONFIG_X86_32
-# include "mpspec_32.h"
+#include <mach_mpspec.h>
+
+extern int mp_bus_id_to_type[MAX_MP_BUSSES];
+extern int mp_bus_id_to_node[MAX_MP_BUSSES];
+extern int mp_bus_id_to_local[MAX_MP_BUSSES];
+extern int quad_local_to_mp_bus_id[NR_CPUS/4][4];
+
+extern unsigned int def_to_bigsmp;
+extern int apic_version[MAX_APICS];
+extern u8 apicid_2_node[];
+extern int pic_mode;
+
+#define MAX_APICID 256
+
  #else
-# include "mpspec_64.h"
+
+#define MAX_MP_BUSSES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
+
+extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+#endif
+
+extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES];
+
+extern unsigned int boot_cpu_physical_apicid;
+extern int smp_found_config;
+extern int nr_ioapics;
+extern int mp_irq_entries;
+extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+extern int mpc_default_type;
+extern unsigned long mp_lapic_addr;
+
+extern void find_smp_config(void);
+extern void get_smp_config(void);
+
+#ifdef CONFIG_ACPI
+extern void mp_register_lapic(u8 id, u8 enabled);
+extern void mp_register_lapic_address(u64 address);
+extern void mp_register_ioapic(u8 id, u32 address, u32 gsi_base);
+extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+                                  u32 gsi);
+extern void mp_config_acpi_legacy_irqs(void);
+extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+#endif /* CONFIG_ACPI */
+
+#define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_APICS)
+
+struct physid_mask
+{
+       unsigned long mask[PHYSID_ARRAY_SIZE];
+};
+
+typedef struct physid_mask physid_mask_t;
+
+#define physid_set(physid, map)                        set_bit(physid, (map).mask)
+#define physid_clear(physid, map)              clear_bit(physid, (map).mask)
+#define physid_isset(physid, map)              test_bit(physid, (map).mask)
+#define physid_test_and_set(physid, map) \
+       test_and_set_bit(physid, (map).mask)
+
+#define physids_and(dst, src1, src2) \
+       bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+
+#define physids_or(dst, src1, src2) \
+       bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+
+#define physids_clear(map) \
+       bitmap_zero((map).mask, MAX_APICS)
+
+#define physids_complement(dst, src) \
+       bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+
+#define physids_empty(map) \
+       bitmap_empty((map).mask, MAX_APICS)
+
+#define physids_equal(map1, map2) \
+       bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+
+#define physids_weight(map) \
+       bitmap_weight((map).mask, MAX_APICS)
+
+#define physids_shift_right(d, s, n) \
+       bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+
+#define physids_shift_left(d, s, n) \
+       bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+
+#define physids_coerce(map)                    ((map).mask[0])
+
+#define physids_promote(physids)                                       \
+       ({                                                              \
+               physid_mask_t __physid_mask = PHYSID_MASK_NONE;         \
+               __physid_mask.mask[0] = physids;                        \
+               __physid_mask;                                          \
+       })
+
+#define physid_mask_of_physid(physid)                                  \
+       ({                                                              \
+               physid_mask_t __physid_mask = PHYSID_MASK_NONE;         \
+               physid_set(physid, __physid_mask);                      \
+               __physid_mask;                                          \
+       })
+
+#define PHYSID_MASK_ALL                { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
+#define PHYSID_MASK_NONE       { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
+
+extern physid_mask_t phys_cpu_present_map;
+
  #endif
diff --git a/include/asm-x86/mpspec_32.h b/include/asm-x86/mpspec_32.h

deleted file mode 100644 (file)

index f213493..0000000
--- a/include/asm-x86/mpspec_32.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef __ASM_MPSPEC_H
-#define __ASM_MPSPEC_H
-
-#include <linux/cpumask.h>
-#include <asm/mpspec_def.h>
-#include <mach_mpspec.h>
-
-extern int mp_bus_id_to_type [MAX_MP_BUSSES];
-extern int mp_bus_id_to_node [MAX_MP_BUSSES];
-extern int mp_bus_id_to_local [MAX_MP_BUSSES];
-extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
-
-extern unsigned int def_to_bigsmp;
-extern unsigned int boot_cpu_physical_apicid;
-extern int smp_found_config;
-extern void find_smp_config (void);
-extern void get_smp_config (void);
-extern int nr_ioapics;
-extern int apic_version [MAX_APICS];
-extern int mp_irq_entries;
-extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
-extern int mpc_default_type;
-extern unsigned long mp_lapic_addr;
-extern int pic_mode;
-
-#ifdef CONFIG_ACPI
-extern void mp_register_lapic (u8 id, u8 enabled);
-extern void mp_register_lapic_address (u64 address);
-extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi);
-extern void mp_config_acpi_legacy_irqs (void);
-extern int mp_register_gsi (u32 gsi, int edge_level, int active_high_low);
-#endif /* CONFIG_ACPI */
-
-#define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_APICS)
-
-struct physid_mask
-{
-       unsigned long mask[PHYSID_ARRAY_SIZE];
-};
-
-typedef struct physid_mask physid_mask_t;
-
-#define physid_set(physid, map)                        set_bit(physid, (map).mask)
-#define physid_clear(physid, map)              clear_bit(physid, (map).mask)
-#define physid_isset(physid, map)              test_bit(physid, (map).mask)
-#define physid_test_and_set(physid, map)       test_and_set_bit(physid, (map).mask)
-
-#define physids_and(dst, src1, src2)           bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_or(dst, src1, src2)            bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_clear(map)                     bitmap_zero((map).mask, MAX_APICS)
-#define physids_complement(dst, src)           bitmap_complement((dst).mask,(src).mask, MAX_APICS)
-#define physids_empty(map)                     bitmap_empty((map).mask, MAX_APICS)
-#define physids_equal(map1, map2)              bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
-#define physids_weight(map)                    bitmap_weight((map).mask, MAX_APICS)
-#define physids_shift_right(d, s, n)           bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
-#define physids_shift_left(d, s, n)            bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
-#define physids_coerce(map)                    ((map).mask[0])
-
-#define physids_promote(physids)                                               \
-       ({                                                                      \
-               physid_mask_t __physid_mask = PHYSID_MASK_NONE;                 \
-               __physid_mask.mask[0] = physids;                                \
-               __physid_mask;                                                  \
-       })
-
-#define physid_mask_of_physid(physid)                                          \
-       ({                                                                      \
-               physid_mask_t __physid_mask = PHYSID_MASK_NONE;                 \
-               physid_set(physid, __physid_mask);                              \
-               __physid_mask;                                                  \
-       })
-
-#define PHYSID_MASK_ALL                { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
-#define PHYSID_MASK_NONE       { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
-
-extern physid_mask_t phys_cpu_present_map;
-
-#endif
-
diff --git a/include/asm-x86/mpspec_64.h b/include/asm-x86/mpspec_64.h

deleted file mode 100644 (file)

index 017fddb..0000000
--- a/include/asm-x86/mpspec_64.h
+++ /dev/null
@@ -1,233 +0,0 @@
-#ifndef __ASM_MPSPEC_H
-#define __ASM_MPSPEC_H
-
-/*
- * Structure definitions for SMP machines following the
- * Intel Multiprocessing Specification 1.1 and 1.4.
- */
-
-/*
- * This tag identifies where the SMP configuration
- * information is. 
- */
- 
-#define SMP_MAGIC_IDENT        (('_'<<24)|('P'<<16)|('M'<<8)|'_')
-
-/*
- * A maximum of 255 APICs with the current APIC ID architecture.
- */
-#define MAX_APICS 255
-
-struct intel_mp_floating
-{
-       char mpf_signature[4];          /* "_MP_"                       */
-       unsigned int mpf_physptr;       /* Configuration table address  */
-       unsigned char mpf_length;       /* Our length (paragraphs)      */
-       unsigned char mpf_specification;/* Specification version        */
-       unsigned char mpf_checksum;     /* Checksum (makes sum 0)       */
-       unsigned char mpf_feature1;     /* Standard or configuration ?  */
-       unsigned char mpf_feature2;     /* Bit7 set for IMCR|PIC        */
-       unsigned char mpf_feature3;     /* Unused (0)                   */
-       unsigned char mpf_feature4;     /* Unused (0)                   */
-       unsigned char mpf_feature5;     /* Unused (0)                   */
-};
-
-struct mp_config_table
-{
-       char mpc_signature[4];
-#define MPC_SIGNATURE "PCMP"
-       unsigned short mpc_length;      /* Size of table */
-       char  mpc_spec;                 /* 0x01 */
-       char  mpc_checksum;
-       char  mpc_oem[8];
-       char  mpc_productid[12];
-       unsigned int mpc_oemptr;        /* 0 if not present */
-       unsigned short mpc_oemsize;     /* 0 if not present */
-       unsigned short mpc_oemcount;
-       unsigned int mpc_lapic; /* APIC address */
-       unsigned int reserved;
-};
-
-/* Followed by entries */
-
-#define        MP_PROCESSOR    0
-#define        MP_BUS          1
-#define        MP_IOAPIC       2
-#define        MP_INTSRC       3
-#define        MP_LINTSRC      4
-
-struct mpc_config_processor
-{
-       unsigned char mpc_type;
-       unsigned char mpc_apicid;       /* Local APIC number */
-       unsigned char mpc_apicver;      /* Its versions */
-       unsigned char mpc_cpuflag;
-#define CPU_ENABLED            1       /* Processor is available */
-#define CPU_BOOTPROCESSOR      2       /* Processor is the BP */
-       unsigned int mpc_cpufeature;            
-#define CPU_STEPPING_MASK 0x0F
-#define CPU_MODEL_MASK 0xF0
-#define CPU_FAMILY_MASK        0xF00
-       unsigned int mpc_featureflag;   /* CPUID feature value */
-       unsigned int mpc_reserved[2];
-};
-
-struct mpc_config_bus
-{
-       unsigned char mpc_type;
-       unsigned char mpc_busid;
-       unsigned char mpc_bustype[6];
-};
-
-/* List of Bus Type string values, Intel MP Spec. */
-#define BUSTYPE_EISA   "EISA"
-#define BUSTYPE_ISA    "ISA"
-#define BUSTYPE_INTERN "INTERN"        /* Internal BUS */
-#define BUSTYPE_MCA    "MCA"
-#define BUSTYPE_VL     "VL"            /* Local bus */
-#define BUSTYPE_PCI    "PCI"
-#define BUSTYPE_PCMCIA "PCMCIA"
-#define BUSTYPE_CBUS   "CBUS"
-#define BUSTYPE_CBUSII "CBUSII"
-#define BUSTYPE_FUTURE "FUTURE"
-#define BUSTYPE_MBI    "MBI"
-#define BUSTYPE_MBII   "MBII"
-#define BUSTYPE_MPI    "MPI"
-#define BUSTYPE_MPSA   "MPSA"
-#define BUSTYPE_NUBUS  "NUBUS"
-#define BUSTYPE_TC     "TC"
-#define BUSTYPE_VME    "VME"
-#define BUSTYPE_XPRESS "XPRESS"
-
-struct mpc_config_ioapic
-{
-       unsigned char mpc_type;
-       unsigned char mpc_apicid;
-       unsigned char mpc_apicver;
-       unsigned char mpc_flags;
-#define MPC_APIC_USABLE                0x01
-       unsigned int mpc_apicaddr;
-};
-
-struct mpc_config_intsrc
-{
-       unsigned char mpc_type;
-       unsigned char mpc_irqtype;
-       unsigned short mpc_irqflag;
-       unsigned char mpc_srcbus;
-       unsigned char mpc_srcbusirq;
-       unsigned char mpc_dstapic;
-       unsigned char mpc_dstirq;
-};
-
-enum mp_irq_source_types {
-       mp_INT = 0,
-       mp_NMI = 1,
-       mp_SMI = 2,
-       mp_ExtINT = 3
-};
-
-#define MP_IRQDIR_DEFAULT      0
-#define MP_IRQDIR_HIGH         1
-#define MP_IRQDIR_LOW          3
-
-
-struct mpc_config_lintsrc
-{
-       unsigned char mpc_type;
-       unsigned char mpc_irqtype;
-       unsigned short mpc_irqflag;
-       unsigned char mpc_srcbusid;
-       unsigned char mpc_srcbusirq;
-       unsigned char mpc_destapic;     
-#define MP_APIC_ALL    0xFF
-       unsigned char mpc_destapiclint;
-};
-
-/*
- *     Default configurations
- *
- *     1       2 CPU ISA 82489DX
- *     2       2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
- *     3       2 CPU EISA 82489DX
- *     4       2 CPU MCA 82489DX
- *     5       2 CPU ISA+PCI
- *     6       2 CPU EISA+PCI
- *     7       2 CPU MCA+PCI
- */
-
-#define MAX_MP_BUSSES 256
-/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
-#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
-extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
-
-extern unsigned int boot_cpu_physical_apicid;
-extern int smp_found_config;
-extern void find_smp_config (void);
-extern void get_smp_config (void);
-extern int nr_ioapics;
-extern unsigned char apic_version [MAX_APICS];
-extern int mp_irq_entries;
-extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
-extern int mpc_default_type;
-extern unsigned long mp_lapic_addr;
-
-#ifdef CONFIG_ACPI
-extern void mp_register_lapic (u8 id, u8 enabled);
-extern void mp_register_lapic_address (u64 address);
-
-extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi);
-extern void mp_config_acpi_legacy_irqs (void);
-extern int mp_register_gsi (u32 gsi, int triggering, int polarity);
-#endif
-
-extern int using_apic_timer;
-
-#define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_APICS)
-
-struct physid_mask
-{
-       unsigned long mask[PHYSID_ARRAY_SIZE];
-};
-
-typedef struct physid_mask physid_mask_t;
-
-#define physid_set(physid, map)                        set_bit(physid, (map).mask)
-#define physid_clear(physid, map)              clear_bit(physid, (map).mask)
-#define physid_isset(physid, map)              test_bit(physid, (map).mask)
-#define physid_test_and_set(physid, map)       test_and_set_bit(physid, (map).mask)
-
-#define physids_and(dst, src1, src2)           bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_or(dst, src1, src2)            bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_clear(map)                     bitmap_zero((map).mask, MAX_APICS)
-#define physids_complement(dst, src)           bitmap_complement((dst).mask, (src).mask, MAX_APICS)
-#define physids_empty(map)                     bitmap_empty((map).mask, MAX_APICS)
-#define physids_equal(map1, map2)              bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
-#define physids_weight(map)                    bitmap_weight((map).mask, MAX_APICS)
-#define physids_shift_right(d, s, n)           bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
-#define physids_shift_left(d, s, n)            bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
-#define physids_coerce(map)                    ((map).mask[0])
-
-#define physids_promote(physids)                                               \
-       ({                                                                      \
-               physid_mask_t __physid_mask = PHYSID_MASK_NONE;                 \
-               __physid_mask.mask[0] = physids;                                \
-               __physid_mask;                                                  \
-       })
-
-#define physid_mask_of_physid(physid)                                          \
-       ({                                                                      \
-               physid_mask_t __physid_mask = PHYSID_MASK_NONE;                 \
-               physid_set(physid, __physid_mask);                              \
-               __physid_mask;                                                  \
-       })
-
-#define PHYSID_MASK_ALL                { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
-#define PHYSID_MASK_NONE       { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
-
-extern physid_mask_t phys_cpu_present_map;
-
-#endif
-
diff --git a/include/asm-x86/mpspec_def.h b/include/asm-x86/mpspec_def.h

index 13bafb16e7afd516d7efc47cd164b5461ecc2eaa..3504617fe648267e5967e57ff47b5d5e33704b1e 100644 (file)
--- a/include/asm-x86/mpspec_def.h
+++ b/include/asm-x86/mpspec_def.h
@@ -8,52 +8,68 @@
  
  /*
   * This tag identifies where the SMP configuration
- * information is. 
+ * information is.
   */
- 
+
  #define SMP_MAGIC_IDENT        (('_'<<24)|('P'<<16)|('M'<<8)|'_')
  
-#define MAX_MPC_ENTRY 1024
-#define MAX_APICS      256
+#ifdef CONFIG_X86_32
+# define MAX_MPC_ENTRY 1024
+# define MAX_APICS      256
+#else
+/*
+ * A maximum of 255 APICs with the current APIC ID architecture.
+ */
+# define MAX_APICS 255
+#endif
  
  struct intel_mp_floating
  {
-       char mpf_signature[4];          /* "_MP_"                       */
-       unsigned long mpf_physptr;      /* Configuration table address  */
+       char mpf_signature[4];          /* "_MP_"                       */
+       unsigned int mpf_physptr;       /* Configuration table address  */
         unsigned char mpf_length;       /* Our length (paragraphs)      */
         unsigned char mpf_specification;/* Specification version        */
         unsigned char mpf_checksum;     /* Checksum (makes sum 0)       */
-       unsigned char mpf_feature1;     /* Standard or configuration ?  */
+       unsigned char mpf_feature1;     /* Standard or configuration ?  */
         unsigned char mpf_feature2;     /* Bit7 set for IMCR|PIC        */
         unsigned char mpf_feature3;     /* Unused (0)                   */
         unsigned char mpf_feature4;     /* Unused (0)                   */
         unsigned char mpf_feature5;     /* Unused (0)                   */
  };
  
+#define MPC_SIGNATURE "PCMP"
+
  struct mp_config_table
  {
         char mpc_signature[4];
-#define MPC_SIGNATURE "PCMP"
         unsigned short mpc_length;      /* Size of table */
         char  mpc_spec;                 /* 0x01 */
         char  mpc_checksum;
         char  mpc_oem[8];
         char  mpc_productid[12];
-       unsigned long mpc_oemptr;       /* 0 if not present */
+       unsigned int mpc_oemptr;        /* 0 if not present */
         unsigned short mpc_oemsize;     /* 0 if not present */
         unsigned short mpc_oemcount;
-       unsigned long mpc_lapic;        /* APIC address */
-       unsigned long reserved;
+       unsigned int mpc_lapic; /* APIC address */
+       unsigned int reserved;
  };
  
  /* Followed by entries */
  
-#define        MP_PROCESSOR    0
-#define        MP_BUS          1
-#define        MP_IOAPIC       2
-#define        MP_INTSRC       3
-#define        MP_LINTSRC      4
-#define        MP_TRANSLATION  192  /* Used by IBM NUMA-Q to describe node locality */
+#define        MP_PROCESSOR            0
+#define        MP_BUS                  1
+#define        MP_IOAPIC               2
+#define        MP_INTSRC               3
+#define        MP_LINTSRC              4
+/* Used by IBM NUMA-Q to describe node locality */
+#define        MP_TRANSLATION          192
+
+#define CPU_ENABLED            1       /* Processor is available */
+#define CPU_BOOTPROCESSOR      2       /* Processor is the BP */
+
+#define CPU_STEPPING_MASK      0x000F
+#define CPU_MODEL_MASK         0x00F0
+#define CPU_FAMILY_MASK                0x0F00
  
  struct mpc_config_processor
  {
@@ -61,14 +77,9 @@ struct mpc_config_processor
         unsigned char mpc_apicid;       /* Local APIC number */
         unsigned char mpc_apicver;      /* Its versions */
         unsigned char mpc_cpuflag;
-#define CPU_ENABLED            1       /* Processor is available */
-#define CPU_BOOTPROCESSOR      2       /* Processor is the BP */
-       unsigned long mpc_cpufeature;           
-#define CPU_STEPPING_MASK 0x0F
-#define CPU_MODEL_MASK 0xF0
-#define CPU_FAMILY_MASK        0xF00
-       unsigned long mpc_featureflag;  /* CPUID feature value */
-       unsigned long mpc_reserved[2];
+       unsigned int mpc_cpufeature;
+       unsigned int mpc_featureflag;   /* CPUID feature value */
+       unsigned int mpc_reserved[2];
  };
  
  struct mpc_config_bus
@@ -98,14 +109,15 @@ struct mpc_config_bus
  #define BUSTYPE_VME    "VME"
  #define BUSTYPE_XPRESS "XPRESS"
  
+#define MPC_APIC_USABLE                0x01
+
  struct mpc_config_ioapic
  {
         unsigned char mpc_type;
         unsigned char mpc_apicid;
         unsigned char mpc_apicver;
         unsigned char mpc_flags;
-#define MPC_APIC_USABLE                0x01
-       unsigned long mpc_apicaddr;
+       unsigned int mpc_apicaddr;
  };
  
  struct mpc_config_intsrc
@@ -130,6 +142,7 @@ enum mp_irq_source_types {
  #define MP_IRQDIR_HIGH         1
  #define MP_IRQDIR_LOW          3
  
+#define MP_APIC_ALL    0xFF
  
  struct mpc_config_lintsrc
  {
@@ -138,15 +151,15 @@ struct mpc_config_lintsrc
         unsigned short mpc_irqflag;
         unsigned char mpc_srcbusid;
         unsigned char mpc_srcbusirq;
-       unsigned char mpc_destapic;     
-#define MP_APIC_ALL    0xFF
+       unsigned char mpc_destapic;
         unsigned char mpc_destapiclint;
  };
  
+#define MPC_OEM_SIGNATURE "_OEM"
+
  struct mp_config_oemtable
  {
         char oem_signature[4];
-#define MPC_OEM_SIGNATURE "_OEM"
         unsigned short oem_length;      /* Size of table */
         char  oem_rev;                  /* 0x01 */
         char  oem_checksum;
@@ -155,13 +168,13 @@ struct mp_config_oemtable
  
  struct mpc_config_translation
  {
-        unsigned char mpc_type;
-        unsigned char trans_len;
-        unsigned char trans_type;
-        unsigned char trans_quad;
-        unsigned char trans_global;
-        unsigned char trans_local;
-        unsigned short trans_reserved;
+       unsigned char mpc_type;
+       unsigned char trans_len;
+       unsigned char trans_type;
+       unsigned char trans_quad;
+       unsigned char trans_global;
+       unsigned char trans_local;
+       unsigned short trans_reserved;
  };
  
  /*
diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h

index a4944732be04d9fa1a75d66ab5799f8f96be6eda..fae118a252782918992a3e7906aeca1c4d618630 100644 (file)
--- a/include/asm-x86/msr-index.h
+++ b/include/asm-x86/msr-index.h
@@ -63,6 +63,13 @@
  #define MSR_IA32_LASTINTFROMIP         0x000001dd
  #define MSR_IA32_LASTINTTOIP           0x000001de
  
+/* DEBUGCTLMSR bits (others vary by model): */
+#define _DEBUGCTLMSR_LBR       0 /* last branch recording */
+#define _DEBUGCTLMSR_BTF       1 /* single-step on branches */
+
+#define DEBUGCTLMSR_LBR                (1UL << _DEBUGCTLMSR_LBR)
+#define DEBUGCTLMSR_BTF                (1UL << _DEBUGCTLMSR_BTF)
+
  #define MSR_IA32_MC0_CTL               0x00000400
  #define MSR_IA32_MC0_STATUS            0x00000401
  #define MSR_IA32_MC0_ADDR              0x00000402
@@ -88,6 +95,14 @@
  #define MSR_AMD64_IBSDCPHYSAD          0xc0011039
  #define MSR_AMD64_IBSCTL               0xc001103a
  
+/* Fam 10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE      0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE                (1<<0)
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK     0xfffffff
+#define FAM10H_MMIO_CONF_BASE_SHIFT    20
+
  /* K8 MSRs */
  #define MSR_K8_TOP_MEM1                        0xc001001a
  #define MSR_K8_TOP_MEM2                        0xc001001d
diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h

index 80b027081b3ce54ffc009e104250283cec5efdda..204a8a30fecf8e9669aae32b6f6db83852cb694b 100644 (file)
--- a/include/asm-x86/msr.h
+++ b/include/asm-x86/msr.h
@@ -7,77 +7,109 @@
  # include <linux/types.h>
  #endif
  
-#ifdef __i386__
-
  #ifdef __KERNEL__
  #ifndef __ASSEMBLY__
  
+#include <asm/asm.h>
  #include <asm/errno.h>
  
+static inline unsigned long long native_read_tscp(unsigned int *aux)
+{
+       unsigned long low, high;
+       asm volatile (".byte 0x0f,0x01,0xf9"
+                     : "=a" (low), "=d" (high), "=c" (*aux));
+       return low | ((u64)high >> 32);
+}
+
+/*
+ * i386 calling convention returns 64-bit value in edx:eax, while
+ * x86_64 returns at rax. Also, the "A" constraint does not really
+ * mean rdx:rax in x86_64, so we need specialized behaviour for each
+ * architecture
+ */
+#ifdef CONFIG_X86_64
+#define DECLARE_ARGS(val, low, high)   unsigned low, high
+#define EAX_EDX_VAL(val, low, high)    (low | ((u64)(high) << 32))
+#define EAX_EDX_ARGS(val, low, high)   "a" (low), "d" (high)
+#define EAX_EDX_RET(val, low, high)    "=a" (low), "=d" (high)
+#else
+#define DECLARE_ARGS(val, low, high)   unsigned long long val
+#define EAX_EDX_VAL(val, low, high)    (val)
+#define EAX_EDX_ARGS(val, low, high)   "A" (val)
+#define EAX_EDX_RET(val, low, high)    "=A" (val)
+#endif
+
  static inline unsigned long long native_read_msr(unsigned int msr)
  {
-       unsigned long long val;
+       DECLARE_ARGS(val, low, high);
  
-       asm volatile("rdmsr" : "=A" (val) : "c" (msr));
-       return val;
+       asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+       return EAX_EDX_VAL(val, low, high);
  }
  
  static inline unsigned long long native_read_msr_safe(unsigned int msr,
                                                       int *err)
  {
-       unsigned long long val;
+       DECLARE_ARGS(val, low, high);
  
-       asm volatile("2: rdmsr ; xorl %0,%0\n"
+       asm volatile("2: rdmsr ; xor %0,%0\n"
                      "1:\n\t"
                      ".section .fixup,\"ax\"\n\t"
-                    "3:  movl %3,%0 ; jmp 1b\n\t"
+                    "3:  mov %3,%0 ; jmp 1b\n\t"
                      ".previous\n\t"
                      ".section __ex_table,\"a\"\n"
-                    "   .align 4\n\t"
-                    "   .long  2b,3b\n\t"
+                    _ASM_ALIGN "\n\t"
+                    _ASM_PTR " 2b,3b\n\t"
                      ".previous"
-                    : "=r" (*err), "=A" (val)
+                    : "=r" (*err), EAX_EDX_RET(val, low, high)
                      : "c" (msr), "i" (-EFAULT));
-
-       return val;
+       return EAX_EDX_VAL(val, low, high);
  }
  
-static inline void native_write_msr(unsigned int msr, unsigned long long val)
+static inline void native_write_msr(unsigned int msr,
+                                   unsigned low, unsigned high)
  {
-       asm volatile("wrmsr" : : "c" (msr), "A"(val));
+       asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high));
  }
  
  static inline int native_write_msr_safe(unsigned int msr,
-                                       unsigned long long val)
+                                       unsigned low, unsigned high)
  {
         int err;
-       asm volatile("2: wrmsr ; xorl %0,%0\n"
+       asm volatile("2: wrmsr ; xor %0,%0\n"
                      "1:\n\t"
                      ".section .fixup,\"ax\"\n\t"
-                    "3:  movl %4,%0 ; jmp 1b\n\t"
+                    "3:  mov %4,%0 ; jmp 1b\n\t"
                      ".previous\n\t"
                      ".section __ex_table,\"a\"\n"
-                    "   .align 4\n\t"
-                    "   .long  2b,3b\n\t"
+                    _ASM_ALIGN "\n\t"
+                    _ASM_PTR " 2b,3b\n\t"
                      ".previous"
                      : "=a" (err)
-                    : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+                    : "c" (msr), "0" (low), "d" (high),
                        "i" (-EFAULT));
         return err;
  }
  
-static inline unsigned long long native_read_tsc(void)
+extern unsigned long long native_read_tsc(void);
+
+static __always_inline unsigned long long __native_read_tsc(void)
  {
-       unsigned long long val;
-       asm volatile("rdtsc" : "=A" (val));
-       return val;
+       DECLARE_ARGS(val, low, high);
+
+       rdtsc_barrier();
+       asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+       rdtsc_barrier();
+
+       return EAX_EDX_VAL(val, low, high);
  }
  
-static inline unsigned long long native_read_pmc(void)
+static inline unsigned long long native_read_pmc(int counter)
  {
-       unsigned long long val;
-       asm volatile("rdpmc" : "=A" (val));
-       return val;
+       DECLARE_ARGS(val, low, high);
+
+       asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+       return EAX_EDX_VAL(val, low, high);
  }
  
  #ifdef CONFIG_PARAVIRT
@@ -97,20 +129,21 @@ static inline unsigned long long native_read_pmc(void)
                 (val2) = (u32)(__val >> 32);                            \
         } while(0)
  
-static inline void wrmsr(u32 __msr, u32 __low, u32 __high)
+static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
  {
-       native_write_msr(__msr, ((u64)__high << 32) | __low);
+       native_write_msr(msr, low, high);
  }
  
  #define rdmsrl(msr,val)                                                        \
         ((val) = native_read_msr(msr))
  
-#define wrmsrl(msr,val)        native_write_msr(msr, val)
+#define wrmsrl(msr, val)                                               \
+       native_write_msr(msr, (u32)((u64)(val)), (u32)((u64)(val) >> 32))
  
  /* wrmsr with exception handling */
-static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high)
+static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
  {
-       return native_write_msr_safe(__msr, ((u64)__high << 32) | __low);
+       return native_write_msr_safe(msr, low, high);
  }
  
  /* rdmsr with exception handling */
@@ -129,204 +162,31 @@ static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high)
  #define rdtscll(val)                                           \
         ((val) = native_read_tsc())
  
-#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
-
  #define rdpmc(counter,low,high)                                        \
         do {                                                    \
-               u64 _l = native_read_pmc();                     \
+               u64 _l = native_read_pmc(counter);              \
                 (low)  = (u32)_l;                               \
                 (high) = (u32)(_l >> 32);                       \
         } while(0)
-#endif /* !CONFIG_PARAVIRT */
-
-#ifdef CONFIG_SMP
-void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
-void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
-int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
-int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
-#else  /*  CONFIG_SMP  */
-static inline void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
-       rdmsr(msr_no, *l, *h);
-}
-static inline void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
-       wrmsr(msr_no, l, h);
-}
-static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
-{
-       return rdmsr_safe(msr_no, l, h);
-}
-static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
-{
-       return wrmsr_safe(msr_no, l, h);
-}
-#endif  /*  CONFIG_SMP  */
-#endif  /* ! __ASSEMBLY__ */
-#endif  /* __KERNEL__ */
-
-#else   /* __i386__ */
-
-#ifndef __ASSEMBLY__
-#include <linux/errno.h>
-/*
- * Access to machine-specific registers (available on 586 and better only)
- * Note: the rd* operations modify the parameters directly (without using
- * pointer indirection), this allows gcc to optimize better
- */
-
-#define rdmsr(msr,val1,val2) \
-       __asm__ __volatile__("rdmsr" \
-                           : "=a" (val1), "=d" (val2) \
-                           : "c" (msr))
-
-
-#define rdmsrl(msr,val) do { unsigned long a__,b__; \
-       __asm__ __volatile__("rdmsr" \
-                           : "=a" (a__), "=d" (b__) \
-                           : "c" (msr)); \
-       val = a__ | (b__<<32); \
-} while(0)
-
-#define wrmsr(msr,val1,val2) \
-     __asm__ __volatile__("wrmsr" \
-                         : /* no outputs */ \
-                         : "c" (msr), "a" (val1), "d" (val2))
-
-#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32)
  
-#define rdtsc(low,high) \
-     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
+#define rdtscp(low, high, aux)                                         \
+       do {                                                            \
+               unsigned long long _val = native_read_tscp(&(aux));     \
+               (low) = (u32)_val;                                      \
+               (high) = (u32)(_val >> 32);                             \
+       } while (0)
  
-#define rdtscl(low) \
-     __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
+#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
  
-#define rdtscp(low,high,aux) \
-     __asm__ __volatile__ (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux))
+#endif /* !CONFIG_PARAVIRT */
  
-#define rdtscll(val) do { \
-     unsigned int __a,__d; \
-     __asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
-     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
-} while(0)
  
-#define rdtscpll(val, aux) do { \
-     unsigned long __a, __d; \
-     __asm__ __volatile__ (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
-     (val) = (__d << 32) | __a; \
-} while (0)
+#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
  
  #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
  
  #define write_rdtscp_aux(val) wrmsr(0xc0000103, val, 0)
  
-#define rdpmc(counter,low,high) \
-     __asm__ __volatile__("rdpmc" \
-                         : "=a" (low), "=d" (high) \
-                         : "c" (counter))
-
-
-static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
-                        unsigned int *ecx, unsigned int *edx)
-{
-       __asm__("cpuid"
-               : "=a" (*eax),
-                 "=b" (*ebx),
-                 "=c" (*ecx),
-                 "=d" (*edx)
-               : "0" (op));
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
-                              int *edx)
-{
-       __asm__("cpuid"
-               : "=a" (*eax),
-                 "=b" (*ebx),
-                 "=c" (*ecx),
-                 "=d" (*edx)
-               : "0" (op), "c" (count));
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
-       unsigned int eax;
-
-       __asm__("cpuid"
-               : "=a" (eax)
-               : "0" (op)
-               : "bx", "cx", "dx");
-       return eax;
-}
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
-       unsigned int eax, ebx;
-
-       __asm__("cpuid"
-               : "=a" (eax), "=b" (ebx)
-               : "0" (op)
-               : "cx", "dx" );
-       return ebx;
-}
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
-       unsigned int eax, ecx;
-
-       __asm__("cpuid"
-               : "=a" (eax), "=c" (ecx)
-               : "0" (op)
-               : "bx", "dx" );
-       return ecx;
-}
-static inline unsigned int cpuid_edx(unsigned int op)
-{
-       unsigned int eax, edx;
-
-       __asm__("cpuid"
-               : "=a" (eax), "=d" (edx)
-               : "0" (op)
-               : "bx", "cx");
-       return edx;
-}
-
-#ifdef __KERNEL__
-
-/* wrmsr with exception handling */
-#define wrmsr_safe(msr,a,b) ({ int ret__;                      \
-       asm volatile("2: wrmsr ; xorl %0,%0\n"                  \
-                    "1:\n\t"                                   \
-                    ".section .fixup,\"ax\"\n\t"               \
-                    "3:  movl %4,%0 ; jmp 1b\n\t"              \
-                    ".previous\n\t"                            \
-                    ".section __ex_table,\"a\"\n"              \
-                    "   .align 8\n\t"                          \
-                    "   .quad  2b,3b\n\t"                      \
-                    ".previous"                                \
-                    : "=a" (ret__)                             \
-                    : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
-       ret__; })
-
-#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
-
-#define rdmsr_safe(msr,a,b) \
-       ({ int ret__;                                           \
-         asm volatile ("1:       rdmsr\n"                      \
-                       "2:\n"                                  \
-                       ".section .fixup,\"ax\"\n"              \
-                       "3:       movl %4,%0\n"                 \
-                       " jmp 2b\n"                             \
-                       ".previous\n"                           \
-                       ".section __ex_table,\"a\"\n"           \
-                       " .align 8\n"                           \
-                       " .quad 1b,3b\n"                                \
-                       ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b)) \
-                       :"c"(msr), "i"(-EIO), "0"(0));                  \
-         ret__; })
-
  #ifdef CONFIG_SMP
  void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
  void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
@@ -350,9 +210,8 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
         return wrmsr_safe(msr_no, l, h);
  }
  #endif  /* CONFIG_SMP */
-#endif  /* __KERNEL__ */
-#endif  /* __ASSEMBLY__ */
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
  
-#endif  /* !__i386__ */
  
  #endif
diff --git a/include/asm-x86/mtrr.h b/include/asm-x86/mtrr.h

index e8320e4e6ca2d5f5d367975f694ebf4e6f78a571..319d065800bec4df1b2b4dfb56357ea5b2ff0f03 100644 (file)
--- a/include/asm-x86/mtrr.h
+++ b/include/asm-x86/mtrr.h
@@ -89,24 +89,25 @@ struct mtrr_gentry
  extern void mtrr_save_fixed_ranges(void *);
  extern void mtrr_save_state(void);
  extern int mtrr_add (unsigned long base, unsigned long size,
-                    unsigned int type, char increment);
+                    unsigned int type, bool increment);
  extern int mtrr_add_page (unsigned long base, unsigned long size,
-                    unsigned int type, char increment);
+                    unsigned int type, bool increment);
  extern int mtrr_del (int reg, unsigned long base, unsigned long size);
  extern int mtrr_del_page (int reg, unsigned long base, unsigned long size);
  extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
  extern void mtrr_ap_init(void);
  extern void mtrr_bp_init(void);
+extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
  #  else
  #define mtrr_save_fixed_ranges(arg) do {} while (0)
  #define mtrr_save_state() do {} while (0)
  static __inline__ int mtrr_add (unsigned long base, unsigned long size,
-                               unsigned int type, char increment)
+                               unsigned int type, bool increment)
  {
      return -ENODEV;
  }
  static __inline__ int mtrr_add_page (unsigned long base, unsigned long size,
-                               unsigned int type, char increment)
+                               unsigned int type, bool increment)
  {
      return -ENODEV;
  }
@@ -120,7 +121,10 @@ static __inline__ int mtrr_del_page (int reg, unsigned long base,
  {
      return -ENODEV;
  }
-
+static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+       return 0;
+}
  static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;}
  
  #define mtrr_ap_init() do {} while (0)
diff --git a/include/asm-x86/mutex_32.h b/include/asm-x86/mutex_32.h

index 7a17d9e58ad6586140e84a91a0a850b1b527e77a..bbeefb96ddfd6062657334b15915598ed4b26595 100644 (file)
--- a/include/asm-x86/mutex_32.h
+++ b/include/asm-x86/mutex_32.h
@@ -26,7 +26,7 @@ do {                                                                  \
         unsigned int dummy;                                             \
                                                                         \
         typecheck(atomic_t *, count);                                   \
-       typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);           \
+       typecheck_fn(void (*)(atomic_t *), fail_fn);            \
                                                                         \
         __asm__ __volatile__(                                           \
                 LOCK_PREFIX "   decl (%%eax)    \n"                     \
@@ -51,8 +51,7 @@ do {                                                                  \
   * or anything the slow path function returns
   */
  static inline int
-__mutex_fastpath_lock_retval(atomic_t *count,
-                            int fastcall (*fail_fn)(atomic_t *))
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
  {
         if (unlikely(atomic_dec_return(count) < 0))
                 return fail_fn(count);
@@ -78,7 +77,7 @@ do {                                                                  \
         unsigned int dummy;                                             \
                                                                         \
         typecheck(atomic_t *, count);                                   \
-       typecheck_fn(fastcall void (*)(atomic_t *), fail_fn);           \
+       typecheck_fn(void (*)(atomic_t *), fail_fn);            \
                                                                         \
         __asm__ __volatile__(                                           \
                 LOCK_PREFIX "   incl (%%eax)    \n"                     \
diff --git a/include/asm-x86/nmi_32.h b/include/asm-x86/nmi_32.h

index 70a958a8e3816f738558ca33b00a49d3e14cd3a0..7206c7e8a38813a858540742ee57e54d1a54b3e5 100644 (file)
--- a/include/asm-x86/nmi_32.h
+++ b/include/asm-x86/nmi_32.h
@@ -1,6 +1,3 @@
-/*
- *  linux/include/asm-i386/nmi.h
- */
  #ifndef ASM_NMI_H
  #define ASM_NMI_H
  
diff --git a/include/asm-x86/nmi_64.h b/include/asm-x86/nmi_64.h

index 65b6acf3bb593af5edd4460d9a8bd3cdb32ff924..2eeb74e5f3ff6d13c2635f50a8b9f3edf1c7a514 100644 (file)
--- a/include/asm-x86/nmi_64.h
+++ b/include/asm-x86/nmi_64.h
@@ -1,6 +1,3 @@
-/*
- *  linux/include/asm-i386/nmi.h
- */
  #ifndef ASM_NMI_H
  #define ASM_NMI_H
  
@@ -41,7 +38,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
  
  #define get_nmi_reason() inb(0x61)
  
-extern int panic_on_timeout;
  extern int unknown_nmi_panic;
  extern int nmi_watchdog_enabled;
  
@@ -60,7 +56,6 @@ extern void enable_timer_nmi_watchdog(void);
  extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
  
  extern void nmi_watchdog_default(void);
-extern int setup_nmi_watchdog(char *);
  
  extern atomic_t nmi_active;
  extern unsigned int nmi_watchdog;
diff --git a/include/asm-x86/nops.h b/include/asm-x86/nops.h

new file mode 100644 (file)

index 0000000..fec025c
--- /dev/null
+++ b/include/asm-x86/nops.h
@@ -0,0 +1,90 @@
+#ifndef _ASM_NOPS_H
+#define _ASM_NOPS_H 1
+
+/* Define nops for use with alternative() */
+
+/* generic versions from gas */
+#define GENERIC_NOP1   ".byte 0x90\n"
+#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
+#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
+#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
+#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
+#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
+#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
+
+/* Opteron 64bit nops */
+#define K8_NOP1 GENERIC_NOP1
+#define K8_NOP2        ".byte 0x66,0x90\n"
+#define K8_NOP3        ".byte 0x66,0x66,0x90\n"
+#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n"
+#define K8_NOP5        K8_NOP3 K8_NOP2
+#define K8_NOP6        K8_NOP3 K8_NOP3
+#define K8_NOP7        K8_NOP4 K8_NOP3
+#define K8_NOP8        K8_NOP4 K8_NOP4
+
+/* K7 nops */
+/* uses eax dependencies (arbitary choice) */
+#define K7_NOP1  GENERIC_NOP1
+#define K7_NOP2        ".byte 0x8b,0xc0\n"
+#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
+#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
+#define K7_NOP5        K7_NOP4 ASM_NOP1
+#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
+#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
+#define K7_NOP8        K7_NOP7 ASM_NOP1
+
+/* P6 nops */
+/* uses eax dependencies (Intel-recommended choice) */
+#define P6_NOP1        GENERIC_NOP1
+#define P6_NOP2        ".byte 0x66,0x90\n"
+#define P6_NOP3        ".byte 0x0f,0x1f,0x00\n"
+#define P6_NOP4        ".byte 0x0f,0x1f,0x40,0\n"
+#define P6_NOP5        ".byte 0x0f,0x1f,0x44,0x00,0\n"
+#define P6_NOP6        ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
+#define P6_NOP7        ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
+#define P6_NOP8        ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+
+#if defined(CONFIG_MK8)
+#define ASM_NOP1 K8_NOP1
+#define ASM_NOP2 K8_NOP2
+#define ASM_NOP3 K8_NOP3
+#define ASM_NOP4 K8_NOP4
+#define ASM_NOP5 K8_NOP5
+#define ASM_NOP6 K8_NOP6
+#define ASM_NOP7 K8_NOP7
+#define ASM_NOP8 K8_NOP8
+#elif defined(CONFIG_MK7)
+#define ASM_NOP1 K7_NOP1
+#define ASM_NOP2 K7_NOP2
+#define ASM_NOP3 K7_NOP3
+#define ASM_NOP4 K7_NOP4
+#define ASM_NOP5 K7_NOP5
+#define ASM_NOP6 K7_NOP6
+#define ASM_NOP7 K7_NOP7
+#define ASM_NOP8 K7_NOP8
+#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
+      defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
+      defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
+#define ASM_NOP1 P6_NOP1
+#define ASM_NOP2 P6_NOP2
+#define ASM_NOP3 P6_NOP3
+#define ASM_NOP4 P6_NOP4
+#define ASM_NOP5 P6_NOP5
+#define ASM_NOP6 P6_NOP6
+#define ASM_NOP7 P6_NOP7
+#define ASM_NOP8 P6_NOP8
+#else
+#define ASM_NOP1 GENERIC_NOP1
+#define ASM_NOP2 GENERIC_NOP2
+#define ASM_NOP3 GENERIC_NOP3
+#define ASM_NOP4 GENERIC_NOP4
+#define ASM_NOP5 GENERIC_NOP5
+#define ASM_NOP6 GENERIC_NOP6
+#define ASM_NOP7 GENERIC_NOP7
+#define ASM_NOP8 GENERIC_NOP8
+#endif
+
+#define ASM_NOP_MAX 8
+
+#endif
diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h

index 96fcb157db1d9cadc70650d16e73c03ff3176834..03d0f7a9bf0249b9e649605625530ccac9d6f836 100644 (file)
--- a/include/asm-x86/numa_32.h
+++ b/include/asm-x86/numa_32.h
@@ -1,3 +1,15 @@
+#ifndef _ASM_X86_32_NUMA_H
+#define _ASM_X86_32_NUMA_H 1
  
-int pxm_to_nid(int pxm);
+extern int pxm_to_nid(int pxm);
  
+#ifdef CONFIG_NUMA
+extern void __init remap_numa_kva(void);
+extern void set_highmem_pages_init(int);
+#else
+static inline void remap_numa_kva(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_32_NUMA_H */
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h

index 0cc5c97a7fc91f1555c5082d9bc28cbbc3922fbb..15fe07cde5861e3ee0d1144e933166ce03befd8f 100644 (file)
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -20,13 +20,19 @@ extern void numa_set_node(int cpu, int node);
  extern void srat_reserve_add_area(int nodeid);
  extern int hotadd_percent;
  
-extern unsigned char apicid_to_node[MAX_LOCAL_APIC];
+extern s16 apicid_to_node[MAX_LOCAL_APIC];
+
+extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern unsigned long numa_free_all_bootmem(void);
+extern void setup_node_bootmem(int nodeid, unsigned long start,
+                              unsigned long end);
+
  #ifdef CONFIG_NUMA
  extern void __init init_cpu_to_node(void);
  
  static inline void clear_node_cpumask(int cpu)
  {
-       clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+       clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]);
  }
  
  #else
@@ -34,6 +40,4 @@ static inline void clear_node_cpumask(int cpu)
  #define clear_node_cpumask(cpu) do {} while (0)
  #endif
  
-#define NUMA_NO_NODE 0xff
-
  #endif
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h

index a757eb26141d63c440de713ec62a9fbed9770107..c8b30efeed85c6354d229a18d1d14bae2c206a87 100644 (file)
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -1,13 +1,183 @@
+#ifndef _ASM_X86_PAGE_H
+#define _ASM_X86_PAGE_H
+
+#include <linux/const.h>
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT     12
+#define PAGE_SIZE      (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK      (~(PAGE_SIZE-1))
+
  #ifdef __KERNEL__
-# ifdef CONFIG_X86_32
-#  include "page_32.h"
-# else
-#  include "page_64.h"
-# endif
+
+#define PHYSICAL_PAGE_MASK     (PAGE_MASK & __PHYSICAL_MASK)
+#define PTE_MASK               (_AT(long, PHYSICAL_PAGE_MASK))
+
+#define LARGE_PAGE_SIZE                (_AC(1,UL) << PMD_SHIFT)
+#define LARGE_PAGE_MASK                (~(LARGE_PAGE_SIZE-1))
+
+#define HPAGE_SHIFT            PMD_SHIFT
+#define HPAGE_SIZE             (_AC(1,UL) << HPAGE_SHIFT)
+#define HPAGE_MASK             (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+
+#define __PHYSICAL_MASK                _AT(phys_addr_t, (_AC(1,ULL) << __PHYSICAL_MASK_SHIFT) - 1)
+#define __VIRTUAL_MASK         ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#endif
+
+#ifdef CONFIG_X86_64
+#include <asm/page_64.h>
+#define max_pfn_mapped         end_pfn_map
  #else
-# ifdef __i386__
-#  include "page_32.h"
-# else
-#  include "page_64.h"
-# endif
+#include <asm/page_32.h>
+#define max_pfn_mapped         max_low_pfn
+#endif /* CONFIG_X86_64 */
+
+#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
+
+#define VM_DATA_DEFAULT_FLAGS \
+       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+
+#ifndef __ASSEMBLY__
+
+extern int page_is_ram(unsigned long pagenr);
+
+struct page;
+
+static void inline clear_user_page(void *page, unsigned long vaddr,
+                               struct page *pg)
+{
+       clear_page(page);
+}
+
+static void inline copy_user_page(void *to, void *from, unsigned long vaddr,
+                               struct page *topage)
+{
+       copy_page(to, from);
+}
+
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+       alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+typedef struct { pgdval_t pgd; } pgd_t;
+typedef struct { pgprotval_t pgprot; } pgprot_t;
+
+static inline pgd_t native_make_pgd(pgdval_t val)
+{
+       return (pgd_t) { val };
+}
+
+static inline pgdval_t native_pgd_val(pgd_t pgd)
+{
+       return pgd.pgd;
+}
+
+#if PAGETABLE_LEVELS >= 3
+#if PAGETABLE_LEVELS == 4
+typedef struct { pudval_t pud; } pud_t;
+
+static inline pud_t native_make_pud(pmdval_t val)
+{
+       return (pud_t) { val };
+}
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+       return pud.pud;
+}
+#else  /* PAGETABLE_LEVELS == 3 */
+#include <asm-generic/pgtable-nopud.h>
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+       return native_pgd_val(pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
+
+typedef struct { pmdval_t pmd; } pmd_t;
+
+static inline pmd_t native_make_pmd(pmdval_t val)
+{
+       return (pmd_t) { val };
+}
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+       return pmd.pmd;
+}
+#else  /* PAGETABLE_LEVELS == 2 */
+#include <asm-generic/pgtable-nopmd.h>
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+       return native_pgd_val(pmd.pud.pgd);
+}
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+static inline pte_t native_make_pte(pteval_t val)
+{
+       return (pte_t) { .pte = val };
+}
+
+static inline pteval_t native_pte_val(pte_t pte)
+{
+       return pte.pte;
+}
+
+#define pgprot_val(x)  ((x).pgprot)
+#define __pgprot(x)    ((pgprot_t) { (x) } )
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else  /* !CONFIG_PARAVIRT */
+
+#define pgd_val(x)     native_pgd_val(x)
+#define __pgd(x)       native_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x)     native_pud_val(x)
+#define __pud(x)       native_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x)     native_pmd_val(x)
+#define __pmd(x)       native_make_pmd(x)
  #endif
+
+#define pte_val(x)     native_pte_val(x)
+#define __pte(x)       native_make_pte(x)
+
+#endif /* CONFIG_PARAVIRT */
+
+#define __pa(x)                __phys_addr((unsigned long)(x))
+/* __pa_symbol should be used for C visible symbols.
+   This seems to be the official gcc blessed way to do such arithmetic. */
+#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
+
+#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
+
+#define __boot_va(x)           __va(x)
+#define __boot_pa(x)           __pa(x)
+
+#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#endif /* __ASSEMBLY__ */
+
+#include <asm-generic/memory_model.h>
+#include <asm-generic/page.h>
+
+#define __HAVE_ARCH_GATE_AREA 1
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_PAGE_H */
diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h

index 80ecc66b6d8647c901c5d7038bae1267b9219c26..a6fd10f230d2e628660e252bda6623bc3840f67f 100644 (file)
--- a/include/asm-x86/page_32.h
+++ b/include/asm-x86/page_32.h
@@ -1,206 +1,107 @@
-#ifndef _I386_PAGE_H
-#define _I386_PAGE_H
-
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT     12
-#define PAGE_SIZE      (1UL << PAGE_SHIFT)
-#define PAGE_MASK      (~(PAGE_SIZE-1))
-
-#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
-#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
-
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-
-#ifdef CONFIG_X86_USE_3DNOW
-
-#include <asm/mmx.h>
-
-#define clear_page(page)       mmx_clear_page((void *)(page))
-#define copy_page(to,from)     mmx_copy_page(to,from)
-
-#else
+#ifndef _ASM_X86_PAGE_32_H
+#define _ASM_X86_PAGE_32_H
  
  /*
- *     On older X86 processors it's not a win to use MMX here it seems.
- *     Maybe the K6-III ?
- */
- 
-#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
-#define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
-
-#endif
-
-#define clear_user_page(page, vaddr, pg)       clear_page(page)
-#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
-
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-       alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
-
-/*
- * These are used to make use of C type-checking..
+ * This handles the memory map.
+ *
+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
+ * a virtual address space of one gigabyte, which limits the
+ * amount of physical memory you can use to about 950MB.
+ *
+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
   */
-extern int nx_enabled;
+#define __PAGE_OFFSET          _AC(CONFIG_PAGE_OFFSET, UL)
  
  #ifdef CONFIG_X86_PAE
-typedef struct { unsigned long pte_low, pte_high; } pte_t;
-typedef struct { unsigned long long pmd; } pmd_t;
-typedef struct { unsigned long long pgd; } pgd_t;
-typedef struct { unsigned long long pgprot; } pgprot_t;
+#define __PHYSICAL_MASK_SHIFT  36
+#define __VIRTUAL_MASK_SHIFT   32
+#define PAGETABLE_LEVELS       3
  
-static inline unsigned long long native_pgd_val(pgd_t pgd)
-{
-       return pgd.pgd;
-}
-
-static inline unsigned long long native_pmd_val(pmd_t pmd)
-{
-       return pmd.pmd;
-}
-
-static inline unsigned long long native_pte_val(pte_t pte)
-{
-       return pte.pte_low | ((unsigned long long)pte.pte_high << 32);
-}
-
-static inline pgd_t native_make_pgd(unsigned long long val)
-{
-       return (pgd_t) { val };
-}
-
-static inline pmd_t native_make_pmd(unsigned long long val)
-{
-       return (pmd_t) { val };
-}
-
-static inline pte_t native_make_pte(unsigned long long val)
-{
-       return (pte_t) { .pte_low = val, .pte_high = (val >> 32) } ;
-}
-
-#ifndef CONFIG_PARAVIRT
-#define pmd_val(x)     native_pmd_val(x)
-#define __pmd(x)       native_make_pmd(x)
-#endif
-
-#define HPAGE_SHIFT    21
-#include <asm-generic/pgtable-nopud.h>
+#ifndef __ASSEMBLY__
+typedef u64    pteval_t;
+typedef u64    pmdval_t;
+typedef u64    pudval_t;
+typedef u64    pgdval_t;
+typedef u64    pgprotval_t;
+typedef u64    phys_addr_t;
+
+typedef union {
+       struct {
+               unsigned long pte_low, pte_high;
+       };
+       pteval_t pte;
+} pte_t;
+#endif /* __ASSEMBLY__
+ */
  #else  /* !CONFIG_X86_PAE */
-typedef struct { unsigned long pte_low; } pte_t;
-typedef struct { unsigned long pgd; } pgd_t;
-typedef struct { unsigned long pgprot; } pgprot_t;
-#define boot_pte_t pte_t /* or would you rather have a typedef */
-
-static inline unsigned long native_pgd_val(pgd_t pgd)
-{
-       return pgd.pgd;
-}
+#define __PHYSICAL_MASK_SHIFT  32
+#define __VIRTUAL_MASK_SHIFT   32
+#define PAGETABLE_LEVELS       2
  
-static inline unsigned long native_pte_val(pte_t pte)
-{
-       return pte.pte_low;
-}
-
-static inline pgd_t native_make_pgd(unsigned long val)
-{
-       return (pgd_t) { val };
-}
+#ifndef __ASSEMBLY__
+typedef unsigned long  pteval_t;
+typedef unsigned long  pmdval_t;
+typedef unsigned long  pudval_t;
+typedef unsigned long  pgdval_t;
+typedef unsigned long  pgprotval_t;
+typedef unsigned long  phys_addr_t;
  
-static inline pte_t native_make_pte(unsigned long val)
-{
-       return (pte_t) { .pte_low = val };
-}
+typedef union { pteval_t pte, pte_low; } pte_t;
+typedef pte_t boot_pte_t;
  
-#define HPAGE_SHIFT    22
-#include <asm-generic/pgtable-nopmd.h>
+#endif /* __ASSEMBLY__ */
  #endif /* CONFIG_X86_PAE */
  
-#define PTE_MASK       PAGE_MASK
-
  #ifdef CONFIG_HUGETLB_PAGE
-#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
-#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
-#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
  #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
  #endif
  
-#define pgprot_val(x)  ((x).pgprot)
-#define __pgprot(x)    ((pgprot_t) { (x) } )
-
-#ifndef CONFIG_PARAVIRT
-#define pgd_val(x)     native_pgd_val(x)
-#define __pgd(x)       native_make_pgd(x)
-#define pte_val(x)     native_pte_val(x)
-#define __pte(x)       native_make_pte(x)
-#endif
-
-#endif /* !__ASSEMBLY__ */
-
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
-
-/*
- * This handles the memory map.. We could make this a config
- * option, but too many people screw it up, and too few need
- * it.
- *
- * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
- * a virtual address space of one gigabyte, which limits the
- * amount of physical memory you can use to about 950MB. 
- *
- * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
- * and CONFIG_HIGHMEM64G options in the kernel configuration.
- */
-
  #ifndef __ASSEMBLY__
+#define __phys_addr(x)         ((x)-PAGE_OFFSET)
+#define __phys_reloc_hide(x)   RELOC_HIDE((x), 0)
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn)         ((pfn) < max_mapnr)
+#endif /* CONFIG_FLATMEM */
  
-struct vm_area_struct;
+extern int nx_enabled;
  
  /*
   * This much address space is reserved for vmalloc() and iomap()
   * as well as fixmap mappings.
   */
  extern unsigned int __VMALLOC_RESERVE;
-
  extern int sysctl_legacy_va_layout;
  
-extern int page_is_ram(unsigned long pagenr);
-
-#endif /* __ASSEMBLY__ */
-
-#ifdef __ASSEMBLY__
-#define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
-#else
-#define __PAGE_OFFSET          ((unsigned long)CONFIG_PAGE_OFFSET)
-#endif
-
-
-#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
  #define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
  #define MAXMEM                 (-__PAGE_OFFSET-__VMALLOC_RESERVE)
-#define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
-/* __pa_symbol should be used for C visible symbols.
-   This seems to be the official gcc blessed way to do such arithmetic. */
-#define __pa_symbol(x)          __pa(RELOC_HIDE((unsigned long)(x),0))
-#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
-#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn)         ((pfn) < max_mapnr)
-#endif /* CONFIG_FLATMEM */
-#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
  
-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#ifdef CONFIG_X86_USE_3DNOW
+#include <asm/mmx.h>
+
+static inline void clear_page(void *page)
+{
+       mmx_clear_page(page);
+}
  
-#define VM_DATA_DEFAULT_FLAGS \
-       (VM_READ | VM_WRITE | \
-       ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
-                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+static inline void copy_page(void *to, void *from)
+{
+       mmx_copy_page(to, from);
+}
+#else  /* !CONFIG_X86_USE_3DNOW */
+#include <linux/string.h>
  
-#include <asm-generic/memory_model.h>
-#include <asm-generic/page.h>
+static inline void clear_page(void *page)
+{
+       memset(page, 0, PAGE_SIZE);
+}
  
-#define __HAVE_ARCH_GATE_AREA 1
-#endif /* __KERNEL__ */
+static inline void copy_page(void *to, void *from)
+{
+       memcpy(to, from, PAGE_SIZE);
+}
+#endif /* CONFIG_X86_3DNOW */
+#endif /* !__ASSEMBLY__ */
  
-#endif /* _I386_PAGE_H */
+#endif /* _ASM_X86_PAGE_32_H */
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h

index c3b52bcb171ef86852d819198d3b3b13f80bd94f..c1ac42d8707f34fe374c6063cc37503b7e4647f4 100644 (file)
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -1,15 +1,9 @@
  #ifndef _X86_64_PAGE_H
  #define _X86_64_PAGE_H
  
-#include <linux/const.h>
+#define PAGETABLE_LEVELS       4
  
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT     12
-#define PAGE_SIZE      (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK      (~(PAGE_SIZE-1))
-#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
-
-#define THREAD_ORDER 1 
+#define THREAD_ORDER   1
  #define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
  #define CURRENT_MASK (~(THREAD_SIZE-1))
  
@@ -29,54 +23,7 @@
  #define MCE_STACK 5
  #define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
  
-#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
-#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
-
-#define HPAGE_SHIFT PMD_SHIFT
-#define HPAGE_SIZE     (_AC(1,UL) << HPAGE_SHIFT)
-#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
-#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
-
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-
-extern unsigned long end_pfn;
-
-void clear_page(void *);
-void copy_page(void *, void *);
-
-#define clear_user_page(page, vaddr, pg)       clear_page(page)
-#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
-
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
-       alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
-/*
- * These are used to make use of C type-checking..
- */
-typedef struct { unsigned long pte; } pte_t;
-typedef struct { unsigned long pmd; } pmd_t;
-typedef struct { unsigned long pud; } pud_t;
-typedef struct { unsigned long pgd; } pgd_t;
-#define PTE_MASK       PHYSICAL_PAGE_MASK
-
-typedef struct { unsigned long pgprot; } pgprot_t;
-
-extern unsigned long phys_base;
-
-#define pte_val(x)     ((x).pte)
-#define pmd_val(x)     ((x).pmd)
-#define pud_val(x)     ((x).pud)
-#define pgd_val(x)     ((x).pgd)
-#define pgprot_val(x)  ((x).pgprot)
-
-#define __pte(x) ((pte_t) { (x) } )
-#define __pmd(x) ((pmd_t) { (x) } )
-#define __pud(x) ((pud_t) { (x) } )
-#define __pgd(x) ((pgd_t) { (x) } )
-#define __pgprot(x)    ((pgprot_t) { (x) } )
-
-#endif /* !__ASSEMBLY__ */
+#define __PAGE_OFFSET           _AC(0xffff810000000000, UL)
  
  #define __PHYSICAL_START       CONFIG_PHYSICAL_START
  #define __KERNEL_ALIGN         0x200000
@@ -92,53 +39,44 @@ extern unsigned long phys_base;
  
  #define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
  #define __START_KERNEL_map     _AC(0xffffffff80000000, UL)
-#define __PAGE_OFFSET           _AC(0xffff810000000000, UL)
-
-/* to align the pointer to the (next) page boundary */
-#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
  
  /* See Documentation/x86_64/mm.txt for a description of the memory map. */
  #define __PHYSICAL_MASK_SHIFT  46
-#define __PHYSICAL_MASK                ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
  #define __VIRTUAL_MASK_SHIFT   48
-#define __VIRTUAL_MASK         ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
  
  #define KERNEL_TEXT_SIZE  (40*1024*1024)
  #define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
-#define PAGE_OFFSET            __PAGE_OFFSET
  
  #ifndef __ASSEMBLY__
+void clear_page(void *page);
+void copy_page(void *to, void *from);
  
-#include <asm/bug.h>
+extern unsigned long end_pfn;
+extern unsigned long end_pfn_map;
+extern unsigned long phys_base;
  
  extern unsigned long __phys_addr(unsigned long);
+#define __phys_reloc_hide(x)   (x)
  
-#endif /* __ASSEMBLY__ */
-
-#define __pa(x)                __phys_addr((unsigned long)(x))
-#define __pa_symbol(x) __phys_addr((unsigned long)(x))
-
-#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
-#define __boot_va(x)           __va(x)
-#define __boot_pa(x)           __pa(x)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn)         ((pfn) < end_pfn)
-#endif
-
-#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long  pteval_t;
+typedef unsigned long  pmdval_t;
+typedef unsigned long  pudval_t;
+typedef unsigned long  pgdval_t;
+typedef unsigned long  pgprotval_t;
+typedef unsigned long  phys_addr_t;
  
-#define VM_DATA_DEFAULT_FLAGS \
-       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
-        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+typedef struct { pteval_t pte; } pte_t;
  
-#define __HAVE_ARCH_GATE_AREA 1        
  #define vmemmap ((struct page *)VMEMMAP_START)
  
-#include <asm-generic/memory_model.h>
-#include <asm-generic/page.h>
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn)          ((pfn) < end_pfn)
+#endif
  
-#endif /* __KERNEL__ */
  
  #endif /* _X86_64_PAGE_H */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h

index f59d370c5df4f69300121daaae150e50c9b475cd..d6236eb46466c19ed4ebab33e64b3290ca127f53 100644 (file)
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -5,22 +5,37 @@
  
  #ifdef CONFIG_PARAVIRT
  #include <asm/page.h>
+#include <asm/asm.h>
  
  /* Bitmask of what can be clobbered: usually at least eax. */
-#define CLBR_NONE 0x0
-#define CLBR_EAX 0x1
-#define CLBR_ECX 0x2
-#define CLBR_EDX 0x4
-#define CLBR_ANY 0x7
+#define CLBR_NONE 0
+#define CLBR_EAX  (1 << 0)
+#define CLBR_ECX  (1 << 1)
+#define CLBR_EDX  (1 << 2)
+
+#ifdef CONFIG_X86_64
+#define CLBR_RSI  (1 << 3)
+#define CLBR_RDI  (1 << 4)
+#define CLBR_R8   (1 << 5)
+#define CLBR_R9   (1 << 6)
+#define CLBR_R10  (1 << 7)
+#define CLBR_R11  (1 << 8)
+#define CLBR_ANY  ((1 << 9) - 1)
+#include <asm/desc_defs.h>
+#else
+/* CLBR_ANY should match all regs platform has. For i386, that's just it */
+#define CLBR_ANY  ((1 << 3) - 1)
+#endif /* X86_64 */
  
  #ifndef __ASSEMBLY__
  #include <linux/types.h>
  #include <linux/cpumask.h>
  #include <asm/kmap_types.h>
+#include <asm/desc_defs.h>
  
  struct page;
  struct thread_struct;
-struct Xgt_desc_struct;
+struct desc_ptr;
  struct tss_struct;
  struct mm_struct;
  struct desc_struct;
@@ -86,22 +101,27 @@ struct pv_cpu_ops {
         unsigned long (*read_cr4)(void);
         void (*write_cr4)(unsigned long);
  
+#ifdef CONFIG_X86_64
+       unsigned long (*read_cr8)(void);
+       void (*write_cr8)(unsigned long);
+#endif
+
         /* Segment descriptor handling */
         void (*load_tr_desc)(void);
-       void (*load_gdt)(const struct Xgt_desc_struct *);
-       void (*load_idt)(const struct Xgt_desc_struct *);
-       void (*store_gdt)(struct Xgt_desc_struct *);
-       void (*store_idt)(struct Xgt_desc_struct *);
+       void (*load_gdt)(const struct desc_ptr *);
+       void (*load_idt)(const struct desc_ptr *);
+       void (*store_gdt)(struct desc_ptr *);
+       void (*store_idt)(struct desc_ptr *);
         void (*set_ldt)(const void *desc, unsigned entries);
         unsigned long (*store_tr)(void);
         void (*load_tls)(struct thread_struct *t, unsigned int cpu);
-       void (*write_ldt_entry)(struct desc_struct *,
-                               int entrynum, u32 low, u32 high);
+       void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
+                               const void *desc);
         void (*write_gdt_entry)(struct desc_struct *,
-                               int entrynum, u32 low, u32 high);
-       void (*write_idt_entry)(struct desc_struct *,
-                               int entrynum, u32 low, u32 high);
-       void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t);
+                               int entrynum, const void *desc, int size);
+       void (*write_idt_entry)(gate_desc *,
+                               int entrynum, const gate_desc *gate);
+       void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
  
         void (*set_iopl_mask)(unsigned mask);
  
@@ -115,15 +135,18 @@ struct pv_cpu_ops {
         /* MSR, PMC and TSR operations.
            err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
         u64 (*read_msr)(unsigned int msr, int *err);
-       int (*write_msr)(unsigned int msr, u64 val);
+       int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
  
         u64 (*read_tsc)(void);
-       u64 (*read_pmc)(void);
+       u64 (*read_pmc)(int counter);
+       unsigned long long (*read_tscp)(unsigned int *aux);
  
         /* These two are jmp to, not actually called. */
-       void (*irq_enable_sysexit)(void);
+       void (*irq_enable_syscall_ret)(void);
         void (*iret)(void);
  
+       void (*swapgs)(void);
+
         struct pv_lazy_ops lazy_mode;
  };
  
@@ -150,9 +173,9 @@ struct pv_apic_ops {
          * Direct APIC operations, principally for VMI.  Ideally
          * these shouldn't be in this interface.
          */
-       void (*apic_write)(unsigned long reg, unsigned long v);
-       void (*apic_write_atomic)(unsigned long reg, unsigned long v);
-       unsigned long (*apic_read)(unsigned long reg);
+       void (*apic_write)(unsigned long reg, u32 v);
+       void (*apic_write_atomic)(unsigned long reg, u32 v);
+       u32 (*apic_read)(unsigned long reg);
         void (*setup_boot_clock)(void);
         void (*setup_secondary_clock)(void);
  
@@ -198,7 +221,7 @@ struct pv_mmu_ops {
  
         /* Hooks for allocating/releasing pagetable pages */
         void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
-       void (*alloc_pd)(u32 pfn);
+       void (*alloc_pd)(struct mm_struct *mm, u32 pfn);
         void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
         void (*release_pt)(u32 pfn);
         void (*release_pd)(u32 pfn);
@@ -212,28 +235,34 @@ struct pv_mmu_ops {
         void (*pte_update_defer)(struct mm_struct *mm,
                                  unsigned long addr, pte_t *ptep);
  
+       pteval_t (*pte_val)(pte_t);
+       pte_t (*make_pte)(pteval_t pte);
+
+       pgdval_t (*pgd_val)(pgd_t);
+       pgd_t (*make_pgd)(pgdval_t pgd);
+
+#if PAGETABLE_LEVELS >= 3
  #ifdef CONFIG_X86_PAE
         void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
         void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
                                 pte_t *ptep, pte_t pte);
-       void (*set_pud)(pud_t *pudp, pud_t pudval);
         void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
         void (*pmd_clear)(pmd_t *pmdp);
  
-       unsigned long long (*pte_val)(pte_t);
-       unsigned long long (*pmd_val)(pmd_t);
-       unsigned long long (*pgd_val)(pgd_t);
+#endif /* CONFIG_X86_PAE */
  
-       pte_t (*make_pte)(unsigned long long pte);
-       pmd_t (*make_pmd)(unsigned long long pmd);
-       pgd_t (*make_pgd)(unsigned long long pgd);
-#else
-       unsigned long (*pte_val)(pte_t);
-       unsigned long (*pgd_val)(pgd_t);
+       void (*set_pud)(pud_t *pudp, pud_t pudval);
  
-       pte_t (*make_pte)(unsigned long pte);
-       pgd_t (*make_pgd)(unsigned long pgd);
-#endif
+       pmdval_t (*pmd_val)(pmd_t);
+       pmd_t (*make_pmd)(pmdval_t pmd);
+
+#if PAGETABLE_LEVELS == 4
+       pudval_t (*pud_val)(pud_t);
+       pud_t (*make_pud)(pudval_t pud);
+
+       void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
+#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* PAGETABLE_LEVELS >= 3 */
  
  #ifdef CONFIG_HIGHPTE
         void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
@@ -279,7 +308,8 @@ extern struct pv_mmu_ops pv_mmu_ops;
  #define _paravirt_alt(insn_string, type, clobber)      \
         "771:\n\t" insn_string "\n" "772:\n"            \
         ".pushsection .parainstructions,\"a\"\n"        \
-       "  .long 771b\n"                                \
+       _ASM_ALIGN "\n"                                 \
+       _ASM_PTR " 771b\n"                              \
         "  .byte " type "\n"                            \
         "  .byte 772b-771b\n"                           \
         "  .short " clobber "\n"                        \
@@ -289,6 +319,11 @@ extern struct pv_mmu_ops pv_mmu_ops;
  #define paravirt_alt(insn_string)                                      \
         _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
  
+/* Simple instruction patching code. */
+#define DEF_NATIVE(ops, name, code)                                    \
+       extern const char start_##ops##_##name[], end_##ops##_##name[]; \
+       asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
  unsigned paravirt_patch_nop(void);
  unsigned paravirt_patch_ignore(unsigned len);
  unsigned paravirt_patch_call(void *insnbuf,
@@ -303,6 +338,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
  unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
                               const char *start, const char *end);
  
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+                     unsigned long addr, unsigned len);
+
  int paravirt_disable_iospace(void);
  
  /*
@@ -319,7 +357,7 @@ int paravirt_disable_iospace(void);
   * runtime.
   *
   * Normally, a call to a pv_op function is a simple indirect call:
- * (paravirt_ops.operations)(args...).
+ * (pv_op_struct.operations)(args...).
   *
   * Unfortunately, this is a relatively slow operation for modern CPUs,
   * because it cannot necessarily determine what the destination
@@ -329,11 +367,17 @@ int paravirt_disable_iospace(void);
   * calls are essentially free, because the call and return addresses
   * are completely predictable.)
   *
- * These macros rely on the standard gcc "regparm(3)" calling
+ * For i386, these macros rely on the standard gcc "regparm(3)" calling
   * convention, in which the first three arguments are placed in %eax,
   * %edx, %ecx (in that order), and the remaining arguments are placed
   * on the stack.  All caller-save registers (eax,edx,ecx) are expected
   * to be modified (either clobbered or used for return values).
+ * X86_64, on the other hand, already specifies a register-based calling
+ * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
+ * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
+ * special handling for dealing with 4 arguments, unlike i386.
+ * However, x86_64 also have to clobber all caller saved registers, which
+ * unfortunately, are quite a bit (r8 - r11)
   *
   * The call instruction itself is marked by placing its start address
   * and size into the .parainstructions section, so that
@@ -356,10 +400,12 @@ int paravirt_disable_iospace(void);
   * the return type.  The macro then uses sizeof() on that type to
   * determine whether its a 32 or 64 bit value, and places the return
   * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
- * 64-bit).
+ * 64-bit). For x86_64 machines, it just returns at %rax regardless of
+ * the return value size.
   *
   * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
- * in low,high order.
+ * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
+ * in low,high order
   *
   * Small structures are passed and returned in registers.  The macro
   * calling convention can't directly deal with this, so the wrapper
@@ -369,46 +415,67 @@ int paravirt_disable_iospace(void);
   * means that all uses must be wrapped in inline functions.  This also
   * makes sure the incoming and outgoing types are always correct.
   */
+#ifdef CONFIG_X86_32
+#define PVOP_VCALL_ARGS                        unsigned long __eax, __edx, __ecx
+#define PVOP_CALL_ARGS                 PVOP_VCALL_ARGS
+#define PVOP_VCALL_CLOBBERS            "=a" (__eax), "=d" (__edx),     \
+                                       "=c" (__ecx)
+#define PVOP_CALL_CLOBBERS             PVOP_VCALL_CLOBBERS
+#define EXTRA_CLOBBERS
+#define VEXTRA_CLOBBERS
+#else
+#define PVOP_VCALL_ARGS                unsigned long __edi, __esi, __edx, __ecx
+#define PVOP_CALL_ARGS         PVOP_VCALL_ARGS, __eax
+#define PVOP_VCALL_CLOBBERS    "=D" (__edi),                           \
+                               "=S" (__esi), "=d" (__edx),             \
+                               "=c" (__ecx)
+
+#define PVOP_CALL_CLOBBERS     PVOP_VCALL_CLOBBERS, "=a" (__eax)
+
+#define EXTRA_CLOBBERS  , "r8", "r9", "r10", "r11"
+#define VEXTRA_CLOBBERS         , "rax", "r8", "r9", "r10", "r11"
+#endif
+
  #define __PVOP_CALL(rettype, op, pre, post, ...)                       \
         ({                                                              \
                 rettype __ret;                                          \
-               unsigned long __eax, __edx, __ecx;                      \
+               PVOP_CALL_ARGS;                                 \
+               /* This is 32-bit specific, but is okay in 64-bit */    \
+               /* since this condition will never hold */              \
                 if (sizeof(rettype) > sizeof(unsigned long)) {          \
                         asm volatile(pre                                \
                                      paravirt_alt(PARAVIRT_CALL)        \
                                      post                               \
-                                    : "=a" (__eax), "=d" (__edx),      \
-                                      "=c" (__ecx)                     \
+                                    : PVOP_CALL_CLOBBERS               \
                                      : paravirt_type(op),               \
                                        paravirt_clobber(CLBR_ANY),      \
                                        ##__VA_ARGS__                    \
-                                    : "memory", "cc");                 \
+                                    : "memory", "cc" EXTRA_CLOBBERS);  \
                         __ret = (rettype)((((u64)__edx) << 32) | __eax); \
                 } else {                                                \
                         asm volatile(pre                                \
                                      paravirt_alt(PARAVIRT_CALL)        \
                                      post                               \
-                                    : "=a" (__eax), "=d" (__edx),      \
-                                      "=c" (__ecx)                     \
+                                    : PVOP_CALL_CLOBBERS               \
                                      : paravirt_type(op),               \
                                        paravirt_clobber(CLBR_ANY),      \
                                        ##__VA_ARGS__                    \
-                                    : "memory", "cc");                 \
+                                    : "memory", "cc" EXTRA_CLOBBERS);  \
                         __ret = (rettype)__eax;                         \
                 }                                                       \
                 __ret;                                                  \
         })
  #define __PVOP_VCALL(op, pre, post, ...)                               \
         ({                                                              \
-               unsigned long __eax, __edx, __ecx;                      \
+               PVOP_VCALL_ARGS;                                        \
                 asm volatile(pre                                        \
                              paravirt_alt(PARAVIRT_CALL)                \
                              post                                       \
-                            : "=a" (__eax), "=d" (__edx), "=c" (__ecx) \
+                            : PVOP_VCALL_CLOBBERS                      \
                              : paravirt_type(op),                       \
                                paravirt_clobber(CLBR_ANY),              \
                                ##__VA_ARGS__                            \
-                            : "memory", "cc");                         \
+                            : "memory", "cc" VEXTRA_CLOBBERS);         \
         })
  
  #define PVOP_CALL0(rettype, op)                                                \
@@ -417,22 +484,26 @@ int paravirt_disable_iospace(void);
         __PVOP_VCALL(op, "", "")
  
  #define PVOP_CALL1(rettype, op, arg1)                                  \
-       __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)))
+       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
  #define PVOP_VCALL1(op, arg1)                                          \
-       __PVOP_VCALL(op, "", "", "0" ((u32)(arg1)))
+       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
  
  #define PVOP_CALL2(rettype, op, arg1, arg2)                            \
-       __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))
+       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),   \
+       "1" ((unsigned long)(arg2)))
  #define PVOP_VCALL2(op, arg1, arg2)                                    \
-       __PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1" ((u32)(arg2)))
+       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),           \
+       "1" ((unsigned long)(arg2)))
  
  #define PVOP_CALL3(rettype, op, arg1, arg2, arg3)                      \
-       __PVOP_CALL(rettype, op, "", "", "0" ((u32)(arg1)),             \
-                   "1"((u32)(arg2)), "2"((u32)(arg3)))
+       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),   \
+       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
  #define PVOP_VCALL3(op, arg1, arg2, arg3)                              \
-       __PVOP_VCALL(op, "", "", "0" ((u32)(arg1)), "1"((u32)(arg2)),   \
-                    "2"((u32)(arg3)))
+       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),           \
+       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
  
+/* This is the only difference in x86_64. We can make it much simpler */
+#ifdef CONFIG_X86_32
  #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)                        \
         __PVOP_CALL(rettype, op,                                        \
                     "push %[_arg4];", "lea 4(%%esp),%%esp;",            \
@@ -443,16 +514,26 @@ int paravirt_disable_iospace(void);
                     "push %[_arg4];", "lea 4(%%esp),%%esp;",            \
                     "0" ((u32)(arg1)), "1" ((u32)(arg2)),               \
                     "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+#else
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)                        \
+       __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)),   \
+       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),         \
+       "3"((unsigned long)(arg4)))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)                                \
+       __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)),           \
+       "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)),         \
+       "3"((unsigned long)(arg4)))
+#endif
  
  static inline int paravirt_enabled(void)
  {
         return pv_info.paravirt_enabled;
  }
  
-static inline void load_esp0(struct tss_struct *tss,
+static inline void load_sp0(struct tss_struct *tss,
                              struct thread_struct *thread)
  {
-       PVOP_VCALL2(pv_cpu_ops.load_esp0, tss, thread);
+       PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
  }
  
  #define ARCH_SETUP                     pv_init_ops.arch_setup();
@@ -540,6 +621,18 @@ static inline void write_cr4(unsigned long x)
         PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
  }
  
+#ifdef CONFIG_X86_64
+static inline unsigned long read_cr8(void)
+{
+       return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
+}
+
+static inline void write_cr8(unsigned long x)
+{
+       PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
+}
+#endif
+
  static inline void raw_safe_halt(void)
  {
         PVOP_VCALL0(pv_irq_ops.safe_halt);
@@ -613,8 +706,6 @@ static inline unsigned long long paravirt_sched_clock(void)
  }
  #define calculate_cpu_khz() (pv_time_ops.get_cpu_khz())
  
-#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
-
  static inline unsigned long long paravirt_read_pmc(int counter)
  {
         return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
@@ -626,15 +717,36 @@ static inline unsigned long long paravirt_read_pmc(int counter)
         high = _l >> 32;                        \
  } while(0)
  
+static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
+{
+       return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
+}
+
+#define rdtscp(low, high, aux)                         \
+do {                                                   \
+       int __aux;                                      \
+       unsigned long __val = paravirt_rdtscp(&__aux);  \
+       (low) = (u32)__val;                             \
+       (high) = (u32)(__val >> 32);                    \
+       (aux) = __aux;                                  \
+} while (0)
+
+#define rdtscpll(val, aux)                             \
+do {                                                   \
+       unsigned long __aux;                            \
+       val = paravirt_rdtscp(&__aux);                  \
+       (aux) = __aux;                                  \
+} while (0)
+
  static inline void load_TR_desc(void)
  {
         PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
  }
-static inline void load_gdt(const struct Xgt_desc_struct *dtr)
+static inline void load_gdt(const struct desc_ptr *dtr)
  {
         PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
  }
-static inline void load_idt(const struct Xgt_desc_struct *dtr)
+static inline void load_idt(const struct desc_ptr *dtr)
  {
         PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
  }
@@ -642,11 +754,11 @@ static inline void set_ldt(const void *addr, unsigned entries)
  {
         PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
  }
-static inline void store_gdt(struct Xgt_desc_struct *dtr)
+static inline void store_gdt(struct desc_ptr *dtr)
  {
         PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
  }
-static inline void store_idt(struct Xgt_desc_struct *dtr)
+static inline void store_idt(struct desc_ptr *dtr)
  {
         PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
  }
@@ -659,17 +771,22 @@ static inline void load_TLS(struct thread_struct *t, unsigned cpu)
  {
         PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
  }
-static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
+
+static inline void write_ldt_entry(struct desc_struct *dt, int entry,
+                                  const void *desc)
  {
-       PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
+       PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
  }
-static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high)
+
+static inline void write_gdt_entry(struct desc_struct *dt, int entry,
+                                  void *desc, int type)
  {
-       PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high);
+       PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
  }
-static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high)
+
+static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
  {
-       PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high);
+       PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
  }
  static inline void set_iopl_mask(unsigned mask)
  {
@@ -690,17 +807,17 @@ static inline void slow_down_io(void) {
  /*
   * Basic functions accessing APICs.
   */
-static inline void apic_write(unsigned long reg, unsigned long v)
+static inline void apic_write(unsigned long reg, u32 v)
  {
         PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
  }
  
-static inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static inline void apic_write_atomic(unsigned long reg, u32 v)
  {
         PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
  }
  
-static inline unsigned long apic_read(unsigned long reg)
+static inline u32 apic_read(unsigned long reg)
  {
         return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
  }
@@ -786,9 +903,9 @@ static inline void paravirt_release_pt(unsigned pfn)
         PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
  }
  
-static inline void paravirt_alloc_pd(unsigned pfn)
+static inline void paravirt_alloc_pd(struct mm_struct *mm, unsigned pfn)
  {
-       PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
+       PVOP_VCALL2(pv_mmu_ops.alloc_pd, mm, pfn);
  }
  
  static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
@@ -822,128 +939,236 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
         PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
  }
  
-#ifdef CONFIG_X86_PAE
-static inline pte_t __pte(unsigned long long val)
+static inline pte_t __pte(pteval_t val)
  {
-       unsigned long long ret = PVOP_CALL2(unsigned long long,
-                                           pv_mmu_ops.make_pte,
-                                           val, val >> 32);
-       return (pte_t) { ret, ret >> 32 };
+       pteval_t ret;
+
+       if (sizeof(pteval_t) > sizeof(long))
+               ret = PVOP_CALL2(pteval_t,
+                                pv_mmu_ops.make_pte,
+                                val, (u64)val >> 32);
+       else
+               ret = PVOP_CALL1(pteval_t,
+                                pv_mmu_ops.make_pte,
+                                val);
+
+       return (pte_t) { .pte = ret };
  }
  
-static inline pmd_t __pmd(unsigned long long val)
+static inline pteval_t pte_val(pte_t pte)
  {
-       return (pmd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pmd,
-                                   val, val >> 32) };
+       pteval_t ret;
+
+       if (sizeof(pteval_t) > sizeof(long))
+               ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
+                                pte.pte, (u64)pte.pte >> 32);
+       else
+               ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
+                                pte.pte);
+
+       return ret;
  }
  
-static inline pgd_t __pgd(unsigned long long val)
+static inline pgd_t __pgd(pgdval_t val)
  {
-       return (pgd_t) { PVOP_CALL2(unsigned long long, pv_mmu_ops.make_pgd,
-                                   val, val >> 32) };
+       pgdval_t ret;
+
+       if (sizeof(pgdval_t) > sizeof(long))
+               ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
+                                val, (u64)val >> 32);
+       else
+               ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
+                                val);
+
+       return (pgd_t) { ret };
  }
  
-static inline unsigned long long pte_val(pte_t x)
+static inline pgdval_t pgd_val(pgd_t pgd)
  {
-       return PVOP_CALL2(unsigned long long, pv_mmu_ops.pte_val,
-                         x.pte_low, x.pte_high);
+       pgdval_t ret;
+
+       if (sizeof(pgdval_t) > sizeof(long))
+               ret =  PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
+                                 pgd.pgd, (u64)pgd.pgd >> 32);
+       else
+               ret =  PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
+                                 pgd.pgd);
+
+       return ret;
  }
  
-static inline unsigned long long pmd_val(pmd_t x)
+static inline void set_pte(pte_t *ptep, pte_t pte)
  {
-       return PVOP_CALL2(unsigned long long, pv_mmu_ops.pmd_val,
-                         x.pmd, x.pmd >> 32);
+       if (sizeof(pteval_t) > sizeof(long))
+               PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
+                           pte.pte, (u64)pte.pte >> 32);
+       else
+               PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
+                           pte.pte);
  }
  
-static inline unsigned long long pgd_val(pgd_t x)
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+                             pte_t *ptep, pte_t pte)
  {
-       return PVOP_CALL2(unsigned long long, pv_mmu_ops.pgd_val,
-                         x.pgd, x.pgd >> 32);
+       if (sizeof(pteval_t) > sizeof(long))
+               /* 5 arg words */
+               pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
+       else
+               PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
  }
  
-static inline void set_pte(pte_t *ptep, pte_t pteval)
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
  {
-       PVOP_VCALL3(pv_mmu_ops.set_pte, ptep, pteval.pte_low, pteval.pte_high);
+       pmdval_t val = native_pmd_val(pmd);
+
+       if (sizeof(pmdval_t) > sizeof(long))
+               PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
+       else
+               PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
  }
  
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
-                             pte_t *ptep, pte_t pteval)
+#if PAGETABLE_LEVELS >= 3
+static inline pmd_t __pmd(pmdval_t val)
  {
-       /* 5 arg words */
-       pv_mmu_ops.set_pte_at(mm, addr, ptep, pteval);
+       pmdval_t ret;
+
+       if (sizeof(pmdval_t) > sizeof(long))
+               ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
+                                val, (u64)val >> 32);
+       else
+               ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
+                                val);
+
+       return (pmd_t) { ret };
  }
  
-static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
+static inline pmdval_t pmd_val(pmd_t pmd)
  {
-       PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
-                   pteval.pte_low, pteval.pte_high);
+       pmdval_t ret;
+
+       if (sizeof(pmdval_t) > sizeof(long))
+               ret =  PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
+                                 pmd.pmd, (u64)pmd.pmd >> 32);
+       else
+               ret =  PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
+                                 pmd.pmd);
+
+       return ret;
  }
  
-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
-                                  pte_t *ptep, pte_t pte)
+static inline void set_pud(pud_t *pudp, pud_t pud)
  {
-       /* 5 arg words */
-       pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
+       pudval_t val = native_pud_val(pud);
+
+       if (sizeof(pudval_t) > sizeof(long))
+               PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
+                           val, (u64)val >> 32);
+       else
+               PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
+                           val);
+}
+#if PAGETABLE_LEVELS == 4
+static inline pud_t __pud(pudval_t val)
+{
+       pudval_t ret;
+
+       if (sizeof(pudval_t) > sizeof(long))
+               ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
+                                val, (u64)val >> 32);
+       else
+               ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
+                                val);
+
+       return (pud_t) { ret };
  }
  
-static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
+static inline pudval_t pud_val(pud_t pud)
  {
-       PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp,
-                   pmdval.pmd, pmdval.pmd >> 32);
+       pudval_t ret;
+
+       if (sizeof(pudval_t) > sizeof(long))
+               ret =  PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
+                                 pud.pud, (u64)pud.pud >> 32);
+       else
+               ret =  PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
+                                 pud.pud);
+
+       return ret;
  }
  
-static inline void set_pud(pud_t *pudp, pud_t pudval)
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
-       PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
-                   pudval.pgd.pgd, pudval.pgd.pgd >> 32);
+       pgdval_t val = native_pgd_val(pgd);
+
+       if (sizeof(pgdval_t) > sizeof(long))
+               PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
+                           val, (u64)val >> 32);
+       else
+               PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
+                           val);
  }
  
-static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static inline void pgd_clear(pgd_t *pgdp)
  {
-       PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
+       set_pgd(pgdp, __pgd(0));
  }
  
-static inline void pmd_clear(pmd_t *pmdp)
+static inline void pud_clear(pud_t *pudp)
  {
-       PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
+       set_pud(pudp, __pud(0));
  }
  
-#else  /* !CONFIG_X86_PAE */
+#endif /* PAGETABLE_LEVELS == 4 */
  
-static inline pte_t __pte(unsigned long val)
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+#ifdef CONFIG_X86_PAE
+/* Special-case pte-setting operations for PAE, which can't update a
+   64-bit pte atomically */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
  {
-       return (pte_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pte, val) };
+       PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
+                   pte.pte, pte.pte >> 32);
  }
  
-static inline pgd_t __pgd(unsigned long val)
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
+                                  pte_t *ptep, pte_t pte)
  {
-       return (pgd_t) { PVOP_CALL1(unsigned long, pv_mmu_ops.make_pgd, val) };
+       /* 5 arg words */
+       pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
  }
  
-static inline unsigned long pte_val(pte_t x)
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+                            pte_t *ptep)
  {
-       return PVOP_CALL1(unsigned long, pv_mmu_ops.pte_val, x.pte_low);
+       PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
  }
  
-static inline unsigned long pgd_val(pgd_t x)
+static inline void pmd_clear(pmd_t *pmdp)
+{
+       PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
+}
+#else  /* !CONFIG_X86_PAE */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
  {
-       return PVOP_CALL1(unsigned long, pv_mmu_ops.pgd_val, x.pgd);
+       set_pte(ptep, pte);
  }
  
-static inline void set_pte(pte_t *ptep, pte_t pteval)
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
+                                  pte_t *ptep, pte_t pte)
  {
-       PVOP_VCALL2(pv_mmu_ops.set_pte, ptep, pteval.pte_low);
+       set_pte(ptep, pte);
  }
  
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
-                             pte_t *ptep, pte_t pteval)
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+                            pte_t *ptep)
  {
-       PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pteval.pte_low);
+       set_pte_at(mm, addr, ptep, __pte(0));
  }
  
-static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
+static inline void pmd_clear(pmd_t *pmdp)
  {
-       PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, pmdval.pud.pgd.pgd);
+       set_pmd(pmdp, __pmd(0));
  }
  #endif /* CONFIG_X86_PAE */
  
@@ -1014,52 +1239,68 @@ struct paravirt_patch_site {
  extern struct paravirt_patch_site __parainstructions[],
         __parainstructions_end[];
  
+#ifdef CONFIG_X86_32
+#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
+#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
+#define PV_FLAGS_ARG "0"
+#define PV_EXTRA_CLOBBERS
+#define PV_VEXTRA_CLOBBERS
+#else
+/* We save some registers, but all of them, that's too much. We clobber all
+ * caller saved registers but the argument parameter */
+#define PV_SAVE_REGS "pushq %%rdi;"
+#define PV_RESTORE_REGS "popq %%rdi;"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx"
+#define PV_FLAGS_ARG "D"
+#endif
+
  static inline unsigned long __raw_local_save_flags(void)
  {
         unsigned long f;
  
-       asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
+       asm volatile(paravirt_alt(PV_SAVE_REGS
                                   PARAVIRT_CALL
-                                 "popl %%edx; popl %%ecx")
+                                 PV_RESTORE_REGS)
                      : "=a"(f)
                      : paravirt_type(pv_irq_ops.save_fl),
                        paravirt_clobber(CLBR_EAX)
-                    : "memory", "cc");
+                    : "memory", "cc" PV_VEXTRA_CLOBBERS);
         return f;
  }
  
  static inline void raw_local_irq_restore(unsigned long f)
  {
-       asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
+       asm volatile(paravirt_alt(PV_SAVE_REGS
                                   PARAVIRT_CALL
-                                 "popl %%edx; popl %%ecx")
+                                 PV_RESTORE_REGS)
                      : "=a"(f)
-                    : "0"(f),
+                    : PV_FLAGS_ARG(f),
                        paravirt_type(pv_irq_ops.restore_fl),
                        paravirt_clobber(CLBR_EAX)
-                    : "memory", "cc");
+                    : "memory", "cc" PV_EXTRA_CLOBBERS);
  }
  
  static inline void raw_local_irq_disable(void)
  {
-       asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
+       asm volatile(paravirt_alt(PV_SAVE_REGS
                                   PARAVIRT_CALL
-                                 "popl %%edx; popl %%ecx")
+                                 PV_RESTORE_REGS)
                      :
                      : paravirt_type(pv_irq_ops.irq_disable),
                        paravirt_clobber(CLBR_EAX)
-                    : "memory", "eax", "cc");
+                    : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
  }
  
  static inline void raw_local_irq_enable(void)
  {
-       asm volatile(paravirt_alt("pushl %%ecx; pushl %%edx;"
+       asm volatile(paravirt_alt(PV_SAVE_REGS
                                   PARAVIRT_CALL
-                                 "popl %%edx; popl %%ecx")
+                                 PV_RESTORE_REGS)
                      :
                      : paravirt_type(pv_irq_ops.irq_enable),
                        paravirt_clobber(CLBR_EAX)
-                    : "memory", "eax", "cc");
+                    : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
  }
  
  static inline unsigned long __raw_local_irq_save(void)
@@ -1071,27 +1312,6 @@ static inline unsigned long __raw_local_irq_save(void)
         return f;
  }
  
-#define CLI_STRING                                                     \
-       _paravirt_alt("pushl %%ecx; pushl %%edx;"                       \
-                     "call *%[paravirt_cli_opptr];"                    \
-                     "popl %%edx; popl %%ecx",                         \
-                     "%c[paravirt_cli_type]", "%c[paravirt_clobber]")
-
-#define STI_STRING                                                     \
-       _paravirt_alt("pushl %%ecx; pushl %%edx;"                       \
-                     "call *%[paravirt_sti_opptr];"                    \
-                     "popl %%edx; popl %%ecx",                         \
-                     "%c[paravirt_sti_type]", "%c[paravirt_clobber]")
-
-#define CLI_STI_CLOBBERS , "%eax"
-#define CLI_STI_INPUT_ARGS                                             \
-       ,                                                               \
-       [paravirt_cli_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_disable)),               \
-       [paravirt_cli_opptr] "m" (pv_irq_ops.irq_disable),              \
-       [paravirt_sti_type] "i" (PARAVIRT_PATCH(pv_irq_ops.irq_enable)),                \
-       [paravirt_sti_opptr] "m" (pv_irq_ops.irq_enable),               \
-       paravirt_clobber(CLBR_EAX)
-
  /* Make sure as little as possible of this mess escapes. */
  #undef PARAVIRT_CALL
  #undef __PVOP_CALL
@@ -1109,43 +1329,72 @@ static inline unsigned long __raw_local_irq_save(void)
  
  #else  /* __ASSEMBLY__ */
  
-#define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 4)
-
-#define PARA_SITE(ptype, clobbers, ops)                \
+#define _PVSITE(ptype, clobbers, ops, word, algn)      \
  771:;                                          \
         ops;                                    \
  772:;                                          \
         .pushsection .parainstructions,"a";     \
-        .long 771b;                            \
+        .align algn;                           \
+        word 771b;                             \
          .byte ptype;                           \
          .byte 772b-771b;                       \
          .short clobbers;                       \
         .popsection
  
+
+#ifdef CONFIG_X86_64
+#define PV_SAVE_REGS   pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
+#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
+#define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 8)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
+#else
+#define PV_SAVE_REGS   pushl %eax; pushl %edi; pushl %ecx; pushl %edx
+#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
+#define PARA_PATCH(struct, off)        ((PARAVIRT_PATCH_##struct + (off)) / 4)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
+#endif
+
  #define INTERRUPT_RETURN                                               \
         PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,       \
                   jmp *%cs:pv_cpu_ops+PV_CPU_iret)
  
  #define DISABLE_INTERRUPTS(clobbers)                                   \
         PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
-                 pushl %eax; pushl %ecx; pushl %edx;                   \
+                 PV_SAVE_REGS;                 \
                   call *%cs:pv_irq_ops+PV_IRQ_irq_disable;              \
-                 popl %edx; popl %ecx; popl %eax)                      \
+                 PV_RESTORE_REGS;)                     \
  
  #define ENABLE_INTERRUPTS(clobbers)                                    \
         PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,  \
-                 pushl %eax; pushl %ecx; pushl %edx;                   \
+                 PV_SAVE_REGS;                 \
                   call *%cs:pv_irq_ops+PV_IRQ_irq_enable;               \
-                 popl %edx; popl %ecx; popl %eax)
+                 PV_RESTORE_REGS;)
+
+#define ENABLE_INTERRUPTS_SYSCALL_RET                                  \
+       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\
+                 CLBR_NONE,                                            \
+                 jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)
  
-#define ENABLE_INTERRUPTS_SYSEXIT                                             \
-       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\
-                 jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit)
  
+#ifdef CONFIG_X86_32
  #define GET_CR0_INTO_EAX                       \
         push %ecx; push %edx;                   \
         call *pv_cpu_ops+PV_CPU_read_cr0;       \
         pop %edx; pop %ecx
+#else
+#define SWAPGS                                                         \
+       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,     \
+                 PV_SAVE_REGS;                                         \
+                 call *pv_cpu_ops+PV_CPU_swapgs;                       \
+                 PV_RESTORE_REGS                                       \
+                )
+
+#define GET_CR2_INTO_RCX                       \
+       call *pv_mmu_ops+PV_MMU_read_cr2;       \
+       movq %rax, %rcx;                        \
+       xorq %rax, %rax;
+
+#endif
  
  #endif /* __ASSEMBLY__ */
  #endif /* CONFIG_PARAVIRT */
diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h

index e883619663476e7f0659e6ebafb3c5d1a6712870..c61190cb9e12a81ea6b1e75caa5e46d78daf6ace 100644 (file)
--- a/include/asm-x86/pci.h
+++ b/include/asm-x86/pci.h
@@ -66,6 +66,7 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
  
  
  #ifdef CONFIG_PCI
+extern void early_quirks(void);
  static inline void pci_dma_burst_advice(struct pci_dev *pdev,
                                         enum pci_dma_burst_strategy *strat,
                                         unsigned long *strategy_parameter)
@@ -73,9 +74,10 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
         *strat = PCI_DMA_BURST_INFINITY;
         *strategy_parameter = ~0UL;
  }
+#else
+static inline void early_quirks(void) { }
  #endif
  
-
  #endif  /* __KERNEL__ */
  
  #ifdef CONFIG_X86_32
@@ -90,6 +92,19 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
  /* generic pci stuff */
  #include <asm-generic/pci.h>
  
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(struct pci_bus *bus)
+{
+       struct pci_sysdata *sd = bus->sysdata;
  
+       return sd->node;
+}
+
+static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
+{
+       return node_to_cpumask(__pcibus_to_node(bus));
+}
+#endif
  
  #endif
diff --git a/include/asm-x86/pci_64.h b/include/asm-x86/pci_64.h

index ef54226a9325a926a1c482ee7ce03d1410478f7b..374690314539c6858ad465396e6ba8397048f8da 100644 (file)
--- a/include/asm-x86/pci_64.h
+++ b/include/asm-x86/pci_64.h
@@ -26,7 +26,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int l
  
  
  extern void pci_iommu_alloc(void);
-extern int iommu_setup(char *opt);
  
  /* The PCI address space does equal the physical memory
   * address space.  The networking and block device layers use
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h

index 35962bbe5e72ad1219e576c911cf9af1f6aaecfe..c0305bff0f19349316ef4d61efa85f4a5e38f7a2 100644 (file)
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -7,22 +7,22 @@
  #include <linux/cache.h>
  #include <asm/page.h>
  
-/* Per processor datastructure. %gs points to it while the kernel runs */ 
+/* Per processor datastructure. %gs points to it while the kernel runs */
  struct x8664_pda {
         struct task_struct *pcurrent;   /* 0  Current process */
         unsigned long data_offset;      /* 8 Per cpu data offset from linker
                                            address */
-       unsigned long kernelstack;  /* 16 top of kernel stack for current */
-       unsigned long oldrsp;       /* 24 user rsp for system call */
-        int irqcount;              /* 32 Irq nesting counter. Starts with -1 */
-       int cpunumber;              /* 36 Logical CPU number */
+       unsigned long kernelstack;      /* 16 top of kernel stack for current */
+       unsigned long oldrsp;           /* 24 user rsp for system call */
+       int irqcount;                   /* 32 Irq nesting counter. Starts -1 */
+       unsigned int cpunumber;         /* 36 Logical CPU number */
  #ifdef CONFIG_CC_STACKPROTECTOR
         unsigned long stack_canary;     /* 40 stack canary value */
                                         /* gcc-ABI: this canary MUST be at
                                            offset 40!!! */
  #endif
         char *irqstackptr;
-       int nodenumber;             /* number of current node */
+       unsigned int nodenumber;        /* number of current node */
         unsigned int __softirq_pending;
         unsigned int __nmi_count;       /* number of NMI on this CPUs */
         short mmu_state;
@@ -40,13 +40,14 @@ struct x8664_pda {
  
  extern struct x8664_pda *_cpu_pda[];
  extern struct x8664_pda boot_cpu_pda[];
+extern void pda_init(int);
  
  #define cpu_pda(i) (_cpu_pda[i])
  
-/* 
+/*
   * There is no fast way to get the base address of the PDA, all the accesses
   * have to mention %fs/%gs.  So it needs to be done this Torvaldian way.
- */ 
+ */
  extern void __bad_pda_field(void) __attribute__((noreturn));
  
  /*
@@ -57,70 +58,70 @@ extern struct x8664_pda _proxy_pda;
  
  #define pda_offset(field) offsetof(struct x8664_pda, field)
  
-#define pda_to_op(op,field,val) do {           \
+#define pda_to_op(op, field, val) do {         \
         typedef typeof(_proxy_pda.field) T__;   \
         if (0) { T__ tmp__; tmp__ = (val); }    /* type checking */ \
         switch (sizeof(_proxy_pda.field)) {     \
         case 2:                                 \
-               asm(op "w %1,%%gs:%c2" :        \
+               asm(op "w %1,%%gs:%c2" :        \
                     "+m" (_proxy_pda.field) :   \
                     "ri" ((T__)val),            \
-                   "i"(pda_offset(field)));    \
-               break;                          \
+                   "i"(pda_offset(field)));    \
+               break;                          \
         case 4:                                 \
-               asm(op "l %1,%%gs:%c2" :        \
+               asm(op "l %1,%%gs:%c2" :        \
                     "+m" (_proxy_pda.field) :   \
                     "ri" ((T__)val),            \
-                   "i" (pda_offset(field)));   \
+                   "i" (pda_offset(field)));   \
                 break;                          \
         case 8:                                 \
-               asm(op "q %1,%%gs:%c2":         \
+               asm(op "q %1,%%gs:%c2":         \
                     "+m" (_proxy_pda.field) :   \
                     "ri" ((T__)val),            \
-                   "i"(pda_offset(field)));    \
+                   "i"(pda_offset(field)));    \
                 break;                          \
-       default:                                \
+       default:                                \
                 __bad_pda_field();              \
-       }                                       \
-       } while (0)
+       }                                       \
+       } while (0)
  
  #define pda_from_op(op,field) ({               \
         typeof(_proxy_pda.field) ret__;         \
         switch (sizeof(_proxy_pda.field)) {     \
-               case 2:                                 \
-               asm(op "w %%gs:%c1,%0" :        \
+       case 2:                                 \
+               asm(op "w %%gs:%c1,%0" :        \
                     "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
+                   "i" (pda_offset(field)),    \
+                   "m" (_proxy_pda.field));    \
                  break;                         \
         case 4:                                 \
                 asm(op "l %%gs:%c1,%0":         \
                     "=r" (ret__):               \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
+                   "i" (pda_offset(field)),    \
+                   "m" (_proxy_pda.field));    \
                  break;                         \
-       case 8:                                 \
+       case 8:                                 \
                 asm(op "q %%gs:%c1,%0":         \
                     "=r" (ret__) :              \
-                   "i" (pda_offset(field)),    \
-                   "m" (_proxy_pda.field));    \
+                   "i" (pda_offset(field)),    \
+                   "m" (_proxy_pda.field));    \
                  break;                         \
-       default:                                \
+       default:                                \
                 __bad_pda_field();              \
         }                                       \
         ret__; })
  
-#define read_pda(field) pda_from_op("mov",field)
-#define write_pda(field,val) pda_to_op("mov",field,val)
-#define add_pda(field,val) pda_to_op("add",field,val)
-#define sub_pda(field,val) pda_to_op("sub",field,val)
-#define or_pda(field,val) pda_to_op("or",field,val)
+#define read_pda(field)                pda_from_op("mov", field)
+#define write_pda(field, val)  pda_to_op("mov", field, val)
+#define add_pda(field, val)    pda_to_op("add", field, val)
+#define sub_pda(field, val)    pda_to_op("sub", field, val)
+#define or_pda(field, val)     pda_to_op("or", field, val)
  
  /* This is not atomic against other CPUs -- CPU preemption needs to be off */
-#define test_and_clear_bit_pda(bit,field) ({           \
+#define test_and_clear_bit_pda(bit, field) ({          \
         int old__;                                              \
         asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0"            \
-           : "=r" (old__), "+m" (_proxy_pda.field)             \
+           : "=r" (old__), "+m" (_proxy_pda.field)             \
             : "dIr" (bit), "i" (pda_offset(field)) : "memory"); \
         old__;                                                  \
  })
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h

index a1aaad274ccae82a8b9ac36671505b6fc0db6fda..0dec00f27eb45db5a853cfa621a2513d1b860953 100644 (file)
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -1,5 +1,142 @@
-#ifdef CONFIG_X86_32
-# include "percpu_32.h"
-#else
-# include "percpu_64.h"
+#ifndef _ASM_X86_PERCPU_H_
+#define _ASM_X86_PERCPU_H_
+
+#ifdef CONFIG_X86_64
+#include <linux/compiler.h>
+
+/* Same as asm-generic/percpu.h, except that we store the per cpu offset
+   in the PDA. Longer term the PDA and every per cpu variable
+   should be just put into a single section and referenced directly
+   from %gs */
+
+#ifdef CONFIG_SMP
+#include <asm/pda.h>
+
+#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
+#define __my_cpu_offset read_pda(data_offset)
+
+#define per_cpu_offset(x) (__per_cpu_offset(x))
+
  #endif
+#include <asm-generic/percpu.h>
+
+DECLARE_PER_CPU(struct x8664_pda, pda);
+
+#else /* CONFIG_X86_64 */
+
+#ifdef __ASSEMBLY__
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ *    var - variable name
+ *    reg - 32bit register
+ *
+ * The resulting address is stored in the "reg" argument.
+ *
+ * Example:
+ *    PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+#define PER_CPU(var, reg)                              \
+       movl %fs:per_cpu__##this_cpu_off, reg;          \
+       lea per_cpu__##var(reg), reg
+#define PER_CPU_VAR(var)       %fs:per_cpu__##var
+#else /* ! SMP */
+#define PER_CPU(var, reg)                      \
+       movl $per_cpu__##var, reg
+#define PER_CPU_VAR(var)       per_cpu__##var
+#endif /* SMP */
+
+#else /* ...!ASSEMBLY */
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ *    var - variable name
+ *    cpu - 32bit register containing the current CPU number
+ *
+ * The resulting address is stored in the "cpu" argument.
+ *
+ * Example:
+ *    PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+
+#define __my_cpu_offset x86_read_percpu(this_cpu_off)
+
+/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
+#define __percpu_seg "%%fs:"
+
+#else  /* !SMP */
+
+#define __percpu_seg ""
+
+#endif /* SMP */
+
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
+/* For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though). */
+extern void __bad_percpu_size(void);
+
+#define percpu_to_op(op,var,val)                               \
+       do {                                                    \
+               typedef typeof(var) T__;                        \
+               if (0) { T__ tmp__; tmp__ = (val); }            \
+               switch (sizeof(var)) {                          \
+               case 1:                                         \
+                       asm(op "b %1,"__percpu_seg"%0"          \
+                           : "+m" (var)                        \
+                           :"ri" ((T__)val));                  \
+                       break;                                  \
+               case 2:                                         \
+                       asm(op "w %1,"__percpu_seg"%0"          \
+                           : "+m" (var)                        \
+                           :"ri" ((T__)val));                  \
+                       break;                                  \
+               case 4:                                         \
+                       asm(op "l %1,"__percpu_seg"%0"          \
+                           : "+m" (var)                        \
+                           :"ri" ((T__)val));                  \
+                       break;                                  \
+               default: __bad_percpu_size();                   \
+               }                                               \
+       } while (0)
+
+#define percpu_from_op(op,var)                                 \
+       ({                                                      \
+               typeof(var) ret__;                              \
+               switch (sizeof(var)) {                          \
+               case 1:                                         \
+                       asm(op "b "__percpu_seg"%1,%0"          \
+                           : "=r" (ret__)                      \
+                           : "m" (var));                       \
+                       break;                                  \
+               case 2:                                         \
+                       asm(op "w "__percpu_seg"%1,%0"          \
+                           : "=r" (ret__)                      \
+                           : "m" (var));                       \
+                       break;                                  \
+               case 4:                                         \
+                       asm(op "l "__percpu_seg"%1,%0"          \
+                           : "=r" (ret__)                      \
+                           : "m" (var));                       \
+                       break;                                  \
+               default: __bad_percpu_size();                   \
+               }                                               \
+               ret__; })
+
+#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
+#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val)
+#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val)
+#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val)
+#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val)
+#endif /* !__ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+#endif /* _ASM_X86_PERCPU_H_ */
diff --git a/include/asm-x86/percpu_32.h b/include/asm-x86/percpu_32.h

deleted file mode 100644 (file)

index a7ebd43..0000000
--- a/include/asm-x86/percpu_32.h
+++ /dev/null
@@ -1,154 +0,0 @@
-#ifndef __ARCH_I386_PERCPU__
-#define __ARCH_I386_PERCPU__
-
-#ifdef __ASSEMBLY__
-
-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- *    var - variable name
- *    reg - 32bit register
- *
- * The resulting address is stored in the "reg" argument.
- *
- * Example:
- *    PER_CPU(cpu_gdt_descr, %ebx)
- */
-#ifdef CONFIG_SMP
-#define PER_CPU(var, reg)                              \
-       movl %fs:per_cpu__##this_cpu_off, reg;          \
-       lea per_cpu__##var(reg), reg
-#define PER_CPU_VAR(var)       %fs:per_cpu__##var
-#else /* ! SMP */
-#define PER_CPU(var, reg)                      \
-       movl $per_cpu__##var, reg
-#define PER_CPU_VAR(var)       per_cpu__##var
-#endif /* SMP */
-
-#else /* ...!ASSEMBLY */
-
-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- *    var - variable name
- *    cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- *    PER_CPU(cpu_gdt_descr, %ebx)
- */
-#ifdef CONFIG_SMP
-/* Same as generic implementation except for optimized local access. */
-#define __GENERIC_PER_CPU
-
-/* This is used for other cpus to find our section. */
-extern unsigned long __per_cpu_offset[];
-
-#define per_cpu_offset(x) (__per_cpu_offset[x])
-
-/* Separate out the type, so (int[3], foo) works. */
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_aligned_in_smp
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*({                         \
-       extern int simple_indentifier_##var(void);      \
-       RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
-
-#define __raw_get_cpu_var(var) (*({                                    \
-       extern int simple_indentifier_##var(void);                      \
-       RELOC_HIDE(&per_cpu__##var, x86_read_percpu(this_cpu_off));     \
-}))
-
-#define __get_cpu_var(var) __raw_get_cpu_var(var)
-
-/* A macro to avoid #include hell... */
-#define percpu_modcopy(pcpudst, src, size)                     \
-do {                                                           \
-       unsigned int __i;                                       \
-       for_each_possible_cpu(__i)                              \
-               memcpy((pcpudst)+__per_cpu_offset[__i],         \
-                      (src), (size));                          \
-} while (0)
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
-#define __percpu_seg "%%fs:"
-#else  /* !SMP */
-#include <asm-generic/percpu.h>
-#define __percpu_seg ""
-#endif /* SMP */
-
-/* For arch-specific code, we can use direct single-insn ops (they
- * don't give an lvalue though). */
-extern void __bad_percpu_size(void);
-
-#define percpu_to_op(op,var,val)                               \
-       do {                                                    \
-               typedef typeof(var) T__;                        \
-               if (0) { T__ tmp__; tmp__ = (val); }            \
-               switch (sizeof(var)) {                          \
-               case 1:                                         \
-                       asm(op "b %1,"__percpu_seg"%0"          \
-                           : "+m" (var)                        \
-                           :"ri" ((T__)val));                  \
-                       break;                                  \
-               case 2:                                         \
-                       asm(op "w %1,"__percpu_seg"%0"          \
-                           : "+m" (var)                        \
-                           :"ri" ((T__)val));                  \
-                       break;                                  \
-               case 4:                                         \
-                       asm(op "l %1,"__percpu_seg"%0"          \
-                           : "+m" (var)                        \
-                           :"ri" ((T__)val));                  \
-                       break;                                  \
-               default: __bad_percpu_size();                   \
-               }                                               \
-       } while (0)
-
-#define percpu_from_op(op,var)                                 \
-       ({                                                      \
-               typeof(var) ret__;                              \
-               switch (sizeof(var)) {                          \
-               case 1:                                         \
-                       asm(op "b "__percpu_seg"%1,%0"          \
-                           : "=r" (ret__)                      \
-                           : "m" (var));                       \
-                       break;                                  \
-               case 2:                                         \
-                       asm(op "w "__percpu_seg"%1,%0"          \
-                           : "=r" (ret__)                      \
-                           : "m" (var));                       \
-                       break;                                  \
-               case 4:                                         \
-                       asm(op "l "__percpu_seg"%1,%0"          \
-                           : "=r" (ret__)                      \
-                           : "m" (var));                       \
-                       break;                                  \
-               default: __bad_percpu_size();                   \
-               }                                               \
-               ret__; })
-
-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var, val)
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __ARCH_I386_PERCPU__ */
diff --git a/include/asm-x86/percpu_64.h b/include/asm-x86/percpu_64.h

deleted file mode 100644 (file)

index 5abd482..0000000
--- a/include/asm-x86/percpu_64.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _ASM_X8664_PERCPU_H_
-#define _ASM_X8664_PERCPU_H_
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
-   in the PDA. Longer term the PDA and every per cpu variable
-   should be just put into a single section and referenced directly
-   from %gs */
-
-#ifdef CONFIG_SMP
-
-#include <asm/pda.h>
-
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
-#define __my_cpu_offset() read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
-/* Separate out the type, so (int[3], foo) works. */
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
-
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)              \
-    __attribute__((__section__(".data.percpu.shared_aligned"))) \
-    __typeof__(type) per_cpu__##name                           \
-    ____cacheline_internodealigned_in_smp
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*({                         \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)); }))
-#define __get_cpu_var(var) (*({                                \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
-#define __raw_get_cpu_var(var) (*({                    \
-       extern int simple_identifier_##var(void);       \
-       RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
-
-/* A macro to avoid #include hell... */
-#define percpu_modcopy(pcpudst, src, size)                     \
-do {                                                           \
-       unsigned int __i;                                       \
-       for_each_possible_cpu(__i)                              \
-               memcpy((pcpudst)+__per_cpu_offset(__i),         \
-                      (src), (size));                          \
-} while (0)
-
-extern void setup_per_cpu_areas(void);
-
-#else /* ! SMP */
-
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) per_cpu__##name
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)      \
-    DEFINE_PER_CPU(type, name)
-
-#define per_cpu(var, cpu)                      (*((void)(cpu), &per_cpu__##var))
-#define __get_cpu_var(var)                     per_cpu__##var
-#define __raw_get_cpu_var(var)                 per_cpu__##var
-
-#endif /* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-
-#endif /* _ASM_X8664_PERCPU_H_ */
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h

index f2fc33ceb9f21419121a4bfbdf8093f73ca5dad7..10c2b452e64c87b4a1abf0d8058e4d48eabbc5a5 100644 (file)
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -3,31 +3,33 @@
  
  #include <linux/threads.h>
  #include <linux/mm.h>          /* for struct page */
+#include <asm/tlb.h>
+#include <asm-generic/tlb.h>
  
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
  #else
  #define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(pfn) do { } while (0)
-#define paravirt_alloc_pd(pfn) do { } while (0)
+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
  #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
  #define paravirt_release_pt(pfn) do { } while (0)
  #define paravirt_release_pd(pfn) do { } while (0)
  #endif
  
-#define pmd_populate_kernel(mm, pmd, pte)                      \
-do {                                                           \
-       paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);         \
-       set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));           \
-} while (0)
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+                                      pmd_t *pmd, pte_t *pte)
+{
+       paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
+       set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
+{
+       unsigned long pfn = page_to_pfn(pte);
  
-#define pmd_populate(mm, pmd, pte)                             \
-do {                                                           \
-       paravirt_alloc_pt(mm, page_to_pfn(pte));                \
-       set_pmd(pmd, __pmd(_PAGE_TABLE +                        \
-               ((unsigned long long)page_to_pfn(pte) <<        \
-                       (unsigned long long) PAGE_SHIFT)));     \
-} while (0)
+       paravirt_alloc_pt(mm, pfn);
+       set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+}
  
  /*
   * Allocate and free page tables.
@@ -49,20 +51,55 @@ static inline void pte_free(struct page *pte)
  }
  
  
-#define __pte_free_tlb(tlb,pte)                                        \
-do {                                                                   \
-       paravirt_release_pt(page_to_pfn(pte));                          \
-       tlb_remove_page((tlb),(pte));                                   \
-} while (0)
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+       paravirt_release_pt(page_to_pfn(pte));
+       tlb_remove_page(tlb, pte);
+}
  
  #ifdef CONFIG_X86_PAE
  /*
   * In the PAE case we free the pmds as part of the pgd.
   */
-#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
-#define pmd_free(x)                    do { } while (0)
-#define __pmd_free_tlb(tlb,x)          do { } while (0)
-#define pud_populate(mm, pmd, pte)     BUG()
-#endif
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(pmd_t *pmd)
+{
+       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+       free_page((unsigned long)pmd);
+}
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+       /* This is called just after the pmd has been detached from
+          the pgd, which requires a full tlb flush to be recognized
+          by the CPU.  Rather than incurring multiple tlb flushes
+          while the address space is being pulled down, make the tlb
+          gathering machinery do a full flush when we're done. */
+       tlb->fullmm = 1;
+
+       paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+       tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+       paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+       /* Note: almost everything apart from _PAGE_PRESENT is
+          reserved at the pmd (PDPT) level. */
+       set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+       /*
+        * Pentium-II erratum A13: in PAE mode we explicitly have to flush
+        * the TLB via cr3 if the top-level pgd is changed...
+        */
+       if (mm == current->active_mm)
+               write_cr3(read_cr3());
+}
+#endif /* CONFIG_X86_PAE */
  
  #endif /* _I386_PGALLOC_H */
diff --git a/include/asm-x86/pgtable-2level.h b/include/asm-x86/pgtable-2level.h

index 84b03cf56a791ce9130914544059b8df1f12b923..701404fab308874133bcbe582dcbcb202ed58eb8 100644 (file)
--- a/include/asm-x86/pgtable-2level.h
+++ b/include/asm-x86/pgtable-2level.h
@@ -15,30 +15,31 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte)
  {
         *ptep = pte;
  }
-static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
-                                    pte_t *ptep , pte_t pte)
-{
-       native_set_pte(ptep, pte);
-}
+
  static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
  {
         *pmdp = pmd;
  }
-#ifndef CONFIG_PARAVIRT
-#define set_pte(pteptr, pteval)                native_set_pte(pteptr, pteval)
-#define set_pte_at(mm,addr,ptep,pteval) native_set_pte_at(mm, addr, ptep, pteval)
-#define set_pmd(pmdptr, pmdval)                native_set_pmd(pmdptr, pmdval)
-#endif
  
-#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
-#define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval)
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+       native_set_pte(ptep, pte);
+}
  
-#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
-#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
+static inline void native_set_pte_present(struct mm_struct *mm, unsigned long addr,
+                                         pte_t *ptep, pte_t pte)
+{
+       native_set_pte(ptep, pte);
+}
+
+static inline void native_pmd_clear(pmd_t *pmdp)
+{
+       native_set_pmd(pmdp, __pmd(0));
+}
  
  static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
  {
-       *xp = __pte(0);
+       *xp = native_make_pte(0);
  }
  
  #ifdef CONFIG_SMP
@@ -53,16 +54,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
  #define pte_page(x)            pfn_to_page(pte_pfn(x))
  #define pte_none(x)            (!(x).pte_low)
  #define pte_pfn(x)             (pte_val(x) >> PAGE_SHIFT)
-#define pfn_pte(pfn, prot)     __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define pfn_pmd(pfn, prot)     __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-
-/*
- * All present pages are kernel-executable:
- */
-static inline int pte_exec_kernel(pte_t pte)
-{
-       return 1;
-}
  
  /*
   * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
@@ -74,13 +65,13 @@ static inline int pte_exec_kernel(pte_t pte)
         ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
  
  #define pgoff_to_pte(off) \
-       ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
+       ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
  
  /* Encode and de-code a swap entry */
  #define __swp_type(x)                  (((x).val >> 1) & 0x1f)
  #define __swp_offset(x)                        ((x).val >> 8)
  #define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
  #define __pte_to_swp_entry(pte)                ((swp_entry_t) { (pte).pte_low })
-#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
+#define __swp_entry_to_pte(x)          ((pte_t) { .pte = (x).val })
  
  #endif /* _I386_PGTABLE_2LEVEL_H */
diff --git a/include/asm-x86/pgtable-3level.h b/include/asm-x86/pgtable-3level.h

index 948a334141184f09e486baa971ed321cb6727f6f..a195c3e757b9ef95931d4c7341fdd2ab02d44464 100644 (file)
--- a/include/asm-x86/pgtable-3level.h
+++ b/include/asm-x86/pgtable-3level.h
@@ -15,16 +15,18 @@
  #define pgd_ERROR(e) \
         printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
  
-#define pud_none(pud)                          0
-#define pud_bad(pud)                           0
-#define pud_present(pud)                       1
  
-/*
- * All present pages with !NX bit are kernel-executable:
- */
-static inline int pte_exec_kernel(pte_t pte)
+static inline int pud_none(pud_t pud)
+{
+       return pud_val(pud) == 0;
+}
+static inline int pud_bad(pud_t pud)
+{
+       return (pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+static inline int pud_present(pud_t pud)
  {
-       return !(pte_val(pte) & _PAGE_NX);
+       return pud_val(pud) & _PAGE_PRESENT;
  }
  
  /* Rules for using set_pte: the pte being assigned *must* be
@@ -39,11 +41,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
         smp_wmb();
         ptep->pte_low = pte.pte_low;
  }
-static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
-                                    pte_t *ptep , pte_t pte)
-{
-       native_set_pte(ptep, pte);
-}
  
  /*
   * Since this is only called on user PTEs, and the page fault handler
@@ -71,7 +68,7 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
  }
  static inline void native_set_pud(pud_t *pudp, pud_t pud)
  {
-       *pudp = pud;
+       set_64bit((unsigned long long *)(pudp),native_pud_val(pud));
  }
  
  /*
@@ -94,24 +91,29 @@ static inline void native_pmd_clear(pmd_t *pmd)
         *(tmp + 1) = 0;
  }
  
-#ifndef CONFIG_PARAVIRT
-#define set_pte(ptep, pte)                     native_set_pte(ptep, pte)
-#define set_pte_at(mm, addr, ptep, pte)                native_set_pte_at(mm, addr, ptep, pte)
-#define set_pte_present(mm, addr, ptep, pte)   native_set_pte_present(mm, addr, ptep, pte)
-#define set_pte_atomic(ptep, pte)              native_set_pte_atomic(ptep, pte)
-#define set_pmd(pmdp, pmd)                     native_set_pmd(pmdp, pmd)
-#define set_pud(pudp, pud)                     native_set_pud(pudp, pud)
-#define pte_clear(mm, addr, ptep)              native_pte_clear(mm, addr, ptep)
-#define pmd_clear(pmd)                         native_pmd_clear(pmd)
-#endif
-
-/*
- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
- * the TLB via cr3 if the top-level pgd is changed...
- * We do not let the generic code free and clear pgd entries due to
- * this erratum.
- */
-static inline void pud_clear (pud_t * pud) { }
+static inline void pud_clear(pud_t *pudp)
+{
+       set_pud(pudp, __pud(0));
+
+       /*
+        * In principle we need to do a cr3 reload here to make sure
+        * the processor recognizes the changed pgd.  In practice, all
+        * the places where pud_clear() gets called are followed by
+        * full tlb flushes anyway, so we can defer the cost here.
+        *
+        * Specifically:
+        *
+        * mm/memory.c:free_pmd_range() - immediately after the
+        * pud_clear() it does a pmd_free_tlb().  We change the
+        * mmu_gather structure to do a full tlb flush (which has the
+        * effect of reloading cr3) when the pagetable free is
+        * complete.
+        *
+        * arch/x86/mm/hugetlbpage.c:huge_pmd_unshare() - the call to
+        * this is followed by a flush_tlb_range, which on x86 does a
+        * full tlb flush.
+        */
+}
  
  #define pud_page(pud) \
  ((struct page *) __va(pud_val(pud) & PAGE_MASK))
@@ -155,21 +157,7 @@ static inline int pte_none(pte_t pte)
  
  static inline unsigned long pte_pfn(pte_t pte)
  {
-       return pte_val(pte) >> PAGE_SHIFT;
-}
-
-extern unsigned long long __supported_pte_mask;
-
-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
-{
-       return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
-                     pgprot_val(pgprot)) & __supported_pte_mask);
-}
-
-static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
-{
-       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
-                     pgprot_val(pgprot)) & __supported_pte_mask);
+       return (pte_val(pte) & ~_PAGE_NX) >> PAGE_SHIFT;
  }
  
  /*
@@ -177,7 +165,7 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
   * put the 32 bits of offset into the high part.
   */
  #define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
+#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
  #define PTE_FILE_MAX_BITS       32
  
  /* Encode and de-code a swap entry */
@@ -185,8 +173,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
  #define __swp_offset(x)                        ((x).val >> 5)
  #define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
  #define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
-#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
-
-#define __pmd_free_tlb(tlb, x)         do { } while (0)
+#define __swp_entry_to_pte(x)          ((pte_t){ { .pte_high = (x).val } })
  
  #endif /* _I386_PGTABLE_3LEVEL_H */
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h

index 1039140652af322e403722c7716d2f3d61dd9739..cd2524f074525968a01917d9495711948e0c487d 100644 (file)
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -1,5 +1,364 @@
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
+#define FIRST_USER_ADDRESS     0
+
+#define _PAGE_BIT_PRESENT      0
+#define _PAGE_BIT_RW           1
+#define _PAGE_BIT_USER         2
+#define _PAGE_BIT_PWT          3
+#define _PAGE_BIT_PCD          4
+#define _PAGE_BIT_ACCESSED     5
+#define _PAGE_BIT_DIRTY                6
+#define _PAGE_BIT_FILE         6
+#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1      9       /* available for programmer */
+#define _PAGE_BIT_UNUSED2      10
+#define _PAGE_BIT_UNUSED3      11
+#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
+
+/*
+ * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a
+ * sign-extended value on 32-bit with all 1's in the upper word,
+ * which preserves the upper pte values on 64-bit ptes:
+ */
+#define _PAGE_PRESENT  (_AC(1, L)<<_PAGE_BIT_PRESENT)
+#define _PAGE_RW       (_AC(1, L)<<_PAGE_BIT_RW)
+#define _PAGE_USER     (_AC(1, L)<<_PAGE_BIT_USER)
+#define _PAGE_PWT      (_AC(1, L)<<_PAGE_BIT_PWT)
+#define _PAGE_PCD      (_AC(1, L)<<_PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY    (_AC(1, L)<<_PAGE_BIT_DIRTY)
+#define _PAGE_PSE      (_AC(1, L)<<_PAGE_BIT_PSE)      /* 2MB page */
+#define _PAGE_GLOBAL   (_AC(1, L)<<_PAGE_BIT_GLOBAL)   /* Global TLB entry */
+#define _PAGE_UNUSED1  (_AC(1, L)<<_PAGE_BIT_UNUSED1)
+#define _PAGE_UNUSED2  (_AC(1, L)<<_PAGE_BIT_UNUSED2)
+#define _PAGE_UNUSED3  (_AC(1, L)<<_PAGE_BIT_UNUSED3)
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX       (_AC(1, ULL) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX       0
+#endif
+
+/* If _PAGE_PRESENT is clear, we use these: */
+#define _PAGE_FILE     _PAGE_DIRTY     /* nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_PROTNONE _PAGE_PSE       /* if the user mapped it with PROT_NONE;
+                                          pte_present gives true */
+
+#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC         __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY              PAGE_COPY_NOEXEC
+#define PAGE_READONLY          __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC     __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+
+#ifdef CONFIG_X86_32
+#define _PAGE_KERNEL_EXEC \
+       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
+#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#ifndef __ASSEMBLY__
+extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
+#endif /* __ASSEMBLY__ */
+#else
+#define __PAGE_KERNEL_EXEC                                             \
+       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
+#define __PAGE_KERNEL          (__PAGE_KERNEL_EXEC | _PAGE_NX)
+#endif
+
+#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX               (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE     (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_VSYSCALL         (__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#ifdef CONFIG_X86_32
+# define MAKE_GLOBAL(x)                        __pgprot((x))
+#else
+# define MAKE_GLOBAL(x)                        __pgprot((x) | _PAGE_GLOBAL)
+#endif
+
+#define PAGE_KERNEL                    MAKE_GLOBAL(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO                 MAKE_GLOBAL(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC               MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX                 MAKE_GLOBAL(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_NOCACHE            MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_EXEC_NOCACHE       MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE              MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_EXEC         MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL           MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE   MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+
+/*         xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+#ifndef __ASSEMBLY__
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)         { return pte_val(pte) & _PAGE_DIRTY; }
+static inline int pte_young(pte_t pte)         { return pte_val(pte) & _PAGE_ACCESSED; }
+static inline int pte_write(pte_t pte)         { return pte_val(pte) & _PAGE_RW; }
+static inline int pte_file(pte_t pte)          { return pte_val(pte) & _PAGE_FILE; }
+static inline int pte_huge(pte_t pte)          { return pte_val(pte) & _PAGE_PSE; }
+static inline int pte_global(pte_t pte)        { return pte_val(pte) & _PAGE_GLOBAL; }
+static inline int pte_exec(pte_t pte)          { return !(pte_val(pte) & _PAGE_NX); }
+
+static inline int pmd_large(pmd_t pte) {
+       return (pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) ==
+               (_PAGE_PSE|_PAGE_PRESENT);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)     { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); }
+static inline pte_t pte_mkold(pte_t pte)       { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); }
+static inline pte_t pte_wrprotect(pte_t pte)   { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_RW); }
+static inline pte_t pte_mkexec(pte_t pte)      { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_NX); }
+static inline pte_t pte_mkdirty(pte_t pte)     { return __pte(pte_val(pte) | _PAGE_DIRTY); }
+static inline pte_t pte_mkyoung(pte_t pte)     { return __pte(pte_val(pte) | _PAGE_ACCESSED); }
+static inline pte_t pte_mkwrite(pte_t pte)     { return __pte(pte_val(pte) | _PAGE_RW); }
+static inline pte_t pte_mkhuge(pte_t pte)      { return __pte(pte_val(pte) | _PAGE_PSE); }
+static inline pte_t pte_clrhuge(pte_t pte)     { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_PSE); }
+static inline pte_t pte_mkglobal(pte_t pte)    { return __pte(pte_val(pte) | _PAGE_GLOBAL); }
+static inline pte_t pte_clrglobal(pte_t pte)   { return __pte(pte_val(pte) & ~(pteval_t)_PAGE_GLOBAL); }
+
+extern pteval_t __supported_pte_mask;
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+       return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
+                     pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+       return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
+                     pgprot_val(pgprot)) & __supported_pte_mask);
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+       pteval_t val = pte_val(pte);
+
+       /*
+        * Chop off the NX bit (if present), and add the NX portion of
+        * the newprot (if present):
+        */
+       val &= _PAGE_CHG_MASK & ~_PAGE_NX;
+       val |= pgprot_val(newprot) & __supported_pte_mask;
+
+       return __pte(val);
+}
+
+#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX))
+
+#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else  /* !CONFIG_PARAVIRT */
+#define set_pte(ptep, pte)             native_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte)        native_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_present(mm, addr, ptep, pte)                           \
+       native_set_pte_present(mm, addr, ptep, pte)
+#define set_pte_atomic(ptep, pte)                                      \
+       native_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd)             native_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd)             native_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd)                 native_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud)            native_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud)                 native_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep)      native_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd)                 native_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep)              do { } while (0)
+#define pte_update_defer(mm, addr, ptep)        do { } while (0)
+#endif /* CONFIG_PARAVIRT */
+
+#endif /* __ASSEMBLY__ */
+
  #ifdef CONFIG_X86_32
  # include "pgtable_32.h"
  #else
  # include "pgtable_64.h"
  #endif
+
+#ifndef __ASSEMBLY__
+
+enum {
+       PG_LEVEL_NONE,
+       PG_LEVEL_4K,
+       PG_LEVEL_2M,
+       PG_LEVEL_1G,
+};
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, int *level);
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
+{
+       pte_t res = *ptep;
+
+       /* Pure native function needs no input for mm, addr */
+       native_pte_clear(NULL, 0, ptep);
+       return res;
+}
+
+static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                                    pte_t *ptep , pte_t pte)
+{
+       native_set_pte(ptep, pte);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces.  It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush.  The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep)             do { } while (0)
+#define pte_update_defer(mm, addr, ptep)       do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
+({                                                                     \
+       int __changed = !pte_same(*(ptep), entry);                      \
+       if (__changed && dirty) {                                       \
+               *ptep = entry;                                          \
+               pte_update_defer((vma)->vm_mm, (address), (ptep));      \
+               flush_tlb_page(vma, address);                           \
+       }                                                               \
+       __changed;                                                      \
+})
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young(vma, addr, ptep) ({                  \
+       int __ret = 0;                                                  \
+       if (pte_young(*(ptep)))                                         \
+               __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,          \
+                                          &(ptep)->pte);               \
+       if (__ret)                                                      \
+               pte_update((vma)->vm_mm, addr, ptep);                   \
+       __ret;                                                          \
+})
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young(vma, address, ptep)                     \
+({                                                                     \
+       int __young;                                                    \
+       __young = ptep_test_and_clear_young((vma), (address), (ptep));  \
+       if (__young)                                                    \
+               flush_tlb_page(vma, address);                           \
+       __young;                                                        \
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+       pte_t pte = native_ptep_get_and_clear(ptep);
+       pte_update(mm, addr, ptep);
+       return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
+{
+       pte_t pte;
+       if (full) {
+               /*
+                * Full address destruction in progress; paravirt does not
+                * care about updates and native needs no locking
+                */
+               pte = native_local_ptep_get_and_clear(ptep);
+       } else {
+               pte = ptep_get_and_clear(mm, addr, ptep);
+       }
+       return pte;
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+       clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+       pte_update(mm, addr, ptep);
+}
+
+#include <asm-generic/pgtable.h>
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h

index ed3e70d8d04bfc3a94a56f38b177850d756cad1a..21e70fbf1dae55cae5ab87cd19c4cbaba3dc57b7 100644 (file)
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -25,20 +25,11 @@
  struct mm_struct;
  struct vm_area_struct;
  
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-extern unsigned long empty_zero_page[1024];
  extern pgd_t swapper_pg_dir[1024];
  extern struct kmem_cache *pmd_cache;
-extern spinlock_t pgd_lock;
-extern struct page *pgd_list;
  void check_pgt_cache(void);
  
-void pmd_ctor(struct kmem_cache *, void *);
-void pgtable_cache_init(void);
+static inline void pgtable_cache_init(void) {}
  void paging_init(void);
  
  
@@ -58,9 +49,6 @@ void paging_init(void);
  #define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
  #define PGDIR_MASK     (~(PGDIR_SIZE-1))
  
-#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
-#define FIRST_USER_ADDRESS     0
-
  #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
  #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
  
@@ -84,113 +72,6 @@ void paging_init(void);
  # define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
  #endif
  
-/*
- * _PAGE_PSE set in the page directory entry just means that
- * the page directory entry points directly to a 4MB-aligned block of
- * memory. 
- */
-#define _PAGE_BIT_PRESENT      0
-#define _PAGE_BIT_RW           1
-#define _PAGE_BIT_USER         2
-#define _PAGE_BIT_PWT          3
-#define _PAGE_BIT_PCD          4
-#define _PAGE_BIT_ACCESSED     5
-#define _PAGE_BIT_DIRTY                6
-#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
-#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
-#define _PAGE_BIT_UNUSED1      9       /* available for programmer */
-#define _PAGE_BIT_UNUSED2      10
-#define _PAGE_BIT_UNUSED3      11
-#define _PAGE_BIT_NX           63
-
-#define _PAGE_PRESENT  0x001
-#define _PAGE_RW       0x002
-#define _PAGE_USER     0x004
-#define _PAGE_PWT      0x008
-#define _PAGE_PCD      0x010
-#define _PAGE_ACCESSED 0x020
-#define _PAGE_DIRTY    0x040
-#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
-#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
-#define _PAGE_UNUSED1  0x200   /* available for programmer */
-#define _PAGE_UNUSED2  0x400
-#define _PAGE_UNUSED3  0x800
-
-/* If _PAGE_PRESENT is clear, we use these: */
-#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
-                                  pte_present gives true */
-#ifdef CONFIG_X86_PAE
-#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
-#else
-#define _PAGE_NX       0
-#endif
-
-#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-#define PAGE_NONE \
-       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED \
-       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-
-#define PAGE_SHARED_EXEC \
-       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC \
-       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY_EXEC \
-       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY \
-       PAGE_COPY_NOEXEC
-#define PAGE_READONLY \
-       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC \
-       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-
-#define _PAGE_KERNEL \
-       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
-#define _PAGE_KERNEL_EXEC \
-       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-
-extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
-#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
-#define __PAGE_KERNEL_RX               (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
-#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
-#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
-
-#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX         __pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
-
-/*
- * The i386 can't do page protection for execute, and considers that
- * the same are read. Also, write permissions imply read permissions.
- * This is the closest we can get..
- */
-#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY_EXEC
-#define __P101 PAGE_READONLY_EXEC
-#define __P110 PAGE_COPY_EXEC
-#define __P111 PAGE_COPY_EXEC
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY_EXEC
-#define __S101 PAGE_READONLY_EXEC
-#define __S110 PAGE_SHARED_EXEC
-#define __S111 PAGE_SHARED_EXEC
-
  /*
   * Define this if things work differently on an i386 and an i486:
   * it will (on an i486) warn about kernel memory accesses that are
@@ -211,133 +92,12 @@ extern unsigned long pg0[];
  
  #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
  
-/*
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
-static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
-static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
-static inline int pte_huge(pte_t pte)          { return (pte).pte_low & _PAGE_PSE; }
-
-/*
- * The following only works if pte_present() is not true.
- */
-static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
-
-static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
-static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
-static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= _PAGE_PSE; return pte; }
-
  #ifdef CONFIG_X86_PAE
  # include <asm/pgtable-3level.h>
  #else
  # include <asm/pgtable-2level.h>
  #endif
  
-#ifndef CONFIG_PARAVIRT
-/*
- * Rules for using pte_update - it must be called after any PTE update which
- * has not been done using the set_pte / clear_pte interfaces.  It is used by
- * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
- * updates should either be sets, clears, or set_pte_atomic for P->P
- * transitions, which means this hook should only be called for user PTEs.
- * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush.  The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
- */
-#define pte_update(mm, addr, ptep)             do { } while (0)
-#define pte_update_defer(mm, addr, ptep)       do { } while (0)
-#endif
-
-/* local pte updates need not use xchg for locking */
-static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
-{
-       pte_t res = *ptep;
-
-       /* Pure native function needs no input for mm, addr */
-       native_pte_clear(NULL, 0, ptep);
-       return res;
-}
-
-/*
- * We only update the dirty/accessed state if we set
- * the dirty bit by hand in the kernel, since the hardware
- * will do the accessed bit for us, and we don't want to
- * race with other CPU's that might be updating the dirty
- * bit at the same time.
- */
-#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty)                \
-({                                                                     \
-       int __changed = !pte_same(*(ptep), entry);                      \
-       if (__changed && dirty) {                                       \
-               (ptep)->pte_low = (entry).pte_low;                      \
-               pte_update_defer((vma)->vm_mm, (address), (ptep));      \
-               flush_tlb_page(vma, address);                           \
-       }                                                               \
-       __changed;                                                      \
-})
-
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(vma, addr, ptep) ({                  \
-       int __ret = 0;                                                  \
-       if (pte_young(*(ptep)))                                         \
-               __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,          \
-                                               &(ptep)->pte_low);      \
-       if (__ret)                                                      \
-               pte_update((vma)->vm_mm, addr, ptep);                   \
-       __ret;                                                          \
-})
-
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(vma, address, ptep)                     \
-({                                                                     \
-       int __young;                                                    \
-       __young = ptep_test_and_clear_young((vma), (address), (ptep));  \
-       if (__young)                                                    \
-               flush_tlb_page(vma, address);                           \
-       __young;                                                        \
-})
-
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       pte_t pte = native_ptep_get_and_clear(ptep);
-       pte_update(mm, addr, ptep);
-       return pte;
-}
-
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
-{
-       pte_t pte;
-       if (full) {
-               /*
-                * Full address destruction in progress; paravirt does not
-                * care about updates and native needs no locking
-                */
-               pte = native_local_ptep_get_and_clear(ptep);
-       } else {
-               pte = ptep_get_and_clear(mm, addr, ptep);
-       }
-       return pte;
-}
-
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
-       pte_update(mm, addr, ptep);
-}
-
  /*
   * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
   *
@@ -367,25 +127,6 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  
  #define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
  
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-       pte.pte_low &= _PAGE_CHG_MASK;
-       pte.pte_low |= pgprot_val(newprot);
-#ifdef CONFIG_X86_PAE
-       /*
-        * Chop off the NX bit (if present), and add the NX portion of
-        * the newprot (if present):
-        */
-       pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
-       pte.pte_high |= (pgprot_val(newprot) >> 32) & \
-                                       (__supported_pte_mask >> 32);
-#endif
-       return pte;
-}
-
-#define pmd_large(pmd) \
-((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
-
  /*
   * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
   *
@@ -432,26 +173,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  #define pmd_page_vaddr(pmd) \
                 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
  
-/*
- * Helper function that returns the kernel pagetable entry controlling
- * the virtual address 'address'. NULL means no pagetable entry present.
- * NOTE: the return type is pte_t but if the pmd is PSE then we return it
- * as a pte too.
- */
-extern pte_t *lookup_address(unsigned long address);
-
-/*
- * Make a given kernel text page executable/non-executable.
- * Returns the previous executability setting of that page (which
- * is used to restore the previous state). Used by the SMP bootup code.
- * NOTE: this is an __init function for security reasons.
- */
-#ifdef CONFIG_X86_PAE
- extern int set_kernel_exec(unsigned long vaddr, int enable);
-#else
- static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
-#endif
-
  #if defined(CONFIG_HIGHPTE)
  #define pte_offset_map(dir, address) \
         ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
@@ -497,13 +218,17 @@ static inline void paravirt_pagetable_setup_done(pgd_t *base)
  
  #endif /* !__ASSEMBLY__ */
  
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
  #ifdef CONFIG_FLATMEM
  #define kern_addr_valid(addr)  (1)
-#endif /* CONFIG_FLATMEM */
+#else
+#define kern_addr_valid(kaddr) (0)
+#endif
  
  #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
                 remap_pfn_range(vma, vaddr, pfn, size, prot)
  
-#include <asm-generic/pgtable.h>
-
  #endif /* _I386_PGTABLE_H */
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h

index 9b0ff477b39e7dc2021196619148969bffe2b355..6e615a103c2ff485fdd59b18b081243325baf114 100644 (file)
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -17,22 +17,16 @@ extern pud_t level3_kernel_pgt[512];
  extern pud_t level3_ident_pgt[512];
  extern pmd_t level2_kernel_pgt[512];
  extern pgd_t init_level4_pgt[];
-extern unsigned long __supported_pte_mask;
  
  #define swapper_pg_dir init_level4_pgt
  
  extern void paging_init(void);
  extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
  
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-
  #endif /* !__ASSEMBLY__ */
  
+#define SHARED_KERNEL_PMD      1
+
  /*
   * PGDIR_SHIFT determines what a top-level page table entry can map
   */
@@ -71,57 +65,68 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
  #define pgd_none(x)    (!pgd_val(x))
  #define pud_none(x)    (!pud_val(x))
  
-static inline void set_pte(pte_t *dst, pte_t val)
+struct mm_struct;
+
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+                                   pte_t *ptep)
+{
+       *ptep = native_make_pte(0);
+}
+
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
  {
-       pte_val(*dst) = pte_val(val);
-} 
-#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
+       *ptep = pte;
+}
  
-static inline void set_pmd(pmd_t *dst, pmd_t val)
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
  {
-        pmd_val(*dst) = pmd_val(val); 
-} 
+       native_set_pte(ptep, pte);
+}
  
-static inline void set_pud(pud_t *dst, pud_t val)
+static inline pte_t native_ptep_get_and_clear(pte_t *xp)
  {
-       pud_val(*dst) = pud_val(val);
+#ifdef CONFIG_SMP
+       return native_make_pte(xchg(&xp->pte, 0));
+#else
+       /* native_local_ptep_get_and_clear, but duplicated because of cyclic dependency */
+       pte_t ret = *xp;
+       native_pte_clear(NULL, 0, xp);
+       return ret;
+#endif
  }
  
-static inline void pud_clear (pud_t *pud)
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
  {
-       set_pud(pud, __pud(0));
+       *pmdp = pmd;
  }
  
-static inline void set_pgd(pgd_t *dst, pgd_t val)
+static inline void native_pmd_clear(pmd_t *pmd)
  {
-       pgd_val(*dst) = pgd_val(val); 
-} 
+       native_set_pmd(pmd, native_make_pmd(0));
+}
  
-static inline void pgd_clear (pgd_t * pgd)
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
  {
-       set_pgd(pgd, __pgd(0));
+       *pudp = pud;
  }
  
-#define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte, 0))
+static inline void native_pud_clear(pud_t *pud)
+{
+       native_set_pud(pud, native_make_pud(0));
+}
  
-struct mm_struct;
+static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       *pgdp = pgd;
+}
  
-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
+static inline void native_pgd_clear(pgd_t * pgd)
  {
-       pte_t pte;
-       if (full) {
-               pte = *ptep;
-               *ptep = __pte(0);
-       } else {
-               pte = ptep_get_and_clear(mm, addr, ptep);
-       }
-       return pte;
+       native_set_pgd(pgd, native_make_pgd(0));
  }
  
  #define pte_same(a, b)         ((a).pte == (b).pte)
  
-#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
-
  #endif /* !__ASSEMBLY__ */
  
  #define PMD_SIZE       (_AC(1,UL) << PMD_SHIFT)
@@ -131,8 +136,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
  #define PGDIR_SIZE     (_AC(1,UL) << PGDIR_SHIFT)
  #define PGDIR_MASK     (~(PGDIR_SIZE-1))
  
-#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
-#define FIRST_USER_ADDRESS     0
  
  #define MAXMEM          _AC(0x3fffffffffff, UL)
  #define VMALLOC_START    _AC(0xffffc20000000000, UL)
@@ -142,91 +145,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
  #define MODULES_END      _AC(0xfffffffffff00000, UL)
  #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
  
-#define _PAGE_BIT_PRESENT      0
-#define _PAGE_BIT_RW           1
-#define _PAGE_BIT_USER         2
-#define _PAGE_BIT_PWT          3
-#define _PAGE_BIT_PCD          4
-#define _PAGE_BIT_ACCESSED     5
-#define _PAGE_BIT_DIRTY                6
-#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
-#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
-#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
-
-#define _PAGE_PRESENT  0x001
-#define _PAGE_RW       0x002
-#define _PAGE_USER     0x004
-#define _PAGE_PWT      0x008
-#define _PAGE_PCD      0x010
-#define _PAGE_ACCESSED 0x020
-#define _PAGE_DIRTY    0x040
-#define _PAGE_PSE      0x080   /* 2MB page */
-#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
-
-#define _PAGE_PROTNONE 0x080   /* If not present */
-#define _PAGE_NX        (_AC(1,UL)<<_PAGE_BIT_NX)
-
-#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
-
-#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY PAGE_COPY_NOEXEC
-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define __PAGE_KERNEL \
-       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
-#define __PAGE_KERNEL_EXEC \
-       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_KERNEL_NOCACHE \
-       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX)
-#define __PAGE_KERNEL_RO \
-       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
-#define __PAGE_KERNEL_VSYSCALL \
-       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
-       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
-#define __PAGE_KERNEL_LARGE \
-       (__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC \
-       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
-
-#define MAKE_GLOBAL(x) __pgprot((x) | _PAGE_GLOBAL)
-
-#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
-#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
-
-/*         xwr */
-#define __P000 PAGE_NONE
-#define __P001 PAGE_READONLY
-#define __P010 PAGE_COPY
-#define __P011 PAGE_COPY
-#define __P100 PAGE_READONLY_EXEC
-#define __P101 PAGE_READONLY_EXEC
-#define __P110 PAGE_COPY_EXEC
-#define __P111 PAGE_COPY_EXEC
-
-#define __S000 PAGE_NONE
-#define __S001 PAGE_READONLY
-#define __S010 PAGE_SHARED
-#define __S011 PAGE_SHARED
-#define __S100 PAGE_READONLY_EXEC
-#define __S101 PAGE_READONLY_EXEC
-#define __S110 PAGE_SHARED_EXEC
-#define __S111 PAGE_SHARED_EXEC
-
  #ifndef __ASSEMBLY__
  
  static inline unsigned long pgd_bad(pgd_t pgd)
@@ -246,66 +164,16 @@ static inline unsigned long pmd_bad(pmd_t pmd)
  
  #define pte_none(x)    (!pte_val(x))
  #define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
-#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
  
-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))        /* FIXME: is this
-                                                  right? */
+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))        /* FIXME: is this right? */
  #define pte_page(x)    pfn_to_page(pte_pfn(x))
  #define pte_pfn(x)  ((pte_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
  
-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
-{
-       pte_t pte;
-       pte_val(pte) = (page_nr << PAGE_SHIFT);
-       pte_val(pte) |= pgprot_val(pgprot);
-       pte_val(pte) &= __supported_pte_mask;
-       return pte;
-}
-
-/*
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
-#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
-static inline int pte_dirty(pte_t pte)         { return pte_val(pte) & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)         { return pte_val(pte) & _PAGE_ACCESSED; }
-static inline int pte_write(pte_t pte)         { return pte_val(pte) & _PAGE_RW; }
-static inline int pte_file(pte_t pte)          { return pte_val(pte) & _PAGE_FILE; }
-static inline int pte_huge(pte_t pte)          { return pte_val(pte) & _PAGE_PSE; }
-
-static inline pte_t pte_mkclean(pte_t pte)     { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; }
-static inline pte_t pte_mkold(pte_t pte)       { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED)); return pte; }
-static inline pte_t pte_wrprotect(pte_t pte)   { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW)); return pte; }
-static inline pte_t pte_mkexec(pte_t pte)      { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_NX)); return pte; }
-static inline pte_t pte_mkdirty(pte_t pte)     { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
-static inline pte_t pte_mkyoung(pte_t pte)     { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
-static inline pte_t pte_mkwrite(pte_t pte)     { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)      { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
-static inline pte_t pte_clrhuge(pte_t pte)     { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_PSE)); return pte; }
-
-struct vm_area_struct;
-
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-       if (!pte_young(*ptep))
-               return 0;
-       return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
-}
-
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       clear_bit(_PAGE_BIT_RW, &ptep->pte);
-}
-
  /*
   * Macro to mark a page protection value as "uncacheable".
   */
  #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
  
-static inline int pmd_large(pmd_t pte) { 
-       return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
-}      
-
  
  /*
   * Conversion functions: convert a page and protection to a page entry,
@@ -340,29 +208,18 @@ static inline int pmd_large(pmd_t pte) {
                         pmd_index(address))
  #define pmd_none(x)    (!pmd_val(x))
  #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
-#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
  #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
  #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
  
  #define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE })
  #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
  
  /* PTE - Level 1 access. */
  
  /* page, protection -> pte */
  #define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
-#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
   
-/* Change flags of a PTE */
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{ 
-       pte_val(pte) &= _PAGE_CHG_MASK;
-       pte_val(pte) |= pgprot_val(newprot);
-       pte_val(pte) &= __supported_pte_mask;
-       return pte; 
-}
-
  #define pte_index(address) \
                 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
  #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
@@ -376,40 +233,20 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  
  #define update_mmu_cache(vma,address,pte) do { } while (0)
  
-/* We only update the dirty/accessed state if we set
- * the dirty bit by hand in the kernel, since the hardware
- * will do the accessed bit for us, and we don't want to
- * race with other CPU's that might be updating the dirty
- * bit at the same time. */
-#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
-({                                                                       \
-       int __changed = !pte_same(*(__ptep), __entry);                    \
-       if (__changed && __dirty) {                                       \
-               set_pte(__ptep, __entry);                                 \
-               flush_tlb_page(__vma, __address);                         \
-       }                                                                 \
-       __changed;                                                        \
-})
-
  /* Encode and de-code a swap entry */
  #define __swp_type(x)                  (((x).val >> 1) & 0x3f)
  #define __swp_offset(x)                        ((x).val >> 8)
  #define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
  #define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
-
-extern spinlock_t pgd_lock;
-extern struct list_head pgd_list;
+#define __swp_entry_to_pte(x)          ((pte_t) { .pte = (x).val })
  
  extern int kern_addr_valid(unsigned long addr); 
  
-pte_t *lookup_address(unsigned long addr);
-
  #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
                 remap_pfn_range(vma, vaddr, pfn, size, prot)
  
  #define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
  
  #define pgtable_cache_init()   do { } while (0)
  #define check_pgt_cache()      do { } while (0)
@@ -422,12 +259,7 @@ pte_t *lookup_address(unsigned long addr);
  #define        kc_offset_to_vaddr(o) \
     (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
  
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
  #define __HAVE_ARCH_PTE_SAME
-#include <asm-generic/pgtable.h>
  #endif /* !__ASSEMBLY__ */
  
  #endif /* _X86_64_PGTABLE_H */
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h

index 46e1c04e309c790c77be80a6e020ded9e58cc319..ab4d0c2a3f8f286b792e2b2b724c6186849fb5e3 100644 (file)
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -1,5 +1,842 @@
+#ifndef __ASM_X86_PROCESSOR_H
+#define __ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* migration helpers, for KVM - will be removed in 2.6.25: */
+#include <asm/vm86.h>
+#define Xgt_desc_struct        desc_ptr
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+       void *pc;
+       asm volatile("mov $1f,%0\n1:":"=r" (pc));
+       return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
+#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
+#else
+#define ARCH_MIN_TASKALIGN     16
+#define ARCH_MIN_MMSTRUCT_ALIGN        0
+#endif
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+       __u8    x86;            /* CPU family */
+       __u8    x86_vendor;     /* CPU vendor */
+       __u8    x86_model;
+       __u8    x86_mask;
+#ifdef CONFIG_X86_32
+       char    wp_works_ok;    /* It doesn't on 386's */
+       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
+       char    hard_math;
+       char    rfu;
+       char    fdiv_bug;
+       char    f00f_bug;
+       char    coma_bug;
+       char    pad0;
+#else
+       /* number of 4K pages in DTLB/ITLB combined(in pages)*/
+       int     x86_tlbsize;
+       __u8    x86_virt_bits, x86_phys_bits;
+       /* cpuid returned core id bits */
+       __u8    x86_coreid_bits;
+       /* Max extended CPUID function supported */
+       __u32   extended_cpuid_level;
+#endif
+       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
+       __u32   x86_capability[NCAPINTS];
+       char    x86_vendor_id[16];
+       char    x86_model_id[64];
+       int     x86_cache_size;  /* in KB - valid for CPUS which support this
+                                   call  */
+       int     x86_cache_alignment;    /* In bytes */
+       int     x86_power;
+       unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
+#endif
+       u16 x86_max_cores;              /* cpuid returned max cores value */
+       u16 apicid;
+       u16 x86_clflush_size;
+#ifdef CONFIG_SMP
+       u16 booted_cores;               /* number of cores as seen by OS */
+       u16 phys_proc_id;               /* Physical processor id. */
+       u16 cpu_core_id;                /* Core id */
+       u16 cpu_index;                  /* index into per_cpu list */
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_NEXGEN 4
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+#define X86_VENDOR_UNKNOWN 0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+extern struct tss_struct doublefault_tss;
+extern __u32 cleared_cpu_caps[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
+#define current_cpu_data       cpu_data(smp_processor_id())
+#else
+#define cpu_data(cpu)          boot_cpu_data
+#define current_cpu_data       boot_cpu_data
+#endif
+
+void cpu_detect(struct cpuinfo_x86 *c);
+
+extern void identify_cpu(struct cpuinfo_x86 *);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
+extern void detect_ht(struct cpuinfo_x86 *c);
+#else
+static inline void detect_ht(struct cpuinfo_x86 *c) {}
+#endif
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+                                        unsigned int *ecx, unsigned int *edx)
+{
+       /* ecx is often an input as well as an output. */
+       __asm__("cpuid"
+               : "=a" (*eax),
+                 "=b" (*ebx),
+                 "=c" (*ecx),
+                 "=d" (*edx)
+               : "0" (*eax), "2" (*ecx));
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+       write_cr3(__pa(pgdir));
+}
+
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+       unsigned short  back_link, __blh;
+       unsigned long   sp0;
+       unsigned short  ss0, __ss0h;
+       unsigned long   sp1;
+       unsigned short  ss1, __ss1h;    /* ss1 caches MSR_IA32_SYSENTER_CS */
+       unsigned long   sp2;
+       unsigned short  ss2, __ss2h;
+       unsigned long   __cr3;
+       unsigned long   ip;
+       unsigned long   flags;
+       unsigned long   ax, cx, dx, bx;
+       unsigned long   sp, bp, si, di;
+       unsigned short  es, __esh;
+       unsigned short  cs, __csh;
+       unsigned short  ss, __ssh;
+       unsigned short  ds, __dsh;
+       unsigned short  fs, __fsh;
+       unsigned short  gs, __gsh;
+       unsigned short  ldt, __ldth;
+       unsigned short  trace, io_bitmap_base;
+} __attribute__((packed));
+#else
+struct x86_hw_tss {
+       u32 reserved1;
+       u64 sp0;
+       u64 sp1;
+       u64 sp2;
+       u64 reserved2;
+       u64 ist[7];
+       u32 reserved3;
+       u32 reserved4;
+       u16 reserved5;
+       u16 io_bitmap_base;
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+
+/*
+ * Size of io_bitmap.
+ */
+#define IO_BITMAP_BITS  65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+
+struct tss_struct {
+       struct x86_hw_tss x86_tss;
+
+       /*
+        * The extra 1 is there because the CPU will access an
+        * additional byte beyond the end of the IO permission
+        * bitmap. The extra byte must be all 1 bits, and must
+        * be within the limit.
+        */
+       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
+       /*
+        * Cache the current maximum and the last task that used the bitmap:
+        */
+       unsigned long io_bitmap_max;
+       struct thread_struct *io_bitmap_owner;
+       /*
+        * pads the TSS to be cacheline-aligned (size is 0x100)
+        */
+       unsigned long __cacheline_filler[35];
+       /*
+        * .. and then another 0x100 bytes for emergency kernel stack
+        */
+       unsigned long stack[64];
+} __attribute__((packed));
+
+DECLARE_PER_CPU(struct tss_struct, init_tss);
+
+/* Save the original ist values for checking stack pointers during debugging */
+struct orig_ist {
+       unsigned long ist[7];
+};
+
+#define        MXCSR_DEFAULT           0x1f80
+
+struct i387_fsave_struct {
+       u32     cwd;
+       u32     swd;
+       u32     twd;
+       u32     fip;
+       u32     fcs;
+       u32     foo;
+       u32     fos;
+       u32     st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
+       u32     status;         /* software status information */
+};
+
+struct i387_fxsave_struct {
+       u16     cwd;
+       u16     swd;
+       u16     twd;
+       u16     fop;
+       union {
+               struct {
+                       u64     rip;
+                       u64     rdp;
+               };
+               struct {
+                       u32     fip;
+                       u32     fcs;
+                       u32     foo;
+                       u32     fos;
+               };
+       };
+       u32     mxcsr;
+       u32     mxcsr_mask;
+       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
+       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
+       u32     padding[24];
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+       u32     cwd;
+       u32     swd;
+       u32     twd;
+       u32     fip;
+       u32     fcs;
+       u32     foo;
+       u32     fos;
+       u32     st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
+       u8      ftop, changed, lookahead, no_update, rm, alimit;
+       struct info     *info;
+       u32     entry_eip;
+};
+
+union i387_union {
+       struct i387_fsave_struct        fsave;
+       struct i387_fxsave_struct       fxsave;
+       struct i387_soft_struct         soft;
+};
+
+#ifdef CONFIG_X86_32
+/*
+ * the following now lives in the per cpu area:
+ * extern      int cpu_llc_id[NR_CPUS];
+ */
+DECLARE_PER_CPU(u8, cpu_llc_id);
+#else
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+struct thread_struct {
+/* cached TLS descriptors. */
+       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+       unsigned long   sp0;
+       unsigned long   sp;
+#ifdef CONFIG_X86_32
+       unsigned long   sysenter_cs;
+#else
+       unsigned long   usersp; /* Copy from PDA */
+       unsigned short  es, ds, fsindex, gsindex;
+#endif
+       unsigned long   ip;
+       unsigned long   fs;
+       unsigned long   gs;
+/* Hardware debugging registers */
+       unsigned long   debugreg0;
+       unsigned long   debugreg1;
+       unsigned long   debugreg2;
+       unsigned long   debugreg3;
+       unsigned long   debugreg6;
+       unsigned long   debugreg7;
+/* fault info */
+       unsigned long   cr2, trap_no, error_code;
+/* floating point info */
+       union i387_union        i387 __attribute__((aligned(16)));;
+#ifdef CONFIG_X86_32
+/* virtual 86 mode info */
+       struct vm86_struct __user *vm86_info;
+       unsigned long           screen_bitmap;
+       unsigned long           v86flags, v86mask, saved_sp0;
+       unsigned int            saved_fs, saved_gs;
+#endif
+/* IO permissions */
+       unsigned long   *io_bitmap_ptr;
+       unsigned long   iopl;
+/* max allowed port in the bitmap, in bytes: */
+       unsigned io_bitmap_max;
+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
+       unsigned long   debugctlmsr;
+/* Debug Store - if not 0 points to a DS Save Area configuration;
+ *               goes into MSR_IA32_DS_AREA */
+       unsigned long   ds_area_msr;
+};
+
+static inline unsigned long native_get_debugreg(int regno)
+{
+       unsigned long val = 0;  /* Damn you, gcc! */
+
+       switch (regno) {
+       case 0:
+               asm("mov %%db0, %0" :"=r" (val)); break;
+       case 1:
+               asm("mov %%db1, %0" :"=r" (val)); break;
+       case 2:
+               asm("mov %%db2, %0" :"=r" (val)); break;
+       case 3:
+               asm("mov %%db3, %0" :"=r" (val)); break;
+       case 6:
+               asm("mov %%db6, %0" :"=r" (val)); break;
+       case 7:
+               asm("mov %%db7, %0" :"=r" (val)); break;
+       default:
+               BUG();
+       }
+       return val;
+}
+
+static inline void native_set_debugreg(int regno, unsigned long value)
+{
+       switch (regno) {
+       case 0:
+               asm("mov %0,%%db0"      : /* no output */ :"r" (value));
+               break;
+       case 1:
+               asm("mov %0,%%db1"      : /* no output */ :"r" (value));
+               break;
+       case 2:
+               asm("mov %0,%%db2"      : /* no output */ :"r" (value));
+               break;
+       case 3:
+               asm("mov %0,%%db3"      : /* no output */ :"r" (value));
+               break;
+       case 6:
+               asm("mov %0,%%db6"      : /* no output */ :"r" (value));
+               break;
+       case 7:
+               asm("mov %0,%%db7"      : /* no output */ :"r" (value));
+               break;
+       default:
+               BUG();
+       }
+}
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void native_set_iopl_mask(unsigned mask)
+{
+#ifdef CONFIG_X86_32
+       unsigned int reg;
+       __asm__ __volatile__ ("pushfl;"
+                             "popl %0;"
+                             "andl %1, %0;"
+                             "orl %2, %0;"
+                             "pushl %0;"
+                             "popfl"
+                               : "=&r" (reg)
+                               : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+#endif
+}
+
+static inline void native_load_sp0(struct tss_struct *tss,
+                                  struct thread_struct *thread)
+{
+       tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+       /* Only happens when SEP is enabled, no need to test "SEP"arately */
+       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+               tss->x86_tss.ss1 = thread->sysenter_cs;
+               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+       }
+#endif
+}
+
+static inline void native_swapgs(void)
+{
+#ifdef CONFIG_X86_64
+       asm volatile("swapgs" ::: "memory");
+#endif
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __cpuid native_cpuid
+#define paravirt_enabled() 0
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register)                            \
+       (var) = native_get_debugreg(register)
+#define set_debugreg(value, register)                          \
+       native_set_debugreg(register, value)
+
+static inline void load_sp0(struct tss_struct *tss,
+                           struct thread_struct *thread)
+{
+       native_load_sp0(tss, thread);
+}
+
+#define set_iopl_mask native_set_iopl_mask
+#define SWAPGS swapgs
+#endif /* CONFIG_PARAVIRT */
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long mmu_cr4_features;
+
+static inline void set_in_cr4(unsigned long mask)
+{
+       unsigned cr4;
+       mmu_cr4_features |= mask;
+       cr4 = read_cr4();
+       cr4 |= mask;
+       write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+       unsigned cr4;
+       mmu_cr4_features &= ~mask;
+       cr4 = read_cr4();
+       cr4 &= ~mask;
+       write_cr4(cr4);
+}
+
+struct microcode_header {
+       unsigned int hdrver;
+       unsigned int rev;
+       unsigned int date;
+       unsigned int sig;
+       unsigned int cksum;
+       unsigned int ldrver;
+       unsigned int pf;
+       unsigned int datasize;
+       unsigned int totalsize;
+       unsigned int reserved[3];
+};
+
+struct microcode {
+       struct microcode_header hdr;
+       unsigned int bits[0];
+};
+
+typedef struct microcode microcode_t;
+typedef struct microcode_header microcode_header_t;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+       unsigned int sig;
+       unsigned int pf;
+       unsigned int cksum;
+};
+
+struct extended_sigtable {
+       unsigned int count;
+       unsigned int cksum;
+       unsigned int reserved[3];
+       struct extended_signature sigs[0];
+};
+
+typedef struct {
+       unsigned long seg;
+} mm_segment_t;
+
+
+/*
+ * create a kernel thread without removing it from tasklists
+ */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+/* Prepare to copy thread state - unlazy all lazy status */
+extern void prepare_to_copy(struct task_struct *tsk);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+                        unsigned int *eax, unsigned int *ebx,
+                        unsigned int *ecx, unsigned int *edx)
+{
+       *eax = op;
+       *ecx = 0;
+       __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+                              unsigned int *eax, unsigned int *ebx,
+                              unsigned int *ecx, unsigned int *edx)
+{
+       *eax = op;
+       *ecx = count;
+       __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+       return eax;
+}
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+       return ebx;
+}
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+       return ecx;
+}
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+       return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+       __asm__ __volatile__("rep;nop": : :"memory");
+}
+
+/* Stop speculative execution */
+static inline void sync_core(void)
+{
+       int tmp;
+       asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+                                         : "ebx", "ecx", "edx", "memory");
+}
+
+#define cpu_relax()   rep_nop()
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+               unsigned long edx)
+{
+       /* "monitor %eax,%ecx,%edx;" */
+       asm volatile(
+               ".byte 0x0f,0x01,0xc8;"
+               : :"a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+       /* "mwait %eax,%ecx;" */
+       asm volatile(
+               ".byte 0x0f,0x01,0xc9;"
+               : :"a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+       /* "mwait %eax,%ecx;" */
+       asm volatile(
+               "sti; .byte 0x0f,0x01,0xc9;"
+               : :"a" (eax), "c" (ecx));
+}
+
+extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
+
+extern int force_mwait;
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+
+extern unsigned long boot_option_idle_override;
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(void);
+extern void cpu_init(void);
+extern void init_gdt(int cpu);
+
+/* from system description table in BIOS.  Mostly for MCA use, but
+ * others may find it useful. */
+extern unsigned int machine_id;
+extern unsigned int machine_submodel_id;
+extern unsigned int BIOS_revision;
+extern unsigned int mca_pentium_flag;
+
+/* Boot loader type from the setup header */
+extern int bootloader_type;
+
+extern char ignore_fpu_irq;
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+#define BASE_PREFETCH  ASM_NOP4
+#define ARCH_HAS_PREFETCH
+#else
+#define BASE_PREFETCH  "prefetcht0 (%1)"
+#endif
+
+/* Prefetch instructions for Pentium III and AMD Athlon */
+/* It's not worth to care about 3dnow! prefetches for the K6
+   because they are microcoded there and very slow.
+   However we don't do prefetches for pre XP Athlons currently
+   That should be fixed. */
+static inline void prefetch(const void *x)
+{
+       alternative_input(BASE_PREFETCH,
+                         "prefetchnta (%1)",
+                         X86_FEATURE_XMM,
+                         "r" (x));
+}
+
+/* 3dnow! prefetch to get an exclusive cache line. Useful for
+   spinlocks to avoid one state transition in the cache coherency protocol. */
+static inline void prefetchw(const void *x)
+{
+       alternative_input(BASE_PREFETCH,
+                         "prefetchw (%1)",
+                         X86_FEATURE_3DNOW,
+                         "r" (x));
+}
+
+#define spin_lock_prefetch(x)  prefetchw(x)
  #ifdef CONFIG_X86_32
-# include "processor_32.h"
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE      (PAGE_OFFSET)
+
+#define INIT_THREAD  {                                                 \
+       .sp0 = sizeof(init_stack) + (long)&init_stack,                  \
+       .vm86_info = NULL,                                              \
+       .sysenter_cs = __KERNEL_CS,                                     \
+       .io_bitmap_ptr = NULL,                                          \
+       .fs = __KERNEL_PERCPU,                                          \
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS  {                                                    \
+       .x86_tss = {                                                    \
+               .sp0            = sizeof(init_stack) + (long)&init_stack, \
+               .ss0            = __KERNEL_DS,                          \
+               .ss1            = __KERNEL_CS,                          \
+               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,             \
+        },                                                             \
+       .io_bitmap      = { [0 ... IO_BITMAP_LONGS] = ~0 },             \
+}
+
+#define start_thread(regs, new_eip, new_esp) do {              \
+       __asm__("movl %0,%%gs": :"r" (0));                      \
+       regs->fs = 0;                                           \
+       set_fs(USER_DS);                                        \
+       regs->ds = __USER_DS;                                   \
+       regs->es = __USER_DS;                                   \
+       regs->ss = __USER_DS;                                   \
+       regs->cs = __USER_CS;                                   \
+       regs->ip = new_eip;                                     \
+       regs->sp = new_esp;                                     \
+} while (0)
+
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info)                                                 \
+({                                                                     \
+       unsigned long *__ptr = (unsigned long *)(info);                 \
+       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessable even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task)                                             \
+({                                                                     \
+       struct pt_regs *__regs__;                                       \
+       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+       __regs__ - 1;                                                   \
+})
+
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
  #else
-# include "processor_64.h"
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE64    (0x800000000000UL - 4096)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+                          0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? \
+                                IA32_PAGE_OFFSET : TASK_SIZE64)
+#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? \
+                                 IA32_PAGE_OFFSET : TASK_SIZE64)
+
+#define INIT_THREAD  { \
+       .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS  { \
+       .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define start_thread(regs, new_rip, new_rsp) do {                           \
+       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));  \
+       load_gs_index(0);                                                    \
+       (regs)->ip = (new_rip);                                              \
+       (regs)->sp = (new_rsp);                                              \
+       write_pda(oldrsp, (new_rsp));                                        \
+       (regs)->cs = __USER_CS;                                              \
+       (regs)->ss = __USER_DS;                                              \
+       (regs)->flags = 0x200;                                               \
+       set_fs(USER_DS);                                                     \
+} while (0)
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
+#endif /* CONFIG_X86_64 */
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+
  #endif
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h

deleted file mode 100644 (file)

index 13976b0..0000000
--- a/include/asm-x86/processor_32.h
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * include/asm-i386/processor.h
- *
- * Copyright (C) 1994 Linus Torvalds
- */
-
-#ifndef __ASM_I386_PROCESSOR_H
-#define __ASM_I386_PROCESSOR_H
-
-#include <asm/vm86.h>
-#include <asm/math_emu.h>
-#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/sigcontext.h>
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
-#include <asm/system.h>
-#include <linux/cache.h>
-#include <linux/threads.h>
-#include <asm/percpu.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <asm/processor-flags.h>
-
-/* flag for disabling the tsc */
-extern int tsc_disable;
-
-struct desc_struct {
-       unsigned long a,b;
-};
-
-#define desc_empty(desc) \
-               (!((desc)->a | (desc)->b))
-
-#define desc_equal(desc1, desc2) \
-               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
-/*
- * Default implementation of macro that returns current
- * instruction pointer ("program counter").
- */
-#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
-
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
- *  before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
-       __u8    x86;            /* CPU family */
-       __u8    x86_vendor;     /* CPU vendor */
-       __u8    x86_model;
-       __u8    x86_mask;
-       char    wp_works_ok;    /* It doesn't on 386's */
-       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
-       char    hard_math;
-       char    rfu;
-               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
-       unsigned long   x86_capability[NCAPINTS];
-       char    x86_vendor_id[16];
-       char    x86_model_id[64];
-       int     x86_cache_size;  /* in KB - valid for CPUS which support this
-                                   call  */
-       int     x86_cache_alignment;    /* In bytes */
-       char    fdiv_bug;
-       char    f00f_bug;
-       char    coma_bug;
-       char    pad0;
-       int     x86_power;
-       unsigned long loops_per_jiffy;
-#ifdef CONFIG_SMP
-       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
-#endif
-       unsigned char x86_max_cores;    /* cpuid returned max cores value */
-       unsigned char apicid;
-       unsigned short x86_clflush_size;
-#ifdef CONFIG_SMP
-       unsigned char booted_cores;     /* number of cores as seen by OS */
-       __u8 phys_proc_id;              /* Physical processor id. */
-       __u8 cpu_core_id;               /* Core id */
-       __u8 cpu_index;                 /* index into per_cpu list */
-#endif
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
-#define X86_VENDOR_INTEL 0
-#define X86_VENDOR_CYRIX 1
-#define X86_VENDOR_AMD 2
-#define X86_VENDOR_UMC 3
-#define X86_VENDOR_NEXGEN 4
-#define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_TRANSMETA 7
-#define X86_VENDOR_NSC 8
-#define X86_VENDOR_NUM 9
-#define X86_VENDOR_UNKNOWN 0xff
-
-/*
- * capabilities of CPUs
- */
-
-extern struct cpuinfo_x86 boot_cpu_data;
-extern struct cpuinfo_x86 new_cpu_data;
-extern struct tss_struct doublefault_tss;
-DECLARE_PER_CPU(struct tss_struct, init_tss);
-
-#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
-#define current_cpu_data       cpu_data(smp_processor_id())
-#else
-#define cpu_data(cpu)          boot_cpu_data
-#define current_cpu_data       boot_cpu_data
-#endif
-
-/*
- * the following now lives in the per cpu area:
- * extern      int cpu_llc_id[NR_CPUS];
- */
-DECLARE_PER_CPU(u8, cpu_llc_id);
-extern char ignore_fpu_irq;
-
-void __init cpu_detect(struct cpuinfo_x86 *c);
-
-extern void identify_boot_cpu(void);
-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
-extern void print_cpu_info(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
-
-#ifdef CONFIG_X86_HT
-extern void detect_ht(struct cpuinfo_x86 *c);
-#else
-static inline void detect_ht(struct cpuinfo_x86 *c) {}
-#endif
-
-static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
-                                        unsigned int *ecx, unsigned int *edx)
-{
-       /* ecx is often an input as well as an output. */
-       __asm__("cpuid"
-               : "=a" (*eax),
-                 "=b" (*ebx),
-                 "=c" (*ecx),
-                 "=d" (*edx)
-               : "0" (*eax), "2" (*ecx));
-}
-
-#define load_cr3(pgdir) write_cr3(__pa(pgdir))
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-
-static inline void set_in_cr4 (unsigned long mask)
-{
-       unsigned cr4;
-       mmu_cr4_features |= mask;
-       cr4 = read_cr4();
-       cr4 |= mask;
-       write_cr4(cr4);
-}
-
-static inline void clear_in_cr4 (unsigned long mask)
-{
-       unsigned cr4;
-       mmu_cr4_features &= ~mask;
-       cr4 = read_cr4();
-       cr4 &= ~mask;
-       write_cr4(cr4);
-}
-
-/* Stop speculative execution */
-static inline void sync_core(void)
-{
-       int tmp;
-       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
-}
-
-static inline void __monitor(const void *eax, unsigned long ecx,
-               unsigned long edx)
-{
-       /* "monitor %eax,%ecx,%edx;" */
-       asm volatile(
-               ".byte 0x0f,0x01,0xc8;"
-               : :"a" (eax), "c" (ecx), "d"(edx));
-}
-
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-       /* "mwait %eax,%ecx;" */
-       asm volatile(
-               ".byte 0x0f,0x01,0xc9;"
-               : :"a" (eax), "c" (ecx));
-}
-
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
-/* from system description table in BIOS.  Mostly for MCA use, but
-others may find it useful. */
-extern unsigned int machine_id;
-extern unsigned int machine_submodel_id;
-extern unsigned int BIOS_revision;
-extern unsigned int mca_pentium_flag;
-
-/* Boot loader type from the setup header */
-extern int bootloader_type;
-
-/*
- * User space process size: 3GB (default).
- */
-#define TASK_SIZE      (PAGE_OFFSET)
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
-
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
-extern void hard_disable_TSC(void);
-extern void disable_TSC(void);
-extern void hard_enable_TSC(void);
-
-/*
- * Size of io_bitmap.
- */
-#define IO_BITMAP_BITS  65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
-#define INVALID_IO_BITMAP_OFFSET 0x8000
-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
-
-struct i387_fsave_struct {
-       long    cwd;
-       long    swd;
-       long    twd;
-       long    fip;
-       long    fcs;
-       long    foo;
-       long    fos;
-       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
-       long    status;         /* software status information */
-};
-
-struct i387_fxsave_struct {
-       unsigned short  cwd;
-       unsigned short  swd;
-       unsigned short  twd;
-       unsigned short  fop;
-       long    fip;
-       long    fcs;
-       long    foo;
-       long    fos;
-       long    mxcsr;
-       long    mxcsr_mask;
-       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
-       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
-       long    padding[56];
-} __attribute__ ((aligned (16)));
-
-struct i387_soft_struct {
-       long    cwd;
-       long    swd;
-       long    twd;
-       long    fip;
-       long    fcs;
-       long    foo;
-       long    fos;
-       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
-       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
-       struct info     *info;
-       unsigned long   entry_eip;
-};
-
-union i387_union {
-       struct i387_fsave_struct        fsave;
-       struct i387_fxsave_struct       fxsave;
-       struct i387_soft_struct soft;
-};
-
-typedef struct {
-       unsigned long seg;
-} mm_segment_t;
-
-struct thread_struct;
-
-/* This is the TSS defined by the hardware. */
-struct i386_hw_tss {
-       unsigned short  back_link,__blh;
-       unsigned long   esp0;
-       unsigned short  ss0,__ss0h;
-       unsigned long   esp1;
-       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
-       unsigned long   esp2;
-       unsigned short  ss2,__ss2h;
-       unsigned long   __cr3;
-       unsigned long   eip;
-       unsigned long   eflags;
-       unsigned long   eax,ecx,edx,ebx;
-       unsigned long   esp;
-       unsigned long   ebp;
-       unsigned long   esi;
-       unsigned long   edi;
-       unsigned short  es, __esh;
-       unsigned short  cs, __csh;
-       unsigned short  ss, __ssh;
-       unsigned short  ds, __dsh;
-       unsigned short  fs, __fsh;
-       unsigned short  gs, __gsh;
-       unsigned short  ldt, __ldth;
-       unsigned short  trace, io_bitmap_base;
-} __attribute__((packed));
-
-struct tss_struct {
-       struct i386_hw_tss x86_tss;
-
-       /*
-        * The extra 1 is there because the CPU will access an
-        * additional byte beyond the end of the IO permission
-        * bitmap. The extra byte must be all 1 bits, and must
-        * be within the limit.
-        */
-       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
-       /*
-        * Cache the current maximum and the last task that used the bitmap:
-        */
-       unsigned long io_bitmap_max;
-       struct thread_struct *io_bitmap_owner;
-       /*
-        * pads the TSS to be cacheline-aligned (size is 0x100)
-        */
-       unsigned long __cacheline_filler[35];
-       /*
-        * .. and then another 0x100 bytes for emergency kernel stack
-        */
-       unsigned long stack[64];
-} __attribute__((packed));
-
-#define ARCH_MIN_TASKALIGN     16
-
-struct thread_struct {
-/* cached TLS descriptors. */
-       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
-       unsigned long   esp0;
-       unsigned long   sysenter_cs;
-       unsigned long   eip;
-       unsigned long   esp;
-       unsigned long   fs;
-       unsigned long   gs;
-/* Hardware debugging registers */
-       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
-/* fault info */
-       unsigned long   cr2, trap_no, error_code;
-/* floating point info */
-       union i387_union        i387;
-/* virtual 86 mode info */
-       struct vm86_struct __user * vm86_info;
-       unsigned long           screen_bitmap;
-       unsigned long           v86flags, v86mask, saved_esp0;
-       unsigned int            saved_fs, saved_gs;
-/* IO permissions */
-       unsigned long   *io_bitmap_ptr;
-       unsigned long   iopl;
-/* max allowed port in the bitmap, in bytes: */
-       unsigned long   io_bitmap_max;
-};
-
-#define INIT_THREAD  {                                                 \
-       .esp0 = sizeof(init_stack) + (long)&init_stack,                 \
-       .vm86_info = NULL,                                              \
-       .sysenter_cs = __KERNEL_CS,                                     \
-       .io_bitmap_ptr = NULL,                                          \
-       .fs = __KERNEL_PERCPU,                                          \
-}
-
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS  {                                                    \
-       .x86_tss = {                                                    \
-               .esp0           = sizeof(init_stack) + (long)&init_stack, \
-               .ss0            = __KERNEL_DS,                          \
-               .ss1            = __KERNEL_CS,                          \
-               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,             \
-        },                                                             \
-       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
-}
-
-#define start_thread(regs, new_eip, new_esp) do {              \
-       __asm__("movl %0,%%gs": :"r" (0));                      \
-       regs->xfs = 0;                                          \
-       set_fs(USER_DS);                                        \
-       regs->xds = __USER_DS;                                  \
-       regs->xes = __USER_DS;                                  \
-       regs->xss = __USER_DS;                                  \
-       regs->xcs = __USER_CS;                                  \
-       regs->eip = new_eip;                                    \
-       regs->esp = new_esp;                                    \
-} while (0)
-
-/* Forward declaration, a strange C thing */
-struct task_struct;
-struct mm_struct;
-
-/* Free all resources held by a thread. */
-extern void release_thread(struct task_struct *);
-
-/* Prepare to copy thread state - unlazy all lazy status */
-extern void prepare_to_copy(struct task_struct *tsk);
-
-/*
- * create a kernel thread without removing it from tasklists
- */
-extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
-
-unsigned long get_wchan(struct task_struct *p);
-
-#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info)                                                 \
-({                                                                     \
-       unsigned long *__ptr = (unsigned long *)(info);                 \
-       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
-})
-
-/*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessable even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the xss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task)                                             \
-({                                                                     \
-       struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
-       __regs__ - 1;                                                   \
-})
-
-#define KSTK_EIP(task) (task_pt_regs(task)->eip)
-#define KSTK_ESP(task) (task_pt_regs(task)->esp)
-
-
-struct microcode_header {
-       unsigned int hdrver;
-       unsigned int rev;
-       unsigned int date;
-       unsigned int sig;
-       unsigned int cksum;
-       unsigned int ldrver;
-       unsigned int pf;
-       unsigned int datasize;
-       unsigned int totalsize;
-       unsigned int reserved[3];
-};
-
-struct microcode {
-       struct microcode_header hdr;
-       unsigned int bits[0];
-};
-
-typedef struct microcode microcode_t;
-typedef struct microcode_header microcode_header_t;
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
-       unsigned int sig;
-       unsigned int pf;
-       unsigned int cksum;
-};
-
-struct extended_sigtable {
-       unsigned int count;
-       unsigned int cksum;
-       unsigned int reserved[3];
-       struct extended_signature sigs[0];
-};
-
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
-       __asm__ __volatile__("rep;nop": : :"memory");
-}
-
-#define cpu_relax()    rep_nop()
-
-static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
-{
-       tss->x86_tss.esp0 = thread->esp0;
-       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
-       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-               tss->x86_tss.ss1 = thread->sysenter_cs;
-               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-       }
-}
-
-
-static inline unsigned long native_get_debugreg(int regno)
-{
-       unsigned long val = 0;  /* Damn you, gcc! */
-
-       switch (regno) {
-       case 0:
-               asm("movl %%db0, %0" :"=r" (val)); break;
-       case 1:
-               asm("movl %%db1, %0" :"=r" (val)); break;
-       case 2:
-               asm("movl %%db2, %0" :"=r" (val)); break;
-       case 3:
-               asm("movl %%db3, %0" :"=r" (val)); break;
-       case 6:
-               asm("movl %%db6, %0" :"=r" (val)); break;
-       case 7:
-               asm("movl %%db7, %0" :"=r" (val)); break;
-       default:
-               BUG();
-       }
-       return val;
-}
-
-static inline void native_set_debugreg(int regno, unsigned long value)
-{
-       switch (regno) {
-       case 0:
-               asm("movl %0,%%db0"     : /* no output */ :"r" (value));
-               break;
-       case 1:
-               asm("movl %0,%%db1"     : /* no output */ :"r" (value));
-               break;
-       case 2:
-               asm("movl %0,%%db2"     : /* no output */ :"r" (value));
-               break;
-       case 3:
-               asm("movl %0,%%db3"     : /* no output */ :"r" (value));
-               break;
-       case 6:
-               asm("movl %0,%%db6"     : /* no output */ :"r" (value));
-               break;
-       case 7:
-               asm("movl %0,%%db7"     : /* no output */ :"r" (value));
-               break;
-       default:
-               BUG();
-       }
-}
-
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void native_set_iopl_mask(unsigned mask)
-{
-       unsigned int reg;
-       __asm__ __volatile__ ("pushfl;"
-                             "popl %0;"
-                             "andl %1, %0;"
-                             "orl %2, %0;"
-                             "pushl %0;"
-                             "popfl"
-                               : "=&r" (reg)
-                               : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-}
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define paravirt_enabled() 0
-#define __cpuid native_cpuid
-
-static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
-{
-       native_load_esp0(tss, thread);
-}
-
-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register)                            \
-       (var) = native_get_debugreg(register)
-#define set_debugreg(value, register)                          \
-       native_set_debugreg(register, value)
-
-#define set_iopl_mask native_set_iopl_mask
-#endif /* CONFIG_PARAVIRT */
-
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op,
-                        unsigned int *eax, unsigned int *ebx,
-                        unsigned int *ecx, unsigned int *edx)
-{
-       *eax = op;
-       *ecx = 0;
-       __cpuid(eax, ebx, ecx, edx);
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(unsigned int op, int count,
-                              unsigned int *eax, unsigned int *ebx,
-                              unsigned int *ecx, unsigned int *edx)
-{
-       *eax = op;
-       *ecx = count;
-       __cpuid(eax, ebx, ecx, edx);
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
-       unsigned int eax, ebx, ecx, edx;
-
-       cpuid(op, &eax, &ebx, &ecx, &edx);
-       return eax;
-}
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
-       unsigned int eax, ebx, ecx, edx;
-
-       cpuid(op, &eax, &ebx, &ecx, &edx);
-       return ebx;
-}
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
-       unsigned int eax, ebx, ecx, edx;
-
-       cpuid(op, &eax, &ebx, &ecx, &edx);
-       return ecx;
-}
-static inline unsigned int cpuid_edx(unsigned int op)
-{
-       unsigned int eax, ebx, ecx, edx;
-
-       cpuid(op, &eax, &ebx, &ecx, &edx);
-       return edx;
-}
-
-/* generic versions from gas */
-#define GENERIC_NOP1   ".byte 0x90\n"
-#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
-#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
-#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
-#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
-#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
-
-/* Opteron nops */
-#define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2        ".byte 0x66,0x90\n" 
-#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
-#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
-#define K8_NOP5        K8_NOP3 K8_NOP2 
-#define K8_NOP6        K8_NOP3 K8_NOP3
-#define K8_NOP7        K8_NOP4 K8_NOP3
-#define K8_NOP8        K8_NOP4 K8_NOP4
-
-/* K7 nops */
-/* uses eax dependencies (arbitary choice) */
-#define K7_NOP1  GENERIC_NOP1
-#define K7_NOP2        ".byte 0x8b,0xc0\n" 
-#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
-#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
-#define K7_NOP5        K7_NOP4 ASM_NOP1
-#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
-#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
-#define K7_NOP8        K7_NOP7 ASM_NOP1
-
-/* P6 nops */
-/* uses eax dependencies (Intel-recommended choice) */
-#define P6_NOP1        GENERIC_NOP1
-#define P6_NOP2        ".byte 0x66,0x90\n"
-#define P6_NOP3        ".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4        ".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5        ".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6        ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7        ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8        ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
-
-#ifdef CONFIG_MK8
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
-#elif defined(CONFIG_MK7)
-#define ASM_NOP1 K7_NOP1
-#define ASM_NOP2 K7_NOP2
-#define ASM_NOP3 K7_NOP3
-#define ASM_NOP4 K7_NOP4
-#define ASM_NOP5 K7_NOP5
-#define ASM_NOP6 K7_NOP6
-#define ASM_NOP7 K7_NOP7
-#define ASM_NOP8 K7_NOP8
-#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \
-      defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \
-      defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
-#else
-#define ASM_NOP1 GENERIC_NOP1
-#define ASM_NOP2 GENERIC_NOP2
-#define ASM_NOP3 GENERIC_NOP3
-#define ASM_NOP4 GENERIC_NOP4
-#define ASM_NOP5 GENERIC_NOP5
-#define ASM_NOP6 GENERIC_NOP6
-#define ASM_NOP7 GENERIC_NOP7
-#define ASM_NOP8 GENERIC_NOP8
-#endif
-
-#define ASM_NOP_MAX 8
-
-/* Prefetch instructions for Pentium III and AMD Athlon */
-/* It's not worth to care about 3dnow! prefetches for the K6
-   because they are microcoded there and very slow.
-   However we don't do prefetches for pre XP Athlons currently
-   That should be fixed. */
-#define ARCH_HAS_PREFETCH
-static inline void prefetch(const void *x)
-{
-       alternative_input(ASM_NOP4,
-                         "prefetchnta (%1)",
-                         X86_FEATURE_XMM,
-                         "r" (x));
-}
-
-#define ARCH_HAS_PREFETCH
-#define ARCH_HAS_PREFETCHW
-#define ARCH_HAS_SPINLOCK_PREFETCH
-
-/* 3dnow! prefetch to get an exclusive cache line. Useful for 
-   spinlocks to avoid one state transition in the cache coherency protocol. */
-static inline void prefetchw(const void *x)
-{
-       alternative_input(ASM_NOP4,
-                         "prefetchw (%1)",
-                         X86_FEATURE_3DNOW,
-                         "r" (x));
-}
-#define spin_lock_prefetch(x)  prefetchw(x)
-
-extern void select_idle_routine(const struct cpuinfo_x86 *c);
-
-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
-
-extern unsigned long boot_option_idle_override;
-extern void enable_sep_cpu(void);
-extern int sysenter_setup(void);
-
-/* Defined in head.S */
-extern struct Xgt_desc_struct early_gdt_descr;
-
-extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
-extern void cpu_init(void);
-extern void init_gdt(int cpu);
-
-extern int force_mwait;
-
-#endif /* __ASM_I386_PROCESSOR_H */
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h

deleted file mode 100644 (file)

index e4f1997..0000000
--- a/include/asm-x86/processor_64.h
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * include/asm-x86_64/processor.h
- *
- * Copyright (C) 1994 Linus Torvalds
- */
-
-#ifndef __ASM_X86_64_PROCESSOR_H
-#define __ASM_X86_64_PROCESSOR_H
-
-#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/types.h>
-#include <asm/sigcontext.h>
-#include <asm/cpufeature.h>
-#include <linux/threads.h>
-#include <asm/msr.h>
-#include <asm/current.h>
-#include <asm/system.h>
-#include <asm/mmsegment.h>
-#include <asm/percpu.h>
-#include <linux/personality.h>
-#include <linux/cpumask.h>
-#include <asm/processor-flags.h>
-
-#define TF_MASK                0x00000100
-#define IF_MASK                0x00000200
-#define IOPL_MASK      0x00003000
-#define NT_MASK                0x00004000
-#define VM_MASK                0x00020000
-#define AC_MASK                0x00040000
-#define VIF_MASK       0x00080000      /* virtual interrupt flag */
-#define VIP_MASK       0x00100000      /* virtual interrupt pending */
-#define ID_MASK                0x00200000
-
-#define desc_empty(desc) \
-               (!((desc)->a | (desc)->b))
-
-#define desc_equal(desc1, desc2) \
-               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
-
-/*
- * Default implementation of macro that returns current
- * instruction pointer ("program counter").
- */
-#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
-
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- */
-
-struct cpuinfo_x86 {
-       __u8    x86;            /* CPU family */
-       __u8    x86_vendor;     /* CPU vendor */
-       __u8    x86_model;
-       __u8    x86_mask;
-       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
-       __u32   x86_capability[NCAPINTS];
-       char    x86_vendor_id[16];
-       char    x86_model_id[64];
-       int     x86_cache_size;  /* in KB */
-       int     x86_clflush_size;
-       int     x86_cache_alignment;
-       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
-        __u8    x86_virt_bits, x86_phys_bits;
-       __u8    x86_max_cores;  /* cpuid returned max cores value */
-        __u32   x86_power;     
-       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
-       unsigned long loops_per_jiffy;
-#ifdef CONFIG_SMP
-       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
-#endif
-       __u8    apicid;
-#ifdef CONFIG_SMP
-       __u8    booted_cores;   /* number of cores as seen by OS */
-       __u8    phys_proc_id;   /* Physical Processor id. */
-       __u8    cpu_core_id;    /* Core id. */
-       __u8    cpu_index;      /* index into per_cpu list */
-#endif
-} ____cacheline_aligned;
-
-#define X86_VENDOR_INTEL 0
-#define X86_VENDOR_CYRIX 1
-#define X86_VENDOR_AMD 2
-#define X86_VENDOR_UMC 3
-#define X86_VENDOR_NEXGEN 4
-#define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_TRANSMETA 7
-#define X86_VENDOR_NUM 8
-#define X86_VENDOR_UNKNOWN 0xff
-
-#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
-#define current_cpu_data       cpu_data(smp_processor_id())
-#else
-#define cpu_data(cpu)          boot_cpu_data
-#define current_cpu_data       boot_cpu_data
-#endif
-
-extern char ignore_irq13;
-
-extern void identify_cpu(struct cpuinfo_x86 *);
-extern void print_cpu_info(struct cpuinfo_x86 *);
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-
-static inline void set_in_cr4 (unsigned long mask)
-{
-       mmu_cr4_features |= mask;
-       __asm__("movq %%cr4,%%rax\n\t"
-               "orq %0,%%rax\n\t"
-               "movq %%rax,%%cr4\n"
-               : : "irg" (mask)
-               :"ax");
-}
-
-static inline void clear_in_cr4 (unsigned long mask)
-{
-       mmu_cr4_features &= ~mask;
-       __asm__("movq %%cr4,%%rax\n\t"
-               "andq %0,%%rax\n\t"
-               "movq %%rax,%%cr4\n"
-               : : "irg" (~mask)
-               :"ax");
-}
-
-
-/*
- * User space process size. 47bits minus one guard page.
- */
-#define TASK_SIZE64    (0x800000000000UL - 4096)
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
-
-#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
-#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
-
-#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
-
-/*
- * Size of io_bitmap.
- */
-#define IO_BITMAP_BITS  65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
-#define INVALID_IO_BITMAP_OFFSET 0x8000
-
-struct i387_fxsave_struct {
-       u16     cwd;
-       u16     swd;
-       u16     twd;
-       u16     fop;
-       u64     rip;
-       u64     rdp; 
-       u32     mxcsr;
-       u32     mxcsr_mask;
-       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
-       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
-       u32     padding[24];
-} __attribute__ ((aligned (16)));
-
-union i387_union {
-       struct i387_fxsave_struct       fxsave;
-};
-
-struct tss_struct {
-       u32 reserved1;
-       u64 rsp0;       
-       u64 rsp1;
-       u64 rsp2;
-       u64 reserved2;
-       u64 ist[7];
-       u32 reserved3;
-       u32 reserved4;
-       u16 reserved5;
-       u16 io_bitmap_base;
-       /*
-        * The extra 1 is there because the CPU will access an
-        * additional byte beyond the end of the IO permission
-        * bitmap. The extra byte must be all 1 bits, and must
-        * be within the limit. Thus we have:
-        *
-        * 128 bytes, the bitmap itself, for ports 0..0x3ff
-        * 8 bytes, for an extra "long" of ~0UL
-        */
-       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
-} __attribute__((packed)) ____cacheline_aligned;
-
-
-extern struct cpuinfo_x86 boot_cpu_data;
-DECLARE_PER_CPU(struct tss_struct,init_tss);
-/* Save the original ist values for checking stack pointers during debugging */
-struct orig_ist {
-       unsigned long ist[7];
-};
-DECLARE_PER_CPU(struct orig_ist, orig_ist);
-
-#ifdef CONFIG_X86_VSMP
-#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
-#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
-#else
-#define ARCH_MIN_TASKALIGN     16
-#define ARCH_MIN_MMSTRUCT_ALIGN        0
-#endif
-
-struct thread_struct {
-       unsigned long   rsp0;
-       unsigned long   rsp;
-       unsigned long   userrsp;        /* Copy from PDA */ 
-       unsigned long   fs;
-       unsigned long   gs;
-       unsigned short  es, ds, fsindex, gsindex;       
-/* Hardware debugging registers */
-       unsigned long   debugreg0;  
-       unsigned long   debugreg1;  
-       unsigned long   debugreg2;  
-       unsigned long   debugreg3;  
-       unsigned long   debugreg6;  
-       unsigned long   debugreg7;  
-/* fault info */
-       unsigned long   cr2, trap_no, error_code;
-/* floating point info */
-       union i387_union        i387  __attribute__((aligned(16)));
-/* IO permissions. the bitmap could be moved into the GDT, that would make
-   switch faster for a limited number of ioperm using tasks. -AK */
-       int             ioperm;
-       unsigned long   *io_bitmap_ptr;
-       unsigned io_bitmap_max;
-/* cached TLS descriptors. */
-       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
-} __attribute__((aligned(16)));
-
-#define INIT_THREAD  { \
-       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
-#define INIT_TSS  { \
-       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
-#define INIT_MMAP \
-{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
-
-#define start_thread(regs,new_rip,new_rsp) do { \
-       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
-       load_gs_index(0);                                                       \
-       (regs)->rip = (new_rip);                                                 \
-       (regs)->rsp = (new_rsp);                                                 \
-       write_pda(oldrsp, (new_rsp));                                            \
-       (regs)->cs = __USER_CS;                                                  \
-       (regs)->ss = __USER_DS;                                                  \
-       (regs)->eflags = 0x200;                                                  \
-       set_fs(USER_DS);                                                         \
-} while(0) 
-
-#define get_debugreg(var, register)                            \
-               __asm__("movq %%db" #register ", %0"            \
-                       :"=r" (var))
-#define set_debugreg(value, register)                  \
-               __asm__("movq %0,%%db" #register                \
-                       : /* no output */                       \
-                       :"r" (value))
-
-struct task_struct;
-struct mm_struct;
-
-/* Free all resources held by a thread. */
-extern void release_thread(struct task_struct *);
-
-/* Prepare to copy thread state - unlazy all lazy status */
-extern void prepare_to_copy(struct task_struct *tsk);
-
-/*
- * create a kernel thread without removing it from tasklists
- */
-extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-
-/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
-
-extern unsigned long get_wchan(struct task_struct *p);
-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
-#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
-#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
-
-
-struct microcode_header {
-       unsigned int hdrver;
-       unsigned int rev;
-       unsigned int date;
-       unsigned int sig;
-       unsigned int cksum;
-       unsigned int ldrver;
-       unsigned int pf;
-       unsigned int datasize;
-       unsigned int totalsize;
-       unsigned int reserved[3];
-};
-
-struct microcode {
-       struct microcode_header hdr;
-       unsigned int bits[0];
-};
-
-typedef struct microcode microcode_t;
-typedef struct microcode_header microcode_header_t;
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
-       unsigned int sig;
-       unsigned int pf;
-       unsigned int cksum;
-};
-
-struct extended_sigtable {
-       unsigned int count;
-       unsigned int cksum;
-       unsigned int reserved[3];
-       struct extended_signature sigs[0];
-};
-
-
-#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
-#else
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
-#endif
-
-/* Opteron nops */
-#define K8_NOP1 ".byte 0x90\n"
-#define K8_NOP2        ".byte 0x66,0x90\n" 
-#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
-#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
-#define K8_NOP5        K8_NOP3 K8_NOP2 
-#define K8_NOP6        K8_NOP3 K8_NOP3
-#define K8_NOP7        K8_NOP4 K8_NOP3
-#define K8_NOP8        K8_NOP4 K8_NOP4
-
-/* P6 nops */
-/* uses eax dependencies (Intel-recommended choice) */
-#define P6_NOP1        ".byte 0x90\n"
-#define P6_NOP2        ".byte 0x66,0x90\n"
-#define P6_NOP3        ".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4        ".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5        ".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6        ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7        ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8        ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
-
-#define ASM_NOP_MAX 8
-
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
-       __asm__ __volatile__("rep;nop": : :"memory");
-}
-
-/* Stop speculative execution */
-static inline void sync_core(void)
-{ 
-       int tmp;
-       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
-} 
-
-#define ARCH_HAS_PREFETCHW 1
-static inline void prefetchw(void *x) 
-{ 
-       alternative_input("prefetcht0 (%1)",
-                         "prefetchw (%1)",
-                         X86_FEATURE_3DNOW,
-                         "r" (x));
-} 
-
-#define ARCH_HAS_SPINLOCK_PREFETCH 1
-
-#define spin_lock_prefetch(x)  prefetchw(x)
-
-#define cpu_relax()   rep_nop()
-
-static inline void __monitor(const void *eax, unsigned long ecx,
-               unsigned long edx)
-{
-       /* "monitor %eax,%ecx,%edx;" */
-       asm volatile(
-               ".byte 0x0f,0x01,0xc8;"
-               : :"a" (eax), "c" (ecx), "d"(edx));
-}
-
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-       /* "mwait %eax,%ecx;" */
-       asm volatile(
-               ".byte 0x0f,0x01,0xc9;"
-               : :"a" (eax), "c" (ecx));
-}
-
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
-{
-       /* "mwait %eax,%ecx;" */
-       asm volatile(
-               "sti; .byte 0x0f,0x01,0xc9;"
-               : :"a" (eax), "c" (ecx));
-}
-
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
-#define stack_current() \
-({                                                             \
-       struct thread_info *ti;                                 \
-       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
-       ti->task;                                       \
-})
-
-#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
-
-extern unsigned long boot_option_idle_override;
-/* Boot loader type from the setup header */
-extern int bootloader_type;
-
-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
-
-#endif /* __ASM_X86_64_PROCESSOR_H */
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h

index dabba55f7ed8814543f546bbedebeedae48f4a38..68563c0709ac438495c8b0abf6ac16f90337ee49 100644 (file)
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -5,87 +5,24 @@
  
  /* misc architecture specific prototypes */
  
-struct cpuinfo_x86; 
-struct pt_regs;
-
-extern void start_kernel(void);
-extern void pda_init(int); 
-
  extern void early_idt_handler(void);
  
-extern void mcheck_init(struct cpuinfo_x86 *c);
  extern void init_memory_mapping(unsigned long start, unsigned long end);
  
-extern void system_call(void); 
-extern int kernel_syscall(void);
+extern void system_call(void);
  extern void syscall_init(void);
  
  extern void ia32_syscall(void);
-extern void ia32_cstar_target(void); 
-extern void ia32_sysenter_target(void); 
-
-extern void config_acpi_tables(void);
-extern void ia32_syscall(void);
-
-extern int pmtimer_mark_offset(void);
-extern void pmtimer_resume(void);
-extern void pmtimer_wait(unsigned);
-extern unsigned int do_gettimeoffset_pm(void);
-#ifdef CONFIG_X86_PM_TIMER
-extern u32 pmtmr_ioport;
-#else
-#define pmtmr_ioport 0
-#endif
-extern int nohpet;
-
-extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
-
-extern void early_identify_cpu(struct cpuinfo_x86 *c);
-
-extern int k8_scan_nodes(unsigned long start, unsigned long end);
-
-extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern unsigned long numa_free_all_bootmem(void);
+extern void ia32_cstar_target(void);
+extern void ia32_sysenter_target(void);
  
  extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
  
-extern void load_gs_index(unsigned gs);
-
-extern unsigned long end_pfn_map; 
-
-extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long * rsp);
-extern void show_registers(struct pt_regs *regs);
-
-extern void exception_table_check(void);
-
-extern void acpi_reserve_bootmem(void);
-
-extern void swap_low_mappings(void);
-
-extern void __show_regs(struct pt_regs * regs);
-extern void show_regs(struct pt_regs * regs);
-
  extern void syscall32_cpu_init(void);
  
-extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
-
-extern void early_quirks(void);
  extern void check_efer(void);
  
-extern void select_idle_routine(const struct cpuinfo_x86 *c);
-
-extern unsigned long table_start, table_end;
-
-extern int exception_trace;
-extern unsigned cpu_khz;
-extern unsigned tsc_khz;
-
  extern int reboot_force;
-extern int notsc_setup(char *);
-
-extern int gsi_irq_sharing(int gsi);
-
-extern int force_mwait;
  
  long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
  
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h

index 7524e12338332d6ee54fc472086caa2babb29c2a..81a8ee4c55fc50a941373d5c16de01da455bdee7 100644 (file)
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -78,4 +78,66 @@
  # define PTRACE_SYSEMU_SINGLESTEP 32
  #endif
  
+#define PTRACE_SINGLEBLOCK     33      /* resume execution until next branch */
+
+#ifndef __ASSEMBLY__
+
+#include <asm/types.h>
+
+/* configuration/status structure used in PTRACE_BTS_CONFIG and
+   PTRACE_BTS_STATUS commands.
+*/
+struct ptrace_bts_config {
+       /* requested or actual size of BTS buffer in bytes */
+       u32 size;
+       /* bitmask of below flags */
+       u32 flags;
+       /* buffer overflow signal */
+       u32 signal;
+       /* actual size of bts_struct in bytes */
+       u32 bts_size;
+};
+#endif
+
+#define PTRACE_BTS_O_TRACE     0x1 /* branch trace */
+#define PTRACE_BTS_O_SCHED     0x2 /* scheduling events w/ jiffies */
+#define PTRACE_BTS_O_SIGNAL     0x4 /* send SIG<signal> on buffer overflow
+                                      instead of wrapping around */
+#define PTRACE_BTS_O_CUT_SIZE  0x8 /* cut requested size to max available
+                                      instead of failing */
+
+#define PTRACE_BTS_CONFIG      40
+/* Configure branch trace recording.
+   ADDR points to a struct ptrace_bts_config.
+   DATA gives the size of that buffer.
+   A new buffer is allocated, iff the size changes.
+   Returns the number of bytes read.
+*/
+#define PTRACE_BTS_STATUS      41
+/* Return the current configuration in a struct ptrace_bts_config
+   pointed to by ADDR; DATA gives the size of that buffer.
+   Returns the number of bytes written.
+*/
+#define PTRACE_BTS_SIZE                42
+/* Return the number of available BTS records.
+   DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_GET         43
+/* Get a single BTS record.
+   DATA defines the index into the BTS array, where 0 is the newest
+   entry, and higher indices refer to older entries.
+   ADDR is pointing to struct bts_struct (see asm/ds.h).
+*/
+#define PTRACE_BTS_CLEAR       44
+/* Clear the BTS buffer.
+   DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_DRAIN       45
+/* Read all available BTS records and clear the buffer.
+   ADDR points to an array of struct bts_struct.
+   DATA gives the size of that buffer.
+   BTS records are read from oldest to newest.
+   Returns number of BTS records drained.
+*/
+
  #endif
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h

index 51ddb2590870f96d0e6f0b373cd7a91e22b61d49..d9e04b46a44069a56565729085fc11b67399696d 100644 (file)
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -4,12 +4,15 @@
  #include <linux/compiler.h>    /* For __user */
  #include <asm/ptrace-abi.h>
  
+
  #ifndef __ASSEMBLY__
  
  #ifdef __i386__
  /* this struct defines the way the registers are stored on the
     stack during a system call. */
  
+#ifndef __KERNEL__
+
  struct pt_regs {
         long ebx;
         long ecx;
@@ -21,7 +24,7 @@ struct pt_regs {
         int  xds;
         int  xes;
         int  xfs;
-       /* int  xgs; */
+       /* int  gs; */
         long orig_eax;
         long eip;
         int  xcs;
@@ -30,44 +33,37 @@ struct pt_regs {
         int  xss;
  };
  
-#ifdef __KERNEL__
+#else /* __KERNEL__ */
+
+struct pt_regs {
+       long bx;
+       long cx;
+       long dx;
+       long si;
+       long di;
+       long bp;
+       long ax;
+       int  ds;
+       int  es;
+       int  fs;
+       /* int  gs; */
+       long orig_ax;
+       long ip;
+       int  cs;
+       long flags;
+       long sp;
+       int  ss;
+};
  
  #include <asm/vm86.h>
  #include <asm/segment.h>
  
-struct task_struct;
-extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
-
-/*
- * user_mode_vm(regs) determines whether a register set came from user mode.
- * This is true if V8086 mode was enabled OR if the register set was from
- * protected mode with RPL-3 CS value.  This tricky test checks that with
- * one comparison.  Many places in the kernel can bypass this full check
- * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
- */
-static inline int user_mode(struct pt_regs *regs)
-{
-       return (regs->xcs & SEGMENT_RPL_MASK) == USER_RPL;
-}
-static inline int user_mode_vm(struct pt_regs *regs)
-{
-       return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL;
-}
-static inline int v8086_mode(struct pt_regs *regs)
-{
-       return (regs->eflags & VM_MASK);
-}
-
-#define instruction_pointer(regs) ((regs)->eip)
-#define frame_pointer(regs) ((regs)->ebp)
-#define stack_pointer(regs) ((unsigned long)(regs))
-#define regs_return_value(regs) ((regs)->eax)
-
-extern unsigned long profile_pc(struct pt_regs *regs);
  #endif /* __KERNEL__ */
  
  #else /* __i386__ */
  
+#ifndef __KERNEL__
+
  struct pt_regs {
         unsigned long r15;
         unsigned long r14;
@@ -96,47 +92,143 @@ struct pt_regs {
  /* top of stack page */
  };
  
+#else /* __KERNEL__ */
+
+struct pt_regs {
+       unsigned long r15;
+       unsigned long r14;
+       unsigned long r13;
+       unsigned long r12;
+       unsigned long bp;
+       unsigned long bx;
+/* arguments: non interrupts/non tracing syscalls only save upto here*/
+       unsigned long r11;
+       unsigned long r10;
+       unsigned long r9;
+       unsigned long r8;
+       unsigned long ax;
+       unsigned long cx;
+       unsigned long dx;
+       unsigned long si;
+       unsigned long di;
+       unsigned long orig_ax;
+/* end of arguments */
+/* cpu exception frame or undefined */
+       unsigned long ip;
+       unsigned long cs;
+       unsigned long flags;
+       unsigned long sp;
+       unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* __KERNEL__ */
+#endif /* !__i386__ */
+
  #ifdef __KERNEL__
  
-#define user_mode(regs) (!!((regs)->cs & 3))
-#define user_mode_vm(regs) user_mode(regs)
-#define instruction_pointer(regs) ((regs)->rip)
-#define frame_pointer(regs) ((regs)->rbp)
-#define stack_pointer(regs) ((regs)->rsp)
-#define regs_return_value(regs) ((regs)->rax)
+/* the DS BTS struct is used for ptrace as well */
+#include <asm/ds.h>
+
+struct task_struct;
+
+extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
  
  extern unsigned long profile_pc(struct pt_regs *regs);
+
+extern unsigned long
+convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
+
+#ifdef CONFIG_X86_32
+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
+#else
  void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+#endif
  
-struct task_struct;
+#define regs_return_value(regs) ((regs)->ax)
+
+/*
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
+static inline int user_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+       return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+#else
+       return !!(regs->cs & 3);
+#endif
+}
+
+static inline int user_mode_vm(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+       return ((regs->cs & SEGMENT_RPL_MASK) |
+               (regs->flags & VM_MASK)) >= USER_RPL;
+#else
+       return user_mode(regs);
+#endif
+}
+
+static inline int v8086_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+       return (regs->flags & VM_MASK);
+#else
+       return 0;       /* No V86 mode support in long mode */
+#endif
+}
+
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps.  So regs will be the current sp.
+ *
+ * This is valid only for kernel mode traps.
+ */
+static inline unsigned long kernel_trap_sp(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+       return (unsigned long)regs;
+#else
+       return regs->sp;
+#endif
+}
+
+static inline unsigned long instruction_pointer(struct pt_regs *regs)
+{
+       return regs->ip;
+}
+
+static inline unsigned long frame_pointer(struct pt_regs *regs)
+{
+       return regs->bp;
+}
+
+/*
+ * These are defined as per linux/ptrace.h, which see.
+ */
+#define arch_has_single_step() (1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
+extern void user_enable_block_step(struct task_struct *);
+#ifdef CONFIG_X86_DEBUGCTLMSR
+#define arch_has_block_step()  (1)
+#else
+#define arch_has_block_step()  (boot_cpu_data.x86 >= 6)
+#endif
+
+struct user_desc;
+extern int do_get_thread_area(struct task_struct *p, int idx,
+                             struct user_desc __user *info);
+extern int do_set_thread_area(struct task_struct *p, int idx,
+                             struct user_desc __user *info, int can_allocate);
  
-extern unsigned long
-convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
-
-enum {
-       EF_CF   = 0x00000001,
-       EF_PF   = 0x00000004,
-       EF_AF   = 0x00000010,
-       EF_ZF   = 0x00000040,
-       EF_SF   = 0x00000080,
-       EF_TF   = 0x00000100,
-       EF_IE   = 0x00000200,
-       EF_DF   = 0x00000400,
-       EF_OF   = 0x00000800,
-       EF_IOPL = 0x00003000,
-       EF_IOPL_RING0 = 0x00000000,
-       EF_IOPL_RING1 = 0x00001000,
-       EF_IOPL_RING2 = 0x00002000,
-       EF_NT   = 0x00004000,   /* nested task */
-       EF_RF   = 0x00010000,   /* resume */
-       EF_VM   = 0x00020000,   /* virtual mode */
-       EF_AC   = 0x00040000,   /* alignment */
-       EF_VIF  = 0x00080000,   /* virtual interrupt */
-       EF_VIP  = 0x00100000,   /* virtual interrupt pending */
-       EF_ID   = 0x00200000,   /* id */
-};
  #endif /* __KERNEL__ */
-#endif /* !__i386__ */
+
  #endif /* !__ASSEMBLY__ */
  
  #endif
diff --git a/include/asm-x86/resume-trace.h b/include/asm-x86/resume-trace.h

index 9b6dd093a9f740364e129b7713f9b3e27f7b1ea3..46f725b0bc823e5e73bbbeff1327f948ed9b1c5c 100644 (file)
--- a/include/asm-x86/resume-trace.h
+++ b/include/asm-x86/resume-trace.h
@@ -1,5 +1,20 @@
-#ifdef CONFIG_X86_32
-# include "resume-trace_32.h"
-#else
-# include "resume-trace_64.h"
+#ifndef _ASM_X86_RESUME_TRACE_H
+#define _ASM_X86_RESUME_TRACE_H
+
+#include <asm/asm.h>
+
+#define TRACE_RESUME(user) do {                                        \
+       if (pm_trace_enabled) {                                 \
+               void *tracedata;                                \
+               asm volatile(_ASM_MOV_UL " $1f,%0\n"            \
+                       ".section .tracedata,\"a\"\n"           \
+                       "1:\t.word %c1\n\t"                     \
+                       _ASM_PTR " %c2\n"                       \
+                       ".previous"                             \
+                       :"=r" (tracedata)                       \
+                       : "i" (__LINE__), "i" (__FILE__));      \
+               generate_resume_trace(tracedata, user);         \
+       }                                                       \
+} while (0)
+
  #endif
diff --git a/include/asm-x86/resume-trace_32.h b/include/asm-x86/resume-trace_32.h

deleted file mode 100644 (file)

index ec9cfd6..0000000
--- a/include/asm-x86/resume-trace_32.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#define TRACE_RESUME(user) do {                                        \
-       if (pm_trace_enabled) {                                 \
-               void *tracedata;                                \
-               asm volatile("movl $1f,%0\n"                    \
-                       ".section .tracedata,\"a\"\n"           \
-                       "1:\t.word %c1\n"                       \
-                       "\t.long %c2\n"                         \
-                       ".previous"                             \
-                       :"=r" (tracedata)                       \
-                       : "i" (__LINE__), "i" (__FILE__));      \
-               generate_resume_trace(tracedata, user);         \
-       }                                                       \
-} while (0)
diff --git a/include/asm-x86/resume-trace_64.h b/include/asm-x86/resume-trace_64.h

deleted file mode 100644 (file)

index 34bf998..0000000
--- a/include/asm-x86/resume-trace_64.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#define TRACE_RESUME(user) do {                                        \
-       if (pm_trace_enabled) {                                 \
-               void *tracedata;                                \
-               asm volatile("movq $1f,%0\n"                    \
-                       ".section .tracedata,\"a\"\n"           \
-                       "1:\t.word %c1\n"                       \
-                       "\t.quad %c2\n"                         \
-                       ".previous"                             \
-                       :"=r" (tracedata)                       \
-                       : "i" (__LINE__), "i" (__FILE__));      \
-               generate_resume_trace(tracedata, user);         \
-       }                                                       \
-} while (0)
diff --git a/include/asm-x86/rio.h b/include/asm-x86/rio.h

index c7350f6d20158302e2b89609c24084a25f140eba..97cdcc9887ba9246649215a5dfd457440f95b8f7 100644 (file)
--- a/include/asm-x86/rio.h
+++ b/include/asm-x86/rio.h
@@ -1,6 +1,6 @@
  /*
- * Derived from include/asm-i386/mach-summit/mach_mpparse.h
- *          and include/asm-i386/mach-default/bios_ebda.h
+ * Derived from include/asm-x86/mach-summit/mach_mpparse.h
+ *          and include/asm-x86/mach-default/bios_ebda.h
   *
   * Author: Laurent Vivier <Laurent.Vivier@bull.net>
   */
diff --git a/include/asm-x86/rwlock.h b/include/asm-x86/rwlock.h

index f2b64a429e6b8eae6580852ff12b1ca5cb25a329..6a8c0d6451080658c6b2bee54fad409b11d47a9a 100644 (file)
--- a/include/asm-x86/rwlock.h
+++ b/include/asm-x86/rwlock.h
@@ -2,7 +2,6 @@
  #define _ASM_X86_RWLOCK_H
  
  #define RW_LOCK_BIAS            0x01000000
-#define RW_LOCK_BIAS_STR       "0x01000000"
  
  /* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
  
diff --git a/include/asm-x86/rwsem.h b/include/asm-x86/rwsem.h

index 041906f3c6df73375abda597ba38e63922ffb542..520a379f4b80490dd9e897d48120d89aef86bc13 100644 (file)
--- a/include/asm-x86/rwsem.h
+++ b/include/asm-x86/rwsem.h
@@ -2,7 +2,7 @@
   *
   * Written by David Howells (dhowells@redhat.com).
   *
- * Derived from asm-i386/semaphore.h
+ * Derived from asm-x86/semaphore.h
   *
   *
   * The MSW of the count is the negated number of active writers and waiting
@@ -44,10 +44,14 @@
  
  struct rwsem_waiter;
  
-extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
-extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem));
+extern asmregparm struct rw_semaphore *
+ rwsem_down_read_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *
+ rwsem_down_write_failed(struct rw_semaphore *sem);
+extern asmregparm struct rw_semaphore *
+ rwsem_wake(struct rw_semaphore *);
+extern asmregparm struct rw_semaphore *
+ rwsem_downgrade_wake(struct rw_semaphore *sem);
  
  /*
   * the semaphore definition
diff --git a/include/asm-x86/scatterlist.h b/include/asm-x86/scatterlist.h

index 3a1e76257a27635419ed72aa2822cfc50b1743ff..d13c197866d627daa572c6928034e639dc2b17f2 100644 (file)
--- a/include/asm-x86/scatterlist.h
+++ b/include/asm-x86/scatterlist.h
@@ -1,5 +1,35 @@
+#ifndef _ASM_X86_SCATTERLIST_H
+#define _ASM_X86_SCATTERLIST_H
+
+#include <asm/types.h>
+
+struct scatterlist {
+#ifdef CONFIG_DEBUG_SG
+       unsigned long   sg_magic;
+#endif
+       unsigned long   page_link;
+       unsigned int    offset;
+       unsigned int    length;
+       dma_addr_t      dma_address;
+#ifdef CONFIG_X86_64
+       unsigned int    dma_length;
+#endif
+};
+
+#define ARCH_HAS_SG_CHAIN
+#define ISA_DMA_THRESHOLD (0x00ffffff)
+
+/*
+ * These macros should be used after a pci_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries pci_map_sg
+ * returns.
+ */
+#define sg_dma_address(sg)     ((sg)->dma_address)
  #ifdef CONFIG_X86_32
-# include "scatterlist_32.h"
+# define sg_dma_len(sg)                ((sg)->length)
  #else
-# include "scatterlist_64.h"
+# define sg_dma_len(sg)                ((sg)->dma_length)
+#endif
+
  #endif
diff --git a/include/asm-x86/scatterlist_32.h b/include/asm-x86/scatterlist_32.h

deleted file mode 100644 (file)

index 0e7d997..0000000
--- a/include/asm-x86/scatterlist_32.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _I386_SCATTERLIST_H
-#define _I386_SCATTERLIST_H
-
-#include <asm/types.h>
-
-struct scatterlist {
-#ifdef CONFIG_DEBUG_SG
-    unsigned long      sg_magic;
-#endif
-    unsigned long      page_link;
-    unsigned int       offset;
-    dma_addr_t         dma_address;
-    unsigned int       length;
-};
-
-#define ARCH_HAS_SG_CHAIN
-
-/* These macros should be used after a pci_map_sg call has been done
- * to get bus addresses of each of the SG entries and their lengths.
- * You should only work with the number of sg entries pci_map_sg
- * returns.
- */
-#define sg_dma_address(sg)     ((sg)->dma_address)
-#define sg_dma_len(sg)         ((sg)->length)
-
-#define ISA_DMA_THRESHOLD (0x00ffffff)
-
-#endif /* !(_I386_SCATTERLIST_H) */
diff --git a/include/asm-x86/scatterlist_64.h b/include/asm-x86/scatterlist_64.h

deleted file mode 100644 (file)

index 1847c72..0000000
--- a/include/asm-x86/scatterlist_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _X8664_SCATTERLIST_H
-#define _X8664_SCATTERLIST_H
-
-#include <asm/types.h>
-
-struct scatterlist {
-#ifdef CONFIG_DEBUG_SG
-    unsigned long      sg_magic;
-#endif
-    unsigned long      page_link;
-    unsigned int       offset;
-    unsigned int       length;
-    dma_addr_t         dma_address;
-    unsigned int        dma_length;
-};
-
-#define ARCH_HAS_SG_CHAIN
-
-#define ISA_DMA_THRESHOLD (0x00ffffff)
-
-/* These macros should be used after a pci_map_sg call has been done
- * to get bus addresses of each of the SG entries and their lengths.
- * You should only work with the number of sg entries pci_map_sg
- * returns.
- */
-#define sg_dma_address(sg)     ((sg)->dma_address)
-#define sg_dma_len(sg)         ((sg)->dma_length)
-
-#endif 
diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h

index 605068280e2852435a84377f84e26c05df36138e..23f0535fec618beb3c909924ff82d8c4b063b0ca 100644 (file)
--- a/include/asm-x86/segment.h
+++ b/include/asm-x86/segment.h
@@ -1,5 +1,204 @@
+#ifndef _ASM_X86_SEGMENT_H_
+#define _ASM_X86_SEGMENT_H_
+
+/* Simple and small GDT entries for booting only */
+
+#define GDT_ENTRY_BOOT_CS      2
+#define __BOOT_CS              (GDT_ENTRY_BOOT_CS * 8)
+
+#define GDT_ENTRY_BOOT_DS      (GDT_ENTRY_BOOT_CS + 1)
+#define __BOOT_DS              (GDT_ENTRY_BOOT_DS * 8)
+
+#define GDT_ENTRY_BOOT_TSS     (GDT_ENTRY_BOOT_CS + 2)
+#define __BOOT_TSS             (GDT_ENTRY_BOOT_TSS * 8)
+
  #ifdef CONFIG_X86_32
-# include "segment_32.h"
+/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ *   0 - null
+ *   1 - reserved
+ *   2 - reserved
+ *   3 - reserved
+ *
+ *   4 - unused                        <==== new cacheline
+ *   5 - unused
+ *
+ *  ------- start of TLS (Thread-Local Storage) segments:
+ *
+ *   6 - TLS segment #1                        [ glibc's TLS segment ]
+ *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
+ *   8 - TLS segment #3
+ *   9 - reserved
+ *  10 - reserved
+ *  11 - reserved
+ *
+ *  ------- start of kernel segments:
+ *
+ *  12 - kernel code segment           <==== new cacheline
+ *  13 - kernel data segment
+ *  14 - default user CS
+ *  15 - default user DS
+ *  16 - TSS
+ *  17 - LDT
+ *  18 - PNPBIOS support (16->32 gate)
+ *  19 - PNPBIOS support
+ *  20 - PNPBIOS support
+ *  21 - PNPBIOS support
+ *  22 - PNPBIOS support
+ *  23 - APM BIOS support
+ *  24 - APM BIOS support
+ *  25 - APM BIOS support
+ *
+ *  26 - ESPFIX small SS
+ *  27 - per-cpu                       [ offset to per-cpu data area ]
+ *  28 - unused
+ *  29 - unused
+ *  30 - unused
+ *  31 - TSS for double fault handler
+ */
+#define GDT_ENTRY_TLS_MIN      6
+#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+
+#define GDT_ENTRY_DEFAULT_USER_CS      14
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
+
+#define GDT_ENTRY_DEFAULT_USER_DS      15
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
+
+#define GDT_ENTRY_KERNEL_BASE  12
+
+#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
+
+#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
+
+#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
+#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
+
+#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
+#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
+
+#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+
+#define GDT_ENTRY_PERCPU                       (GDT_ENTRY_KERNEL_BASE + 15)
+#ifdef CONFIG_SMP
+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
  #else
-# include "segment_64.h"
+#define __KERNEL_PERCPU 0
+#endif
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS      31
+
+/*
+ * The GDT has 32 entries
+ */
+#define GDT_ENTRIES 32
+
+/* The PnP BIOS entries in the GDT */
+#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
+#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
+#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
+#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
+#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
+
+/* The PnP BIOS selectors */
+#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
+#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
+#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
+#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
+#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK       0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK                0x4
+
+/* User mode is privilege level 3 */
+#define USER_RPL               0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT            0x4
+#define SEGMENT_GDT            0x0
+
+/*
+ * Matching rules for certain types of segments.
+ */
+
+/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
+#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8)
+
+/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
+#define SEGMENT_IS_FLAT_CODE(x)  (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
+
+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
+
+#else
+#include <asm/cache.h>
+
+#define __KERNEL_CS    0x10
+#define __KERNEL_DS    0x18
+
+#define __KERNEL32_CS   0x08
+
+/*
+ * we cannot use the same code segment descriptor for user and kernel
+ * -- not even in the long flat mode, because of different DPL /kkeil
+ * The segment offset needs to contain a RPL. Grr. -AK
+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ */
+
+#define __USER32_CS   0x23   /* 4*8+3 */
+#define __USER_DS     0x2b   /* 5*8+3 */
+#define __USER_CS     0x33   /* 6*8+3 */
+#define __USER32_DS    __USER_DS
+
+#define GDT_ENTRY_TSS 8        /* needs two entries */
+#define GDT_ENTRY_LDT 10 /* needs two entries */
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+
+#define GDT_ENTRY_PER_CPU 15   /* Abused to load per CPU data from limit */
+#define __PER_CPU_SEG  (GDT_ENTRY_PER_CPU * 8 + 3)
+
+/* TLS indexes for 64bit - hardcoded in arch_prctl */
+#define FS_TLS 0
+#define GS_TLS 1
+
+#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+
+#define GDT_ENTRIES 16
+
+#endif
+
+#ifndef CONFIG_PARAVIRT
+#define get_kernel_rpl()  0
+#endif
+
+/* User mode is privilege level 3 */
+#define USER_RPL               0x3
+/* LDT segment has TI set, GDT has it cleared */
+#define SEGMENT_LDT            0x4
+#define SEGMENT_GDT            0x0
+
+/* Bottom two bits of selector give the ring privilege level */
+#define SEGMENT_RPL_MASK       0x3
+/* Bit 2 is table indicator (LDT/GDT) */
+#define SEGMENT_TI_MASK                0x4
+
+#define IDT_ENTRIES 256
+#define GDT_SIZE (GDT_ENTRIES * 8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+extern const char early_idt_handlers[IDT_ENTRIES][10];
+#endif
+#endif
+
  #endif
diff --git a/include/asm-x86/segment_32.h b/include/asm-x86/segment_32.h

deleted file mode 100644 (file)

index 597a47c..0000000
--- a/include/asm-x86/segment_32.h
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef _ASM_SEGMENT_H
-#define _ASM_SEGMENT_H
-
-/*
- * The layout of the per-CPU GDT under Linux:
- *
- *   0 - null
- *   1 - reserved
- *   2 - reserved
- *   3 - reserved
- *
- *   4 - unused                        <==== new cacheline
- *   5 - unused
- *
- *  ------- start of TLS (Thread-Local Storage) segments:
- *
- *   6 - TLS segment #1                        [ glibc's TLS segment ]
- *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
- *   8 - TLS segment #3
- *   9 - reserved
- *  10 - reserved
- *  11 - reserved
- *
- *  ------- start of kernel segments:
- *
- *  12 - kernel code segment           <==== new cacheline
- *  13 - kernel data segment
- *  14 - default user CS
- *  15 - default user DS
- *  16 - TSS
- *  17 - LDT
- *  18 - PNPBIOS support (16->32 gate)
- *  19 - PNPBIOS support
- *  20 - PNPBIOS support
- *  21 - PNPBIOS support
- *  22 - PNPBIOS support
- *  23 - APM BIOS support
- *  24 - APM BIOS support
- *  25 - APM BIOS support 
- *
- *  26 - ESPFIX small SS
- *  27 - per-cpu                       [ offset to per-cpu data area ]
- *  28 - unused
- *  29 - unused
- *  30 - unused
- *  31 - TSS for double fault handler
- */
-#define GDT_ENTRY_TLS_ENTRIES  3
-#define GDT_ENTRY_TLS_MIN      6
-#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
-
-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
-
-#define GDT_ENTRY_DEFAULT_USER_CS      14
-#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
-
-#define GDT_ENTRY_DEFAULT_USER_DS      15
-#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
-
-#define GDT_ENTRY_KERNEL_BASE  12
-
-#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
-#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
-
-#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
-#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
-
-#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
-#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
-
-#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
-#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
-
-#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
-
-#define GDT_ENTRY_PERCPU                       (GDT_ENTRY_KERNEL_BASE + 15)
-#ifdef CONFIG_SMP
-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
-#else
-#define __KERNEL_PERCPU 0
-#endif
-
-#define GDT_ENTRY_DOUBLEFAULT_TSS      31
-
-/*
- * The GDT has 32 entries
- */
-#define GDT_ENTRIES 32
-#define GDT_SIZE (GDT_ENTRIES * 8)
-
-/* Simple and small GDT entries for booting only */
-
-#define GDT_ENTRY_BOOT_CS              2
-#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
-
-#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
-#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
-
-/* The PnP BIOS entries in the GDT */
-#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
-#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
-#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
-#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
-#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
-
-/* The PnP BIOS selectors */
-#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
-#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
-#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
-#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
-#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
-
-/*
- * The interrupt descriptor table has room for 256 idt's,
- * the global descriptor table is dependent on the number
- * of tasks we can have..
- */
-#define IDT_ENTRIES 256
-
-/* Bottom two bits of selector give the ring privilege level */
-#define SEGMENT_RPL_MASK       0x3
-/* Bit 2 is table indicator (LDT/GDT) */
-#define SEGMENT_TI_MASK                0x4
-
-/* User mode is privilege level 3 */
-#define USER_RPL               0x3
-/* LDT segment has TI set, GDT has it cleared */
-#define SEGMENT_LDT            0x4
-#define SEGMENT_GDT            0x0
-
-#ifndef CONFIG_PARAVIRT
-#define get_kernel_rpl()  0
-#endif
-/*
- * Matching rules for certain types of segments.
- */
-
-/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
-#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8)
-
-/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
-#define SEGMENT_IS_FLAT_CODE(x)  (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
-
-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
-#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
-
-#endif
diff --git a/include/asm-x86/segment_64.h b/include/asm-x86/segment_64.h

deleted file mode 100644 (file)

index 04b8ab2..0000000
--- a/include/asm-x86/segment_64.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef _ASM_SEGMENT_H
-#define _ASM_SEGMENT_H
-
-#include <asm/cache.h>
-
-/* Simple and small GDT entries for booting only */
-
-#define GDT_ENTRY_BOOT_CS              2
-#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
-
-#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
-#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
-
-#define __KERNEL_CS    0x10
-#define __KERNEL_DS    0x18
-
-#define __KERNEL32_CS   0x08
-
-/* 
- * we cannot use the same code segment descriptor for user and kernel
- * -- not even in the long flat mode, because of different DPL /kkeil 
- * The segment offset needs to contain a RPL. Grr. -AK
- * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) 
- */
-
-#define __USER32_CS   0x23   /* 4*8+3 */ 
-#define __USER_DS     0x2b   /* 5*8+3 */ 
-#define __USER_CS     0x33   /* 6*8+3 */ 
-#define __USER32_DS    __USER_DS 
-
-#define GDT_ENTRY_TSS 8        /* needs two entries */
-#define GDT_ENTRY_LDT 10 /* needs two entries */
-#define GDT_ENTRY_TLS_MIN 12
-#define GDT_ENTRY_TLS_MAX 14
-
-#define GDT_ENTRY_TLS_ENTRIES 3
-
-#define GDT_ENTRY_PER_CPU 15   /* Abused to load per CPU data from limit */
-#define __PER_CPU_SEG  (GDT_ENTRY_PER_CPU * 8 + 3)
-
-/* TLS indexes for 64bit - hardcoded in arch_prctl */
-#define FS_TLS 0       
-#define GS_TLS 1       
-
-#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
-#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
-
-#define IDT_ENTRIES 256
-#define GDT_ENTRIES 16
-#define GDT_SIZE (GDT_ENTRIES * 8)
-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) 
-
-#endif
diff --git a/include/asm-x86/semaphore_32.h b/include/asm-x86/semaphore_32.h

index 835c1d751a9f8cdcb7270a8034c9cdbb97309e13..ac96d3804d0c8cea437308d90bb282b28ae058be 100644 (file)
--- a/include/asm-x86/semaphore_32.h
+++ b/include/asm-x86/semaphore_32.h
@@ -83,10 +83,10 @@ static inline void init_MUTEX_LOCKED (struct semaphore *sem)
         sema_init(sem, 0);
  }
  
-fastcall void __down_failed(void /* special register calling convention */);
-fastcall int  __down_failed_interruptible(void  /* params in registers */);
-fastcall int  __down_failed_trylock(void  /* params in registers */);
-fastcall void __up_wakeup(void /* special register calling convention */);
+extern asmregparm void __down_failed(atomic_t *count_ptr);
+extern asmregparm int  __down_failed_interruptible(atomic_t *count_ptr);
+extern asmregparm int  __down_failed_trylock(atomic_t *count_ptr);
+extern asmregparm void __up_wakeup(atomic_t *count_ptr);
  
  /*
   * This is ugly, but we want the default case to fall through.
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h

index 24d786e07b49816bb919e901f33e761d15adcd40..071e054abd82f91eb68eb96c37d7c41cf4e76299 100644 (file)
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -3,6 +3,13 @@
  
  #define COMMAND_LINE_SIZE 2048
  
+#ifndef __ASSEMBLY__
+char *machine_specific_memory_setup(void);
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init() do {} while (0)
+#endif
+#endif /* __ASSEMBLY__ */
+
  #ifdef __KERNEL__
  
  #ifdef __i386__
@@ -51,9 +58,7 @@ void __init add_memory_region(unsigned long long start,
  
  extern unsigned long init_pg_tables_end;
  
-#ifndef CONFIG_PARAVIRT
-#define paravirt_post_allocator_init() do {} while (0)
-#endif
+
  
  #endif /* __i386__ */
  #endif /* _SETUP */
diff --git a/include/asm-x86/sigcontext.h b/include/asm-x86/sigcontext.h

index c047f9dc3423311651410807bfb61d2a35cbf354..681deade5f0005d5ac0938104b377712997257a8 100644 (file)
--- a/include/asm-x86/sigcontext.h
+++ b/include/asm-x86/sigcontext.h
@@ -63,20 +63,20 @@ struct sigcontext {
         unsigned short fs, __fsh;
         unsigned short es, __esh;
         unsigned short ds, __dsh;
-       unsigned long edi;
-       unsigned long esi;
-       unsigned long ebp;
-       unsigned long esp;
-       unsigned long ebx;
-       unsigned long edx;
-       unsigned long ecx;
-       unsigned long eax;
+       unsigned long di;
+       unsigned long si;
+       unsigned long bp;
+       unsigned long sp;
+       unsigned long bx;
+       unsigned long dx;
+       unsigned long cx;
+       unsigned long ax;
         unsigned long trapno;
         unsigned long err;
-       unsigned long eip;
+       unsigned long ip;
         unsigned short cs, __csh;
-       unsigned long eflags;
-       unsigned long esp_at_signal;
+       unsigned long flags;
+       unsigned long sp_at_signal;
         unsigned short ss, __ssh;
         struct _fpstate __user * fpstate;
         unsigned long oldmask;
@@ -111,16 +111,16 @@ struct sigcontext {
         unsigned long r13;
         unsigned long r14;
         unsigned long r15;
-       unsigned long rdi;
-       unsigned long rsi;
-       unsigned long rbp;
-       unsigned long rbx;
-       unsigned long rdx;
-       unsigned long rax;
-       unsigned long rcx;
-       unsigned long rsp;
-       unsigned long rip;
-       unsigned long eflags;           /* RFLAGS */
+       unsigned long di;
+       unsigned long si;
+       unsigned long bp;
+       unsigned long bx;
+       unsigned long dx;
+       unsigned long ax;
+       unsigned long cx;
+       unsigned long sp;
+       unsigned long ip;
+       unsigned long flags;
         unsigned short cs;
         unsigned short gs;
         unsigned short fs;
diff --git a/include/asm-x86/sigcontext32.h b/include/asm-x86/sigcontext32.h

index 3d657038ab7c3769b739d546ecdd13ce6a0d9200..6ffab4fd593a6d23ed0b49be94b8b09fa69a19d7 100644 (file)
--- a/include/asm-x86/sigcontext32.h
+++ b/include/asm-x86/sigcontext32.h
@@ -48,20 +48,20 @@ struct sigcontext_ia32 {
         unsigned short fs, __fsh;
         unsigned short es, __esh;
         unsigned short ds, __dsh;
-       unsigned int edi;
-       unsigned int esi;
-       unsigned int ebp;
-       unsigned int esp;
-       unsigned int ebx;
-       unsigned int edx;
-       unsigned int ecx;
-       unsigned int eax;
+       unsigned int di;
+       unsigned int si;
+       unsigned int bp;
+       unsigned int sp;
+       unsigned int bx;
+       unsigned int dx;
+       unsigned int cx;
+       unsigned int ax;
         unsigned int trapno;
         unsigned int err;
-       unsigned int eip;
+       unsigned int ip;
         unsigned short cs, __csh;
-       unsigned int eflags;
-       unsigned int esp_at_signal;
+       unsigned int flags;
+       unsigned int sp_at_signal;
         unsigned short ss, __ssh;
         unsigned int fpstate;           /* really (struct _fpstate_ia32 *) */
         unsigned int oldmask;
diff --git a/include/asm-x86/signal.h b/include/asm-x86/signal.h

index 987a422a2c788975a2996cfe4f453d67d4231923..aee7eca585ab07d400cd9f4a66d1f6e7eb3fc2ce 100644 (file)
--- a/include/asm-x86/signal.h
+++ b/include/asm-x86/signal.h
@@ -245,21 +245,14 @@ static __inline__ int sigfindinword(unsigned long word)
  
  struct pt_regs;
  
-#define ptrace_signal_deliver(regs, cookie)            \
-       do {                                            \
-               if (current->ptrace & PT_DTRACE) {      \
-                       current->ptrace &= ~PT_DTRACE;  \
-                       (regs)->eflags &= ~TF_MASK;     \
-               }                                       \
-       } while (0)
-
  #else /* __i386__ */
  
  #undef __HAVE_ARCH_SIG_BITOPS
  
+#endif /* !__i386__ */
+
  #define ptrace_signal_deliver(regs, cookie) do { } while (0)
  
-#endif /* !__i386__ */
  #endif /* __KERNEL__ */
  #endif /* __ASSEMBLY__ */
  
diff --git a/include/asm-x86/smp_32.h b/include/asm-x86/smp_32.h

index e10b7affdfe5097e496412c5bd6ea768b3c0f245..56152e31228794ad8ddb685aaacf65aeef82ec6c 100644 (file)
--- a/include/asm-x86/smp_32.h
+++ b/include/asm-x86/smp_32.h
@@ -1,51 +1,41 @@
  #ifndef __ASM_SMP_H
  #define __ASM_SMP_H
  
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+
  /*
   * We need the APIC definitions automatically as part of 'smp.h'
   */
-#ifndef __ASSEMBLY__
-#include <linux/kernel.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+#  include <asm/io_apic.h>
+# endif
  #endif
  
-#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
-#include <linux/bitops.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#ifdef CONFIG_X86_IO_APIC
-#include <asm/io_apic.h>
-#endif
-#endif
+extern cpumask_t cpu_callout_map;
+extern cpumask_t cpu_callin_map;
  
-#define BAD_APICID 0xFFu
-#ifdef CONFIG_SMP
-#ifndef __ASSEMBLY__
+extern int smp_num_siblings;
+extern unsigned int num_processors;
  
-/*
- * Private routines/data
- */
- 
  extern void smp_alloc_memory(void);
-extern int pic_mode;
-extern int smp_num_siblings;
-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
-DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+extern void lock_ipi_call_lock(void);
+extern void unlock_ipi_call_lock(void);
  
  extern void (*mtrr_hook) (void);
  extern void zap_low_mappings (void);
-extern void lock_ipi_call_lock(void);
-extern void unlock_ipi_call_lock(void);
  
-#define MAX_APICID 256
  extern u8 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
-
-#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
+extern void *x86_cpu_to_apicid_early_ptr;
  
-extern void set_cpu_sibling_map(int cpu);
+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(u8, cpu_llc_id);
+DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
  
  #ifdef CONFIG_HOTPLUG_CPU
  extern void cpu_exit_clear(void);
@@ -53,6 +43,9 @@ extern void cpu_uninit(void);
  extern void remove_siblinginfo(int cpu);
  #endif
  
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
  struct smp_ops
  {
         void (*smp_prepare_boot_cpu)(void);
@@ -67,6 +60,7 @@ struct smp_ops
                                       int wait);
  };
  
+#ifdef CONFIG_SMP
  extern struct smp_ops smp_ops;
  
  static inline void smp_prepare_boot_cpu(void)
@@ -107,10 +101,12 @@ int native_cpu_up(unsigned int cpunum);
  void native_smp_cpus_done(unsigned int max_cpus);
  
  #ifndef CONFIG_PARAVIRT
-#define startup_ipi_hook(phys_apicid, start_eip, start_esp)            \
-do { } while (0)
+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
  #endif
  
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+
  /*
   * This function is needed by all SMP systems. It must _always_ be valid
   * from the initial startup. We map APIC_BASE very early in page_setup(),
@@ -119,9 +115,11 @@ do { } while (0)
  DECLARE_PER_CPU(int, cpu_number);
  #define raw_smp_processor_id() (x86_read_percpu(cpu_number))
  
-extern cpumask_t cpu_callout_map;
-extern cpumask_t cpu_callin_map;
-extern cpumask_t cpu_possible_map;
+#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
+
+extern int safe_smp_processor_id(void);
+
+void __cpuinit smp_store_cpu_info(int id);
  
  /* We don't mark CPUs online until __cpu_up(), so we need another measure */
  static inline int num_booting_cpus(void)
@@ -129,56 +127,39 @@ static inline int num_booting_cpus(void)
         return cpus_weight(cpu_callout_map);
  }
  
-extern int safe_smp_processor_id(void);
-extern int __cpu_disable(void);
-extern void __cpu_die(unsigned int cpu);
-extern unsigned int num_processors;
-
-void __cpuinit smp_store_cpu_info(int id);
-
-#endif /* !__ASSEMBLY__ */
-
  #else /* CONFIG_SMP */
  
  #define safe_smp_processor_id()                0
  #define cpu_physical_id(cpu)           boot_cpu_physical_apicid
  
-#define NO_PROC_ID             0xFF            /* No processor magic marker */
-
-#endif /* CONFIG_SMP */
-
-#ifndef __ASSEMBLY__
+#endif /* !CONFIG_SMP */
  
  #ifdef CONFIG_X86_LOCAL_APIC
  
-#ifdef APIC_DEFINITION
+static __inline int logical_smp_processor_id(void)
+{
+       /* we don't want to mark this access volatile - bad code generation */
+       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+# ifdef APIC_DEFINITION
  extern int hard_smp_processor_id(void);
-#else
-#include <mach_apicdef.h>
+# else
+#  include <mach_apicdef.h>
  static inline int hard_smp_processor_id(void)
  {
         /* we don't want to mark this access volatile - bad code generation */
-       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
+       return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
  }
-#endif /* APIC_DEFINITION */
+# endif /* APIC_DEFINITION */
  
  #else /* CONFIG_X86_LOCAL_APIC */
  
-#ifndef CONFIG_SMP
-#define hard_smp_processor_id()                0
-#endif
+# ifndef CONFIG_SMP
+#  define hard_smp_processor_id()      0
+# endif
  
  #endif /* CONFIG_X86_LOCAL_APIC */
  
-extern u8 apicid_2_node[];
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static __inline int logical_smp_processor_id(void)
-{
-       /* we don't want to mark this access volatile - bad code generation */
-       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
-}
-#endif
-#endif
-
+#endif /* !ASSEMBLY */
  #endif
diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h

index ab612b0ff270a173a4300d1d6ed6bf2bdd9b3d65..e0a75519ad216e714a1d5430d183c2608dd1a3c4 100644 (file)
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -1,130 +1,101 @@
  #ifndef __ASM_SMP_H
  #define __ASM_SMP_H
  
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#include <linux/threads.h>
  #include <linux/cpumask.h>
-#include <linux/bitops.h>
  #include <linux/init.h>
-extern int disable_apic;
  
-#include <asm/mpspec.h>
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
  #include <asm/apic.h>
  #include <asm/io_apic.h>
-#include <asm/thread_info.h>
-
-#ifdef CONFIG_SMP
-
+#include <asm/mpspec.h>
  #include <asm/pda.h>
+#include <asm/thread_info.h>
  
-struct pt_regs;
-
-extern cpumask_t cpu_present_mask;
-extern cpumask_t cpu_possible_map;
-extern cpumask_t cpu_online_map;
  extern cpumask_t cpu_callout_map;
  extern cpumask_t cpu_initialized;
  
-/*
- * Private routines/data
- */
- 
+extern int smp_num_siblings;
+extern unsigned int num_processors;
+
  extern void smp_alloc_memory(void);
-extern volatile unsigned long smp_invalidate_needed;
  extern void lock_ipi_call_lock(void);
  extern void unlock_ipi_call_lock(void);
-extern int smp_num_siblings;
-extern void smp_send_reschedule(int cpu);
+
  extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
                                   void *info, int wait);
  
-/*
- * cpu_sibling_map and cpu_core_map now live
- * in the per cpu area
- *
- * extern cpumask_t cpu_sibling_map[NR_CPUS];
- * extern cpumask_t cpu_core_map[NR_CPUS];
- */
+extern u16 __initdata x86_cpu_to_apicid_init[];
+extern u16 __initdata x86_bios_cpu_apicid_init[];
+extern void *x86_cpu_to_apicid_early_ptr;
+extern void *x86_bios_cpu_apicid_early_ptr;
+
  DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
  DECLARE_PER_CPU(cpumask_t, cpu_core_map);
-DECLARE_PER_CPU(u8, cpu_llc_id);
-
-#define SMP_TRAMPOLINE_BASE 0x6000
-
-/*
- * On x86 all CPUs are mapped 1:1 to the APIC space.
- * This simplifies scheduling and IPI sending and
- * compresses data structures.
- */
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
  
-static inline int num_booting_cpus(void)
+static inline int cpu_present_to_apicid(int mps_cpu)
  {
-       return cpus_weight(cpu_callout_map);
+       if (cpu_present(mps_cpu))
+               return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+       else
+               return BAD_APICID;
  }
  
-#define raw_smp_processor_id() read_pda(cpunumber)
+#ifdef CONFIG_SMP
+
+#define SMP_TRAMPOLINE_BASE 0x6000
  
  extern int __cpu_disable(void);
  extern void __cpu_die(unsigned int cpu);
  extern void prefill_possible_map(void);
-extern unsigned num_processors;
  extern unsigned __cpuinitdata disabled_cpus;
  
-#define NO_PROC_ID             0xFF            /* No processor magic marker */
-
-#endif /* CONFIG_SMP */
+#define raw_smp_processor_id() read_pda(cpunumber)
+#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
  
-#define safe_smp_processor_id()                smp_processor_id()
-
-static inline int hard_smp_processor_id(void)
-{
-       /* we don't want to mark this access volatile - bad code generation */
-       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
-}
+#define stack_smp_processor_id()                                       \
+       ({                                                              \
+       struct thread_info *ti;                                         \
+       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
+       ti->cpu;                                                        \
+})
  
  /*
- * Some lowlevel functions might want to know about
- * the real APIC ID <-> CPU # mapping.
+ * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies
+ * scheduling and IPI sending and compresses data structures.
   */
-extern u8 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);        /* physical ID */
-extern u8 bios_cpu_apicid[];
-
-static inline int cpu_present_to_apicid(int mps_cpu)
+static inline int num_booting_cpus(void)
  {
-       if (mps_cpu < NR_CPUS)
-               return (int)bios_cpu_apicid[mps_cpu];
-       else
-               return BAD_APICID;
+       return cpus_weight(cpu_callout_map);
  }
  
-#ifndef CONFIG_SMP
+extern void smp_send_reschedule(int cpu);
+
+#else /* CONFIG_SMP */
+
+extern unsigned int boot_cpu_id;
+#define cpu_physical_id(cpu)   boot_cpu_id
  #define stack_smp_processor_id() 0
-#define cpu_logical_map(x) (x)
-#else
-#include <asm/thread_info.h>
-#define stack_smp_processor_id() \
-({                                                             \
-       struct thread_info *ti;                                 \
-       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
-       ti->cpu;                                                \
-})
-#endif
+
+#endif /* !CONFIG_SMP */
+
+#define safe_smp_processor_id()                smp_processor_id()
  
  static __inline int logical_smp_processor_id(void)
  {
         /* we don't want to mark this access volatile - bad code generation */
-       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
+       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+}
+
+static inline int hard_smp_processor_id(void)
+{
+       /* we don't want to mark this access volatile - bad code generation */
+       return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
  }
  
-#ifdef CONFIG_SMP
-#define cpu_physical_id(cpu)           per_cpu(x86_cpu_to_apicid, cpu)
-#else
-extern unsigned int boot_cpu_id;
-#define cpu_physical_id(cpu)           boot_cpu_id
-#endif /* !CONFIG_SMP */
  #endif
  
diff --git a/include/asm-x86/sparsemem.h b/include/asm-x86/sparsemem.h

index 3f203b1d9ee8d6dc94dfb6d8326452c3eb5e42ca..fa58cd55411a5386045d704860510369cd802389 100644 (file)
--- a/include/asm-x86/sparsemem.h
+++ b/include/asm-x86/sparsemem.h
@@ -1,5 +1,34 @@
+#ifndef _ASM_X86_SPARSEMEM_H
+#define _ASM_X86_SPARSEMEM_H
+
+#ifdef CONFIG_SPARSEMEM
+/*
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the flags
+ *    field of the struct page
+ *
+ * SECTION_SIZE_BITS           2^n: size of each section
+ * MAX_PHYSADDR_BITS           2^n: max size of physical address space
+ * MAX_PHYSMEM_BITS            2^n: how much memory we can have in that space
+ *
+ */
+
  #ifdef CONFIG_X86_32
-# include "sparsemem_32.h"
-#else
-# include "sparsemem_64.h"
+# ifdef CONFIG_X86_PAE
+#  define SECTION_SIZE_BITS    30
+#  define MAX_PHYSADDR_BITS    36
+#  define MAX_PHYSMEM_BITS     36
+# else
+#  define SECTION_SIZE_BITS    26
+#  define MAX_PHYSADDR_BITS    32
+#  define MAX_PHYSMEM_BITS     32
+# endif
+#else /* CONFIG_X86_32 */
+# define SECTION_SIZE_BITS     27 /* matt - 128 is convenient right now */
+# define MAX_PHYSADDR_BITS     40
+# define MAX_PHYSMEM_BITS      40
+#endif
+
+#endif /* CONFIG_SPARSEMEM */
  #endif
diff --git a/include/asm-x86/sparsemem_32.h b/include/asm-x86/sparsemem_32.h

deleted file mode 100644 (file)

index cfeed99..0000000
--- a/include/asm-x86/sparsemem_32.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _I386_SPARSEMEM_H
-#define _I386_SPARSEMEM_H
-#ifdef CONFIG_SPARSEMEM
-
-/*
- * generic non-linear memory support:
- *
- * 1) we will not split memory into more chunks than will fit into the
- *    flags field of the struct page
- */
-
-/*
- * SECTION_SIZE_BITS           2^N: how big each section will be
- * MAX_PHYSADDR_BITS           2^N: how much physical address space we have
- * MAX_PHYSMEM_BITS            2^N: how much memory we can have in that space
- */
-#ifdef CONFIG_X86_PAE
-#define SECTION_SIZE_BITS       30
-#define MAX_PHYSADDR_BITS       36
-#define MAX_PHYSMEM_BITS       36
-#else
-#define SECTION_SIZE_BITS       26
-#define MAX_PHYSADDR_BITS       32
-#define MAX_PHYSMEM_BITS       32
-#endif
-
-/* XXX: FIXME -- wli */
-#define kern_addr_valid(kaddr)  (0)
-
-#endif /* CONFIG_SPARSEMEM */
-#endif /* _I386_SPARSEMEM_H */
diff --git a/include/asm-x86/sparsemem_64.h b/include/asm-x86/sparsemem_64.h

deleted file mode 100644 (file)

index dabb167..0000000
--- a/include/asm-x86/sparsemem_64.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _ASM_X86_64_SPARSEMEM_H
-#define _ASM_X86_64_SPARSEMEM_H 1
-
-#ifdef CONFIG_SPARSEMEM
-
-/*
- * generic non-linear memory support:
- *
- * 1) we will not split memory into more chunks than will fit into the flags
- *    field of the struct page
- *
- * SECTION_SIZE_BITS           2^n: size of each section
- * MAX_PHYSADDR_BITS           2^n: max size of physical address space
- * MAX_PHYSMEM_BITS            2^n: how much memory we can have in that space
- *
- */
-
-#define SECTION_SIZE_BITS      27 /* matt - 128 is convenient right now */
-#define MAX_PHYSADDR_BITS      40
-#define MAX_PHYSMEM_BITS       40
-
-extern int early_pfn_to_nid(unsigned long pfn);
-
-#endif /* CONFIG_SPARSEMEM */
-
-#endif /* _ASM_X86_64_SPARSEMEM_H */
diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h

index d74d85e71dcb071b2be8688a726f3a36f2a68b9f..23804c1890ffcd0780ba4c6171896db99d46a658 100644 (file)
--- a/include/asm-x86/spinlock.h
+++ b/include/asm-x86/spinlock.h
@@ -1,5 +1,296 @@
+#ifndef _X86_SPINLOCK_H_
+#define _X86_SPINLOCK_H_
+
+#include <asm/atomic.h>
+#include <asm/rwlock.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations.  There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which are currently limited to 256
+ * CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
  #ifdef CONFIG_X86_32
-# include "spinlock_32.h"
+typedef char _slock_t;
+# define LOCK_INS_DEC "decb"
+# define LOCK_INS_XCH "xchgb"
+# define LOCK_INS_MOV "movb"
+# define LOCK_INS_CMP "cmpb"
+# define LOCK_PTR_REG "a"
  #else
-# include "spinlock_64.h"
+typedef int _slock_t;
+# define LOCK_INS_DEC "decl"
+# define LOCK_INS_XCH "xchgl"
+# define LOCK_INS_MOV "movl"
+# define LOCK_INS_CMP "cmpl"
+# define LOCK_PTR_REG "D"
+#endif
+
+#if defined(CONFIG_X86_32) && \
+       (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
+/*
+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
+ * (PPro errata 66, 92)
+ */
+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
+#else
+# define UNLOCK_LOCK_PREFIX
+#endif
+
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an xadd covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide xadd increment of the low part would carry
+ * up and contaminate the high part.
+ *
+ * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
+ * save some instructions and make the code more elegant. There really isn't
+ * much between them in performance though, especially as locks are out of line.
+ */
+#if (NR_CPUS < 256)
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+       int tmp = *(volatile signed int *)(&(lock)->slock);
+
+       return (((tmp >> 8) & 0xff) != (tmp & 0xff));
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+       int tmp = *(volatile signed int *)(&(lock)->slock);
+
+       return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1;
+}
+
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+       short inc = 0x0100;
+
+       __asm__ __volatile__ (
+               LOCK_PREFIX "xaddw %w0, %1\n"
+               "1:\t"
+               "cmpb %h0, %b0\n\t"
+               "je 2f\n\t"
+               "rep ; nop\n\t"
+               "movb %1, %b0\n\t"
+               /* don't need lfence here, because loads are in-order */
+               "jmp 1b\n"
+               "2:"
+               :"+Q" (inc), "+m" (lock->slock)
+               :
+               :"memory", "cc");
+}
+
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+       int tmp;
+       short new;
+
+       asm volatile(
+               "movw %2,%w0\n\t"
+               "cmpb %h0,%b0\n\t"
+               "jne 1f\n\t"
+               "movw %w0,%w1\n\t"
+               "incb %h1\n\t"
+               "lock ; cmpxchgw %w1,%2\n\t"
+               "1:"
+               "sete %b1\n\t"
+               "movzbl %b1,%0\n\t"
+               :"=&a" (tmp), "=Q" (new), "+m" (lock->slock)
+               :
+               : "memory", "cc");
+
+       return tmp;
+}
+
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+       __asm__ __volatile__(
+               UNLOCK_LOCK_PREFIX "incb %0"
+               :"+m" (lock->slock)
+               :
+               :"memory", "cc");
+}
+#else
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+       int tmp = *(volatile signed int *)(&(lock)->slock);
+
+       return (((tmp >> 16) & 0xffff) != (tmp & 0xffff));
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+       int tmp = *(volatile signed int *)(&(lock)->slock);
+
+       return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1;
+}
+
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+       int inc = 0x00010000;
+       int tmp;
+
+       __asm__ __volatile__ (
+               "lock ; xaddl %0, %1\n"
+               "movzwl %w0, %2\n\t"
+               "shrl $16, %0\n\t"
+               "1:\t"
+               "cmpl %0, %2\n\t"
+               "je 2f\n\t"
+               "rep ; nop\n\t"
+               "movzwl %1, %2\n\t"
+               /* don't need lfence here, because loads are in-order */
+               "jmp 1b\n"
+               "2:"
+               :"+Q" (inc), "+m" (lock->slock), "=r" (tmp)
+               :
+               :"memory", "cc");
+}
+
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+       int tmp;
+       int new;
+
+       asm volatile(
+               "movl %2,%0\n\t"
+               "movl %0,%1\n\t"
+               "roll $16, %0\n\t"
+               "cmpl %0,%1\n\t"
+               "jne 1f\n\t"
+               "addl $0x00010000, %1\n\t"
+               "lock ; cmpxchgl %1,%2\n\t"
+               "1:"
+               "sete %b1\n\t"
+               "movzbl %b1,%0\n\t"
+               :"=&a" (tmp), "=r" (new), "+m" (lock->slock)
+               :
+               : "memory", "cc");
+
+       return tmp;
+}
+
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+       __asm__ __volatile__(
+               UNLOCK_LOCK_PREFIX "incw %0"
+               :"+m" (lock->slock)
+               :
+               :"memory", "cc");
+}
+#endif
+
+static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
+{
+       while (__raw_spin_is_locked(lock))
+               cpu_relax();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ */
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_read_can_lock(raw_rwlock_t *lock)
+{
+       return (int)(lock)->lock > 0;
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int __raw_write_can_lock(raw_rwlock_t *lock)
+{
+       return (lock)->lock == RW_LOCK_BIAS;
+}
+
+static inline void __raw_read_lock(raw_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+                    "jns 1f\n"
+                    "call __read_lock_failed\n\t"
+                    "1:\n"
+                    ::LOCK_PTR_REG (rw) : "memory");
+}
+
+static inline void __raw_write_lock(raw_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+                    "jz 1f\n"
+                    "call __write_lock_failed\n\t"
+                    "1:\n"
+                    ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+}
+
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
+{
+       atomic_t *count = (atomic_t *)lock;
+
+       atomic_dec(count);
+       if (atomic_read(count) >= 0)
+               return 1;
+       atomic_inc(count);
+       return 0;
+}
+
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
+{
+       atomic_t *count = (atomic_t *)lock;
+
+       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+               return 1;
+       atomic_add(RW_LOCK_BIAS, count);
+       return 0;
+}
+
+static inline void __raw_read_unlock(raw_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+}
+
+static inline void __raw_write_unlock(raw_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX "addl %1, %0"
+                    : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+}
+
+#define _raw_spin_relax(lock)  cpu_relax()
+#define _raw_read_relax(lock)  cpu_relax()
+#define _raw_write_relax(lock) cpu_relax()
+
  #endif
diff --git a/include/asm-x86/spinlock_32.h b/include/asm-x86/spinlock_32.h

deleted file mode 100644 (file)

index d3bcebe..0000000
--- a/include/asm-x86/spinlock_32.h
+++ /dev/null
@@ -1,221 +0,0 @@
-#ifndef __ASM_SPINLOCK_H
-#define __ASM_SPINLOCK_H
-
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <linux/compiler.h>
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define CLI_STRING     "cli"
-#define STI_STRING     "sti"
-#define CLI_STI_CLOBBERS
-#define CLI_STI_INPUT_ARGS
-#endif /* CONFIG_PARAVIRT */
-
-/*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
- *
- * Simple spin lock operations.  There are two variants, one clears IRQ's
- * on the local processor, one does not.
- *
- * We make no fairness assumptions. They have a cost.
- *
- * (the type definitions are in asm/spinlock_types.h)
- */
-
-static inline int __raw_spin_is_locked(raw_spinlock_t *x)
-{
-       return *(volatile signed char *)(&(x)->slock) <= 0;
-}
-
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
-{
-       asm volatile("\n1:\t"
-                    LOCK_PREFIX " ; decb %0\n\t"
-                    "jns 3f\n"
-                    "2:\t"
-                    "rep;nop\n\t"
-                    "cmpb $0,%0\n\t"
-                    "jle 2b\n\t"
-                    "jmp 1b\n"
-                    "3:\n\t"
-                    : "+m" (lock->slock) : : "memory");
-}
-
-/*
- * It is easier for the lock validator if interrupts are not re-enabled
- * in the middle of a lock-acquire. This is a performance feature anyway
- * so we turn it off:
- *
- * NOTE: there's an irqs-on section here, which normally would have to be
- * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use this variant.
- */
-#ifndef CONFIG_PROVE_LOCKING
-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
-{
-       asm volatile(
-               "\n1:\t"
-               LOCK_PREFIX " ; decb %[slock]\n\t"
-               "jns 5f\n"
-               "2:\t"
-               "testl $0x200, %[flags]\n\t"
-               "jz 4f\n\t"
-               STI_STRING "\n"
-               "3:\t"
-               "rep;nop\n\t"
-               "cmpb $0, %[slock]\n\t"
-               "jle 3b\n\t"
-               CLI_STRING "\n\t"
-               "jmp 1b\n"
-               "4:\t"
-               "rep;nop\n\t"
-               "cmpb $0, %[slock]\n\t"
-               "jg 1b\n\t"
-               "jmp 4b\n"
-               "5:\n\t"
-               : [slock] "+m" (lock->slock)
-               : [flags] "r" (flags)
-                 CLI_STI_INPUT_ARGS
-               : "memory" CLI_STI_CLOBBERS);
-}
-#endif
-
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
-{
-       char oldval;
-       asm volatile(
-               "xchgb %b0,%1"
-               :"=q" (oldval), "+m" (lock->slock)
-               :"0" (0) : "memory");
-       return oldval > 0;
-}
-
-/*
- * __raw_spin_unlock based on writing $1 to the low byte.
- * This method works. Despite all the confusion.
- * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
- * (PPro errata 66, 92)
- */
-
-#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
-
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
-{
-       asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory");
-}
-
-#else
-
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
-{
-       char oldval = 1;
-
-       asm volatile("xchgb %b0, %1"
-                    : "=q" (oldval), "+m" (lock->slock)
-                    : "0" (oldval) : "memory");
-}
-
-#endif
-
-static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
-{
-       while (__raw_spin_is_locked(lock))
-               cpu_relax();
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- *
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
- *
- * The inline assembly is non-obvious. Think about it.
- *
- * Changed to use the same technique as rw semaphores.  See
- * semaphore.h for details.  -ben
- *
- * the helpers are in arch/i386/kernel/semaphore.c
- */
-
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int __raw_read_can_lock(raw_rwlock_t *x)
-{
-       return (int)(x)->lock > 0;
-}
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int __raw_write_can_lock(raw_rwlock_t *x)
-{
-       return (x)->lock == RW_LOCK_BIAS;
-}
-
-static inline void __raw_read_lock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
-                    "jns 1f\n"
-                    "call __read_lock_failed\n\t"
-                    "1:\n"
-                    ::"a" (rw) : "memory");
-}
-
-static inline void __raw_write_lock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX " subl $" RW_LOCK_BIAS_STR ",(%0)\n\t"
-                    "jz 1f\n"
-                    "call __write_lock_failed\n\t"
-                    "1:\n"
-                    ::"a" (rw) : "memory");
-}
-
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
-{
-       atomic_t *count = (atomic_t *)lock;
-       atomic_dec(count);
-       if (atomic_read(count) >= 0)
-               return 1;
-       atomic_inc(count);
-       return 0;
-}
-
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
-{
-       atomic_t *count = (atomic_t *)lock;
-       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
-               return 1;
-       atomic_add(RW_LOCK_BIAS, count);
-       return 0;
-}
-
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
-}
-
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ", %0"
-                                : "+m" (rw->lock) : : "memory");
-}
-
-#define _raw_spin_relax(lock)  cpu_relax()
-#define _raw_read_relax(lock)  cpu_relax()
-#define _raw_write_relax(lock) cpu_relax()
-
-#endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-x86/spinlock_64.h b/include/asm-x86/spinlock_64.h

deleted file mode 100644 (file)

index 88bf981..0000000
--- a/include/asm-x86/spinlock_64.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef __ASM_SPINLOCK_H
-#define __ASM_SPINLOCK_H
-
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-
-/*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
- *
- * Simple spin lock operations.  There are two variants, one clears IRQ's
- * on the local processor, one does not.
- *
- * We make no fairness assumptions. They have a cost.
- *
- * (the type definitions are in asm/spinlock_types.h)
- */
-
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
-{
-       return *(volatile signed int *)(&(lock)->slock) <= 0;
-}
-
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
-{
-       asm volatile(
-               "\n1:\t"
-               LOCK_PREFIX " ; decl %0\n\t"
-               "jns 2f\n"
-               "3:\n"
-               "rep;nop\n\t"
-               "cmpl $0,%0\n\t"
-               "jle 3b\n\t"
-               "jmp 1b\n"
-               "2:\t" : "=m" (lock->slock) : : "memory");
-}
-
-/*
- * Same as __raw_spin_lock, but reenable interrupts during spinning.
- */
-#ifndef CONFIG_PROVE_LOCKING
-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
-{
-       asm volatile(
-               "\n1:\t"
-               LOCK_PREFIX " ; decl %0\n\t"
-               "jns 5f\n"
-               "testl $0x200, %1\n\t"  /* interrupts were disabled? */
-               "jz 4f\n\t"
-               "sti\n"
-               "3:\t"
-               "rep;nop\n\t"
-               "cmpl $0, %0\n\t"
-               "jle 3b\n\t"
-               "cli\n\t"
-               "jmp 1b\n"
-               "4:\t"
-               "rep;nop\n\t"
-               "cmpl $0, %0\n\t"
-               "jg 1b\n\t"
-               "jmp 4b\n"
-               "5:\n\t"
-               : "+m" (lock->slock) : "r" ((unsigned)flags) : "memory");
-}
-#endif
-
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
-{
-       int oldval;
-
-       asm volatile(
-               "xchgl %0,%1"
-               :"=q" (oldval), "=m" (lock->slock)
-               :"0" (0) : "memory");
-
-       return oldval > 0;
-}
-
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
-{
-       asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory");
-}
-
-static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
-{
-       while (__raw_spin_is_locked(lock))
-               cpu_relax();
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- *
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
- */
-
-static inline int __raw_read_can_lock(raw_rwlock_t *lock)
-{
-       return (int)(lock)->lock > 0;
-}
-
-static inline int __raw_write_can_lock(raw_rwlock_t *lock)
-{
-       return (lock)->lock == RW_LOCK_BIAS;
-}
-
-static inline void __raw_read_lock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t"
-                    "jns 1f\n"
-                    "call __read_lock_failed\n"
-                    "1:\n"
-                    ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory");
-}
-
-static inline void __raw_write_lock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t"
-                    "jz 1f\n"
-                    "\tcall __write_lock_failed\n\t"
-                    "1:\n"
-                    ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory");
-}
-
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
-{
-       atomic_t *count = (atomic_t *)lock;
-       atomic_dec(count);
-       if (atomic_read(count) >= 0)
-               return 1;
-       atomic_inc(count);
-       return 0;
-}
-
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
-{
-       atomic_t *count = (atomic_t *)lock;
-       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
-               return 1;
-       atomic_add(RW_LOCK_BIAS, count);
-       return 0;
-}
-
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX " ; incl %0" :"=m" (rw->lock) : : "memory");
-}
-
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
-{
-       asm volatile(LOCK_PREFIX " ; addl $" RW_LOCK_BIAS_STR ",%0"
-                               : "=m" (rw->lock) : : "memory");
-}
-
-#define _raw_spin_relax(lock)  cpu_relax()
-#define _raw_read_relax(lock)  cpu_relax()
-#define _raw_write_relax(lock) cpu_relax()
-
-#endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-x86/spinlock_types.h b/include/asm-x86/spinlock_types.h

index 4da9345c15001803a3e7786811f685a0ec5c4616..9029cf78cf5dce2ece4ede3251a300697d74f9b2 100644 (file)
--- a/include/asm-x86/spinlock_types.h
+++ b/include/asm-x86/spinlock_types.h
@@ -9,7 +9,7 @@ typedef struct {
         unsigned int slock;
  } raw_spinlock_t;
  
-#define __RAW_SPIN_LOCK_UNLOCKED       { 1 }
+#define __RAW_SPIN_LOCK_UNLOCKED       { 0 }
  
  typedef struct {
         unsigned int lock;
diff --git a/include/asm-x86/stacktrace.h b/include/asm-x86/stacktrace.h

index 70dd5bae32350fbd0c1d90cabdbeda9cf0740818..30f82526a8e285547ea890febb757890d3ad4cde 100644 (file)
--- a/include/asm-x86/stacktrace.h
+++ b/include/asm-x86/stacktrace.h
@@ -9,12 +9,13 @@ struct stacktrace_ops {
         void (*warning)(void *data, char *msg);
         /* msg must contain %s for the symbol */
         void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
-       void (*address)(void *data, unsigned long address);
+       void (*address)(void *data, unsigned long address, int reliable);
         /* On negative return stop dumping */
         int (*stack)(void *data, char *name);
  };
  
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp,
                 const struct stacktrace_ops *ops, void *data);
  
  #endif
diff --git a/include/asm-x86/suspend_32.h b/include/asm-x86/suspend_32.h

index a2520732ffd6a30b8f769f7a053c31c505944eaf..1bbda3ad7796002f40c52df81a02053b2f1a875d 100644 (file)
--- a/include/asm-x86/suspend_32.h
+++ b/include/asm-x86/suspend_32.h
@@ -12,8 +12,8 @@ static inline int arch_prepare_suspend(void) { return 0; }
  struct saved_context {
         u16 es, fs, gs, ss;
         unsigned long cr0, cr2, cr3, cr4;
-       struct Xgt_desc_struct gdt;
-       struct Xgt_desc_struct idt;
+       struct desc_ptr gdt;
+       struct desc_ptr idt;
         u16 ldt;
         u16 tss;
         unsigned long tr;
diff --git a/include/asm-x86/suspend_64.h b/include/asm-x86/suspend_64.h

index c505a76bcf6e93862b9e77dcccbe3afa02b14629..2eb92cb81a0d2c7c2fac3a8ea227835648cfaf0c 100644 (file)
--- a/include/asm-x86/suspend_64.h
+++ b/include/asm-x86/suspend_64.h
@@ -15,7 +15,14 @@ arch_prepare_suspend(void)
         return 0;
  }
  
-/* Image of the saved processor state. If you touch this, fix acpi/wakeup.S. */
+/*
+ * Image of the saved processor state, used by the low level ACPI suspend to
+ * RAM code and by the low level hibernation code.
+ *
+ * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
+ * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
+ * still work as required.
+ */
  struct saved_context {
         struct pt_regs regs;
         u16 ds, es, fs, gs, ss;
@@ -38,8 +45,6 @@ struct saved_context {
  #define loaddebug(thread,register) \
         set_debugreg((thread)->debugreg##register, register)
  
-extern void fix_processor_context(void);
-
  /* routines for saving/restoring kernel state */
  extern int acpi_save_state_mem(void);
  extern char core_restore_code;
diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h

index 692562b48f2a6534074df384a7dff97217ea06a8..ee32ef9367f4f25b1c87bf435d2c80ed8586c9c9 100644 (file)
--- a/include/asm-x86/system.h
+++ b/include/asm-x86/system.h
@@ -1,5 +1,414 @@
+#ifndef _ASM_X86_SYSTEM_H_
+#define _ASM_X86_SYSTEM_H_
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeature.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/* entries in ARCH_DLINFO: */
+#ifdef CONFIG_IA32_EMULATION
+# define AT_VECTOR_SIZE_ARCH 2
+#else
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
+#ifdef CONFIG_X86_32
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+extern struct task_struct *FASTCALL(__switch_to(struct task_struct *prev,
+                                               struct task_struct *next));
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last) do {                               \
+       unsigned long esi, edi;                                         \
+       asm volatile("pushfl\n\t"               /* Save flags */        \
+                    "pushl %%ebp\n\t"                                  \
+                    "movl %%esp,%0\n\t"        /* save ESP */          \
+                    "movl %5,%%esp\n\t"        /* restore ESP */       \
+                    "movl $1f,%1\n\t"          /* save EIP */          \
+                    "pushl %6\n\t"             /* restore EIP */       \
+                    "jmp __switch_to\n"                                \
+                    "1:\t"                                             \
+                    "popl %%ebp\n\t"                                   \
+                    "popfl"                                            \
+                    :"=m" (prev->thread.sp), "=m" (prev->thread.ip),   \
+                     "=a" (last), "=S" (esi), "=D" (edi)               \
+                    :"m" (next->thread.sp), "m" (next->thread.ip),     \
+                     "2" (prev), "d" (next));                          \
+} while (0)
+
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#else
+#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
+#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER  \
+       , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+         "r12", "r13", "r14", "r15"
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+       asm volatile(SAVE_CONTEXT                                                   \
+            "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
+            "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
+            "call __switch_to\n\t"                                       \
+            ".globl thread_return\n"                                     \
+            "thread_return:\n\t"                                         \
+            "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
+            "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
+            LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
+            "movq %%rax,%%rdi\n\t"                                       \
+            "jc   ret_from_fork\n\t"                                     \
+            RESTORE_CONTEXT                                              \
+            : "=a" (last)                                                \
+            : [next] "S" (next), [prev] "D" (prev),                      \
+              [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+              [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
+              [tif_fork] "i" (TIF_FORK),                                 \
+              [thread_info] "i" (offsetof(struct task_struct, stack)),   \
+              [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))  \
+            : "memory", "cc" __EXTRA_CLOBBER)
+#endif
+
+#ifdef __KERNEL__
+#define _set_base(addr, base) do { unsigned long __pr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+       "rorl $16,%%edx\n\t" \
+       "movb %%dl,%2\n\t" \
+       "movb %%dh,%3" \
+       :"=&d" (__pr) \
+       :"m" (*((addr)+2)), \
+        "m" (*((addr)+4)), \
+        "m" (*((addr)+7)), \
+        "0" (base) \
+       ); } while (0)
+
+#define _set_limit(addr, limit) do { unsigned long __lr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+       "rorl $16,%%edx\n\t" \
+       "movb %2,%%dh\n\t" \
+       "andb $0xf0,%%dh\n\t" \
+       "orb %%dh,%%dl\n\t" \
+       "movb %%dl,%2" \
+       :"=&d" (__lr) \
+       :"m" (*(addr)), \
+        "m" (*((addr)+6)), \
+        "0" (limit) \
+       ); } while (0)
+
+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
+
+extern void load_gs_index(unsigned);
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value)                        \
+       asm volatile("\n"                       \
+               "1:\t"                          \
+               "movl %k0,%%" #seg "\n"         \
+               "2:\n"                          \
+               ".section .fixup,\"ax\"\n"      \
+               "3:\t"                          \
+               "movl %k1, %%" #seg "\n\t"      \
+               "jmp 2b\n"                      \
+               ".previous\n"                   \
+               ".section __ex_table,\"a\"\n\t" \
+               _ASM_ALIGN "\n\t"               \
+               _ASM_PTR " 1b,3b\n"             \
+               ".previous"                     \
+               : :"r" (value), "r" (0))
+
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value) \
+       asm volatile("mov %%" #seg ",%0":"=rm" (value))
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+       unsigned long __limit;
+       __asm__("lsll %1,%0"
+               :"=r" (__limit):"r" (segment));
+       return __limit+1;
+}
+
+static inline void native_clts(void)
+{
+       asm volatile ("clts");
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+       unsigned long val;
+       asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order));
+       return val;
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+       asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr2(void)
+{
+       unsigned long val;
+       asm volatile("mov %%cr2,%0\n\t" :"=r" (val), "=m" (__force_order));
+       return val;
+}
+
+static inline void native_write_cr2(unsigned long val)
+{
+       asm volatile("mov %0,%%cr2": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr3(void)
+{
+       unsigned long val;
+       asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order));
+       return val;
+}
+
+static inline void native_write_cr3(unsigned long val)
+{
+       asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+       unsigned long val;
+       asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order));
+       return val;
+}
+
+static inline unsigned long native_read_cr4_safe(void)
+{
+       unsigned long val;
+       /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
+        * exists, so it will never fail. */
+#ifdef CONFIG_X86_32
+       asm volatile("1: mov %%cr4, %0          \n"
+               "2:                             \n"
+               ".section __ex_table,\"a\"      \n"
+               ".long 1b,2b                    \n"
+               ".previous                      \n"
+               : "=r" (val), "=m" (__force_order) : "0" (0));
+#else
+       val = native_read_cr4();
+#endif
+       return val;
+}
+
+static inline void native_write_cr4(unsigned long val)
+{
+       asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long native_read_cr8(void)
+{
+       unsigned long cr8;
+       asm volatile("movq %%cr8,%0" : "=r" (cr8));
+       return cr8;
+}
+
+static inline void native_write_cr8(unsigned long val)
+{
+       asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+       asm volatile("wbinvd": : :"memory");
+}
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define read_cr0()     (native_read_cr0())
+#define write_cr0(x)   (native_write_cr0(x))
+#define read_cr2()     (native_read_cr2())
+#define write_cr2(x)   (native_write_cr2(x))
+#define read_cr3()     (native_read_cr3())
+#define write_cr3(x)   (native_write_cr3(x))
+#define read_cr4()     (native_read_cr4())
+#define read_cr4_safe()        (native_read_cr4_safe())
+#define write_cr4(x)   (native_write_cr4(x))
+#define wbinvd()       (native_wbinvd())
+#ifdef CONFIG_X86_64
+#define read_cr8()     (native_read_cr8())
+#define write_cr8(x)   (native_write_cr8(x))
+#endif
+
+/* Clear the 'TS' bit */
+#define clts()         (native_clts())
+
+#endif/* CONFIG_PARAVIRT */
+
+#define stts() write_cr0(8 | read_cr0())
+
+#endif /* __KERNEL__ */
+
+static inline void clflush(void *__p)
+{
+       asm volatile("clflush %0" : "+m" (*(char __force *)__p));
+}
+
+#define nop() __asm__ __volatile__ ("nop")
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+extern int es7000_plat;
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
  #ifdef CONFIG_X86_32
-# include "system_32.h"
+/*
+ * For now, "wmb()" doesn't actually do anything, as all
+ * Intel CPU's follow what Intel calls a *Processor Order*,
+ * in which all writes are seen in the program order even
+ * outside the CPU.
+ *
+ * I expect future Intel CPU's to have a weaker ordering,
+ * but I'd also expect them to finally get their act together
+ * and add some real memory barriers if so.
+ *
+ * Some non intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
  #else
-# include "system_64.h"
+#define mb()   asm volatile("mfence":::"memory")
+#define rmb()  asm volatile("lfence":::"memory")
+#define wmb()  asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier.  All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads.  This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies.  See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ *     CPU 0                           CPU 1
+ *
+ *     b = 2;
+ *     memory_barrier();
+ *     p = &b;                         q = p;
+ *                                     read_barrier_depends();
+ *                                     d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends().  However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ *     CPU 0                           CPU 1
+ *
+ *     a = 2;
+ *     memory_barrier();
+ *     b = 3;                          y = b;
+ *                                     read_barrier_depends();
+ *                                     x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b".  Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends() do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb()       mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb()     rmb()
+#else
+# define smp_rmb()     barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb()     wmb()
+#else
+# define smp_wmb()     barrier()
+#endif
+#define smp_read_barrier_depends()     read_barrier_depends()
+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+#else
+#define smp_mb()       barrier()
+#define smp_rmb()      barrier()
+#define smp_wmb()      barrier()
+#define smp_read_barrier_depends()     do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static inline void rdtsc_barrier(void)
+{
+       alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+       alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
  #endif
diff --git a/include/asm-x86/system_32.h b/include/asm-x86/system_32.h

deleted file mode 100644 (file)

index ef84688..0000000
--- a/include/asm-x86/system_32.h
+++ /dev/null
@@ -1,320 +0,0 @@
-#ifndef __ASM_SYSTEM_H
-#define __ASM_SYSTEM_H
-
-#include <linux/kernel.h>
-#include <asm/segment.h>
-#include <asm/cpufeature.h>
-#include <asm/cmpxchg.h>
-
-#ifdef __KERNEL__
-#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
-
-struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
-extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
-
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
-#define switch_to(prev,next,last) do {                                 \
-       unsigned long esi,edi;                                          \
-       asm volatile("pushfl\n\t"               /* Save flags */        \
-                    "pushl %%ebp\n\t"                                  \
-                    "movl %%esp,%0\n\t"        /* save ESP */          \
-                    "movl %5,%%esp\n\t"        /* restore ESP */       \
-                    "movl $1f,%1\n\t"          /* save EIP */          \
-                    "pushl %6\n\t"             /* restore EIP */       \
-                    "jmp __switch_to\n"                                \
-                    "1:\t"                                             \
-                    "popl %%ebp\n\t"                                   \
-                    "popfl"                                            \
-                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
-                     "=a" (last),"=S" (esi),"=D" (edi)                 \
-                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
-                     "2" (prev), "d" (next));                          \
-} while (0)
-
-#define _set_base(addr,base) do { unsigned long __pr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
-       "rorl $16,%%edx\n\t" \
-       "movb %%dl,%2\n\t" \
-       "movb %%dh,%3" \
-       :"=&d" (__pr) \
-       :"m" (*((addr)+2)), \
-        "m" (*((addr)+4)), \
-        "m" (*((addr)+7)), \
-         "0" (base) \
-        ); } while(0)
-
-#define _set_limit(addr,limit) do { unsigned long __lr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
-       "rorl $16,%%edx\n\t" \
-       "movb %2,%%dh\n\t" \
-       "andb $0xf0,%%dh\n\t" \
-       "orb %%dh,%%dl\n\t" \
-       "movb %%dl,%2" \
-       :"=&d" (__lr) \
-       :"m" (*(addr)), \
-        "m" (*((addr)+6)), \
-        "0" (limit) \
-        ); } while(0)
-
-#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
-#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
-
-/*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
- */
-#define loadsegment(seg,value)                 \
-       asm volatile("\n"                       \
-               "1:\t"                          \
-               "mov %0,%%" #seg "\n"           \
-               "2:\n"                          \
-               ".section .fixup,\"ax\"\n"      \
-               "3:\t"                          \
-               "pushl $0\n\t"                  \
-               "popl %%" #seg "\n\t"           \
-               "jmp 2b\n"                      \
-               ".previous\n"                   \
-               ".section __ex_table,\"a\"\n\t" \
-               ".align 4\n\t"                  \
-               ".long 1b,3b\n"                 \
-               ".previous"                     \
-               : :"rm" (value))
-
-/*
- * Save a segment register away
- */
-#define savesegment(seg, value) \
-       asm volatile("mov %%" #seg ",%0":"=rm" (value))
-
-
-static inline void native_clts(void)
-{
-       asm volatile ("clts");
-}
-
-static inline unsigned long native_read_cr0(void)
-{
-       unsigned long val;
-       asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
-       return val;
-}
-
-static inline void native_write_cr0(unsigned long val)
-{
-       asm volatile("movl %0,%%cr0": :"r" (val));
-}
-
-static inline unsigned long native_read_cr2(void)
-{
-       unsigned long val;
-       asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
-       return val;
-}
-
-static inline void native_write_cr2(unsigned long val)
-{
-       asm volatile("movl %0,%%cr2": :"r" (val));
-}
-
-static inline unsigned long native_read_cr3(void)
-{
-       unsigned long val;
-       asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
-       return val;
-}
-
-static inline void native_write_cr3(unsigned long val)
-{
-       asm volatile("movl %0,%%cr3": :"r" (val));
-}
-
-static inline unsigned long native_read_cr4(void)
-{
-       unsigned long val;
-       asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
-       return val;
-}
-
-static inline unsigned long native_read_cr4_safe(void)
-{
-       unsigned long val;
-       /* This could fault if %cr4 does not exist */
-       asm volatile("1: movl %%cr4, %0         \n"
-               "2:                             \n"
-               ".section __ex_table,\"a\"      \n"
-               ".long 1b,2b                    \n"
-               ".previous                      \n"
-               : "=r" (val): "0" (0));
-       return val;
-}
-
-static inline void native_write_cr4(unsigned long val)
-{
-       asm volatile("movl %0,%%cr4": :"r" (val));
-}
-
-static inline void native_wbinvd(void)
-{
-       asm volatile("wbinvd": : :"memory");
-}
-
-static inline void clflush(volatile void *__p)
-{
-       asm volatile("clflush %0" : "+m" (*(char __force *)__p));
-}
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define read_cr0()     (native_read_cr0())
-#define write_cr0(x)   (native_write_cr0(x))
-#define read_cr2()     (native_read_cr2())
-#define write_cr2(x)   (native_write_cr2(x))
-#define read_cr3()     (native_read_cr3())
-#define write_cr3(x)   (native_write_cr3(x))
-#define read_cr4()     (native_read_cr4())
-#define read_cr4_safe()        (native_read_cr4_safe())
-#define write_cr4(x)   (native_write_cr4(x))
-#define wbinvd()       (native_wbinvd())
-
-/* Clear the 'TS' bit */
-#define clts()         (native_clts())
-
-#endif/* CONFIG_PARAVIRT */
-
-/* Set the 'TS' bit */
-#define stts() write_cr0(8 | read_cr0())
-
-#endif /* __KERNEL__ */
-
-static inline unsigned long get_limit(unsigned long segment)
-{
-       unsigned long __limit;
-       __asm__("lsll %1,%0"
-               :"=r" (__limit):"r" (segment));
-       return __limit+1;
-}
-
-#define nop() __asm__ __volatile__ ("nop")
-
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- *
- * For now, "wmb()" doesn't actually do anything, as all
- * Intel CPU's follow what Intel calls a *Processor Order*,
- * in which all writes are seen in the program order even
- * outside the CPU.
- *
- * I expect future Intel CPU's to have a weaker ordering,
- * but I'd also expect them to finally get their act together
- * and add some real memory barriers if so.
- *
- * Some non intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
- 
-
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
-
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *     CPU 0                           CPU 1
- *
- *     b = 2;
- *     memory_barrier();
- *     p = &b;                         q = p;
- *                                     read_barrier_depends();
- *                                     d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *     CPU 0                           CPU 1
- *
- *     a = 2;
- *     memory_barrier();
- *     b = 3;                          y = b;
- *                                     read_barrier_depends();
- *                                     x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- **/
-
-#define read_barrier_depends() do { } while(0)
-
-#ifdef CONFIG_SMP
-#define smp_mb()       mb()
-#ifdef CONFIG_X86_PPRO_FENCE
-# define smp_rmb()     rmb()
-#else
-# define smp_rmb()     barrier()
-#endif
-#ifdef CONFIG_X86_OOSTORE
-# define smp_wmb()     wmb()
-#else
-# define smp_wmb()     barrier()
-#endif
-#define smp_read_barrier_depends()     read_barrier_depends()
-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
-#else
-#define smp_mb()       barrier()
-#define smp_rmb()      barrier()
-#define smp_wmb()      barrier()
-#define smp_read_barrier_depends()     do { } while(0)
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
-#endif
-
-#include <linux/irqflags.h>
-
-/*
- * disable hlt during certain critical i/o operations
- */
-#define HAVE_DISABLE_HLT
-void disable_hlt(void);
-void enable_hlt(void);
-
-extern int es7000_plat;
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
-void default_idle(void);
-void __show_registers(struct pt_regs *, int all);
-
-#endif
diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h

index 6e9e4841a2da25836a997f07941676a6c599459a..97fa251ccb2b088d055e0332fb97d50e5aeb480d 100644 (file)
--- a/include/asm-x86/system_64.h
+++ b/include/asm-x86/system_64.h
@@ -1,126 +1,9 @@
  #ifndef __ASM_SYSTEM_H
  #define __ASM_SYSTEM_H
  
-#include <linux/kernel.h>
  #include <asm/segment.h>
  #include <asm/cmpxchg.h>
  
-#ifdef __KERNEL__
-
-/* entries in ARCH_DLINFO: */
-#ifdef CONFIG_IA32_EMULATION
-# define AT_VECTOR_SIZE_ARCH 2
-#else
-# define AT_VECTOR_SIZE_ARCH 1
-#endif
-
-#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
-#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
-
-#define __EXTRA_CLOBBER  \
-       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
-
-/* Save restore flags to clear handle leaking NT */
-#define switch_to(prev,next,last) \
-       asm volatile(SAVE_CONTEXT                                                   \
-                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
-                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
-                    "call __switch_to\n\t"                                       \
-                    ".globl thread_return\n"                                   \
-                    "thread_return:\n\t"                                           \
-                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
-                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
-                    LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"        \
-                    "movq %%rax,%%rdi\n\t"                                       \
-                    "jc   ret_from_fork\n\t"                                     \
-                    RESTORE_CONTEXT                                                \
-                    : "=a" (last)                                                \
-                    : [next] "S" (next), [prev] "D" (prev),                      \
-                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
-                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
-                      [tif_fork] "i" (TIF_FORK),                         \
-                      [thread_info] "i" (offsetof(struct task_struct, stack)), \
-                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
-                    : "memory", "cc" __EXTRA_CLOBBER)
-    
-extern void load_gs_index(unsigned); 
-
-/*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
- */
-#define loadsegment(seg,value) \
-       asm volatile("\n"                       \
-               "1:\t"                          \
-               "movl %k0,%%" #seg "\n"         \
-               "2:\n"                          \
-               ".section .fixup,\"ax\"\n"      \
-               "3:\t"                          \
-               "movl %1,%%" #seg "\n\t"        \
-               "jmp 2b\n"                      \
-               ".previous\n"                   \
-               ".section __ex_table,\"a\"\n\t" \
-               ".align 8\n\t"                  \
-               ".quad 1b,3b\n"                 \
-               ".previous"                     \
-               : :"r" (value), "r" (0))
-
-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
-
-static inline unsigned long read_cr0(void)
-{ 
-       unsigned long cr0;
-       asm volatile("movq %%cr0,%0" : "=r" (cr0));
-       return cr0;
-}
-
-static inline void write_cr0(unsigned long val) 
-{ 
-       asm volatile("movq %0,%%cr0" :: "r" (val));
-}
-
-static inline unsigned long read_cr2(void)
-{
-       unsigned long cr2;
-       asm volatile("movq %%cr2,%0" : "=r" (cr2));
-       return cr2;
-}
-
-static inline void write_cr2(unsigned long val)
-{
-       asm volatile("movq %0,%%cr2" :: "r" (val));
-}
-
-static inline unsigned long read_cr3(void)
-{ 
-       unsigned long cr3;
-       asm volatile("movq %%cr3,%0" : "=r" (cr3));
-       return cr3;
-}
-
-static inline void write_cr3(unsigned long val)
-{
-       asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
-}
-
-static inline unsigned long read_cr4(void)
-{ 
-       unsigned long cr4;
-       asm volatile("movq %%cr4,%0" : "=r" (cr4));
-       return cr4;
-}
-
-static inline void write_cr4(unsigned long val)
-{ 
-       asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
-}
  
  static inline unsigned long read_cr8(void)
  {
@@ -134,52 +17,6 @@ static inline void write_cr8(unsigned long val)
         asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
  }
  
-#define stts() write_cr0(8 | read_cr0())
-
-#define wbinvd() \
-       __asm__ __volatile__ ("wbinvd": : :"memory")
-
-#endif /* __KERNEL__ */
-
-static inline void clflush(volatile void *__p)
-{
-       asm volatile("clflush %0" : "+m" (*(char __force *)__p));
-}
-
-#define nop() __asm__ __volatile__ ("nop")
-
-#ifdef CONFIG_SMP
-#define smp_mb()       mb()
-#define smp_rmb()      barrier()
-#define smp_wmb()      barrier()
-#define smp_read_barrier_depends()     do {} while(0)
-#else
-#define smp_mb()       barrier()
-#define smp_rmb()      barrier()
-#define smp_wmb()      barrier()
-#define smp_read_barrier_depends()     do {} while(0)
-#endif
-
-    
-/*
- * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
- * to devices.
- */
-#define mb()   asm volatile("mfence":::"memory")
-#define rmb()  asm volatile("lfence":::"memory")
-#define wmb()  asm volatile("sfence" ::: "memory")
-
-#define read_barrier_depends() do {} while(0)
-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
-
-#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
-
  #include <linux/irqflags.h>
  
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
  #endif
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h

index a516e9192f1135a2a6b310caf5aaf0525df43220..5bd508260ffbf6b5b925023a96249526972b1556 100644 (file)
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -138,6 +138,10 @@ static inline struct thread_info *current_thread_info(void)
  #define TIF_IO_BITMAP          18      /* uses I/O bitmap */
  #define TIF_FREEZE             19      /* is freezing for suspend */
  #define TIF_NOTSC              20      /* TSC is not accessible in userland */
+#define TIF_FORCED_TF          21      /* true if TF in eflags artificially */
+#define TIF_DEBUGCTLMSR                22      /* uses thread_struct.debugctlmsr */
+#define TIF_DS_AREA_MSR        23      /* uses thread_struct.ds_area_msr */
+#define TIF_BTS_TRACE_TS        24      /* record scheduling event timestamps */
  
  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
  #define _TIF_SIGPENDING                (1<<TIF_SIGPENDING)
@@ -153,6 +157,10 @@ static inline struct thread_info *current_thread_info(void)
  #define _TIF_IO_BITMAP         (1<<TIF_IO_BITMAP)
  #define _TIF_FREEZE            (1<<TIF_FREEZE)
  #define _TIF_NOTSC             (1<<TIF_NOTSC)
+#define _TIF_FORCED_TF         (1<<TIF_FORCED_TF)
+#define _TIF_DEBUGCTLMSR       (1<<TIF_DEBUGCTLMSR)
+#define _TIF_DS_AREA_MSR       (1<<TIF_DS_AREA_MSR)
+#define _TIF_BTS_TRACE_TS      (1<<TIF_BTS_TRACE_TS)
  
  /* work to do on interrupt/exception return */
  #define _TIF_WORK_MASK \
@@ -162,8 +170,12 @@ static inline struct thread_info *current_thread_info(void)
  #define _TIF_ALLWORK_MASK      (0x0000FFFF & ~_TIF_SECCOMP)
  
  /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW_NEXT (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUG)
-#define _TIF_WORK_CTXSW_PREV (_TIF_IO_BITMAP | _TIF_NOTSC)
+#define _TIF_WORK_CTXSW \
+    (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR | \
+     _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS)
+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG)
+
  
  /*
   * Thread-synchronous status.
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h

index 7f6ee68f000203d17a6ee747e6df85a55535053b..9b531ea015a83a491ed810acaaed074c6069a76b 100644 (file)
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -21,7 +21,7 @@
  #ifndef __ASSEMBLY__
  struct task_struct;
  struct exec_domain;
-#include <asm/mmsegment.h>
+#include <asm/processor.h>
  
  struct thread_info {
         struct task_struct      *task;          /* main task structure */
@@ -33,6 +33,9 @@ struct thread_info {
  
         mm_segment_t            addr_limit;     
         struct restart_block    restart_block;
+#ifdef CONFIG_IA32_EMULATION
+       void __user             *sysenter_return;
+#endif
  };
  #endif
  
@@ -74,20 +77,14 @@ static inline struct thread_info *stack_thread_info(void)
  
  /* thread information allocation */
  #ifdef CONFIG_DEBUG_STACK_USAGE
-#define alloc_thread_info(tsk)                                 \
-    ({                                                         \
-       struct thread_info *ret;                                \
-                                                               \
-       ret = ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)); \
-       if (ret)                                                \
-               memset(ret, 0, THREAD_SIZE);                    \
-       ret;                                                    \
-    })
+#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
  #else
-#define alloc_thread_info(tsk) \
-       ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER))
+#define THREAD_FLAGS GFP_KERNEL
  #endif
  
+#define alloc_thread_info(tsk) \
+       ((struct thread_info *) __get_free_pages(THREAD_FLAGS, THREAD_ORDER))
+
  #define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
  
  #else /* !__ASSEMBLY__ */
@@ -124,6 +121,10 @@ static inline struct thread_info *stack_thread_info(void)
  #define TIF_DEBUG              21      /* uses debug registers */
  #define TIF_IO_BITMAP          22      /* uses I/O bitmap */
  #define TIF_FREEZE             23      /* is freezing for suspend */
+#define TIF_FORCED_TF          24      /* true if TF in eflags artificially */
+#define TIF_DEBUGCTLMSR                25      /* uses thread_struct.debugctlmsr */
+#define TIF_DS_AREA_MSR        25      /* uses thread_struct.ds_area_msr */
+#define TIF_BTS_TRACE_TS       26      /* record scheduling event timestamps */
  
  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
  #define _TIF_SIGPENDING                (1<<TIF_SIGPENDING)
@@ -141,6 +142,10 @@ static inline struct thread_info *stack_thread_info(void)
  #define _TIF_DEBUG             (1<<TIF_DEBUG)
  #define _TIF_IO_BITMAP         (1<<TIF_IO_BITMAP)
  #define _TIF_FREEZE            (1<<TIF_FREEZE)
+#define _TIF_FORCED_TF         (1<<TIF_FORCED_TF)
+#define _TIF_DEBUGCTLMSR       (1<<TIF_DEBUGCTLMSR)
+#define _TIF_DS_AREA_MSR       (1<<TIF_DS_AREA_MSR)
+#define _TIF_BTS_TRACE_TS      (1<<TIF_BTS_TRACE_TS)
  
  /* work to do on interrupt/exception return */
  #define _TIF_WORK_MASK \
@@ -152,7 +157,10 @@ static inline struct thread_info *stack_thread_info(void)
         (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
  
  /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
+#define _TIF_WORK_CTXSW \
+    (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS)
+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
  
  #define PREEMPT_ACTIVE     0x10000000
  
diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h

index eac011366dc2c09337abfe3d146bafcb01b1d686..68779b048a3edba6f3371bb459735d8a170aec9f 100644 (file)
--- a/include/asm-x86/time.h
+++ b/include/asm-x86/time.h
@@ -1,8 +1,12 @@
-#ifndef _ASMi386_TIME_H
-#define _ASMi386_TIME_H
+#ifndef _ASMX86_TIME_H
+#define _ASMX86_TIME_H
  
+extern void (*late_time_init)(void);
+extern void hpet_time_init(void);
+
+#include <asm/mc146818rtc.h>
+#ifdef CONFIG_X86_32
  #include <linux/efi.h>
-#include "mach_time.h"
  
  static inline unsigned long native_get_wallclock(void)
  {
@@ -28,8 +32,20 @@ static inline int native_set_wallclock(unsigned long nowtime)
         return retval;
  }
  
-extern void (*late_time_init)(void);
-extern void hpet_time_init(void);
+#else
+extern void native_time_init_hook(void);
+
+static inline unsigned long native_get_wallclock(void)
+{
+       return mach_get_cmos_time();
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+       return mach_set_rtc_mmss(nowtime);
+}
+
+#endif
  
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
diff --git a/include/asm-x86/timer.h b/include/asm-x86/timer.h

index 0db7e994fb8b4f40ad9a96ecf2258d7fd6dc57f6..4f6fcb050c11577d4c0640f26513674c1669f4d2 100644 (file)
--- a/include/asm-x86/timer.h
+++ b/include/asm-x86/timer.h
@@ -2,6 +2,7 @@
  #define _ASMi386_TIMER_H
  #include <linux/init.h>
  #include <linux/pm.h>
+#include <linux/percpu.h>
  
  #define TICK_SIZE (tick_nsec / 1000)
  
@@ -16,7 +17,7 @@ extern int recalibrate_cpu_khz(void);
  #define calculate_cpu_khz() native_calculate_cpu_khz()
  #endif
  
-/* Accellerators for sched_clock()
+/* Accelerators for sched_clock()
   * convert from cycles(64bits) => nanoseconds (64bits)
   *  basic equation:
   *             ns = cycles / (freq / ns_per_sec)
@@ -31,20 +32,32 @@ extern int recalibrate_cpu_khz(void);
   *     And since SC is a constant power of two, we can convert the div
   *  into a shift.
   *
- *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  We can use khz divisor instead of mhz to keep a better precision, since
   *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
   *  (mathieu.desnoyers@polymtl.ca)
   *
   *                     -johnstul@us.ibm.com "math is hard, lets go shopping!"
   */
-extern unsigned long cyc2ns_scale __read_mostly;
+
+DECLARE_PER_CPU(unsigned long, cyc2ns);
  
  #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
  
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
  {
-       return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+       return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
  }
  
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+       unsigned long long ns;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       ns = __cycles_2_ns(cyc);
+       local_irq_restore(flags);
+
+       return ns;
+}
  
  #endif
diff --git a/include/asm-x86/timex.h b/include/asm-x86/timex.h

index 39a21ab030f02a6244746bc1f706c884cdd59e8b..27cfd6c599bad2ab649806c3aecefc69cb602947 100644 (file)
--- a/include/asm-x86/timex.h
+++ b/include/asm-x86/timex.h
@@ -7,6 +7,8 @@
  
  #ifdef CONFIG_X86_ELAN
  #  define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */
+#elif defined(CONFIG_X86_RDC321X)
+#  define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */
  #else
  #  define PIT_TICK_RATE 1193182 /* Underlying HZ */
  #endif
diff --git a/include/asm-x86/tlbflush.h b/include/asm-x86/tlbflush.h

index 9af4cc83a1afc1be1a3078edbe9eb6231eb47546..3998709ed63795a472d41ce817ef53900a0b1068 100644 (file)
--- a/include/asm-x86/tlbflush.h
+++ b/include/asm-x86/tlbflush.h
@@ -1,5 +1,158 @@
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+static inline void __native_flush_tlb(void)
+{
+       write_cr3(read_cr3());
+}
+
+static inline void __native_flush_tlb_global(void)
+{
+       unsigned long cr4 = read_cr4();
+
+       /* clear PGE */
+       write_cr4(cr4 & ~X86_CR4_PGE);
+       /* write old PGE again and flush TLBs */
+       write_cr4(cr4);
+}
+
+static inline void __native_flush_tlb_single(unsigned long addr)
+{
+       __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory");
+}
+
+static inline void __flush_tlb_all(void)
+{
+       if (cpu_has_pge)
+               __flush_tlb_global();
+       else
+               __flush_tlb();
+}
+
+static inline void __flush_tlb_one(unsigned long addr)
+{
+       if (cpu_has_invlpg)
+               __flush_tlb_single(addr);
+       else
+               __flush_tlb();
+}
+
  #ifdef CONFIG_X86_32
-# include "tlbflush_32.h"
+# define TLB_FLUSH_ALL 0xffffffff
  #else
-# include "tlbflush_64.h"
+# define TLB_FLUSH_ALL -1ULL
+#endif
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb() flushes the current mm struct TLBs
+ *  - flush_tlb_all() flushes all processes TLBs
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ *  - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ *
+ * x86-64 can only flush individual pages or full VMs. For a range flush
+ * we always do the full VM. Might be worth trying if for a small
+ * range a few INVLPGs in a row are a win.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+       if (mm == current->active_mm)
+               __flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+                                 unsigned long addr)
+{
+       if (vma->vm_mm == current->active_mm)
+               __flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end)
+{
+       if (vma->vm_mm == current->active_mm)
+               __flush_tlb();
+}
+
+static inline void native_flush_tlb_others(const cpumask_t *cpumask,
+                                          struct mm_struct *mm,
+                                          unsigned long va)
+{
+}
+
+#else  /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+extern void flush_tlb_all(void);
+extern void flush_tlb_current_task(void);
+extern void flush_tlb_mm(struct mm_struct *);
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+
+#define flush_tlb()    flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end)
+{
+       flush_tlb_mm(vma->vm_mm);
+}
+
+void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
+                            unsigned long va);
+
+#define TLBSTATE_OK    1
+#define TLBSTATE_LAZY  2
+
+#ifdef CONFIG_X86_32
+struct tlb_state
+{
+       struct mm_struct *active_mm;
+       int state;
+       char __cacheline_padding[L1_CACHE_BYTES-8];
+};
+DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+#endif
+
+#endif /* SMP */
+
+#ifndef CONFIG_PARAVIRT
+#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
  #endif
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+                                         unsigned long end)
+{
+       flush_tlb_all();
+}
+
+#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/include/asm-x86/tlbflush_32.h b/include/asm-x86/tlbflush_32.h

deleted file mode 100644 (file)

index 2bd5b95..0000000
--- a/include/asm-x86/tlbflush_32.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef _I386_TLBFLUSH_H
-#define _I386_TLBFLUSH_H
-
-#include <linux/mm.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define __flush_tlb() __native_flush_tlb()
-#define __flush_tlb_global() __native_flush_tlb_global()
-#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
-#endif
-
-#define __native_flush_tlb()                                           \
-       do {                                                            \
-               unsigned int tmpreg;                                    \
-                                                                       \
-               __asm__ __volatile__(                                   \
-                       "movl %%cr3, %0;              \n"               \
-                       "movl %0, %%cr3;  # flush TLB \n"               \
-                       : "=r" (tmpreg)                                 \
-                       :: "memory");                                   \
-       } while (0)
-
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-#define __native_flush_tlb_global()                                    \
-       do {                                                            \
-               unsigned int tmpreg, cr4, cr4_orig;                     \
-                                                                       \
-               __asm__ __volatile__(                                   \
-                       "movl %%cr4, %2;  # turn off PGE     \n"        \
-                       "movl %2, %1;                        \n"        \
-                       "andl %3, %1;                        \n"        \
-                       "movl %1, %%cr4;                     \n"        \
-                       "movl %%cr3, %0;                     \n"        \
-                       "movl %0, %%cr3;  # flush TLB        \n"        \
-                       "movl %2, %%cr4;  # turn PGE back on \n"        \
-                       : "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig) \
-                       : "i" (~X86_CR4_PGE)                            \
-                       : "memory");                                    \
-       } while (0)
-
-#define __native_flush_tlb_single(addr)                                \
-       __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
-
-# define __flush_tlb_all()                                             \
-       do {                                                            \
-               if (cpu_has_pge)                                        \
-                       __flush_tlb_global();                           \
-               else                                                    \
-                       __flush_tlb();                                  \
-       } while (0)
-
-#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
-
-#ifdef CONFIG_X86_INVLPG
-# define __flush_tlb_one(addr) __flush_tlb_single(addr)
-#else
-# define __flush_tlb_one(addr)                                         \
-       do {                                                            \
-               if (cpu_has_invlpg)                                     \
-                       __flush_tlb_single(addr);                       \
-               else                                                    \
-                       __flush_tlb();                                  \
-       } while (0)
-#endif
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes TLBs
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *  - flush_tlb_others(cpumask, mm, va) flushes a TLBs on other cpus
- *
- * ..but the i386 has somewhat limited tlb flushing capabilities,
- * and page-granular flushes are available only on i486 and up.
- */
-
-#define TLB_FLUSH_ALL  0xffffffff
-
-
-#ifndef CONFIG_SMP
-
-#include <linux/sched.h>
-
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-       if (mm == current->active_mm)
-               __flush_tlb();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-       unsigned long addr)
-{
-       if (vma->vm_mm == current->active_mm)
-               __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-       unsigned long start, unsigned long end)
-{
-       if (vma->vm_mm == current->active_mm)
-               __flush_tlb();
-}
-
-static inline void native_flush_tlb_others(const cpumask_t *cpumask,
-                                          struct mm_struct *mm, unsigned long va)
-{
-}
-
-#else  /* SMP */
-
-#include <asm/smp.h>
-
-#define local_flush_tlb() \
-       __flush_tlb()
-
-extern void flush_tlb_all(void);
-extern void flush_tlb_current_task(void);
-extern void flush_tlb_mm(struct mm_struct *);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
-
-#define flush_tlb()    flush_tlb_current_task()
-
-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
-{
-       flush_tlb_mm(vma->vm_mm);
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
-                            unsigned long va);
-
-#define TLBSTATE_OK    1
-#define TLBSTATE_LAZY  2
-
-struct tlb_state
-{
-       struct mm_struct *active_mm;
-       int state;
-       char __cacheline_padding[L1_CACHE_BYTES-8];
-};
-DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
-#endif /* SMP */
-
-#ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, va)         \
-       native_flush_tlb_others(&mask, mm, va)
-#endif
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-                                       unsigned long end)
-{
-       flush_tlb_all();
-}
-
-#endif /* _I386_TLBFLUSH_H */
diff --git a/include/asm-x86/tlbflush_64.h b/include/asm-x86/tlbflush_64.h

deleted file mode 100644 (file)

index 7731fd2..0000000
--- a/include/asm-x86/tlbflush_64.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#ifndef _X8664_TLBFLUSH_H
-#define _X8664_TLBFLUSH_H
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <asm/processor.h>
-#include <asm/system.h>
-
-static inline void __flush_tlb(void)
-{
-       write_cr3(read_cr3());
-}
-
-static inline void __flush_tlb_all(void)
-{
-       unsigned long cr4 = read_cr4();
-       write_cr4(cr4 & ~X86_CR4_PGE);  /* clear PGE */
-       write_cr4(cr4);                 /* write old PGE again and flush TLBs */
-}
-
-#define __flush_tlb_one(addr) \
-       __asm__ __volatile__("invlpg (%0)" :: "r" (addr) : "memory")
-
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes TLBs
- *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *
- * x86-64 can only flush individual pages or full VMs. For a range flush
- * we always do the full VM. Might be worth trying if for a small
- * range a few INVLPGs in a row are a win.
- */
-
-#ifndef CONFIG_SMP
-
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-       if (mm == current->active_mm)
-               __flush_tlb();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-       unsigned long addr)
-{
-       if (vma->vm_mm == current->active_mm)
-               __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-       unsigned long start, unsigned long end)
-{
-       if (vma->vm_mm == current->active_mm)
-               __flush_tlb();
-}
-
-#else
-
-#include <asm/smp.h>
-
-#define local_flush_tlb() \
-       __flush_tlb()
-
-extern void flush_tlb_all(void);
-extern void flush_tlb_current_task(void);
-extern void flush_tlb_mm(struct mm_struct *);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
-
-#define flush_tlb()    flush_tlb_current_task()
-
-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
-{
-       flush_tlb_mm(vma->vm_mm);
-}
-
-#define TLBSTATE_OK    1
-#define TLBSTATE_LAZY  2
-
-/* Roughly an IPI every 20MB with 4k pages for freeing page table
-   ranges. Cost is about 42k of memory for each CPU. */
-#define ARCH_FREE_PTE_NR 5350  
-
-#endif
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-                                       unsigned long end)
-{
-       flush_tlb_all();
-}
-
-#endif /* _X8664_TLBFLUSH_H */
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h

index b10fde9798ea56a5633d6afd6f19f38fab533f40..8af05a93f09714dd16fd54b7e3119960e554f103 100644 (file)
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -1,5 +1,188 @@
+/*
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+#ifdef CONFIG_NUMA
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+
+/* Mappings between logical cpu number and node number */
  #ifdef CONFIG_X86_32
-# include "topology_32.h"
+extern int cpu_to_node_map[];
+
  #else
-# include "topology_64.h"
+DECLARE_PER_CPU(int, x86_cpu_to_node_map);
+extern int x86_cpu_to_node_map_init[];
+extern void *x86_cpu_to_node_map_early_ptr;
+/* Returns the number of the current Node. */
+#define numa_node_id()         (early_cpu_to_node(raw_smp_processor_id()))
+#endif
+
+extern cpumask_t node_to_cpumask_map[];
+
+#define NUMA_NO_NODE   (-1)
+
+/* Returns the number of the node containing CPU 'cpu' */
+#ifdef CONFIG_X86_32
+#define early_cpu_to_node(cpu) cpu_to_node(cpu)
+static inline int cpu_to_node(int cpu)
+{
+       return cpu_to_node_map[cpu];
+}
+
+#else /* CONFIG_X86_64 */
+static inline int early_cpu_to_node(int cpu)
+{
+       int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
+
+       if (cpu_to_node_map)
+               return cpu_to_node_map[cpu];
+       else if (per_cpu_offset(cpu))
+               return per_cpu(x86_cpu_to_node_map, cpu);
+       else
+               return NUMA_NO_NODE;
+}
+
+static inline int cpu_to_node(int cpu)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+       if (x86_cpu_to_node_map_early_ptr) {
+               printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n",
+                       (int)cpu);
+               dump_stack();
+               return ((int *)x86_cpu_to_node_map_early_ptr)[cpu];
+       }
+#endif
+       if (per_cpu_offset(cpu))
+               return per_cpu(x86_cpu_to_node_map, cpu);
+       else
+               return NUMA_NO_NODE;
+}
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Returns the number of the node containing Node 'node'. This
+ * architecture is flat, so it is a pretty simple function!
+ */
+#define parent_node(node) (node)
+
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline cpumask_t node_to_cpumask(int node)
+{
+       return node_to_cpumask_map[node];
+}
+
+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+       cpumask_t mask = node_to_cpumask(node);
+
+       return first_cpu(mask);
+}
+
+#define pcibus_to_node(bus) __pcibus_to_node(bus)
+#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
+
+#ifdef CONFIG_X86_32
+extern unsigned long node_start_pfn[];
+extern unsigned long node_end_pfn[];
+extern unsigned long node_remap_size[];
+#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
+
+# ifdef CONFIG_X86_HT
+#  define ENABLE_TOPO_DEFINES
+# endif
+
+# define SD_CACHE_NICE_TRIES   1
+# define SD_IDLE_IDX           1
+# define SD_NEWIDLE_IDX                2
+# define SD_FORKEXEC_IDX       0
+
+#else
+
+# ifdef CONFIG_SMP
+#  define ENABLE_TOPO_DEFINES
+# endif
+
+# define SD_CACHE_NICE_TRIES   2
+# define SD_IDLE_IDX           2
+# define SD_NEWIDLE_IDX                0
+# define SD_FORKEXEC_IDX       1
+
+#endif
+
+/* sched_domains SD_NODE_INIT for NUMAQ machines */
+#define SD_NODE_INIT (struct sched_domain) {           \
+       .span                   = CPU_MASK_NONE,        \
+       .parent                 = NULL,                 \
+       .child                  = NULL,                 \
+       .groups                 = NULL,                 \
+       .min_interval           = 8,                    \
+       .max_interval           = 32,                   \
+       .busy_factor            = 32,                   \
+       .imbalance_pct          = 125,                  \
+       .cache_nice_tries       = SD_CACHE_NICE_TRIES,  \
+       .busy_idx               = 3,                    \
+       .idle_idx               = SD_IDLE_IDX,          \
+       .newidle_idx            = SD_NEWIDLE_IDX,       \
+       .wake_idx               = 1,                    \
+       .forkexec_idx           = SD_FORKEXEC_IDX,      \
+       .flags                  = SD_LOAD_BALANCE       \
+                               | SD_BALANCE_EXEC       \
+                               | SD_BALANCE_FORK       \
+                               | SD_SERIALIZE          \
+                               | SD_WAKE_BALANCE,      \
+       .last_balance           = jiffies,              \
+       .balance_interval       = 1,                    \
+       .nr_balance_failed      = 0,                    \
+}
+
+#ifdef CONFIG_X86_64_ACPI_NUMA
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a, b)
+#endif
+
+#else /* CONFIG_NUMA */
+
+#include <asm-generic/topology.h>
+
+#endif
+
+extern cpumask_t cpu_coregroup_map(int cpu);
+
+#ifdef ENABLE_TOPO_DEFINES
+#define topology_physical_package_id(cpu)      (cpu_data(cpu).phys_proc_id)
+#define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
+#define topology_core_siblings(cpu)            (per_cpu(cpu_core_map, cpu))
+#define topology_thread_siblings(cpu)          (per_cpu(cpu_sibling_map, cpu))
+#endif
+
+#ifdef CONFIG_SMP
+#define mc_capable()                   (boot_cpu_data.x86_max_cores > 1)
+#define smt_capable()                  (smp_num_siblings > 1)
+#endif
+
  #endif
diff --git a/include/asm-x86/topology_32.h b/include/asm-x86/topology_32.h

deleted file mode 100644 (file)

index 9040f5a..0000000
--- a/include/asm-x86/topology_32.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * linux/include/asm-i386/topology.h
- *
- * Written by: Matthew Dobson, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.          
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#ifndef _ASM_I386_TOPOLOGY_H
-#define _ASM_I386_TOPOLOGY_H
-
-#ifdef CONFIG_X86_HT
-#define topology_physical_package_id(cpu)      (cpu_data(cpu).phys_proc_id)
-#define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
-#define topology_core_siblings(cpu)            (per_cpu(cpu_core_map, cpu))
-#define topology_thread_siblings(cpu)          (per_cpu(cpu_sibling_map, cpu))
-#endif
-
-#ifdef CONFIG_NUMA
-
-#include <asm/mpspec.h>
-
-#include <linux/cpumask.h>
-
-/* Mappings between logical cpu number and node number */
-extern cpumask_t node_2_cpu_mask[];
-extern int cpu_2_node[];
-
-/* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
-{ 
-       return cpu_2_node[cpu];
-}
-
-/* Returns the number of the node containing Node 'node'.  This architecture is flat, 
-   so it is a pretty simple function! */
-#define parent_node(node) (node)
-
-/* Returns a bitmask of CPUs on Node 'node'. */
-static inline cpumask_t node_to_cpumask(int node)
-{
-       return node_2_cpu_mask[node];
-}
-
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{ 
-       cpumask_t mask = node_to_cpumask(node);
-       return first_cpu(mask);
-}
-
-#define pcibus_to_node(bus) ((struct pci_sysdata *)((bus)->sysdata))->node
-#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus))
-
-/* sched_domains SD_NODE_INIT for NUMAQ machines */
-#define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
-       .parent                 = NULL,                 \
-       .child                  = NULL,                 \
-       .groups                 = NULL,                 \
-       .min_interval           = 8,                    \
-       .max_interval           = 32,                   \
-       .busy_factor            = 32,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = 1,                    \
-       .busy_idx               = 3,                    \
-       .idle_idx               = 1,                    \
-       .newidle_idx            = 2,                    \
-       .wake_idx               = 1,                    \
-       .flags                  = SD_LOAD_BALANCE       \
-                               | SD_BALANCE_EXEC       \
-                               | SD_BALANCE_FORK       \
-                               | SD_SERIALIZE          \
-                               | SD_WAKE_BALANCE,      \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 1,                    \
-       .nr_balance_failed      = 0,                    \
-}
-
-extern unsigned long node_start_pfn[];
-extern unsigned long node_end_pfn[];
-extern unsigned long node_remap_size[];
-
-#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
-
-#else /* !CONFIG_NUMA */
-/*
- * Other i386 platforms should define their own version of the 
- * above macros here.
- */
-
-#include <asm-generic/topology.h>
-
-#endif /* CONFIG_NUMA */
-
-extern cpumask_t cpu_coregroup_map(int cpu);
-
-#ifdef CONFIG_SMP
-#define mc_capable()   (boot_cpu_data.x86_max_cores > 1)
-#define smt_capable()  (smp_num_siblings > 1)
-#endif
-
-#endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86/topology_64.h b/include/asm-x86/topology_64.h

deleted file mode 100644 (file)

index a718dda..0000000
--- a/include/asm-x86/topology_64.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef _ASM_X86_64_TOPOLOGY_H
-#define _ASM_X86_64_TOPOLOGY_H
-
-
-#ifdef CONFIG_NUMA
-
-#include <asm/mpspec.h>
-#include <linux/bitops.h>
-
-extern cpumask_t cpu_online_map;
-
-extern unsigned char cpu_to_node[];
-extern cpumask_t     node_to_cpumask[];
-
-#ifdef CONFIG_ACPI_NUMA
-extern int __node_distance(int, int);
-#define node_distance(a,b) __node_distance(a,b)
-/* #else fallback version */
-#endif
-
-#define cpu_to_node(cpu)               (cpu_to_node[cpu])
-#define parent_node(node)              (node)
-#define node_to_first_cpu(node)        (first_cpu(node_to_cpumask[node]))
-#define node_to_cpumask(node)          (node_to_cpumask[node])
-#define pcibus_to_node(bus)    ((struct pci_sysdata *)((bus)->sysdata))->node
-#define pcibus_to_cpumask(bus)         node_to_cpumask(pcibus_to_node(bus));
-
-#define numa_node_id()                 read_pda(nodenumber)
-
-/* sched_domains SD_NODE_INIT for x86_64 machines */
-#define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
-       .parent                 = NULL,                 \
-       .child                  = NULL,                 \
-       .groups                 = NULL,                 \
-       .min_interval           = 8,                    \
-       .max_interval           = 32,                   \
-       .busy_factor            = 32,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = 2,                    \
-       .busy_idx               = 3,                    \
-       .idle_idx               = 2,                    \
-       .newidle_idx            = 0,                    \
-       .wake_idx               = 1,                    \
-       .forkexec_idx           = 1,                    \
-       .flags                  = SD_LOAD_BALANCE       \
-                               | SD_BALANCE_FORK       \
-                               | SD_BALANCE_EXEC       \
-                               | SD_SERIALIZE          \
-                               | SD_WAKE_BALANCE,      \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 1,                    \
-       .nr_balance_failed      = 0,                    \
-}
-
-#endif
-
-#ifdef CONFIG_SMP
-#define topology_physical_package_id(cpu)      (cpu_data(cpu).phys_proc_id)
-#define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
-#define topology_core_siblings(cpu)            (per_cpu(cpu_core_map, cpu))
-#define topology_thread_siblings(cpu)          (per_cpu(cpu_sibling_map, cpu))
-#define mc_capable()                   (boot_cpu_data.x86_max_cores > 1)
-#define smt_capable()                  (smp_num_siblings > 1)
-#endif
-
-#include <asm-generic/topology.h>
-
-extern cpumask_t cpu_coregroup_map(int cpu);
-
-#endif
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h

index 6baab30dc2c8be051e81286bcb9e0ce97d37d724..7d3e27f7d484d8ce848951b4f0b48a4d285242c0 100644 (file)
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -17,6 +17,8 @@ typedef unsigned long long cycles_t;
  extern unsigned int cpu_khz;
  extern unsigned int tsc_khz;
  
+extern void disable_TSC(void);
+
  static inline cycles_t get_cycles(void)
  {
         unsigned long long ret = 0;
@@ -25,39 +27,22 @@ static inline cycles_t get_cycles(void)
         if (!cpu_has_tsc)
                 return 0;
  #endif
-
-#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
         rdtscll(ret);
-#endif
+
         return ret;
  }
  
-/* Like get_cycles, but make sure the CPU is synchronized. */
-static __always_inline cycles_t get_cycles_sync(void)
+static inline cycles_t vget_cycles(void)
  {
-       unsigned long long ret;
-       unsigned eax, edx;
-
-       /*
-        * Use RDTSCP if possible; it is guaranteed to be synchronous
-        * and doesn't cause a VMEXIT on Hypervisors
-        */
-       alternative_io(ASM_NOP3, ".byte 0x0f,0x01,0xf9", X86_FEATURE_RDTSCP,
-                      ASM_OUTPUT2("=a" (eax), "=d" (edx)),
-                      "a" (0U), "d" (0U) : "ecx", "memory");
-       ret = (((unsigned long long)edx) << 32) | ((unsigned long long)eax);
-       if (ret)
-               return ret;
-
         /*
-        * Don't do an additional sync on CPUs where we know
-        * RDTSC is already synchronous:
+        * We only do VDSOs on TSC capable CPUs, so this shouldnt
+        * access boot_cpu_data (which is not VDSO-safe):
          */
-       alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
-                         "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
-       rdtscll(ret);
-
-       return ret;
+#ifndef CONFIG_X86_TSC
+       if (!cpu_has_tsc)
+               return 0;
+#endif
+       return (cycles_t) __native_read_tsc();
  }
  
  extern void tsc_init(void);
@@ -73,8 +58,7 @@ int check_tsc_unstable(void);
  extern void check_tsc_sync_source(int cpu);
  extern void check_tsc_sync_target(void);
  
-#ifdef CONFIG_X86_64
  extern void tsc_calibrate(void);
-#endif
+extern int notsc_setup(char *);
  
  #endif
diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h

index f4ce8768ad443648fc637bf1cdb128e25a4f8a44..31d79470271942af7e4f8449294761b0d383b619 100644 (file)
--- a/include/asm-x86/uaccess_64.h
+++ b/include/asm-x86/uaccess_64.h
@@ -65,6 +65,8 @@ struct exception_table_entry
         unsigned long insn, fixup;
  };
  
+extern int fixup_exception(struct pt_regs *regs);
+
  #define ARCH_HAS_SEARCH_EXTABLE
  
  /*
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h

index 9b15545eb9b5bd7d330f7ee0ccc240f7f6e06dec..8d8f9b5adbb971a740ccf73476df86c280dd5d89 100644 (file)
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -333,8 +333,6 @@
  
  #ifdef __KERNEL__
  
-#define NR_syscalls 325
-
  #define __ARCH_WANT_IPC_PARSE_VERSION
  #define __ARCH_WANT_OLD_READDIR
  #define __ARCH_WANT_OLD_STAT
diff --git a/include/asm-x86/user_32.h b/include/asm-x86/user_32.h

index 0e85d2a5e33a82f01d87743523a29cf3e327632b..ed8b8fc6906c583878346c6b687f5b97692a12f6 100644 (file)
--- a/include/asm-x86/user_32.h
+++ b/include/asm-x86/user_32.h
@@ -75,13 +75,23 @@ struct user_fxsr_struct {
   * doesn't use the extra segment registers)
   */
  struct user_regs_struct {
-       long ebx, ecx, edx, esi, edi, ebp, eax;
-       unsigned short ds, __ds, es, __es;
-       unsigned short fs, __fs, gs, __gs;
-       long orig_eax, eip;
-       unsigned short cs, __cs;
-       long eflags, esp;
-       unsigned short ss, __ss;
+       unsigned long   bx;
+       unsigned long   cx;
+       unsigned long   dx;
+       unsigned long   si;
+       unsigned long   di;
+       unsigned long   bp;
+       unsigned long   ax;
+       unsigned long   ds;
+       unsigned long   es;
+       unsigned long   fs;
+       unsigned long   gs;
+       unsigned long   orig_ax;
+       unsigned long   ip;
+       unsigned long   cs;
+       unsigned long   flags;
+       unsigned long   sp;
+       unsigned long   ss;
  };
  
  /* When the kernel dumps core, it starts by dumping the user struct -
diff --git a/include/asm-x86/user_64.h b/include/asm-x86/user_64.h

index 12785c649ac5238804d0f15759812fad8a7e362d..a5449d456cc0a7119037ebae4171d64397ecb27f 100644 (file)
--- a/include/asm-x86/user_64.h
+++ b/include/asm-x86/user_64.h
@@ -40,13 +40,13 @@
   * and both the standard and SIMD floating point data can be accessed via
   * the new ptrace requests.  In either case, changes to the FPU environment
   * will be reflected in the task's state as expected.
- * 
+ *
   * x86-64 support by Andi Kleen.
   */
  
  /* This matches the 64bit FXSAVE format as defined by AMD. It is the same
     as the 32bit format defined by Intel, except that the selector:offset pairs for
-   data and eip are replaced with flat 64bit pointers. */ 
+   data and eip are replaced with flat 64bit pointers. */
  struct user_i387_struct {
         unsigned short  cwd;
         unsigned short  swd;
@@ -65,13 +65,34 @@ struct user_i387_struct {
   * Segment register layout in coredumps.
   */
  struct user_regs_struct {
-       unsigned long r15,r14,r13,r12,rbp,rbx,r11,r10;
-       unsigned long r9,r8,rax,rcx,rdx,rsi,rdi,orig_rax;
-       unsigned long rip,cs,eflags;
-       unsigned long rsp,ss;
-       unsigned long fs_base, gs_base;
-       unsigned long ds,es,fs,gs; 
-}; 
+       unsigned long   r15;
+       unsigned long   r14;
+       unsigned long   r13;
+       unsigned long   r12;
+       unsigned long   bp;
+       unsigned long   bx;
+       unsigned long   r11;
+       unsigned long   r10;
+       unsigned long   r9;
+       unsigned long   r8;
+       unsigned long   ax;
+       unsigned long   cx;
+       unsigned long   dx;
+       unsigned long   si;
+       unsigned long   di;
+       unsigned long   orig_ax;
+       unsigned long   ip;
+       unsigned long   cs;
+       unsigned long   flags;
+       unsigned long   sp;
+       unsigned long   ss;
+       unsigned long   fs_base;
+       unsigned long   gs_base;
+       unsigned long   ds;
+       unsigned long   es;
+       unsigned long   fs;
+       unsigned long   gs;
+};
  
  /* When the kernel dumps core, it starts by dumping the user struct -
     this will be used by gdb to figure out where the data and stack segments
@@ -94,7 +115,7 @@ struct user{
                                    This is actually the bottom of the stack,
                                    the top of the stack is always found in the
                                    esp register.  */
-  long int signal;                     /* Signal that caused the core dump. */
+  long int signal;             /* Signal that caused the core dump. */
    int reserved;                        /* No longer used */
    int pad1;
    struct user_pt_regs * u_ar0; /* Used by gdb to help find the values for */
diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h

new file mode 100644 (file)

index 0000000..629bcb6
--- /dev/null
+++ b/include/asm-x86/vdso.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_X86_VDSO_H
+#define _ASM_X86_VDSO_H        1
+
+#ifdef CONFIG_X86_64
+extern const char VDSO64_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO64_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO64_SYMBOL(base, name) ({           \
+       extern const char VDSO64_##name[];      \
+       (void *) (VDSO64_##name - VDSO64_PRELINK + (unsigned long) (base)); })
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+extern const char VDSO32_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO32_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO32_SYMBOL(base, name) ({           \
+       extern const char VDSO32_##name[];      \
+       (void *) (VDSO32_##name - VDSO32_PRELINK + (unsigned long) (base)); })
+#endif
+
+#endif /* asm-x86/vdso.h */
diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h

index f01c49f5d108d50666535d0fb1e52cc1bdc97c2d..17b3700949bfe3e90ef495fd8e09c60f7450c129 100644 (file)
--- a/include/asm-x86/vsyscall.h
+++ b/include/asm-x86/vsyscall.h
@@ -36,6 +36,8 @@ extern volatile unsigned long __jiffies;
  extern int vgetcpu_mode;
  extern struct timezone sys_tz;
  
+extern void map_vsyscall(void);
+
  #endif /* __KERNEL__ */
  
  #endif /* _ASM_X86_64_VSYSCALL_H_ */
diff --git a/include/asm-x86/vsyscall32.h b/include/asm-x86/vsyscall32.h

deleted file mode 100644 (file)

index c631c08..0000000
--- a/include/asm-x86/vsyscall32.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _ASM_VSYSCALL32_H
-#define _ASM_VSYSCALL32_H 1
-
-/* Values need to match arch/x86_64/ia32/vsyscall.lds */
-
-#ifdef __ASSEMBLY__
-#define VSYSCALL32_BASE 0xffffe000
-#define VSYSCALL32_SYSEXIT (VSYSCALL32_BASE + 0x410)
-#else
-#define VSYSCALL32_BASE 0xffffe000UL
-#define VSYSCALL32_END (VSYSCALL32_BASE + PAGE_SIZE)
-#define VSYSCALL32_EHDR ((const struct elf32_hdr *) VSYSCALL32_BASE)
-
-#define VSYSCALL32_VSYSCALL ((void *)VSYSCALL32_BASE + 0x400) 
-#define VSYSCALL32_SYSEXIT ((void *)VSYSCALL32_BASE + 0x410)
-#define VSYSCALL32_SIGRETURN ((void __user *)VSYSCALL32_BASE + 0x500) 
-#define VSYSCALL32_RTSIGRETURN ((void __user *)VSYSCALL32_BASE + 0x600) 
-#endif
-
-#endif
diff --git a/include/asm-x86/xor_32.h b/include/asm-x86/xor_32.h

index 23c86cef3b258a8611e765bd168e1c89054a19f6..a41ef1bdd424b36465df9a402f419759a606cf9c 100644 (file)
--- a/include/asm-x86/xor_32.h
+++ b/include/asm-x86/xor_32.h
@@ -1,6 +1,4 @@
  /*
- * include/asm-i386/xor.h
- *
   * Optimized RAID-5 checksumming functions for MMX and SSE.
   *
   * This program is free software; you can redistribute it and/or modify
diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h

index f942fcc218312ec70d6eb9d1793ba43cd98ad355..1eee7fcb2420682f613eb7f2528a7056f38d7e04 100644 (file)
--- a/include/asm-x86/xor_64.h
+++ b/include/asm-x86/xor_64.h
@@ -1,6 +1,4 @@
  /*
- * include/asm-x86_64/xor.h
- *
   * Optimized RAID-5 checksumming functions for MMX and SSE.
   *
   * This program is free software; you can redistribute it and/or modify
diff --git a/include/linux/Kbuild b/include/linux/Kbuild

index 27b9350052b46dd7ed8a848174f3d4a26e490eca..85b2482cc736c782307c99afd26a7b1f834f38d9 100644 (file)
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -100,7 +100,6 @@ header-y += iso_fs.h
  header-y += ixjuser.h
  header-y += jffs2.h
  header-y += keyctl.h
-header-y += kvm.h
  header-y += limits.h
  header-y += lock_dlm_plock.h
  header-y += magic.h
@@ -256,6 +255,7 @@ unifdef-y += kd.h
  unifdef-y += kernelcapi.h
  unifdef-y += kernel.h
  unifdef-y += keyboard.h
+unifdef-$(CONFIG_HAVE_KVM) += kvm.h
  unifdef-y += llc.h
  unifdef-y += loop.h
  unifdef-y += lp.h
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h

index 1d0ef1ae80362d50f1b620b54c01313d6c46b936..7e3d2859be50230b9b2755288a53d8fb693145b0 100644 (file)
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -25,6 +25,8 @@ static inline u32 acpi_pm_read_early(void)
         return acpi_pm_read_verified() & ACPI_PM_MASK;
  }
  
+extern void pmtimer_wait(unsigned);
+
  #else
  
  static inline u32 acpi_pm_read_early(void)
diff --git a/include/linux/audit.h b/include/linux/audit.h

index c687816928380ab5e3bb031b05501f56fa6efc53..bdd6f5de5fc4bb654f489dab723e3cb09f66f3d0 100644 (file)
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -115,6 +115,8 @@
  #define AUDIT_MAC_IPSEC_ADDSPD 1413    /* Not used */
  #define AUDIT_MAC_IPSEC_DELSPD 1414    /* Not used */
  #define AUDIT_MAC_IPSEC_EVENT  1415    /* Audit an IPSec event */
+#define AUDIT_MAC_UNLBL_STCADD 1416    /* NetLabel: add a static label */
+#define AUDIT_MAC_UNLBL_STCDEL 1417    /* NetLabel: del a static label */
  
  #define AUDIT_FIRST_KERN_ANOM_MSG   1700
  #define AUDIT_LAST_KERN_ANOM_MSG    1799
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 71e7a847dffce22d7cedef1357df99d0cc5bbbbb..e18d4192f6e83992278f344ab7d4f7f885595dcd 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -825,6 +825,7 @@ static inline void exit_io_context(void)
  {
  }
  
+struct io_context;
  static inline int put_io_context(struct io_context *ioc)
  {
         return 1;
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h

index 06dadba349acbd46d6ed8ba702153c44ba13b65e..cfc3147e5cf9a30fb87fdde7a972c5cb1635f574 100644 (file)
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -282,10 +282,10 @@ static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
         __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
  }
  
-extern int blk_trace_setup(request_queue_t *q, char *name, dev_t dev,
+extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
                            char __user *arg);
-extern int blk_trace_startstop(request_queue_t *q, int start);
-extern int blk_trace_remove(request_queue_t *q);
+extern int blk_trace_startstop(struct request_queue *q, int start);
+extern int blk_trace_remove(struct request_queue *q);
  
  #else /* !CONFIG_BLK_DEV_IO_TRACE */
  #define blk_trace_ioctl(bdev, cmd, arg)                (-ENOTTY)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h

index 107787aacb647c95c37f53bf94d2660486d687a3..85778a4b120906870970f25f8e987fdc7ece95e6 100644 (file)
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -103,7 +103,7 @@ struct clocksource {
  #define CLOCK_SOURCE_VALID_FOR_HRES            0x20
  
  /* simplify initialization of mask field */
-#define CLOCKSOURCE_MASK(bits) (cycle_t)(bits<64 ? ((1ULL<<bits)-1) : -1)
+#define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
  
  /**
   * clocksource_khz2mult - calculates mult from khz and shift
@@ -215,6 +215,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
  
  /* used to install a new clocksource */
  extern int clocksource_register(struct clocksource*);
+extern void clocksource_unregister(struct clocksource*);
  extern struct clocksource* clocksource_get_next(void);
  extern void clocksource_change_rating(struct clocksource *cs, int rating);
  extern void clocksource_resume(void);
diff --git a/include/linux/compat.h b/include/linux/compat.h

index 0e69d2cf14aa0c204e21edb26bd12191ee818b3e..d38655f2be7088e3bce8d63476ec525fdf347511 100644 (file)
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -191,6 +191,10 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
                 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
                 struct compat_timeval __user *tvp);
  
+asmlinkage long compat_sys_wait4(compat_pid_t pid,
+                                compat_uint_t *stat_addr, int options,
+                                struct compat_rusage *ru);
+
  #define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))
  
  #define BITS_TO_COMPAT_LONGS(bits) \
@@ -239,6 +243,17 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
                 compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
                 const compat_ulong_t __user *new_nodes);
  
+extern int compat_ptrace_request(struct task_struct *child,
+                                compat_long_t request,
+                                compat_ulong_t addr, compat_ulong_t data);
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
+extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+                              compat_ulong_t addr, compat_ulong_t data);
+asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+                                 compat_long_t addr, compat_long_t data);
+#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
+
  /*
   * epoll (fs/eventpoll.c) compat bits follow ...
   */
diff --git a/include/linux/const.h b/include/linux/const.h

index 07b300bfe34b01f27d424b7f5c63f43130582f7d..c22c707c455d6b379afa6c2642640e6b6277541e 100644 (file)
--- a/include/linux/const.h
+++ b/include/linux/const.h
@@ -7,13 +7,18 @@
   * C code.  Therefore we cannot annotate them always with
   * 'UL' and other type specifiers unilaterally.  We
   * use the following macros to deal with this.
+ *
+ * Similarly, _AT() will cast an expression with a type in C, but
+ * leave it unchanged in asm.
   */
  
  #ifdef __ASSEMBLY__
  #define _AC(X,Y)       X
+#define _AT(T,X)       X
  #else
  #define __AC(X,Y)      (X##Y)
  #define _AC(X,Y)       __AC(X,Y)
+#define _AT(T,X)       ((T)(X))
  #endif
  
  #endif /* !(_LINUX_CONST_H) */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h

index 85bd790c201eeaafb62da1b03006c13b29e61598..7047f58306a7a24fba73076f255551c59866cc4e 100644 (file)
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -218,8 +218,8 @@ int __first_cpu(const cpumask_t *srcp);
  int __next_cpu(int n, const cpumask_t *srcp);
  #define next_cpu(n, src) __next_cpu((n), &(src))
  #else
-#define first_cpu(src)         0
-#define next_cpu(n, src)       1
+#define first_cpu(src)         ({ (void)(src); 0; })
+#define next_cpu(n, src)       ({ (void)(src); 1; })
  #endif
  
  #define cpumask_of_cpu(cpu)                                            \
diff --git a/include/linux/device.h b/include/linux/device.h

index 1880208964d6c9a4a75d332927a0ef3e23b19191..db375be333c708a2e556c5424c95a0aa91802836 100644 (file)
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -84,6 +84,9 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data,
  struct device *bus_find_device(struct bus_type *bus, struct device *start,
                                void *data,
                                int (*match)(struct device *dev, void *data));
+struct device *bus_find_device_by_name(struct bus_type *bus,
+                                      struct device *start,
+                                      const char *name);
  
  int __must_check bus_for_each_drv(struct bus_type *bus,
                                   struct device_driver *start, void *data,
diff --git a/include/linux/elf.h b/include/linux/elf.h

index 576e83bd6d88e5b6dfcad00a4136106c3c87dbec..7ceb24d87c1a970b06835f5c645bb0ab2e31f1d0 100644 (file)
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -355,6 +355,7 @@ typedef struct elf64_shdr {
  #define NT_AUXV                6
  #define NT_PRXFPREG     0x46e62b7f      /* copied from gdb5.1/include/elf/common.h */
  #define NT_PPC_VMX     0x100           /* PowerPC Altivec/VMX registers */
+#define NT_386_TLS     0x200           /* i386 TLS slots (struct user_desc) */
  
  
  /* Note header in a PT_NOTE section */
diff --git a/include/linux/hpet.h b/include/linux/hpet.h

index 707f7cb9e795cb8ab5fdd0c84e7a9791605156a5..9cd94bfd07e54ed19675457cdcf203ca316c82a5 100644 (file)
--- a/include/linux/hpet.h
+++ b/include/linux/hpet.h
@@ -64,7 +64,7 @@ struct hpet {
   */
  
  #define        Tn_INT_ROUTE_CAP_MASK           (0xffffffff00000000ULL)
-#define        Tn_INI_ROUTE_CAP_SHIFT          (32UL)
+#define        Tn_INT_ROUTE_CAP_SHIFT          (32UL)
  #define        Tn_FSB_INT_DELCAP_MASK          (0x8000UL)
  #define        Tn_FSB_INT_DELCAP_SHIFT         (15)
  #define        Tn_FSB_EN_CNF_MASK              (0x4000UL)
@@ -115,9 +115,6 @@ static inline void hpet_reserve_timer(struct hpet_data *hd, int timer)
  }
  
  int hpet_alloc(struct hpet_data *);
-int hpet_register(struct hpet_task *, int);
-int hpet_unregister(struct hpet_task *);
-int hpet_control(struct hpet_task *, unsigned int, unsigned long);
  
  #endif /* __KERNEL__ */
  
diff --git a/include/linux/init_ohci1394_dma.h b/include/linux/init_ohci1394_dma.h

new file mode 100644 (file)

index 0000000..3c03a4b
--- /dev/null
+++ b/include/linux/init_ohci1394_dma.h
@@ -0,0 +1,4 @@
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+extern int __initdata init_ohci1394_dma_early;
+extern void __init init_ohci1394_dma_on_all_controllers(void);
+#endif
diff --git a/include/linux/ioport.h b/include/linux/ioport.h

index 6187a8567bc74d5ba8f6233dac7db942d275d63f..605d237364d2d92a442615c530e563a133de65a9 100644 (file)
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -8,6 +8,7 @@
  #ifndef _LINUX_IOPORT_H
  #define _LINUX_IOPORT_H
  
+#ifndef __ASSEMBLY__
  #include <linux/compiler.h>
  #include <linux/types.h>
  /*
@@ -153,4 +154,5 @@ extern struct resource * __devm_request_region(struct device *dev,
  extern void __devm_release_region(struct device *dev, struct resource *parent,
                                   resource_size_t start, resource_size_t n);
  
+#endif /* __ASSEMBLY__ */
  #endif /* _LINUX_IOPORT_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index a7283c9beadff498968eb1ff506b5369077ae96c..ff356b2ee478d3dd006e67a76427a467525df7ae 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -194,6 +194,9 @@ static inline int log_buf_read(int idx) { return 0; }
  static inline int log_buf_copy(char *dest, int idx, int len) { return 0; }
  #endif
  
+extern void __attribute__((format(printf, 1, 2)))
+       early_printk(const char *fmt, ...);
+
  unsigned long int_sqrt(unsigned long);
  
  extern int printk_ratelimit(void);
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h

index 81891581e89ba544a527aa994386374b4c1c2146..6168c0a441724f198f1999e0f35ce88012750d59 100644 (file)
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -182,6 +182,15 @@ static inline void kretprobe_assert(struct kretprobe_instance *ri,
         }
  }
  
+#ifdef CONFIG_KPROBES_SANITY_TEST
+extern int init_test_probes(void);
+#else
+static inline int init_test_probes(void)
+{
+       return 0;
+}
+#endif /* CONFIG_KPROBES_SANITY_TEST */
+
  extern spinlock_t kretprobe_lock;
  extern struct mutex kprobe_mutex;
  extern int arch_prepare_kprobe(struct kprobe *p);
@@ -227,6 +236,7 @@ void unregister_kretprobe(struct kretprobe *rp);
  
  void kprobe_flush_task(struct task_struct *tk);
  void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
+
  #else /* CONFIG_KPROBES */
  
  #define __kprobes      /**/
diff --git a/include/linux/kvm.h b/include/linux/kvm.h

index 057a7f34ee36b2aec3fdfad395b0f7c4323796ec..4de4fd2d8607d19df5f9eb4cb3a88325fd3d60f8 100644 (file)
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -9,12 +9,10 @@
  
  #include <asm/types.h>
  #include <linux/ioctl.h>
+#include <asm/kvm.h>
  
  #define KVM_API_VERSION 12
  
-/* Architectural interrupt line count. */
-#define KVM_NR_INTERRUPTS 256
-
  /* for KVM_CREATE_MEMORY_REGION */
  struct kvm_memory_region {
         __u32 slot;
@@ -23,17 +21,19 @@ struct kvm_memory_region {
         __u64 memory_size; /* bytes */
  };
  
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
-
-struct kvm_memory_alias {
-       __u32 slot;  /* this has a different namespace than memory slots */
+/* for KVM_SET_USER_MEMORY_REGION */
+struct kvm_userspace_memory_region {
+       __u32 slot;
         __u32 flags;
         __u64 guest_phys_addr;
-       __u64 memory_size;
-       __u64 target_phys_addr;
+       __u64 memory_size; /* bytes */
+       __u64 userspace_addr; /* start of the userspace allocated memory */
  };
  
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES  1UL
+
+
  /* for KVM_IRQ_LINE */
  struct kvm_irq_level {
         /*
@@ -45,62 +45,18 @@ struct kvm_irq_level {
         __u32 level;
  };
  
-/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
-struct kvm_pic_state {
-       __u8 last_irr;  /* edge detection */
-       __u8 irr;               /* interrupt request register */
-       __u8 imr;               /* interrupt mask register */
-       __u8 isr;               /* interrupt service register */
-       __u8 priority_add;      /* highest irq priority */
-       __u8 irq_base;
-       __u8 read_reg_select;
-       __u8 poll;
-       __u8 special_mask;
-       __u8 init_state;
-       __u8 auto_eoi;
-       __u8 rotate_on_auto_eoi;
-       __u8 special_fully_nested_mode;
-       __u8 init4;             /* true if 4 byte init */
-       __u8 elcr;              /* PIIX edge/trigger selection */
-       __u8 elcr_mask;
-};
-
-#define KVM_IOAPIC_NUM_PINS  24
-struct kvm_ioapic_state {
-       __u64 base_address;
-       __u32 ioregsel;
-       __u32 id;
-       __u32 irr;
-       __u32 pad;
-       union {
-               __u64 bits;
-               struct {
-                       __u8 vector;
-                       __u8 delivery_mode:3;
-                       __u8 dest_mode:1;
-                       __u8 delivery_status:1;
-                       __u8 polarity:1;
-                       __u8 remote_irr:1;
-                       __u8 trig_mode:1;
-                       __u8 mask:1;
-                       __u8 reserve:7;
-                       __u8 reserved[4];
-                       __u8 dest_id;
-               } fields;
-       } redirtbl[KVM_IOAPIC_NUM_PINS];
-};
-
-#define KVM_IRQCHIP_PIC_MASTER   0
-#define KVM_IRQCHIP_PIC_SLAVE    1
-#define KVM_IRQCHIP_IOAPIC       2
  
  struct kvm_irqchip {
         __u32 chip_id;
         __u32 pad;
          union {
                 char dummy[512];  /* reserving space */
+#ifdef CONFIG_X86
                 struct kvm_pic_state pic;
+#endif
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
                 struct kvm_ioapic_state ioapic;
+#endif
         } chip;
  };
  
@@ -116,6 +72,7 @@ struct kvm_irqchip {
  #define KVM_EXIT_FAIL_ENTRY       9
  #define KVM_EXIT_INTR             10
  #define KVM_EXIT_SET_TPR          11
+#define KVM_EXIT_TPR_ACCESS       12
  
  /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
  struct kvm_run {
@@ -174,90 +131,17 @@ struct kvm_run {
                         __u32 longmode;
                         __u32 pad;
                 } hypercall;
+               /* KVM_EXIT_TPR_ACCESS */
+               struct {
+                       __u64 rip;
+                       __u32 is_write;
+                       __u32 pad;
+               } tpr_access;
                 /* Fix the size of the union. */
                 char padding[256];
         };
  };
  
-/* for KVM_GET_REGS and KVM_SET_REGS */
-struct kvm_regs {
-       /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
-       __u64 rax, rbx, rcx, rdx;
-       __u64 rsi, rdi, rsp, rbp;
-       __u64 r8,  r9,  r10, r11;
-       __u64 r12, r13, r14, r15;
-       __u64 rip, rflags;
-};
-
-/* for KVM_GET_FPU and KVM_SET_FPU */
-struct kvm_fpu {
-       __u8  fpr[8][16];
-       __u16 fcw;
-       __u16 fsw;
-       __u8  ftwx;  /* in fxsave format */
-       __u8  pad1;
-       __u16 last_opcode;
-       __u64 last_ip;
-       __u64 last_dp;
-       __u8  xmm[16][16];
-       __u32 mxcsr;
-       __u32 pad2;
-};
-
-/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
-#define KVM_APIC_REG_SIZE 0x400
-struct kvm_lapic_state {
-       char regs[KVM_APIC_REG_SIZE];
-};
-
-struct kvm_segment {
-       __u64 base;
-       __u32 limit;
-       __u16 selector;
-       __u8  type;
-       __u8  present, dpl, db, s, l, g, avl;
-       __u8  unusable;
-       __u8  padding;
-};
-
-struct kvm_dtable {
-       __u64 base;
-       __u16 limit;
-       __u16 padding[3];
-};
-
-/* for KVM_GET_SREGS and KVM_SET_SREGS */
-struct kvm_sregs {
-       /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
-       struct kvm_segment cs, ds, es, fs, gs, ss;
-       struct kvm_segment tr, ldt;
-       struct kvm_dtable gdt, idt;
-       __u64 cr0, cr2, cr3, cr4, cr8;
-       __u64 efer;
-       __u64 apic_base;
-       __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
-};
-
-struct kvm_msr_entry {
-       __u32 index;
-       __u32 reserved;
-       __u64 data;
-};
-
-/* for KVM_GET_MSRS and KVM_SET_MSRS */
-struct kvm_msrs {
-       __u32 nmsrs; /* number of msrs in entries */
-       __u32 pad;
-
-       struct kvm_msr_entry entries[0];
-};
-
-/* for KVM_GET_MSR_INDEX_LIST */
-struct kvm_msr_list {
-       __u32 nmsrs; /* number of msrs in entries */
-       __u32 indices[0];
-};
-
  /* for KVM_TRANSLATE */
  struct kvm_translation {
         /* in */
@@ -302,28 +186,24 @@ struct kvm_dirty_log {
         };
  };
  
-struct kvm_cpuid_entry {
-       __u32 function;
-       __u32 eax;
-       __u32 ebx;
-       __u32 ecx;
-       __u32 edx;
-       __u32 padding;
-};
-
-/* for KVM_SET_CPUID */
-struct kvm_cpuid {
-       __u32 nent;
-       __u32 padding;
-       struct kvm_cpuid_entry entries[0];
-};
-
  /* for KVM_SET_SIGNAL_MASK */
  struct kvm_signal_mask {
         __u32 len;
         __u8  sigset[0];
  };
  
+/* for KVM_TPR_ACCESS_REPORTING */
+struct kvm_tpr_access_ctl {
+       __u32 enabled;
+       __u32 flags;
+       __u32 reserved[8];
+};
+
+/* for KVM_SET_VAPIC_ADDR */
+struct kvm_vapic_addr {
+       __u64 vapic_addr;
+};
+
  #define KVMIO 0xAE
  
  /*
@@ -347,11 +227,21 @@ struct kvm_signal_mask {
   */
  #define KVM_CAP_IRQCHIP          0
  #define KVM_CAP_HLT      1
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
+#define KVM_CAP_USER_MEMORY 3
+#define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_EXT_CPUID 5
+#define KVM_CAP_VAPIC 6
  
  /*
   * ioctls for VM fds
   */
  #define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 0x40, struct kvm_memory_region)
+#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO, 0x44)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO, 0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
+                                       struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
  /*
   * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
   * a vcpu fd.
@@ -359,6 +249,7 @@ struct kvm_signal_mask {
  #define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
  #define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
  #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
+#define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
  /* Device model IOC */
  #define KVM_CREATE_IRQCHIP       _IO(KVMIO,  0x60)
  #define KVM_IRQ_LINE             _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -384,5 +275,11 @@ struct kvm_signal_mask {
  #define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
  #define KVM_GET_LAPIC             _IOR(KVMIO,  0x8e, struct kvm_lapic_state)
  #define KVM_SET_LAPIC             _IOW(KVMIO,  0x8f, struct kvm_lapic_state)
+#define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
+#define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO,  0x92, struct kvm_tpr_access_ctl)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
  
  #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

new file mode 100644 (file)

index 0000000..ea4764b
--- /dev/null
+++ b/include/linux/kvm_host.h
@@ -0,0 +1,299 @@
+#ifndef __KVM_HOST_H
+#define __KVM_HOST_H
+
+/*
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/hardirq.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <asm/signal.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+
+#include <linux/kvm_types.h>
+
+#include <asm/kvm_host.h>
+
+#define KVM_MAX_VCPUS 4
+#define KVM_MEMORY_SLOTS 8
+/* memory slots that does not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+#define KVM_PIO_PAGE_OFFSET 1
+
+/*
+ * vcpu->requests bit members
+ */
+#define KVM_REQ_TLB_FLUSH          0
+#define KVM_REQ_MIGRATE_TIMER      1
+#define KVM_REQ_REPORT_TPR_ACCESS  2
+
+struct kvm_vcpu;
+extern struct kmem_cache *kvm_vcpu_cache;
+
+struct kvm_guest_debug {
+       int enabled;
+       unsigned long bp[4];
+       int singlestep;
+};
+
+/*
+ * It would be nice to use something smarter than a linear search, TBD...
+ * Thankfully we dont expect many devices to register (famous last words :),
+ * so until then it will suffice.  At least its abstracted so we can change
+ * in one place.
+ */
+struct kvm_io_bus {
+       int                   dev_count;
+#define NR_IOBUS_DEVS 6
+       struct kvm_io_device *devs[NR_IOBUS_DEVS];
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus);
+void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
+                            struct kvm_io_device *dev);
+
+struct kvm_vcpu {
+       struct kvm *kvm;
+       struct preempt_notifier preempt_notifier;
+       int vcpu_id;
+       struct mutex mutex;
+       int   cpu;
+       struct kvm_run *run;
+       int guest_mode;
+       unsigned long requests;
+       struct kvm_guest_debug guest_debug;
+       int fpu_active;
+       int guest_fpu_loaded;
+       wait_queue_head_t wq;
+       int sigset_active;
+       sigset_t sigset;
+       struct kvm_vcpu_stat stat;
+
+#ifdef CONFIG_HAS_IOMEM
+       int mmio_needed;
+       int mmio_read_completed;
+       int mmio_is_write;
+       int mmio_size;
+       unsigned char mmio_data[8];
+       gpa_t mmio_phys_addr;
+#endif
+
+       struct kvm_vcpu_arch arch;
+};
+
+struct kvm_memory_slot {
+       gfn_t base_gfn;
+       unsigned long npages;
+       unsigned long flags;
+       unsigned long *rmap;
+       unsigned long *dirty_bitmap;
+       unsigned long userspace_addr;
+       int user_alloc;
+};
+
+struct kvm {
+       struct mutex lock; /* protects the vcpus array and APIC accesses */
+       spinlock_t mmu_lock;
+       struct mm_struct *mm; /* userspace tied to this vm */
+       int nmemslots;
+       struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
+                                       KVM_PRIVATE_MEM_SLOTS];
+       struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+       struct list_head vm_list;
+       struct file *filp;
+       struct kvm_io_bus mmio_bus;
+       struct kvm_io_bus pio_bus;
+       struct kvm_vm_stat stat;
+       struct kvm_arch arch;
+};
+
+/* The guest did something we don't support. */
+#define pr_unimpl(vcpu, fmt, ...)                                      \
+ do {                                                                  \
+       if (printk_ratelimit())                                         \
+               printk(KERN_ERR "kvm: %i: cpu%i " fmt,                  \
+                      current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
+ } while (0)
+
+#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
+#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_put(struct kvm_vcpu *vcpu);
+
+void decache_vcpus_on_cpu(int cpu);
+
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+                 struct module *module);
+void kvm_exit(void);
+
+#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
+#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
+static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
+struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
+
+extern struct page *bad_page;
+
+int is_error_page(struct page *page);
+int kvm_is_error_hva(unsigned long addr);
+int kvm_set_memory_region(struct kvm *kvm,
+                         struct kvm_userspace_memory_region *mem,
+                         int user_alloc);
+int __kvm_set_memory_region(struct kvm *kvm,
+                           struct kvm_userspace_memory_region *mem,
+                           int user_alloc);
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               struct kvm_memory_slot old,
+                               int user_alloc);
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page_clean(struct page *page);
+void kvm_release_page_dirty(struct page *page);
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+                       int len);
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+                         unsigned long len);
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+                        int offset, int len);
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+                   unsigned long len);
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
+
+void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_resched(struct kvm_vcpu *vcpu);
+void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+void kvm_flush_remote_tlbs(struct kvm *kvm);
+
+long kvm_arch_dev_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg);
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+
+int kvm_dev_ioctl_check_extension(long ext);
+
+int kvm_get_dirty_log(struct kvm *kvm,
+                       struct kvm_dirty_log *log, int *is_dirty);
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                               struct kvm_dirty_log *log);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+                                  struct
+                                  kvm_userspace_memory_region *mem,
+                                  int user_alloc);
+long kvm_arch_vm_ioctl(struct file *filp,
+                      unsigned int ioctl, unsigned long arg);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                   struct kvm_translation *tr);
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                 struct kvm_sregs *sregs);
+int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
+                                   struct kvm_debug_guest *dbg);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+
+int kvm_arch_init(void *opaque);
+void kvm_arch_exit(void);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+
+int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_arch_hardware_enable(void *garbage);
+void kvm_arch_hardware_disable(void *garbage);
+int kvm_arch_hardware_setup(void);
+void kvm_arch_hardware_unsetup(void);
+void kvm_arch_check_processor_compat(void *rtn);
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+
+void kvm_free_physmem(struct kvm *kvm);
+
+struct  kvm *kvm_arch_create_vm(void);
+void kvm_arch_destroy_vm(struct kvm *kvm);
+
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+static inline void kvm_guest_enter(void)
+{
+       account_system_vtime(current);
+       current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+       account_system_vtime(current);
+       current->flags &= ~PF_VCPU;
+}
+
+static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+       return slot - kvm->memslots;
+}
+
+static inline gpa_t gfn_to_gpa(gfn_t gfn)
+{
+       return (gpa_t)gfn << PAGE_SHIFT;
+}
+
+static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+       set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
+}
+
+enum kvm_stat_kind {
+       KVM_STAT_VM,
+       KVM_STAT_VCPU,
+};
+
+struct kvm_stats_debugfs_item {
+       const char *name;
+       int offset;
+       enum kvm_stat_kind kind;
+       struct dentry *dentry;
+};
+extern struct kvm_stats_debugfs_item debugfs_entries[];
+
+#endif
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h

index 3b292565a693c2363283b50fdac72880e5cca0b2..5497aac0d2f829d4efe3df98f8c34496d947243f 100644 (file)
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -2,72 +2,30 @@
  #define __LINUX_KVM_PARA_H
  
  /*
- * Guest OS interface for KVM paravirtualization
- *
- * Note: this interface is totally experimental, and is certain to change
- *       as we make progress.
+ * This header file provides a method for making a hypercall to the host
+ * Architectures should define:
+ * - kvm_hypercall0, kvm_hypercall1...
+ * - kvm_arch_para_features
+ * - kvm_para_available
   */
  
-/*
- * Per-VCPU descriptor area shared between guest and host. Writable to
- * both guest and host. Registered with the host by the guest when
- * a guest acknowledges paravirtual mode.
- *
- * NOTE: all addresses are guest-physical addresses (gpa), to make it
- * easier for the hypervisor to map between the various addresses.
- */
-struct kvm_vcpu_para_state {
-       /*
-        * API version information for compatibility. If there's any support
-        * mismatch (too old host trying to execute too new guest) then
-        * the host will deny entry into paravirtual mode. Any other
-        * combination (new host + old guest and new host + new guest)
-        * is supposed to work - new host versions will support all old
-        * guest API versions.
-        */
-       u32 guest_version;
-       u32 host_version;
-       u32 size;
-       u32 ret;
-
-       /*
-        * The address of the vm exit instruction (VMCALL or VMMCALL),
-        * which the host will patch according to the CPU model the
-        * VM runs on:
-        */
-       u64 hypercall_gpa;
-
-} __attribute__ ((aligned(PAGE_SIZE)));
-
-#define KVM_PARA_API_VERSION 1
-
-/*
- * This is used for an RDMSR's ECX parameter to probe for a KVM host.
- * Hopefully no CPU vendor will use up this number. This is placed well
- * out of way of the typical space occupied by CPU vendors' MSR indices,
- * and we think (or at least hope) it wont be occupied in the future
- * either.
- */
-#define MSR_KVM_API_MAGIC 0x87655678
+/* Return values for hypercalls */
+#define KVM_ENOSYS             1000
  
-#define KVM_EINVAL 1
+#define KVM_HC_VAPIC_POLL_IRQ            1
  
  /*
- * Hypercall calling convention:
- *
- * Each hypercall may have 0-6 parameters.
- *
- * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
- *
- * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
- * order: RDI, RSI, RDX, RCX, R8, R9.
- *
- * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
- * (the first 3 are according to the gcc regparm calling convention)
- *
- * No registers are clobbered by the hypercall, except that the
- * return value is in RAX.
+ * hypercalls use architecture specific
   */
-#define __NR_hypercalls                        0
+#include <asm/kvm_para.h>
+
+#ifdef __KERNEL__
+static inline int kvm_para_has_feature(unsigned int feature)
+{
+       if (kvm_arch_para_features() & (1UL << feature))
+               return 1;
+       return 0;
+}
+#endif /* __KERNEL__ */
+#endif /* __LINUX_KVM_PARA_H */
  
-#endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h

new file mode 100644 (file)

index 0000000..1c4e46d
--- /dev/null
+++ b/include/linux/kvm_types.h
@@ -0,0 +1,54 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ */
+
+#ifndef __KVM_TYPES_H__
+#define __KVM_TYPES_H__
+
+#include <asm/types.h>
+
+/*
+ * Address types:
+ *
+ *  gva - guest virtual address
+ *  gpa - guest physical address
+ *  gfn - guest frame number
+ *  hva - host virtual address
+ *  hpa - host physical address
+ *  hfn - host frame number
+ */
+
+typedef unsigned long  gva_t;
+typedef u64            gpa_t;
+typedef unsigned long  gfn_t;
+
+typedef unsigned long  hva_t;
+typedef u64            hpa_t;
+typedef unsigned long  hfn_t;
+
+struct kvm_pio_request {
+       unsigned long count;
+       int cur_count;
+       struct page *guest_pages[2];
+       unsigned guest_page_offset;
+       int in;
+       int port;
+       int size;
+       int string;
+       int down;
+       int rep;
+};
+
+#endif /* __KVM_TYPES_H__ */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h

index ff203dd029191d5b578707ce184655bcc02967da..3faf599ea58ec4cac70121f874efa9ec4d6ca5f4 100644 (file)
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -13,6 +13,10 @@
  #define asmlinkage CPP_ASMLINKAGE
  #endif
  
+#ifndef asmregparm
+# define asmregparm
+#endif
+
  #ifndef prevent_tail_call
  # define prevent_tail_call(ret) do { } while (0)
  #endif
@@ -53,6 +57,10 @@
    .size name, .-name
  #endif
  
+/* If symbol 'name' is treated as a subroutine (gets called, and returns)
+ * then please use ENDPROC to mark 'name' as STT_FUNC for the benefit of
+ * static analysis tools such as stack depth analyzer.
+ */
  #ifndef ENDPROC
  #define ENDPROC(name) \
    .type name, @function; \
diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h

index 6f1637c61e10eaef35f0a5e6a27145c273b5d95b..3d25bcd139d166b1173e4691ffa5670c2950cdb1 100644 (file)
--- a/include/linux/lockd/bind.h
+++ b/include/linux/lockd/bind.h
@@ -32,10 +32,27 @@ struct nlmsvc_binding {
  
  extern struct nlmsvc_binding * nlmsvc_ops;
  
+/*
+ * Similar to nfs_client_initdata, but without the NFS-specific
+ * rpc_ops field.
+ */
+struct nlmclnt_initdata {
+       const char              *hostname;
+       const struct sockaddr   *address;
+       size_t                  addrlen;
+       unsigned short          protocol;
+       u32                     nfs_version;
+};
+
  /*
   * Functions exported by the lockd module
   */
-extern int     nlmclnt_proc(struct inode *, int, struct file_lock *);
+
+extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init);
+extern void    nlmclnt_done(struct nlm_host *host);
+
+extern int     nlmclnt_proc(struct nlm_host *host, int cmd,
+                                       struct file_lock *fl);
  extern int     lockd_up(int proto);
  extern void    lockd_down(void);
  
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 1897ca223eca99c402523994c04e64287b769cbd..1bba6789a50a39cd1139a7b0556df38fcd78624e 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1118,9 +1118,21 @@ static inline void vm_stat_account(struct mm_struct *mm,
  }
  #endif /* CONFIG_PROC_FS */
  
-#ifndef CONFIG_DEBUG_PAGEALLOC
+#ifdef CONFIG_DEBUG_PAGEALLOC
+extern int debug_pagealloc_enabled;
+
+extern void kernel_map_pages(struct page *page, int numpages, int enable);
+
+static inline void enable_debug_pagealloc(void)
+{
+       debug_pagealloc_enabled = 1;
+}
+#else
  static inline void
  kernel_map_pages(struct page *page, int numpages, int enable) {}
+static inline void enable_debug_pagealloc(void)
+{
+}
  #endif
  
  extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
@@ -1146,6 +1158,7 @@ extern int randomize_va_space;
  #endif
  
  const char * arch_vma_name(struct vm_area_struct *vma);
+void print_vma_addr(char *prefix, unsigned long rip);
  
  struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
  pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h

index 2d15d4aac09407e94ee859c3c1a76c1dd9717041..099ddb4481c07d9d64b4708db3ce519de622ed84 100644 (file)
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -196,28 +196,67 @@ struct nfs_inode {
  #define NFS_INO_STALE          (2)             /* possible stale inode */
  #define NFS_INO_ACL_LRU_SET    (3)             /* Inode is on the LRU list */
  
-static inline struct nfs_inode *NFS_I(struct inode *inode)
+static inline struct nfs_inode *NFS_I(const struct inode *inode)
  {
         return container_of(inode, struct nfs_inode, vfs_inode);
  }
-#define NFS_SB(s)              ((struct nfs_server *)(s->s_fs_info))
  
-#define NFS_FH(inode)                  (&NFS_I(inode)->fh)
-#define NFS_SERVER(inode)              (NFS_SB(inode->i_sb))
-#define NFS_CLIENT(inode)              (NFS_SERVER(inode)->client)
-#define NFS_PROTO(inode)               (NFS_SERVER(inode)->nfs_client->rpc_ops)
-#define NFS_COOKIEVERF(inode)          (NFS_I(inode)->cookieverf)
-#define NFS_MINATTRTIMEO(inode) \
-       (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \
-                              : NFS_SERVER(inode)->acregmin)
-#define NFS_MAXATTRTIMEO(inode) \
-       (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmax \
-                              : NFS_SERVER(inode)->acregmax)
+static inline struct nfs_server *NFS_SB(const struct super_block *s)
+{
+       return (struct nfs_server *)(s->s_fs_info);
+}
+
+static inline struct nfs_fh *NFS_FH(const struct inode *inode)
+{
+       return &NFS_I(inode)->fh;
+}
+
+static inline struct nfs_server *NFS_SERVER(const struct inode *inode)
+{
+       return NFS_SB(inode->i_sb);
+}
+
+static inline struct rpc_clnt *NFS_CLIENT(const struct inode *inode)
+{
+       return NFS_SERVER(inode)->client;
+}
+
+static inline const struct nfs_rpc_ops *NFS_PROTO(const struct inode *inode)
+{
+       return NFS_SERVER(inode)->nfs_client->rpc_ops;
+}
+
+static inline __be32 *NFS_COOKIEVERF(const struct inode *inode)
+{
+       return NFS_I(inode)->cookieverf;
+}
+
+static inline unsigned NFS_MINATTRTIMEO(const struct inode *inode)
+{
+       struct nfs_server *nfss = NFS_SERVER(inode);
+       return S_ISDIR(inode->i_mode) ? nfss->acdirmin : nfss->acregmin;
+}
  
-#define NFS_FLAGS(inode)               (NFS_I(inode)->flags)
-#define NFS_STALE(inode)               (test_bit(NFS_INO_STALE, &NFS_FLAGS(inode)))
+static inline unsigned NFS_MAXATTRTIMEO(const struct inode *inode)
+{
+       struct nfs_server *nfss = NFS_SERVER(inode);
+       return S_ISDIR(inode->i_mode) ? nfss->acdirmax : nfss->acregmax;
+}
  
-#define NFS_FILEID(inode)              (NFS_I(inode)->fileid)
+static inline int NFS_STALE(const struct inode *inode)
+{
+       return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+}
+
+static inline __u64 NFS_FILEID(const struct inode *inode)
+{
+       return NFS_I(inode)->fileid;
+}
+
+static inline void set_nfs_fileid(struct inode *inode, __u64 fileid)
+{
+       NFS_I(inode)->fileid = fileid;
+}
  
  static inline void nfs_mark_for_revalidate(struct inode *inode)
  {
@@ -237,7 +276,7 @@ static inline int nfs_server_capable(struct inode *inode, int cap)
  
  static inline int NFS_USE_READDIRPLUS(struct inode *inode)
  {
-       return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+       return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
  }
  
  static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf)
@@ -366,6 +405,7 @@ extern const struct inode_operations nfs3_dir_inode_operations;
  extern const struct file_operations nfs_dir_operations;
  extern struct dentry_operations nfs_dentry_operations;
  
+extern void nfs_force_lookup_revalidate(struct inode *dir);
  extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
  extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags);
  extern void nfs_access_zap_cache(struct inode *inode);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h

index 0cac49bc09553ef33b572593f038bb0e198fb17d..3423c6761bf7b561ad1ea4f4f7fce48f6541137e 100644 (file)
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -3,8 +3,12 @@
  
  #include <linux/list.h>
  #include <linux/backing-dev.h>
+#include <linux/wait.h>
+
+#include <asm/atomic.h>
  
  struct nfs_iostats;
+struct nlm_host;
  
  /*
   * The nfs_client identifies our client state to the server.
@@ -14,20 +18,19 @@ struct nfs_client {
         int                     cl_cons_state;  /* current construction state (-ve: init error) */
  #define NFS_CS_READY           0               /* ready to be used */
  #define NFS_CS_INITING         1               /* busy initialising */
-       int                     cl_nfsversion;  /* NFS protocol version */
         unsigned long           cl_res_state;   /* NFS resources state */
  #define NFS_CS_CALLBACK                1               /* - callback started */
  #define NFS_CS_IDMAP           2               /* - idmap started */
  #define NFS_CS_RENEWD          3               /* - renewd started */
-       struct sockaddr_in      cl_addr;        /* server identifier */
+       struct sockaddr_storage cl_addr;        /* server identifier */
+       size_t                  cl_addrlen;
         char *                  cl_hostname;    /* hostname of server */
         struct list_head        cl_share_link;  /* link in global client list */
         struct list_head        cl_superblocks; /* List of nfs_server structs */
  
         struct rpc_clnt *       cl_rpcclient;
         const struct nfs_rpc_ops *rpc_ops;      /* NFS protocol vector */
-       unsigned long           retrans_timeo;  /* retransmit timeout */
-       unsigned int            retrans_count;  /* number of retransmit tries */
+       int                     cl_proto;       /* Network transport protocol */
  
  #ifdef CONFIG_NFS_V4
         u64                     cl_clientid;    /* constant */
@@ -62,7 +65,7 @@ struct nfs_client {
         /* Our own IP address, as a null-terminated string.
          * This is used to generate the clientid, and the callback address.
          */
-       char                    cl_ipaddr[16];
+       char                    cl_ipaddr[48];
         unsigned char           cl_id_uniquifier;
  #endif
  };
@@ -78,6 +81,7 @@ struct nfs_server {
         struct list_head        master_link;    /* link in master servers list */
         struct rpc_clnt *       client;         /* RPC client handle */
         struct rpc_clnt *       client_acl;     /* ACL RPC client handle */
+       struct nlm_host         *nlm_host;      /* NLM client handle */
         struct nfs_iostats *    io_stats;       /* I/O statistics */
         struct backing_dev_info backing_dev_info;
         atomic_long_t           writeback;      /* number of writeback pages */
@@ -110,6 +114,9 @@ struct nfs_server {
                                                    filesystem */
  #endif
         void (*destroy)(struct nfs_server *);
+
+       atomic_t active; /* Keep trace of any activity to this server */
+       wait_queue_head_t active_wq;  /* Wait for any activity to stop  */
  };
  
  /* Server capabilities */
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h

index 30dbcc185e6972829967562693eb75eb101f53b6..a1676e19e49170dc41607ae824b5c911c83886c1 100644 (file)
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -83,6 +83,7 @@ extern        void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
  extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
  extern  int nfs_wait_on_request(struct nfs_page *);
  extern void nfs_unlock_request(struct nfs_page *req);
+extern int nfs_set_page_tag_locked(struct nfs_page *req);
  extern  void nfs_clear_page_tag_locked(struct nfs_page *req);
  
  
@@ -95,18 +96,6 @@ nfs_lock_request_dontget(struct nfs_page *req)
         return !test_and_set_bit(PG_BUSY, &req->wb_flags);
  }
  
-/*
- * Lock the page of an asynchronous request and take a reference
- */
-static inline int
-nfs_lock_request(struct nfs_page *req)
-{
-       if (test_and_set_bit(PG_BUSY, &req->wb_flags))
-               return 0;
-       kref_get(&req->wb_kref);
-       return 1;
-}
-
  /**
   * nfs_list_add_request - Insert a request into a list
   * @req: request
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h

index daab252f2e5cf653e41bb39721596bfc7aa89315..f301d0b8babc99a94485e089cdfbca401c1407d8 100644 (file)
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -666,16 +666,17 @@ struct nfs4_rename_res {
         struct nfs_fattr *              new_fattr;
  };
  
+#define NFS4_SETCLIENTID_NAMELEN       (56)
  struct nfs4_setclientid {
-       const nfs4_verifier *           sc_verifier;      /* request */
+       const nfs4_verifier *           sc_verifier;
         unsigned int                    sc_name_len;
-       char                            sc_name[48];      /* request */
-       u32                             sc_prog;          /* request */
+       char                            sc_name[NFS4_SETCLIENTID_NAMELEN];
+       u32                             sc_prog;
         unsigned int                    sc_netid_len;
-       char                            sc_netid[4];      /* request */
+       char                            sc_netid[RPCBIND_MAXNETIDLEN];
         unsigned int                    sc_uaddr_len;
-       char                            sc_uaddr[24];     /* request */
-       u32                             sc_cb_ident;      /* request */
+       char                            sc_uaddr[RPCBIND_MAXUADDRLEN];
+       u32                             sc_cb_ident;
  };
  
  struct nfs4_statfs_arg {
@@ -773,7 +774,7 @@ struct nfs_access_entry;
   * RPC procedure vector for NFSv2/NFSv3 demuxing
   */
  struct nfs_rpc_ops {
-       int     version;                /* Protocol version */
+       u32     version;                /* Protocol version */
         struct dentry_operations *dentry_ops;
         const struct inode_operations *dir_inode_ops;
         const struct inode_operations *file_inode_ops;
@@ -816,11 +817,11 @@ struct nfs_rpc_ops {
                              struct nfs_pathconf *);
         int     (*set_capabilities)(struct nfs_server *, struct nfs_fh *);
         __be32 *(*decode_dirent)(__be32 *, struct nfs_entry *, int plus);
-       void    (*read_setup)   (struct nfs_read_data *);
+       void    (*read_setup)   (struct nfs_read_data *, struct rpc_message *);
         int     (*read_done)  (struct rpc_task *, struct nfs_read_data *);
-       void    (*write_setup)  (struct nfs_write_data *, int how);
+       void    (*write_setup)  (struct nfs_write_data *, struct rpc_message *);
         int     (*write_done)  (struct rpc_task *, struct nfs_write_data *);
-       void    (*commit_setup) (struct nfs_write_data *, int how);
+       void    (*commit_setup) (struct nfs_write_data *, struct rpc_message *);
         int     (*commit_done) (struct rpc_task *, struct nfs_write_data *);
         int     (*file_open)   (struct inode *, struct file *);
         int     (*file_release) (struct inode *, struct file *);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h

index c69531348363dde5551164c50f4fac84e9180cf8..41f6f28690f63be67aa43a2a162a1d349dc347e3 100644 (file)
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2085,6 +2085,13 @@
  #define PCI_VENDOR_ID_BELKIN           0x1799
  #define PCI_DEVICE_ID_BELKIN_F5D7010V7 0x701f
  
+#define PCI_VENDOR_ID_RDC              0x17f3
+#define PCI_DEVICE_ID_RDC_R6020                0x6020
+#define PCI_DEVICE_ID_RDC_R6030                0x6030
+#define PCI_DEVICE_ID_RDC_R6040                0x6040
+#define PCI_DEVICE_ID_RDC_R6060                0x6060
+#define PCI_DEVICE_ID_RDC_R6061                0x6061
+
  #define PCI_VENDOR_ID_LENOVO           0x17aa
  
  #define PCI_VENDOR_ID_ARECA            0x17d3
diff --git a/include/linux/percpu.h b/include/linux/percpu.h

index 926adaae0f9610885d5d64e341410a9b96900312..00412bb494c40b100bc1e996983b72ba877ccf69 100644 (file)
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,6 +9,30 @@
  
  #include <asm/percpu.h>
  
+#ifndef PER_CPU_ATTRIBUTES
+#define PER_CPU_ATTRIBUTES
+#endif
+
+#ifdef CONFIG_SMP
+#define DEFINE_PER_CPU(type, name)                                     \
+       __attribute__((__section__(".data.percpu")))                    \
+       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                      \
+       __attribute__((__section__(".data.percpu.shared_aligned")))     \
+       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name             \
+       ____cacheline_aligned_in_smp
+#else
+#define DEFINE_PER_CPU(type, name)                                     \
+       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)                    \
+       DEFINE_PER_CPU(type, name)
+#endif
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+
  /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
  #ifndef PERCPU_ENOUGH_ROOM
  #ifdef CONFIG_MODULES
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h

index 3ea5750a0f7e2ac74c1c90e1aecd4910b7a74bc7..515bff053de8a55900e223019452950d178d81b9 100644 (file)
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -129,6 +129,81 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data);
  #define force_successful_syscall_return() do { } while (0)
  #endif
  
+/*
+ * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
+ *
+ * These do-nothing inlines are used when the arch does not
+ * implement single-step.  The kerneldoc comments are here
+ * to document the interface for all arch definitions.
+ */
+
+#ifndef arch_has_single_step
+/**
+ * arch_has_single_step - does this CPU support user-mode single-step?
+ *
+ * If this is defined, then there must be function declarations or
+ * inlines for user_enable_single_step() and user_disable_single_step().
+ * arch_has_single_step() should evaluate to nonzero iff the machine
+ * supports instruction single-step for user mode.
+ * It can be a constant or it can test a CPU feature bit.
+ */
+#define arch_has_single_step()         (0)
+
+/**
+ * user_enable_single_step - single-step in user-mode task
+ * @task: either current or a task stopped in %TASK_TRACED
+ *
+ * This can only be called when arch_has_single_step() has returned nonzero.
+ * Set @task so that when it returns to user mode, it will trap after the
+ * next single instruction executes.  If arch_has_block_step() is defined,
+ * this must clear the effects of user_enable_block_step() too.
+ */
+static inline void user_enable_single_step(struct task_struct *task)
+{
+       BUG();                  /* This can never be called.  */
+}
+
+/**
+ * user_disable_single_step - cancel user-mode single-step
+ * @task: either current or a task stopped in %TASK_TRACED
+ *
+ * Clear @task of the effects of user_enable_single_step() and
+ * user_enable_block_step().  This can be called whether or not either
+ * of those was ever called on @task, and even if arch_has_single_step()
+ * returned zero.
+ */
+static inline void user_disable_single_step(struct task_struct *task)
+{
+}
+#endif /* arch_has_single_step */
+
+#ifndef arch_has_block_step
+/**
+ * arch_has_block_step - does this CPU support user-mode block-step?
+ *
+ * If this is defined, then there must be a function declaration or inline
+ * for user_enable_block_step(), and arch_has_single_step() must be defined
+ * too.  arch_has_block_step() should evaluate to nonzero iff the machine
+ * supports step-until-branch for user mode.  It can be a constant or it
+ * can test a CPU feature bit.
+ */
+#define arch_has_block_step()          (0)
+
+/**
+ * user_enable_block_step - step until branch in user-mode task
+ * @task: either current or a task stopped in %TASK_TRACED
+ *
+ * This can only be called when arch_has_block_step() has returned nonzero,
+ * and will never be called when single-instruction stepping is being used.
+ * Set @task so that when it returns to user mode, it will trap after the
+ * next branch or trap taken.
+ */
+static inline void user_enable_block_step(struct task_struct *task)
+{
+       BUG();                  /* This can never be called.  */
+}
+#endif /* arch_has_block_step */
+
  #endif
  
  #endif
diff --git a/include/linux/regset.h b/include/linux/regset.h

new file mode 100644 (file)

index 0000000..8abee65
--- /dev/null
+++ b/include/linux/regset.h
@@ -0,0 +1,368 @@
+/*
+ * User-mode machine state access
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ */
+
+#ifndef _LINUX_REGSET_H
+#define _LINUX_REGSET_H        1
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+struct task_struct;
+struct user_regset;
+
+
+/**
+ * user_regset_active_fn - type of @active function in &struct user_regset
+ * @target:    thread being examined
+ * @regset:    regset being examined
+ *
+ * Return -%ENODEV if not available on the hardware found.
+ * Return %0 if no interesting state in this thread.
+ * Return >%0 number of @size units of interesting state.
+ * Any get call fetching state beyond that number will
+ * see the default initialization state for this data,
+ * so a caller that knows what the default state is need
+ * not copy it all out.
+ * This call is optional; the pointer is %NULL if there
+ * is no inexpensive check to yield a value < @n.
+ */
+typedef int user_regset_active_fn(struct task_struct *target,
+                                 const struct user_regset *regset);
+
+/**
+ * user_regset_get_fn - type of @get function in &struct user_regset
+ * @target:    thread being examined
+ * @regset:    regset being examined
+ * @pos:       offset into the regset data to access, in bytes
+ * @count:     amount of data to copy, in bytes
+ * @kbuf:      if not %NULL, a kernel-space pointer to copy into
+ * @ubuf:      if @kbuf is %NULL, a user-space pointer to copy into
+ *
+ * Fetch register values.  Return %0 on success; -%EIO or -%ENODEV
+ * are usual failure returns.  The @pos and @count values are in
+ * bytes, but must be properly aligned.  If @kbuf is non-null, that
+ * buffer is used and @ubuf is ignored.  If @kbuf is %NULL, then
+ * ubuf gives a userland pointer to access directly, and an -%EFAULT
+ * return value is possible.
+ */
+typedef int user_regset_get_fn(struct task_struct *target,
+                              const struct user_regset *regset,
+                              unsigned int pos, unsigned int count,
+                              void *kbuf, void __user *ubuf);
+
+/**
+ * user_regset_set_fn - type of @set function in &struct user_regset
+ * @target:    thread being examined
+ * @regset:    regset being examined
+ * @pos:       offset into the regset data to access, in bytes
+ * @count:     amount of data to copy, in bytes
+ * @kbuf:      if not %NULL, a kernel-space pointer to copy from
+ * @ubuf:      if @kbuf is %NULL, a user-space pointer to copy from
+ *
+ * Store register values.  Return %0 on success; -%EIO or -%ENODEV
+ * are usual failure returns.  The @pos and @count values are in
+ * bytes, but must be properly aligned.  If @kbuf is non-null, that
+ * buffer is used and @ubuf is ignored.  If @kbuf is %NULL, then
+ * ubuf gives a userland pointer to access directly, and an -%EFAULT
+ * return value is possible.
+ */
+typedef int user_regset_set_fn(struct task_struct *target,
+                              const struct user_regset *regset,
+                              unsigned int pos, unsigned int count,
+                              const void *kbuf, const void __user *ubuf);
+
+/**
+ * user_regset_writeback_fn - type of @writeback function in &struct user_regset
+ * @target:    thread being examined
+ * @regset:    regset being examined
+ * @immediate: zero if writeback at completion of next context switch is OK
+ *
+ * This call is optional; usually the pointer is %NULL.  When
+ * provided, there is some user memory associated with this regset's
+ * hardware, such as memory backing cached register data on register
+ * window machines; the regset's data controls what user memory is
+ * used (e.g. via the stack pointer value).
+ *
+ * Write register data back to user memory.  If the @immediate flag
+ * is nonzero, it must be written to the user memory so uaccess or
+ * access_process_vm() can see it when this call returns; if zero,
+ * then it must be written back by the time the task completes a
+ * context switch (as synchronized with wait_task_inactive()).
+ * Return %0 on success or if there was nothing to do, -%EFAULT for
+ * a memory problem (bad stack pointer or whatever), or -%EIO for a
+ * hardware problem.
+ */
+typedef int user_regset_writeback_fn(struct task_struct *target,
+                                    const struct user_regset *regset,
+                                    int immediate);
+
+/**
+ * struct user_regset - accessible thread CPU state
+ * @n:                 Number of slots (registers).
+ * @size:              Size in bytes of a slot (register).
+ * @align:             Required alignment, in bytes.
+ * @bias:              Bias from natural indexing.
+ * @core_note_type:    ELF note @n_type value used in core dumps.
+ * @get:               Function to fetch values.
+ * @set:               Function to store values.
+ * @active:            Function to report if regset is active, or %NULL.
+ * @writeback:         Function to write data back to user memory, or %NULL.
+ *
+ * This data structure describes a machine resource we call a register set.
+ * This is part of the state of an individual thread, not necessarily
+ * actual CPU registers per se.  A register set consists of a number of
+ * similar slots, given by @n.  Each slot is @size bytes, and aligned to
+ * @align bytes (which is at least @size).
+ *
+ * These functions must be called only on the current thread or on a
+ * thread that is in %TASK_STOPPED or %TASK_TRACED state, that we are
+ * guaranteed will not be woken up and return to user mode, and that we
+ * have called wait_task_inactive() on.  (The target thread always might
+ * wake up for SIGKILL while these functions are working, in which case
+ * that thread's user_regset state might be scrambled.)
+ *
+ * The @pos argument must be aligned according to @align; the @count
+ * argument must be a multiple of @size.  These functions are not
+ * responsible for checking for invalid arguments.
+ *
+ * When there is a natural value to use as an index, @bias gives the
+ * difference between the natural index and the slot index for the
+ * register set.  For example, x86 GDT segment descriptors form a regset;
+ * the segment selector produces a natural index, but only a subset of
+ * that index space is available as a regset (the TLS slots); subtracting
+ * @bias from a segment selector index value computes the regset slot.
+ *
+ * If nonzero, @core_note_type gives the n_type field (NT_* value)
+ * of the core file note in which this regset's data appears.
+ * NT_PRSTATUS is a special case in that the regset data starts at
+ * offsetof(struct elf_prstatus, pr_reg) into the note data; that is
+ * part of the per-machine ELF formats userland knows about.  In
+ * other cases, the core file note contains exactly the whole regset
+ * (@n * @size) and nothing else.  The core file note is normally
+ * omitted when there is an @active function and it returns zero.
+ */
+struct user_regset {
+       user_regset_get_fn              *get;
+       user_regset_set_fn              *set;
+       user_regset_active_fn           *active;
+       user_regset_writeback_fn        *writeback;
+       unsigned int                    n;
+       unsigned int                    size;
+       unsigned int                    align;
+       unsigned int                    bias;
+       unsigned int                    core_note_type;
+};
+
+/**
+ * struct user_regset_view - available regsets
+ * @name:      Identifier, e.g. UTS_MACHINE string.
+ * @regsets:   Array of @n regsets available in this view.
+ * @n:         Number of elements in @regsets.
+ * @e_machine: ELF header @e_machine %EM_* value written in core dumps.
+ * @e_flags:   ELF header @e_flags value written in core dumps.
+ * @ei_osabi:  ELF header @e_ident[%EI_OSABI] value written in core dumps.
+ *
+ * A regset view is a collection of regsets (&struct user_regset,
+ * above).  This describes all the state of a thread that can be seen
+ * from a given architecture/ABI environment.  More than one view might
+ * refer to the same &struct user_regset, or more than one regset
+ * might refer to the same machine-specific state in the thread.  For
+ * example, a 32-bit thread's state could be examined from the 32-bit
+ * view or from the 64-bit view.  Either method reaches the same thread
+ * register state, doing appropriate widening or truncation.
+ */
+struct user_regset_view {
+       const char *name;
+       const struct user_regset *regsets;
+       unsigned int n;
+       u32 e_flags;
+       u16 e_machine;
+       u8 ei_osabi;
+};
+
+/*
+ * This is documented here rather than at the definition sites because its
+ * implementation is machine-dependent but its interface is universal.
+ */
+/**
+ * task_user_regset_view - Return the process's native regset view.
+ * @tsk: a thread of the process in question
+ *
+ * Return the &struct user_regset_view that is native for the given process.
+ * For example, what it would access when it called ptrace().
+ * Throughout the life of the process, this only changes at exec.
+ */
+const struct user_regset_view *task_user_regset_view(struct task_struct *tsk);
+
+
+/*
+ * These are helpers for writing regset get/set functions in arch code.
+ * Because @start_pos and @end_pos are always compile-time constants,
+ * these are inlined into very little code though they look large.
+ *
+ * Use one or more calls sequentially for each chunk of regset data stored
+ * contiguously in memory.  Call with constants for @start_pos and @end_pos,
+ * giving the range of byte positions in the regset that data corresponds
+ * to; @end_pos can be -1 if this chunk is at the end of the regset layout.
+ * Each call updates the arguments to point past its chunk.
+ */
+
+static inline int user_regset_copyout(unsigned int *pos, unsigned int *count,
+                                     void **kbuf,
+                                     void __user **ubuf, const void *data,
+                                     const int start_pos, const int end_pos)
+{
+       if (*count == 0)
+               return 0;
+       BUG_ON(*pos < start_pos);
+       if (end_pos < 0 || *pos < end_pos) {
+               unsigned int copy = (end_pos < 0 ? *count
+                                    : min(*count, end_pos - *pos));
+               data += *pos - start_pos;
+               if (*kbuf) {
+                       memcpy(*kbuf, data, copy);
+                       *kbuf += copy;
+               } else if (__copy_to_user(*ubuf, data, copy))
+                       return -EFAULT;
+               else
+                       *ubuf += copy;
+               *pos += copy;
+               *count -= copy;
+       }
+       return 0;
+}
+
+static inline int user_regset_copyin(unsigned int *pos, unsigned int *count,
+                                    const void **kbuf,
+                                    const void __user **ubuf, void *data,
+                                    const int start_pos, const int end_pos)
+{
+       if (*count == 0)
+               return 0;
+       BUG_ON(*pos < start_pos);
+       if (end_pos < 0 || *pos < end_pos) {
+               unsigned int copy = (end_pos < 0 ? *count
+                                    : min(*count, end_pos - *pos));
+               data += *pos - start_pos;
+               if (*kbuf) {
+                       memcpy(data, *kbuf, copy);
+                       *kbuf += copy;
+               } else if (__copy_from_user(data, *ubuf, copy))
+                       return -EFAULT;
+               else
+                       *ubuf += copy;
+               *pos += copy;
+               *count -= copy;
+       }
+       return 0;
+}
+
+/*
+ * These two parallel the two above, but for portions of a regset layout
+ * that always read as all-zero or for which writes are ignored.
+ */
+static inline int user_regset_copyout_zero(unsigned int *pos,
+                                          unsigned int *count,
+                                          void **kbuf, void __user **ubuf,
+                                          const int start_pos,
+                                          const int end_pos)
+{
+       if (*count == 0)
+               return 0;
+       BUG_ON(*pos < start_pos);
+       if (end_pos < 0 || *pos < end_pos) {
+               unsigned int copy = (end_pos < 0 ? *count
+                                    : min(*count, end_pos - *pos));
+               if (*kbuf) {
+                       memset(*kbuf, 0, copy);
+                       *kbuf += copy;
+               } else if (__clear_user(*ubuf, copy))
+                       return -EFAULT;
+               else
+                       *ubuf += copy;
+               *pos += copy;
+               *count -= copy;
+       }
+       return 0;
+}
+
+static inline int user_regset_copyin_ignore(unsigned int *pos,
+                                           unsigned int *count,
+                                           const void **kbuf,
+                                           const void __user **ubuf,
+                                           const int start_pos,
+                                           const int end_pos)
+{
+       if (*count == 0)
+               return 0;
+       BUG_ON(*pos < start_pos);
+       if (end_pos < 0 || *pos < end_pos) {
+               unsigned int copy = (end_pos < 0 ? *count
+                                    : min(*count, end_pos - *pos));
+               if (*kbuf)
+                       *kbuf += copy;
+               else
+                       *ubuf += copy;
+               *pos += copy;
+               *count -= copy;
+       }
+       return 0;
+}
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target:    thread to be examined
+ * @view:      &struct user_regset_view describing user thread machine state
+ * @setno:     index in @view->regsets
+ * @offset:    offset into the regset data, in bytes
+ * @size:      amount of data to copy, in bytes
+ * @data:      user-mode pointer to copy into
+ */
+static inline int copy_regset_to_user(struct task_struct *target,
+                                     const struct user_regset_view *view,
+                                     unsigned int setno,
+                                     unsigned int offset, unsigned int size,
+                                     void __user *data)
+{
+       const struct user_regset *regset = &view->regsets[setno];
+
+       if (!access_ok(VERIFY_WRITE, data, size))
+               return -EIO;
+
+       return regset->get(target, regset, offset, size, NULL, data);
+}
+
+/**
+ * copy_regset_from_user - store into thread's user_regset data from user memory
+ * @target:    thread to be examined
+ * @view:      &struct user_regset_view describing user thread machine state
+ * @setno:     index in @view->regsets
+ * @offset:    offset into the regset data, in bytes
+ * @size:      amount of data to copy, in bytes
+ * @data:      user-mode pointer to copy from
+ */
+static inline int copy_regset_from_user(struct task_struct *target,
+                                       const struct user_regset_view *view,
+                                       unsigned int setno,
+                                       unsigned int offset, unsigned int size,
+                                       const void __user *data)
+{
+       const struct user_regset *regset = &view->regsets[setno];
+
+       if (!access_ok(VERIFY_READ, data, size))
+               return -EIO;
+
+       return regset->set(target, regset, offset, size, NULL, data);
+}
+
+
+#endif /* <linux/regset.h> */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 2d0546e884ea0fe443f88a74a01f39bbd5f86cb4..9d4797609aa5e368439671d603da5f7e7bedb979 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1922,23 +1922,16 @@ extern int cond_resched_softirq(void);
  
  /*
   * Does a critical section need to be broken due to another
- * task waiting?:
+ * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ * but a general need for low latency)
   */
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
-# define need_lockbreak(lock) ((lock)->break_lock)
-#else
-# define need_lockbreak(lock) 0
-#endif
-
-/*
- * Does a critical section need to be broken due to another
- * task waiting or preemption being signalled:
- */
-static inline int lock_need_resched(spinlock_t *lock)
+static inline int spin_needbreak(spinlock_t *lock)
  {
-       if (need_lockbreak(lock) || need_resched())
-               return 1;
+#ifdef CONFIG_PREEMPT
+       return spin_is_contended(lock);
+#else
         return 0;
+#endif
  }
  
  /*
diff --git a/include/linux/selinux.h b/include/linux/selinux.h

index 6080f73fc85f0b5555f973b606984ecc77ddefe2..8c2cc4c0252640f7c4d5705b25d845417db65d19 100644 (file)
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -120,16 +120,35 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid);
  int selinux_string_to_sid(char *str, u32 *sid);
  
  /**
- *     selinux_relabel_packet_permission - check permission to relabel a packet
- *     @sid: ID value to be applied to network packet (via SECMARK, most likely)
+ *     selinux_secmark_relabel_packet_permission - secmark permission check
+ *     @sid: SECMARK ID value to be applied to network packet
   *
- *     Returns 0 if the current task is allowed to label packets with the
- *     supplied security ID.  Note that it is implicit that the packet is always
- *     being relabeled from the default unlabled value, and that the access
- *     control decision is made in the AVC.
+ *     Returns 0 if the current task is allowed to set the SECMARK label of
+ *     packets with the supplied security ID.  Note that it is implicit that
+ *     the packet is always being relabeled from the default unlabeled value,
+ *     and that the access control decision is made in the AVC.
   */
-int selinux_relabel_packet_permission(u32 sid);
+int selinux_secmark_relabel_packet_permission(u32 sid);
  
+/**
+ *     selinux_secmark_refcount_inc - increments the secmark use counter
+ *
+ *     SELinux keeps track of the current SECMARK targets in use so it knows
+ *     when to apply SECMARK label access checks to network packets.  This
+ *     function incements this reference count to indicate that a new SECMARK
+ *     target has been configured.
+ */
+void selinux_secmark_refcount_inc(void);
+
+/**
+ *     selinux_secmark_refcount_dec - decrements the secmark use counter
+ *
+ *     SELinux keeps track of the current SECMARK targets in use so it knows
+ *     when to apply SECMARK label access checks to network packets.  This
+ *     function decements this reference count to indicate that one of the
+ *     existing SECMARK targets has been removed/flushed.
+ */
+void selinux_secmark_refcount_dec(void);
  #else
  
  static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -184,11 +203,21 @@ static inline int selinux_string_to_sid(const char *str, u32 *sid)
         return 0;
  }
  
-static inline int selinux_relabel_packet_permission(u32 sid)
+static inline int selinux_secmark_relabel_packet_permission(u32 sid)
  {
         return 0;
  }
  
+static inline void selinux_secmark_refcount_inc(void)
+{
+       return;
+}
+
+static inline void selinux_secmark_refcount_dec(void)
+{
+       return;
+}
+
  #endif /* CONFIG_SECURITY_SELINUX */
  
  #endif /* _LINUX_SELINUX_H */
diff --git a/include/linux/smp.h b/include/linux/smp.h

index c25e66bcecf335cfead012f4fd8feeff8834e5aa..55232ccf9cfdbb020765e89f99b22c7fc9a35af2 100644 (file)
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -78,6 +78,8 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
   */
  void smp_prepare_boot_cpu(void);
  
+extern unsigned int setup_max_cpus;
+
  #else /* !SMP */
  
  /*
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h

index c376f3b36c8980c9b434cabc693e7ef9b4ab43f3..124449733c55a64595198aa8dc8eaf6a682189ca 100644 (file)
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -120,6 +120,12 @@ do {                                                               \
  
  #define spin_is_locked(lock)   __raw_spin_is_locked(&(lock)->raw_lock)
  
+#ifdef CONFIG_GENERIC_LOCKBREAK
+#define spin_is_contended(lock) ((lock)->break_lock)
+#else
+#define spin_is_contended(lock)        __raw_spin_is_contended(&(lock)->raw_lock)
+#endif
+
  /**
   * spin_unlock_wait - wait until the spinlock gets unlocked
   * @lock: the spinlock in question.
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h

index f6a3a951b79eda3fc409913d10819361fe41faa3..68d88f71f1a2049c2f8be89c47aab85978f21054 100644 (file)
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -19,7 +19,7 @@
  
  typedef struct {
         raw_spinlock_t raw_lock;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+#ifdef CONFIG_GENERIC_LOCKBREAK
         unsigned int break_lock;
  #endif
  #ifdef CONFIG_DEBUG_SPINLOCK
@@ -35,7 +35,7 @@ typedef struct {
  
  typedef struct {
         raw_rwlock_t raw_lock;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+#ifdef CONFIG_GENERIC_LOCKBREAK
         unsigned int break_lock;
  #endif
  #ifdef CONFIG_DEBUG_SPINLOCK
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h

index ea54c4c9a4ecd2c678b6f8d717b0b8bc98d50cfa..938234c4a996ba6e78521ffc60ecc197034fed9d 100644 (file)
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -64,6 +64,8 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
  # define __raw_spin_trylock(lock)      ({ (void)(lock); 1; })
  #endif /* DEBUG_SPINLOCK */
  
+#define __raw_spin_is_contended(lock)  (((void)(lock), 0))
+
  #define __raw_read_can_lock(lock)      (((void)(lock), 1))
  #define __raw_write_can_lock(lock)     (((void)(lock), 1))
  
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h

index d9d5c5ad826c759c4c0853b5a53e1b8bbcb46366..3e9addc741c1ad8a57b41c9172d3114b53faee1b 100644 (file)
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -46,6 +46,7 @@ struct rpc_clnt {
                                 cl_autobind : 1;/* use getport() */
  
         struct rpc_rtt *        cl_rtt;         /* RTO estimator data */
+       const struct rpc_timeout *cl_timeout;   /* Timeout strategy */
  
         int                     cl_nodelen;     /* nodename length */
         char                    cl_nodename[UNX_MAXNODENAME];
@@ -54,6 +55,7 @@ struct rpc_clnt {
         struct dentry *         cl_dentry;      /* inode */
         struct rpc_clnt *       cl_parent;      /* Points to parent of clones */
         struct rpc_rtt          cl_rtt_default;
+       struct rpc_timeout      cl_timeout_default;
         struct rpc_program *    cl_program;
         char                    cl_inline_name[32];
  };
@@ -99,7 +101,7 @@ struct rpc_create_args {
         struct sockaddr         *address;
         size_t                  addrsize;
         struct sockaddr         *saddress;
-       struct rpc_timeout      *timeout;
+       const struct rpc_timeout *timeout;
         char                    *servername;
         struct rpc_program      *program;
         u32                     version;
@@ -123,11 +125,10 @@ void              rpc_shutdown_client(struct rpc_clnt *);
  void           rpc_release_client(struct rpc_clnt *);
  
  int            rpcb_register(u32, u32, int, unsigned short, int *);
-int            rpcb_getport_sync(struct sockaddr_in *, __u32, __u32, int);
+int            rpcb_getport_sync(struct sockaddr_in *, u32, u32, int);
  void           rpcb_getport_async(struct rpc_task *);
  
-void           rpc_call_setup(struct rpc_task *, struct rpc_message *, int);
-
+void           rpc_call_start(struct rpc_task *);
  int            rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg,
                                int flags, const struct rpc_call_ops *tk_ops,
                                void *calldata);
@@ -142,7 +143,7 @@ void                rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
  size_t         rpc_max_payload(struct rpc_clnt *);
  void           rpc_force_rebind(struct rpc_clnt *);
  size_t         rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
-char *         rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
+const char     *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
  
  #endif /* __KERNEL__ */
  #endif /* _LINUX_SUNRPC_CLNT_H */
diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h

index c4beb5775111c03a2c0f49fccb908d546ad8fb8f..70df4f1d884740f776af077d6b86eac280a931fb 100644 (file)
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -152,5 +152,44 @@ typedef __be32     rpc_fraghdr;
   */
  #define RPCBIND_MAXNETIDLEN    (4u)
  
+/*
+ * Universal addresses are introduced in RFC 1833 and further spelled
+ * out in RFC 3530.  RPCBIND_MAXUADDRLEN defines a maximum byte length
+ * of a universal address for use in allocating buffers and character
+ * arrays.
+ *
+ * Quoting RFC 3530, section 2.2:
+ *
+ * For TCP over IPv4 and for UDP over IPv4, the format of r_addr is the
+ * US-ASCII string:
+ *
+ *     h1.h2.h3.h4.p1.p2
+ *
+ * The prefix, "h1.h2.h3.h4", is the standard textual form for
+ * representing an IPv4 address, which is always four octets long.
+ * Assuming big-endian ordering, h1, h2, h3, and h4, are respectively,
+ * the first through fourth octets each converted to ASCII-decimal.
+ * Assuming big-endian ordering, p1 and p2 are, respectively, the first
+ * and second octets each converted to ASCII-decimal.  For example, if a
+ * host, in big-endian order, has an address of 0x0A010307 and there is
+ * a service listening on, in big endian order, port 0x020F (decimal
+ * 527), then the complete universal address is "10.1.3.7.2.15".
+ *
+ * ...
+ *
+ * For TCP over IPv6 and for UDP over IPv6, the format of r_addr is the
+ * US-ASCII string:
+ *
+ *     x1:x2:x3:x4:x5:x6:x7:x8.p1.p2
+ *
+ * The suffix "p1.p2" is the service port, and is computed the same way
+ * as with universal addresses for TCP and UDP over IPv4.  The prefix,
+ * "x1:x2:x3:x4:x5:x6:x7:x8", is the standard textual form for
+ * representing an IPv6 address as defined in Section 2.2 of [RFC2373].
+ * Additionally, the two alternative forms specified in Section 2.2 of
+ * [RFC2373] are also acceptable.
+ */
+#define RPCBIND_MAXUADDRLEN    (56u)
+
  #endif /* __KERNEL__ */
  #endif /* _LINUX_SUNRPC_MSGPROT_H_ */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h

index 8ea077db0099cc636a2c9053e7cd1cc17abec2a0..ce3d1b13272901a7ac3cfa07c9b3042bebb6043e 100644 (file)
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -56,8 +56,6 @@ struct rpc_task {
         __u8                    tk_garb_retry;
         __u8                    tk_cred_retry;
  
-       unsigned long           tk_cookie;      /* Cookie for batching tasks */
-
         /*
          * timeout_fn   to be executed by timer bottom half
          * callback     to be executed after waking up
@@ -78,7 +76,6 @@ struct rpc_task {
         struct timer_list       tk_timer;       /* kernel timer */
         unsigned long           tk_timeout;     /* timeout for rpc_sleep() */
         unsigned short          tk_flags;       /* misc flags */
-       unsigned char           tk_priority : 2;/* Task priority */
         unsigned long           tk_runstate;    /* Task run status */
         struct workqueue_struct *tk_workqueue;  /* Normally rpciod, but could
                                                  * be any workqueue
@@ -94,6 +91,9 @@ struct rpc_task {
         unsigned long           tk_start;       /* RPC task init timestamp */
         long                    tk_rtt;         /* round-trip time (jiffies) */
  
+       pid_t                   tk_owner;       /* Process id for batching tasks */
+       unsigned char           tk_priority : 2;/* Task priority */
+
  #ifdef RPC_DEBUG
         unsigned short          tk_pid;         /* debugging aid */
  #endif
@@ -117,6 +117,15 @@ struct rpc_call_ops {
         void (*rpc_release)(void *);
  };
  
+struct rpc_task_setup {
+       struct rpc_task *task;
+       struct rpc_clnt *rpc_client;
+       const struct rpc_message *rpc_message;
+       const struct rpc_call_ops *callback_ops;
+       void *callback_data;
+       unsigned short flags;
+       signed char priority;
+};
  
  /*
   * RPC task flags
@@ -180,10 +189,10 @@ struct rpc_call_ops {
   * Note: if you change these, you must also change
   * the task initialization definitions below.
   */
-#define RPC_PRIORITY_LOW       0
-#define RPC_PRIORITY_NORMAL    1
-#define RPC_PRIORITY_HIGH      2
-#define RPC_NR_PRIORITY                (RPC_PRIORITY_HIGH+1)
+#define RPC_PRIORITY_LOW       (-1)
+#define RPC_PRIORITY_NORMAL    (0)
+#define RPC_PRIORITY_HIGH      (1)
+#define RPC_NR_PRIORITY                (1 + RPC_PRIORITY_HIGH - RPC_PRIORITY_LOW)
  
  /*
   * RPC synchronization objects
@@ -191,7 +200,7 @@ struct rpc_call_ops {
  struct rpc_wait_queue {
         spinlock_t              lock;
         struct list_head        tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */
-       unsigned long           cookie;                 /* cookie of last task serviced */
+       pid_t                   owner;                  /* process id of last task serviced */
         unsigned char           maxpriority;            /* maximum priority (0 if queue is not a priority queue) */
         unsigned char           priority;               /* current priority */
         unsigned char           count;                  /* # task groups remaining serviced so far */
@@ -208,41 +217,13 @@ struct rpc_wait_queue {
   * performance of NFS operations such as read/write.
   */
  #define RPC_BATCH_COUNT                        16
-
-#ifndef RPC_DEBUG
-# define RPC_WAITQ_INIT(var,qname) { \
-               .lock = __SPIN_LOCK_UNLOCKED(var.lock), \
-               .tasks = { \
-                       [0] = LIST_HEAD_INIT(var.tasks[0]), \
-                       [1] = LIST_HEAD_INIT(var.tasks[1]), \
-                       [2] = LIST_HEAD_INIT(var.tasks[2]), \
-               }, \
-       }
-#else
-# define RPC_WAITQ_INIT(var,qname) { \
-               .lock = __SPIN_LOCK_UNLOCKED(var.lock), \
-               .tasks = { \
-                       [0] = LIST_HEAD_INIT(var.tasks[0]), \
-                       [1] = LIST_HEAD_INIT(var.tasks[1]), \
-                       [2] = LIST_HEAD_INIT(var.tasks[2]), \
-               }, \
-               .name = qname, \
-       }
-#endif
-# define RPC_WAITQ(var,qname)      struct rpc_wait_queue var = RPC_WAITQ_INIT(var,qname)
-
  #define RPC_IS_PRIORITY(q)             ((q)->maxpriority > 0)
  
  /*
   * Function prototypes
   */
-struct rpc_task *rpc_new_task(struct rpc_clnt *, int flags,
-                               const struct rpc_call_ops *ops, void *data);
-struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
-                               const struct rpc_call_ops *ops, void *data);
-void           rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
-                               int flags, const struct rpc_call_ops *ops,
-                               void *data);
+struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
+struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
  void           rpc_put_task(struct rpc_task *);
  void           rpc_exit_task(struct rpc_task *);
  void           rpc_release_calldata(const struct rpc_call_ops *, void *);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h

index 30b17b3bc1a9b9eea991874d83a9a8f24799e4b8..b3ff9a815e6fe5cf8627335ea9c65f4105115b33 100644 (file)
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -120,7 +120,7 @@ struct rpc_xprt {
         struct kref             kref;           /* Reference count */
         struct rpc_xprt_ops *   ops;            /* transport methods */
  
-       struct rpc_timeout      timeout;        /* timeout parms */
+       const struct rpc_timeout *timeout;      /* timeout parms */
         struct sockaddr_storage addr;           /* server address */
         size_t                  addrlen;        /* size of server address */
         int                     prot;           /* IP protocol */
@@ -183,7 +183,7 @@ struct rpc_xprt {
                                         bklog_u;        /* backlog queue utilization */
         } stat;
  
-       char *                  address_strings[RPC_DISPLAY_MAX];
+       const char              *address_strings[RPC_DISPLAY_MAX];
  };
  
  struct xprt_create {
@@ -191,7 +191,6 @@ struct xprt_create {
         struct sockaddr *       srcaddr;        /* optional local address */
         struct sockaddr *       dstaddr;        /* remote peer address */
         size_t                  addrlen;
-       struct rpc_timeout *    timeout;        /* optional timeout parameters */
  };
  
  struct xprt_class {
@@ -202,11 +201,6 @@ struct xprt_class {
         char                    name[32];
  };
  
-/*
- * Transport operations used by ULPs
- */
-void                   xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr);
-
  /*
   * Generic internal transport functions
   */
@@ -245,7 +239,8 @@ void                        xprt_adjust_cwnd(struct rpc_task *task, int result);
  struct rpc_rqst *      xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid);
  void                   xprt_complete_rqst(struct rpc_task *task, int copied);
  void                   xprt_release_rqst_cong(struct rpc_task *task);
-void                   xprt_disconnect(struct rpc_xprt *xprt);
+void                   xprt_disconnect_done(struct rpc_xprt *xprt);
+void                   xprt_force_disconnect(struct rpc_xprt *xprt);
  
  /*
   * Reserved bit positions in xprt->state
@@ -256,6 +251,7 @@ void                        xprt_disconnect(struct rpc_xprt *xprt);
  #define XPRT_CLOSE_WAIT                (3)
  #define XPRT_BOUND             (4)
  #define XPRT_BINDING           (5)
+#define XPRT_CLOSING           (6)
  
  static inline void xprt_set_connected(struct rpc_xprt *xprt)
  {
diff --git a/include/linux/suspend.h b/include/linux/suspend.h

index 4360e0816956e536d929d7fffd23fedaab14fd46..40280df2a3dbc5d91d61291f8793f3d98b989cbc 100644 (file)
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -211,9 +211,6 @@ static inline int hibernate(void) { return -ENOSYS; }
  #ifdef CONFIG_PM_SLEEP
  void save_processor_state(void);
  void restore_processor_state(void);
-struct saved_context;
-void __save_processor_state(struct saved_context *ctxt);
-void __restore_processor_state(struct saved_context *ctxt);
  
  /* kernel/power/main.c */
  extern struct blocking_notifier_head pm_chain_head;
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 4f3838adbb30626ec0e4df84a145e95f091b2aa6..2c3ce4c69b25411323756ad078b3cc7db425dad8 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -6,6 +6,7 @@
  #include <linux/mmzone.h>
  #include <linux/list.h>
  #include <linux/sched.h>
+#include <linux/pagemap.h>
  
  #include <asm/atomic.h>
  #include <asm/page.h>
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h

index 9c4ad755d7e57cd856409589617ed31917abe6d5..dfbdfb9836f469ff78184c9b99825a569953c1fd 100644 (file)
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -42,27 +42,27 @@ extern long do_no_restart_syscall(struct restart_block *parm);
  
  static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
  {
-       set_bit(flag,&ti->flags);
+       set_bit(flag, (unsigned long *)&ti->flags);
  }
  
  static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
  {
-       clear_bit(flag,&ti->flags);
+       clear_bit(flag, (unsigned long *)&ti->flags);
  }
  
  static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
  {
-       return test_and_set_bit(flag,&ti->flags);
+       return test_and_set_bit(flag, (unsigned long *)&ti->flags);
  }
  
  static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
  {
-       return test_and_clear_bit(flag,&ti->flags);
+       return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
  }
  
  static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
  {
-       return test_bit(flag,&ti->flags);
+       return test_bit(flag, (unsigned long *)&ti->flags);
  }
  
  #define set_thread_flag(flag) \
diff --git a/include/linux/tick.h b/include/linux/tick.h

index f4a1395e05ff368d7bd9b807e6ed05e3859f50f7..0fadf95debe1f3a1b6415623926fc884fbc719ac 100644 (file)
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -51,8 +51,10 @@ struct tick_sched {
         unsigned long                   idle_jiffies;
         unsigned long                   idle_calls;
         unsigned long                   idle_sleeps;
+       int                             idle_active;
         ktime_t                         idle_entrytime;
         ktime_t                         idle_sleeptime;
+       ktime_t                         idle_lastupdate;
         ktime_t                         sleep_length;
         unsigned long                   last_jiffies;
         unsigned long                   next_jiffies;
@@ -103,6 +105,8 @@ extern void tick_nohz_stop_sched_tick(void);
  extern void tick_nohz_restart_sched_tick(void);
  extern void tick_nohz_update_jiffies(void);
  extern ktime_t tick_nohz_get_sleep_length(void);
+extern void tick_nohz_stop_idle(int cpu);
+extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
  # else
  static inline void tick_nohz_stop_sched_tick(void) { }
  static inline void tick_nohz_restart_sched_tick(void) { }
@@ -113,6 +117,8 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
  
         return len;
  }
+static inline void tick_nohz_stop_idle(int cpu) { }
+static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return 0; }
  # endif /* !NO_HZ */
  
  #endif
diff --git a/include/linux/timer.h b/include/linux/timer.h

index 78cf899b440969a44049f7c052e2f9e167b47bd5..de0e71359ede60eadb302df0988300c97c790e3e 100644 (file)
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -5,7 +5,7 @@
  #include <linux/ktime.h>
  #include <linux/stddef.h>
  
-struct tvec_t_base_s;
+struct tvec_base;
  
  struct timer_list {
         struct list_head entry;
@@ -14,7 +14,7 @@ struct timer_list {
         void (*function)(unsigned long);
         unsigned long data;
  
-       struct tvec_t_base_s *base;
+       struct tvec_base *base;
  #ifdef CONFIG_TIMER_STATS
         void *start_site;
         char start_comm[16];
@@ -22,7 +22,7 @@ struct timer_list {
  #endif
  };
  
-extern struct tvec_t_base_s boot_tvec_bases;
+extern struct tvec_base boot_tvec_bases;
  
  #define TIMER_INITIALIZER(_function, _expires, _data) {                \
                 .function = (_function),                        \
diff --git a/include/net/netlabel.h b/include/net/netlabel.h

index 2e5b2f6f9fa08467b1eb15a50b3f6448c50aac80..b3213c7c53096d97b45b5e51ca842ba01284fefc 100644 (file)
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -67,7 +67,11 @@
   * NetLabel NETLINK protocol
   */
  
-#define NETLBL_PROTO_VERSION            1
+/* NetLabel NETLINK protocol version
+ *  1: initial version
+ *  2: added static labels for unlabeled connections
+ */
+#define NETLBL_PROTO_VERSION            2
  
  /* NetLabel NETLINK types/families */
  #define NETLBL_NLTYPE_NONE              0
@@ -105,17 +109,49 @@ struct netlbl_dom_map;
  /* Domain mapping operations */
  int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
  
-/* LSM security attributes */
+/*
+ * LSM security attributes
+ */
+
+/**
+ * struct netlbl_lsm_cache - NetLabel LSM security attribute cache
+ * @refcount: atomic reference counter
+ * @free: LSM supplied function to free the cache data
+ * @data: LSM supplied cache data
+ *
+ * Description:
+ * This structure is provided for LSMs which wish to make use of the NetLabel
+ * caching mechanism to store LSM specific data/attributes in the NetLabel
+ * cache.  If the LSM has to perform a lot of translation from the NetLabel
+ * security attributes into it's own internal representation then the cache
+ * mechanism can provide a way to eliminate some or all of that translation
+ * overhead on a cache hit.
+ *
+ */
  struct netlbl_lsm_cache {
         atomic_t refcount;
         void (*free) (const void *data);
         void *data;
  };
-/* The catmap bitmap field MUST be a power of two in length and large
+
+/**
+ * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap
+ * @startbit: the value of the lowest order bit in the bitmap
+ * @bitmap: the category bitmap
+ * @next: pointer to the next bitmap "node" or NULL
+ *
+ * Description:
+ * This structure is used to represent category bitmaps.  Due to the large
+ * number of categories supported by most labeling protocols it is not
+ * practical to transfer a full bitmap internally so NetLabel adopts a sparse
+ * bitmap structure modeled after SELinux's ebitmap structure.
+ * The catmap bitmap field MUST be a power of two in length and large
   * enough to hold at least 240 bits.  Special care (i.e. check the code!)
   * should be used when changing these values as the LSM implementation
   * probably has functions which rely on the sizes of these types to speed
- * processing. */
+ * processing.
+ *
+ */
  #define NETLBL_CATMAP_MAPTYPE           u64
  #define NETLBL_CATMAP_MAPCNT            4
  #define NETLBL_CATMAP_MAPSIZE           (sizeof(NETLBL_CATMAP_MAPTYPE) * 8)
@@ -127,22 +163,48 @@ struct netlbl_lsm_secattr_catmap {
         NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT];
         struct netlbl_lsm_secattr_catmap *next;
  };
+
+/**
+ * struct netlbl_lsm_secattr - NetLabel LSM security attributes
+ * @flags: indicate which attributes are contained in this structure
+ * @type: indicate the NLTYPE of the attributes
+ * @domain: the NetLabel LSM domain
+ * @cache: NetLabel LSM specific cache
+ * @attr.mls: MLS sensitivity label
+ * @attr.mls.cat: MLS category bitmap
+ * @attr.mls.lvl: MLS sensitivity level
+ * @attr.secid: LSM specific secid token
+ *
+ * Description:
+ * This structure is used to pass security attributes between NetLabel and the
+ * LSM modules.  The flags field is used to specify which fields within the
+ * struct are valid and valid values can be created by bitwise OR'ing the
+ * NETLBL_SECATTR_* defines.  The domain field is typically set by the LSM to
+ * specify domain specific configuration settings and is not usually used by
+ * NetLabel itself when returning security attributes to the LSM.
+ *
+ */
  #define NETLBL_SECATTR_NONE             0x00000000
  #define NETLBL_SECATTR_DOMAIN           0x00000001
  #define NETLBL_SECATTR_CACHE            0x00000002
  #define NETLBL_SECATTR_MLS_LVL          0x00000004
  #define NETLBL_SECATTR_MLS_CAT          0x00000008
+#define NETLBL_SECATTR_SECID            0x00000010
  #define NETLBL_SECATTR_CACHEABLE        (NETLBL_SECATTR_MLS_LVL | \
-                                        NETLBL_SECATTR_MLS_CAT)
+                                        NETLBL_SECATTR_MLS_CAT | \
+                                        NETLBL_SECATTR_SECID)
  struct netlbl_lsm_secattr {
         u32 flags;
-
+       u32 type;
         char *domain;
-
-       u32 mls_lvl;
-       struct netlbl_lsm_secattr_catmap *mls_cat;
-
         struct netlbl_lsm_cache *cache;
+       union {
+               struct {
+                       struct netlbl_lsm_secattr_catmap *cat;
+                       u32 lvl;
+               } mls;
+               u32 secid;
+       } attr;
  };
  
  /*
@@ -231,10 +293,7 @@ static inline void netlbl_secattr_catmap_free(
   */
  static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
  {
-       secattr->flags = 0;
-       secattr->domain = NULL;
-       secattr->mls_cat = NULL;
-       secattr->cache = NULL;
+       memset(secattr, 0, sizeof(*secattr));
  }
  
  /**
@@ -248,11 +307,11 @@ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
   */
  static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
  {
-       if (secattr->cache)
-               netlbl_secattr_cache_free(secattr->cache);
         kfree(secattr->domain);
-       if (secattr->mls_cat)
-               netlbl_secattr_catmap_free(secattr->mls_cat);
+       if (secattr->flags & NETLBL_SECATTR_CACHE)
+               netlbl_secattr_cache_free(secattr->cache);
+       if (secattr->flags & NETLBL_SECATTR_MLS_CAT)
+               netlbl_secattr_catmap_free(secattr->attr.mls.cat);
  }
  
  /**
@@ -300,7 +359,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap,
                                  gfp_t flags);
  
  /*
- * LSM protocol operations
+ * LSM protocol operations (NetLabel LSM/kernel API)
   */
  int netlbl_enabled(void);
  int netlbl_sock_setattr(struct sock *sk,
@@ -308,6 +367,7 @@ int netlbl_sock_setattr(struct sock *sk,
  int netlbl_sock_getattr(struct sock *sk,
                         struct netlbl_lsm_secattr *secattr);
  int netlbl_skbuff_getattr(const struct sk_buff *skb,
+                         u16 family,
                           struct netlbl_lsm_secattr *secattr);
  void netlbl_skbuff_err(struct sk_buff *skb, int error);
  
@@ -360,6 +420,7 @@ static inline int netlbl_sock_getattr(struct sock *sk,
         return -ENOSYS;
  }
  static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
+                                       u16 family,
                                         struct netlbl_lsm_secattr *secattr)
  {
         return -ENOSYS;
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h

index 702fcfeb37f19c1d4ef5f7c03692d4e900b574cd..82251575a9b45e1a4e24475dc71503deef0223a8 100644 (file)
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -10,6 +10,25 @@
  
  #include <linux/types.h>
  
+/*
+ * The maximum number of SG segments that we will put inside a
+ * scatterlist (unless chaining is used). Should ideally fit inside a
+ * single page, to avoid a higher order allocation.  We could define this
+ * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order.  The
+ * minimum value is 32
+ */
+#define SCSI_MAX_SG_SEGMENTS   128
+
+/*
+ * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit
+ * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
+ */
+#ifdef ARCH_HAS_SG_CHAIN
+#define SCSI_MAX_SG_CHAIN_SEGMENTS     2048
+#else
+#define SCSI_MAX_SG_CHAIN_SEGMENTS     SCSI_MAX_SG_SEGMENTS
+#endif
+
  /*
   *     SCSI command lengths
   */
@@ -83,6 +102,7 @@ extern const unsigned char scsi_command_size[8];
  #define READ_TOC              0x43
  #define LOG_SELECT            0x4c
  #define LOG_SENSE             0x4d
+#define XDWRITEREAD_10        0x53
  #define MODE_SELECT_10        0x55
  #define RESERVE_10            0x56
  #define RELEASE_10            0x57
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h

index a457fca66f6156c4b3629c06d10a71e4c453c5ee..de28aab820b0cca9cdd7261c4000ee965f821e1d 100644 (file)
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -2,15 +2,20 @@
  #define _SCSI_SCSI_CMND_H
  
  #include <linux/dma-mapping.h>
+#include <linux/blkdev.h>
  #include <linux/list.h>
  #include <linux/types.h>
  #include <linux/timer.h>
  #include <linux/scatterlist.h>
  
-struct request;
  struct Scsi_Host;
  struct scsi_device;
  
+struct scsi_data_buffer {
+       struct sg_table table;
+       unsigned length;
+       int resid;
+};
  
  /* embedded in scsi_cmnd */
  struct scsi_pointer {
@@ -61,15 +66,11 @@ struct scsi_cmnd {
         /* These elements define the operation we are about to perform */
  #define MAX_COMMAND_SIZE       16
         unsigned char cmnd[MAX_COMMAND_SIZE];
-       unsigned request_bufflen;       /* Actual request size */
  
         struct timer_list eh_timeout;   /* Used to time out the command. */
-       void *request_buffer;           /* Actual requested buffer */
  
         /* These elements define the operation we ultimately want to perform */
-       struct sg_table sg_table;
-       unsigned short use_sg;  /* Number of pieces of scatter-gather */
-
+       struct scsi_data_buffer sdb;
         unsigned underflow;     /* Return error if less than
                                    this amount is transferred */
  
@@ -79,10 +80,6 @@ struct scsi_cmnd {
                                    reconnects.   Probably == sector
                                    size */
  
-       int resid;              /* Number of bytes requested to be
-                                  transferred less actual number
-                                  transferred (0 if not supported) */
-
         struct request *request;        /* The command we are
                                            working on */
  
@@ -127,27 +124,55 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
                                  size_t *offset, size_t *len);
  extern void scsi_kunmap_atomic_sg(void *virt);
  
-extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
-extern void scsi_free_sgtable(struct scsi_cmnd *);
+extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask);
+extern void scsi_release_buffers(struct scsi_cmnd *cmd);
  
  extern int scsi_dma_map(struct scsi_cmnd *cmd);
  extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
  
-#define scsi_sg_count(cmd) ((cmd)->use_sg)
-#define scsi_sglist(cmd) ((cmd)->sg_table.sgl)
-#define scsi_bufflen(cmd) ((cmd)->request_bufflen)
+static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
+{
+       return cmd->sdb.table.nents;
+}
+
+static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
+{
+       return cmd->sdb.table.sgl;
+}
+
+static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
+{
+       return cmd->sdb.length;
+}
  
  static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid)
  {
-       cmd->resid = resid;
+       cmd->sdb.resid = resid;
  }
  
  static inline int scsi_get_resid(struct scsi_cmnd *cmd)
  {
-       return cmd->resid;
+       return cmd->sdb.resid;
  }
  
  #define scsi_for_each_sg(cmd, sg, nseg, __i)                   \
         for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
  
+static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd)
+{
+       return blk_bidi_rq(cmd->request) &&
+               (cmd->request->next_rq->special != NULL);
+}
+
+static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd)
+{
+       return scsi_bidi_cmnd(cmd) ?
+               cmd->request->next_rq->special : &cmd->sdb;
+}
+
+static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd)
+{
+       return &cmd->sdb;
+}
+
  #endif /* _SCSI_SCSI_CMND_H */
diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h

index d21b8913ceb3d709f5c711d3b9380f930cc874b9..25071d5d9bf813e95b49efa567fbed23c16f2236 100644 (file)
--- a/include/scsi/scsi_eh.h
+++ b/include/scsi/scsi_eh.h
@@ -68,16 +68,15 @@ extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
  extern int scsi_reset_provider(struct scsi_device *, int);
  
  struct scsi_eh_save {
+       /* saved state */
         int result;
         enum dma_data_direction data_direction;
         unsigned char cmd_len;
         unsigned char cmnd[MAX_COMMAND_SIZE];
+       struct scsi_data_buffer sdb;
+       struct request *next_rq;
  
-       void *buffer;
-       unsigned bufflen;
-       unsigned short use_sg;
-       int resid;
-
+       /* new command support */
         struct scatterlist sense_sgl;
  };
  
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h

index 0fd4746ee39d4437e788c37b9a7f59d9b65dc11a..5c58d594126a93bcb697e4c61b9a61c31aaa30b3 100644 (file)
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -39,9 +39,6 @@ struct blk_queue_tags;
  #define DISABLE_CLUSTERING 0
  #define ENABLE_CLUSTERING 1
  
-#define DISABLE_SG_CHAINING 0
-#define ENABLE_SG_CHAINING 1
-
  enum scsi_eh_timer_return {
         EH_NOT_HANDLED,
         EH_HANDLED,
@@ -136,9 +133,9 @@ struct scsi_host_template {
          * the done callback is invoked.
          *
          * This is called to inform the LLD to transfer
-        * cmd->request_bufflen bytes. The cmd->use_sg speciefies the
+        * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the
          * number of scatterlist entried in the command and
-        * cmd->request_buffer contains the scatterlist.
+        * scsi_sglist(cmd) returns the scatterlist.
          *
          * return values: see queuecommand
          *
@@ -445,15 +442,6 @@ struct scsi_host_template {
          */
         unsigned ordered_tag:1;
  
-       /*
-        * true if the low-level driver can support sg chaining. this
-        * will be removed eventually when all the drivers are
-        * converted to support sg chaining.
-        *
-        * Status: OBSOLETE
-        */
-       unsigned use_sg_chaining:1;
-
         /*
          * Countdown for host blocking with no commands outstanding
          */
@@ -598,7 +586,6 @@ struct Scsi_Host {
         unsigned unchecked_isa_dma:1;
         unsigned use_clustering:1;
         unsigned use_blk_tcq:1;
-       unsigned use_sg_chaining:1;
  
         /*
          * Host has requested that no further requests come through for the
diff --git a/include/xen/page.h b/include/xen/page.h

index c0c8fcb278999ff50b9f92ce141f0e4281f4ab86..031ef22a971e551be070a15e61c4ad6f10ca225e 100644 (file)
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -156,16 +156,16 @@ static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
  
  static inline unsigned long long pte_val_ma(pte_t x)
  {
-       return ((unsigned long long)x.pte_high << 32) | x.pte_low;
+       return x.pte;
  }
  #define pmd_val_ma(v) ((v).pmd)
  #define pud_val_ma(v) ((v).pgd.pgd)
-#define __pte_ma(x)    ((pte_t) { .pte_low = (x), .pte_high = (x)>>32 } )
+#define __pte_ma(x)    ((pte_t) { .pte = (x) })
  #define __pmd_ma(x)    ((pmd_t) { (x) } )
  #else  /* !X86_PAE */
  #define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
  #define mfn_pte(pfn, prot)     __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define pte_val_ma(x)  ((x).pte_low)
+#define pte_val_ma(x)  ((x).pte)
  #define pmd_val_ma(v)  ((v).pud.pgd.pgd)
  #define __pte_ma(x)    ((pte_t) { (x) } )
  #endif /* CONFIG_X86_PAE */
diff --git a/init/main.c b/init/main.c

index f287ca5862b9c7e9a8d0f1b1b4ba689a4af7e4e4..cb81ed116f62b3cadbde986e508081295c0446d9 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -128,7 +128,7 @@ static char *ramdisk_execute_command;
  
  #ifdef CONFIG_SMP
  /* Setup configured maximum number of CPUs to activate */
-static unsigned int __initdata max_cpus = NR_CPUS;
+unsigned int __initdata setup_max_cpus = NR_CPUS;
  
  /*
   * Setup routine for controlling SMP activation
@@ -146,7 +146,7 @@ static inline void disable_ioapic_setup(void) {};
  
  static int __init nosmp(char *str)
  {
-       max_cpus = 0;
+       setup_max_cpus = 0;
         disable_ioapic_setup();
         return 0;
  }
@@ -155,8 +155,8 @@ early_param("nosmp", nosmp);
  
  static int __init maxcpus(char *str)
  {
-       get_option(&str, &max_cpus);
-       if (max_cpus == 0)
+       get_option(&str, &setup_max_cpus);
+       if (setup_max_cpus == 0)
                 disable_ioapic_setup();
  
         return 0;
@@ -164,7 +164,7 @@ static int __init maxcpus(char *str)
  
  early_param("maxcpus", maxcpus);
  #else
-#define max_cpus NR_CPUS
+#define setup_max_cpus NR_CPUS
  #endif
  
  /*
@@ -318,6 +318,10 @@ static int __init unknown_bootoption(char *param, char *val)
         return 0;
  }
  
+#ifdef CONFIG_DEBUG_PAGEALLOC
+int __read_mostly debug_pagealloc_enabled = 0;
+#endif
+
  static int __init init_setup(char *str)
  {
         unsigned int i;
@@ -363,7 +367,7 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
  
  #else
  
-#ifdef __GENERIC_PER_CPU
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  
  EXPORT_SYMBOL(__per_cpu_offset);
@@ -384,7 +388,7 @@ static void __init setup_per_cpu_areas(void)
                 ptr += size;
         }
  }
-#endif /* !__GENERIC_PER_CPU */
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
  
  /* Called by boot processor to activate the rest. */
  static void __init smp_init(void)
@@ -393,7 +397,7 @@ static void __init smp_init(void)
  
         /* FIXME: This should be done in userspace --RR */
         for_each_present_cpu(cpu) {
-               if (num_online_cpus() >= max_cpus)
+               if (num_online_cpus() >= setup_max_cpus)
                         break;
                 if (!cpu_online(cpu))
                         cpu_up(cpu);
@@ -401,7 +405,7 @@ static void __init smp_init(void)
  
         /* Any cleanup work */
         printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
-       smp_cpus_done(max_cpus);
+       smp_cpus_done(setup_max_cpus);
  }
  
  #endif
@@ -552,6 +556,7 @@ asmlinkage void __init start_kernel(void)
         preempt_disable();
         build_all_zonelists();
         page_alloc_init();
+       enable_debug_pagealloc();
         printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
         parse_early_param();
         parse_args("Booting kernel", static_command_line, __start___param,
@@ -824,7 +829,7 @@ static int __init kernel_init(void * unused)
         __set_special_pids(1, 1);
         cad_pid = task_pid(current);
  
-       smp_prepare_cpus(max_cpus);
+       smp_prepare_cpus(setup_max_cpus);
  
         do_pre_smp_initcalls();
  
diff --git a/kernel/Makefile b/kernel/Makefile

index 390d421462672af511c09b95f6649590bd1f83c4..8885627ea02138d58901d6c7d5d1cbda07883ccb 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
  obj-$(CONFIG_PM) += power/
  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
  obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
  obj-$(CONFIG_COMPAT) += compat.o
  obj-$(CONFIG_CGROUPS) += cgroup.o
  obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
@@ -43,6 +44,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
  obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
  obj-$(CONFIG_IKCONFIG) += configs.o
  obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
  obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
  obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
  obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c

new file mode 100644 (file)

index 0000000..d1a7605
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,48 @@
+/*
+ * Simple stack backtrace regression test module
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+static struct timer_list backtrace_timer;
+
+static void backtrace_test_timer(unsigned long data)
+{
+       printk("Testing a backtrace from irq context.\n");
+       printk("The following trace is a kernel self test and not a bug!\n");
+       dump_stack();
+}
+static int backtrace_regression_test(void)
+{
+       printk("====[ backtrace testing ]===========\n");
+       printk("Testing a backtrace from process context.\n");
+       printk("The following trace is a kernel self test and not a bug!\n");
+       dump_stack();
+
+       init_timer(&backtrace_timer);
+       backtrace_timer.function = backtrace_test_timer;
+       mod_timer(&backtrace_timer, jiffies + 10);
+
+       msleep(10);
+       printk("====[ end of backtrace testing ]====\n");
+       return 0;
+}
+
+static void exitf(void)
+{
+}
+
+module_init(backtrace_regression_test);
+module_exit(exitf);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/fork.c b/kernel/fork.c

index 314f5101d2b0e5f864f22fce25c4255c7305f789..05e0b6f4365bfc5fca5ef1cabd400c3894c013e3 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
         destroy_context(mm);
         free_mm(mm);
  }
+EXPORT_SYMBOL_GPL(__mmdrop);
  
  /*
   * Decrement the use count and release all resources for an mm.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c

index 1f314221d534be091d809628b39f12ccba4e997e..438a01464287fc391b8da015fbe7c80a666c4928 100644 (file)
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
                         return;
                 }
                 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
+#ifdef CONFIG_DEBUG_SHIRQ
+               dump_stack();
+#endif
                 spin_unlock_irqrestore(&desc->lock, flags);
                 return;
         }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c

index 50b81b98046a9af5a6016376c72549ef7faf49f1..c2f2ccb0549a18a8552e0c8a018a198e4c70195e 100644 (file)
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
  
  #endif
  
+static int irq_spurious_read(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+       struct irq_desc *d = &irq_desc[(long) data];
+       return sprintf(page, "count %u\n"
+                            "unhandled %u\n"
+                            "last_unhandled %u ms\n",
+                       d->irq_count,
+                       d->irqs_unhandled,
+                       jiffies_to_msecs(d->last_unhandled));
+}
+
  #define MAX_NAMELEN 128
  
  static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
  void register_irq_proc(unsigned int irq)
  {
         char name [MAX_NAMELEN];
+       struct proc_dir_entry *entry;
  
         if (!root_irq_dir ||
                 (irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
  
  #ifdef CONFIG_SMP
         {
-               struct proc_dir_entry *entry;
-
                 /* create /proc/irq/<irq>/smp_affinity */
                 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
  
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
                 }
         }
  #endif
+
+       entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
+       if (entry) {
+               entry->data = (void *)(long)irq;
+               entry->read_proc = irq_spurious_read;
+       }
  }
  
  #undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c

index 32b161972fad2cda220f3f4698e9fb565017b6a1..a6b2bc831dd05be5ee9cbff347566bf410870c26 100644 (file)
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,6 +10,7 @@
  #include <linux/module.h>
  #include <linux/kallsyms.h>
  #include <linux/interrupt.h>
+#include <linux/moduleparam.h>
  
  static int irqfixup __read_mostly;
  
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
  }
  
  __setup("noirqdebug", noirqdebug_setup);
+module_param(noirqdebug, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
  
  static int __init irqfixup_setup(char *str)
  {
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
  }
  
  __setup("irqfixup", irqfixup_setup);
+module_param(irqfixup, int, 0644);
+MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
  
  static int __init irqpoll_setup(char *str)
  {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c

index e3a5d817ac9b0f07b514587a7ec03453461ab655..d0493eafea3ec5d86ece12d8861421b6bea13ec7 100644 (file)
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
         if (!err)
                 err = register_die_notifier(&kprobe_exceptions_nb);
  
+       if (!err)
+               init_test_probes();
         return err;
  }
  
diff --git a/kernel/module.c b/kernel/module.c

index f6a4e721fd4907339dfbbca63d5081244697973f..bd60278ee7035945d1db4dfaed0f26b3c4a457e8 100644 (file)
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -430,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
         return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
  }
  
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
  static int percpu_modinit(void)
  {
         pcpu_num_used = 2;
diff --git a/kernel/panic.c b/kernel/panic.c

index da4d6bac270e90dcdc0dfc6052dbb49f935cc981..d9e90cfe3298ca09947aca7b562479c5fcacb6f8 100644 (file)
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,6 +20,7 @@
  #include <linux/kexec.h>
  #include <linux/debug_locks.h>
  #include <linux/random.h>
+#include <linux/kallsyms.h>
  
  int panic_on_oops;
  int tainted;
@@ -280,6 +281,13 @@ static int init_oops_id(void)
  }
  late_initcall(init_oops_id);
  
+static void print_oops_end_marker(void)
+{
+       init_oops_id();
+       printk(KERN_WARNING "---[ end trace %016llx ]---\n",
+               (unsigned long long)oops_id);
+}
+
  /*
   * Called when the architecture exits its oops handler, after printing
   * everything.
@@ -287,11 +295,26 @@ late_initcall(init_oops_id);
  void oops_exit(void)
  {
         do_oops_enter_exit();
-       init_oops_id();
-       printk(KERN_WARNING "---[ end trace %016llx ]---\n",
-               (unsigned long long)oops_id);
+       print_oops_end_marker();
  }
  
+#ifdef WANT_WARN_ON_SLOWPATH
+void warn_on_slowpath(const char *file, int line)
+{
+       char function[KSYM_SYMBOL_LEN];
+       unsigned long caller = (unsigned long) __builtin_return_address(0);
+       sprint_symbol(function, caller);
+
+       printk(KERN_WARNING "------------[ cut here ]------------\n");
+       printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+               line, function);
+       print_modules();
+       dump_stack();
+       print_oops_end_marker();
+}
+EXPORT_SYMBOL(warn_on_slowpath);
+#endif
+
  #ifdef CONFIG_CC_STACKPROTECTOR
  /*
   * Called when gcc's -fstack-protector feature is used, and
diff --git a/kernel/printk.c b/kernel/printk.c

index 3b7c968d0ef994e8be14ef780835d96634761881..58bbec6841193d977f0a928fef090aee5edafa9b 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
  
  #include <asm/uaccess.h>
  
+/*
+ * Architectures can override it:
+ */
+void __attribute__((weak)) early_printk(const char *fmt, ...)
+{
+}
+
  #define __LOG_BUF_LEN  (1 << CONFIG_LOG_BUF_SHIFT)
  
  /* printk's without a loglevel use this.. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c

index c719bb9d79ab0dddd7a651d37b107b4368dfdd4b..e6e9b8be4b053c5f0074bdeca6f1a4ac9517e8fb 100644 (file)
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
         return error;
  }
  
+
+#ifdef PTRACE_SINGLESTEP
+#define is_singlestep(request)         ((request) == PTRACE_SINGLESTEP)
+#else
+#define is_singlestep(request)         0
+#endif
+
+#ifdef PTRACE_SINGLEBLOCK
+#define is_singleblock(request)                ((request) == PTRACE_SINGLEBLOCK)
+#else
+#define is_singleblock(request)                0
+#endif
+
+#ifdef PTRACE_SYSEMU
+#define is_sysemu_singlestep(request)  ((request) == PTRACE_SYSEMU_SINGLESTEP)
+#else
+#define is_sysemu_singlestep(request)  0
+#endif
+
+static int ptrace_resume(struct task_struct *child, long request, long data)
+{
+       if (!valid_signal(data))
+               return -EIO;
+
+       if (request == PTRACE_SYSCALL)
+               set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+       else
+               clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
+#ifdef TIF_SYSCALL_EMU
+       if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
+               set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+       else
+               clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+
+       if (is_singleblock(request)) {
+               if (unlikely(!arch_has_block_step()))
+                       return -EIO;
+               user_enable_block_step(child);
+       } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
+               if (unlikely(!arch_has_single_step()))
+                       return -EIO;
+               user_enable_single_step(child);
+       }
+       else
+               user_disable_single_step(child);
+
+       child->exit_code = data;
+       wake_up_process(child);
+
+       return 0;
+}
+
  int ptrace_request(struct task_struct *child, long request,
                    long addr, long data)
  {
         int ret = -EIO;
  
         switch (request) {
+       case PTRACE_PEEKTEXT:
+       case PTRACE_PEEKDATA:
+               return generic_ptrace_peekdata(child, addr, data);
+       case PTRACE_POKETEXT:
+       case PTRACE_POKEDATA:
+               return generic_ptrace_pokedata(child, addr, data);
+
  #ifdef PTRACE_OLDSETOPTIONS
         case PTRACE_OLDSETOPTIONS:
  #endif
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request,
         case PTRACE_DETACH:      /* detach a process that was attached. */
                 ret = ptrace_detach(child, data);
                 break;
+
+#ifdef PTRACE_SINGLESTEP
+       case PTRACE_SINGLESTEP:
+#endif
+#ifdef PTRACE_SINGLEBLOCK
+       case PTRACE_SINGLEBLOCK:
+#endif
+#ifdef PTRACE_SYSEMU
+       case PTRACE_SYSEMU:
+       case PTRACE_SYSEMU_SINGLESTEP:
+#endif
+       case PTRACE_SYSCALL:
+       case PTRACE_CONT:
+               return ptrace_resume(child, request, data);
+
+       case PTRACE_KILL:
+               if (child->exit_state)  /* already dead */
+                       return 0;
+               return ptrace_resume(child, request, SIGKILL);
+
         default:
                 break;
         }
@@ -526,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
         copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
         return (copied == sizeof(data)) ? 0 : -EIO;
  }
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+int compat_ptrace_request(struct task_struct *child, compat_long_t request,
+                         compat_ulong_t addr, compat_ulong_t data)
+{
+       compat_ulong_t __user *datap = compat_ptr(data);
+       compat_ulong_t word;
+       int ret;
+
+       switch (request) {
+       case PTRACE_PEEKTEXT:
+       case PTRACE_PEEKDATA:
+               ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+               if (ret != sizeof(word))
+                       ret = -EIO;
+               else
+                       ret = put_user(word, datap);
+               break;
+
+       case PTRACE_POKETEXT:
+       case PTRACE_POKEDATA:
+               ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+               ret = (ret != sizeof(data) ? -EIO : 0);
+               break;
+
+       case PTRACE_GETEVENTMSG:
+               ret = put_user((compat_ulong_t) child->ptrace_message, datap);
+               break;
+
+       default:
+               ret = ptrace_request(child, request, addr, data);
+       }
+
+       return ret;
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
+asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+                                 compat_long_t addr, compat_long_t data)
+{
+       struct task_struct *child;
+       long ret;
+
+       /*
+        * This lock_kernel fixes a subtle race with suid exec
+        */
+       lock_kernel();
+       if (request == PTRACE_TRACEME) {
+               ret = ptrace_traceme();
+               goto out;
+       }
+
+       child = ptrace_get_task_struct(pid);
+       if (IS_ERR(child)) {
+               ret = PTR_ERR(child);
+               goto out;
+       }
+
+       if (request == PTRACE_ATTACH) {
+               ret = ptrace_attach(child);
+               /*
+                * Some architectures need to do book-keeping after
+                * a ptrace attach.
+                */
+               if (!ret)
+                       arch_ptrace_attach(child);
+               goto out_put_task_struct;
+       }
+
+       ret = ptrace_check_attach(child, request == PTRACE_KILL);
+       if (!ret)
+               ret = compat_arch_ptrace(child, request, addr, data);
+
+ out_put_task_struct:
+       put_task_struct(child);
+ out:
+       unlock_kernel();
+       return ret;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
+
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/sched.c b/kernel/sched.c

index 524285e46fa788e7e0a04612a611965b7650a2d5..ba4c88088f62c53ab8ec414e3d77da65b1cd9f67 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4945,19 +4945,15 @@ EXPORT_SYMBOL(_cond_resched);
   */
  int cond_resched_lock(spinlock_t *lock)
  {
+       int resched = need_resched() && system_state == SYSTEM_RUNNING;
         int ret = 0;
  
-       if (need_lockbreak(lock)) {
+       if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
-               cpu_relax();
-               ret = 1;
-               spin_lock(lock);
-       }
-       if (need_resched() && system_state == SYSTEM_RUNNING) {
-               spin_release(&lock->dep_map, 1, _THIS_IP_);
-               _raw_spin_unlock(lock);
-               preempt_enable_no_resched();
-               __cond_resched();
+               if (resched && need_resched())
+                       __cond_resched();
+               else
+                       cpu_relax();
                 ret = 1;
                 spin_lock(lock);
         }
diff --git a/kernel/signal.c b/kernel/signal.c

index afa4f781f924999086c3fe2f34530e1ee84b6496..bf49ce6f016bee66cb89bdb39dd416d4d20635ba 100644 (file)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
                 current->comm, task_pid_nr(current), signr);
  
  #if defined(__i386__) && !defined(__arch_um__)
-       printk("code at %08lx: ", regs->eip);
+       printk("code at %08lx: ", regs->ip);
         {
                 int i;
                 for (i = 0; i < 16; i++) {
                         unsigned char insn;
  
-                       __get_user(insn, (unsigned char *)(regs->eip + i));
+                       __get_user(insn, (unsigned char *)(regs->ip + i));
                         printk("%02x ", insn);
                 }
         }
diff --git a/kernel/softirq.c b/kernel/softirq.c

index bd89bc4eb0b9c0baa8070678d3cb099a8b40bb1c..d7837d45419eabbbead3886b104a697b5bcaf29c 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
   *
   *     Copyright (C) 1992 Linus Torvalds
   *
- * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *     Distribute under GPLv2.
+ *
+ *     Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
   */
  
  #include <linux/module.h>
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void)
   */
  void irq_enter(void)
  {
+#ifdef CONFIG_NO_HZ
+       int cpu = smp_processor_id();
+       if (idle_cpu(cpu) && !in_interrupt())
+               tick_nohz_stop_idle(cpu);
+#endif
         __irq_enter();
  #ifdef CONFIG_NO_HZ
-       if (idle_cpu(smp_processor_id()))
+       if (idle_cpu(cpu))
                 tick_nohz_update_jiffies();
  #endif
  }
diff --git a/kernel/spinlock.c b/kernel/spinlock.c

index cd72424c26625765085e0b2306345d474cb9476d..ae28c82451237a7ee0b0d8653701e7eed5437c6b 100644 (file)
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
   * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
   * not re-enabled during lock-acquire (which the preempt-spin-ops do):
   */
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
-       defined(CONFIG_DEBUG_LOCK_ALLOC)
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
  
  void __lockfunc _read_lock(rwlock_t *lock)
  {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 4bc8e48434a7f9a8879d8ffee7797dc06f8b8107..357b68ba23ecd7975f93e4273787cf1041ca8df8 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
  #ifdef CONFIG_X86
  #include <asm/nmi.h>
  #include <asm/stacktrace.h>
+#include <asm/io.h>
  #endif
  
  static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -727,6 +728,14 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "io_delay_type",
+               .data           = &io_delay_type,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
  #endif
  #if defined(CONFIG_MMU)
         {
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c

new file mode 100644 (file)

index 0000000..88cdb10
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,216 @@
+/*
+ * test_kprobes.c - simple sanity test for *probes
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+
+#define div_factor 3
+
+static u32 rand1, preh_val, posth_val, jph_val;
+static int errors, handler_errors, num_tests;
+
+static noinline u32 kprobe_target(u32 value)
+{
+       /*
+        * gcc ignores noinline on some architectures unless we stuff
+        * sufficient lard into the function. The get_kprobe() here is
+        * just for that.
+        *
+        * NOTE: We aren't concerned about the correctness of get_kprobe()
+        * here; hence, this call is neither under !preempt nor with the
+        * kprobe_mutex held. This is fine(tm)
+        */
+       if (get_kprobe((void *)0xdeadbeef))
+               printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
+
+       return (value / div_factor);
+}
+
+static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       preh_val = (rand1 / div_factor);
+       return 0;
+}
+
+static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
+               unsigned long flags)
+{
+       if (preh_val != (rand1 / div_factor)) {
+               handler_errors++;
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "incorrect value in post_handler\n");
+       }
+       posth_val = preh_val + div_factor;
+}
+
+static struct kprobe kp = {
+       .symbol_name = "kprobe_target",
+       .pre_handler = kp_pre_handler,
+       .post_handler = kp_post_handler
+};
+
+static int test_kprobe(void)
+{
+       int ret;
+
+       ret = register_kprobe(&kp);
+       if (ret < 0) {
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "register_kprobe returned %d\n", ret);
+               return ret;
+       }
+
+       ret = kprobe_target(rand1);
+       unregister_kprobe(&kp);
+
+       if (preh_val == 0) {
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "kprobe pre_handler not called\n");
+               handler_errors++;
+       }
+
+       if (posth_val == 0) {
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "kprobe post_handler not called\n");
+               handler_errors++;
+       }
+
+       return 0;
+}
+
+static u32 j_kprobe_target(u32 value)
+{
+       if (value != rand1) {
+               handler_errors++;
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "incorrect value in jprobe handler\n");
+       }
+
+       jph_val = rand1;
+       jprobe_return();
+       return 0;
+}
+
+static struct jprobe jp = {
+       .entry          = j_kprobe_target,
+       .kp.symbol_name = "kprobe_target"
+};
+
+static int test_jprobe(void)
+{
+       int ret;
+
+       ret = register_jprobe(&jp);
+       if (ret < 0) {
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "register_jprobe returned %d\n", ret);
+               return ret;
+       }
+
+       ret = kprobe_target(rand1);
+       unregister_jprobe(&jp);
+       if (jph_val == 0) {
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "jprobe handler not called\n");
+               handler_errors++;
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static u32 krph_val;
+
+static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+       unsigned long ret = regs_return_value(regs);
+
+       if (ret != (rand1 / div_factor)) {
+               handler_errors++;
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "incorrect value in kretprobe handler\n");
+       }
+
+       krph_val = (rand1 / div_factor);
+       return 0;
+}
+
+static struct kretprobe rp = {
+       .handler        = return_handler,
+       .kp.symbol_name = "kprobe_target"
+};
+
+static int test_kretprobe(void)
+{
+       int ret;
+
+       ret = register_kretprobe(&rp);
+       if (ret < 0) {
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "register_kretprobe returned %d\n", ret);
+               return ret;
+       }
+
+       ret = kprobe_target(rand1);
+       unregister_kretprobe(&rp);
+       if (krph_val == 0) {
+               printk(KERN_ERR "Kprobe smoke test failed: "
+                               "kretprobe handler not called\n");
+               handler_errors++;
+       }
+
+       return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+int init_test_probes(void)
+{
+       int ret;
+
+       do {
+               rand1 = random32();
+       } while (rand1 <= div_factor);
+
+       printk(KERN_INFO "Kprobe smoke test started\n");
+       num_tests++;
+       ret = test_kprobe();
+       if (ret < 0)
+               errors++;
+
+       num_tests++;
+       ret = test_jprobe();
+       if (ret < 0)
+               errors++;
+
+#ifdef CONFIG_KRETPROBES
+       num_tests++;
+       ret = test_kretprobe();
+       if (ret < 0)
+               errors++;
+#endif /* CONFIG_KRETPROBES */
+
+       if (errors)
+               printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
+                               "%d tests failed\n", errors, num_tests);
+       else if (handler_errors)
+               printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
+                               "running handlers\n", handler_errors);
+       else
+               printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+
+       return 0;
+}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c

index 5fb139fef9fa78821e0f85d640d211b1fbb0f5c7..3e59fce6dd432e5b2f85c660c4ebaf40f70fd921 100644 (file)
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
  {
         u64 clc = ((u64) latch << evt->shift);
  
+       if (unlikely(!evt->mult)) {
+               evt->mult = 1;
+               WARN_ON(1);
+       }
+
         do_div(clc, evt->mult);
         if (clc < 1000)
                 clc = 1000;
@@ -151,6 +156,14 @@ static void clockevents_notify_released(void)
  void clockevents_register_device(struct clock_event_device *dev)
  {
         BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+       /*
+        * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+        * on it, so fix it up and emit a warning:
+        */
+       if (unlikely(!dev->mult)) {
+               dev->mult = 1;
+               WARN_ON(1);
+       }
  
         spin_lock(&clockevents_lock);
  
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c

index 8d6125ad2cf0cf3c76368e582d9750066d176232..6e9259a5d5010ac7e869f4f567fd3c0618002943 100644 (file)
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
         }
  
         if (!list_empty(&watchdog_list)) {
-               __mod_timer(&watchdog_timer,
-                           watchdog_timer.expires + WATCHDOG_INTERVAL);
+               /* Cycle through CPUs to check if the CPUs stay synchronized to
+                * each other. */
+               int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+               if (next_cpu >= NR_CPUS)
+                       next_cpu = first_cpu(cpu_online_map);
+               watchdog_timer.expires += WATCHDOG_INTERVAL;
+               add_timer_on(&watchdog_timer, next_cpu);
         }
         spin_unlock(&watchdog_lock);
  }
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
                 if (!started && watchdog) {
                         watchdog_last = watchdog->read();
                         watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
-                       add_timer(&watchdog_timer);
+                       add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
                 }
         } else {
                 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
                         if (watchdog)
                                 del_timer(&watchdog_timer);
                         watchdog = cs;
-                       init_timer(&watchdog_timer);
+                       init_timer_deferrable(&watchdog_timer);
                         watchdog_timer.function = clocksource_watchdog;
  
                         /* Reset watchdog cycles */
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
                                 watchdog_last = watchdog->read();
                                 watchdog_timer.expires =
                                         jiffies + WATCHDOG_INTERVAL;
-                               add_timer(&watchdog_timer);
+                               add_timer_on(&watchdog_timer,
+                                               first_cpu(cpu_online_map));
                         }
                 }
         }
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
         spin_unlock_irqrestore(&clocksource_lock, flags);
  }
  
+/**
+ * clocksource_unregister - remove a registered clocksource
+ */
+void clocksource_unregister(struct clocksource *cs)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&clocksource_lock, flags);
+       list_del(&cs->list);
+       if (clocksource_override == cs)
+               clocksource_override = NULL;
+       next_clocksource = select_clocksource();
+       spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
  #ifdef CONFIG_SYSFS
  /**
   * sysfs_show_current_clocksources - sysfs interface for current clocksource
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c

index 5b86698faa0b08cc1411331c71aaf0ce466ee106..e1bd50cbbf5d232896b4ae470bf005adb84e543d 100644 (file)
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
  /*
   * Broadcast the event to the cpus, which are set in the mask
   */
-int tick_do_broadcast(cpumask_t mask)
+static void tick_do_broadcast(cpumask_t mask)
  {
-       int ret = 0, cpu = smp_processor_id();
+       int cpu = smp_processor_id();
         struct tick_device *td;
  
         /*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
                 cpu_clear(cpu, mask);
                 td = &per_cpu(tick_cpu_device, cpu);
                 td->evtdev->event_handler(td->evtdev);
-               ret = 1;
         }
  
         if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
                 cpu = first_cpu(mask);
                 td = &per_cpu(tick_cpu_device, cpu);
                 td->evtdev->broadcast(mask);
-               ret = 1;
         }
-       return ret;
  }
  
  /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h

index bb13f2724905de18829636053518cee424e1191d..f13f2b7f4fd46bda983d7138edce267b1a9e0c91 100644 (file)
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
   * Broadcasting support
   */
  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_do_broadcast(cpumask_t mask);
-
  extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
  extern int tick_check_broadcast_device(struct clock_event_device *dev);
  extern int tick_is_broadcast_device(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c

index 1a21b6fdb674b878d566789bac2c799da7593adf..63f24b55069551f4331f09c090657b66be139f7e 100644 (file)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
   *
   *  Started by: Thomas Gleixner and Ingo Molnar
   *
- *  For licencing details see kernel-base/COPYING
+ *  Distribute under GPLv2.
   */
  #include <linux/cpu.h>
  #include <linux/err.h>
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void)
         local_irq_restore(flags);
  }
  
+void tick_nohz_stop_idle(int cpu)
+{
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+       if (ts->idle_active) {
+               ktime_t now, delta;
+               now = ktime_get();
+               delta = ktime_sub(now, ts->idle_entrytime);
+               ts->idle_lastupdate = now;
+               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+               ts->idle_active = 0;
+       }
+}
+
+static ktime_t tick_nohz_start_idle(int cpu)
+{
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+       ktime_t now, delta;
+
+       now = ktime_get();
+       if (ts->idle_active) {
+               delta = ktime_sub(now, ts->idle_entrytime);
+               ts->idle_lastupdate = now;
+               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+       }
+       ts->idle_entrytime = now;
+       ts->idle_active = 1;
+       return now;
+}
+
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+       *last_update_time = ktime_to_us(ts->idle_lastupdate);
+       return ktime_to_us(ts->idle_sleeptime);
+}
+
  /**
   * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
   *
@@ -155,13 +193,14 @@ void tick_nohz_stop_sched_tick(void)
         unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
         unsigned long rt_jiffies;
         struct tick_sched *ts;
-       ktime_t last_update, expires, now, delta;
+       ktime_t last_update, expires, now;
         struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
         int cpu;
  
         local_irq_save(flags);
  
         cpu = smp_processor_id();
+       now = tick_nohz_start_idle(cpu);
         ts = &per_cpu(tick_cpu_sched, cpu);
  
         /*
@@ -193,19 +232,7 @@ void tick_nohz_stop_sched_tick(void)
                 }
         }
  
-       now = ktime_get();
-       /*
-        * When called from irq_exit we need to account the idle sleep time
-        * correctly.
-        */
-       if (ts->tick_stopped) {
-               delta = ktime_sub(now, ts->idle_entrytime);
-               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-       }
-
-       ts->idle_entrytime = now;
         ts->idle_calls++;
-
         /* Read jiffies and the time when jiffies were updated last */
         do {
                 seq = read_seqbegin(&xtime_lock);
@@ -296,7 +323,7 @@ void tick_nohz_stop_sched_tick(void)
                         /* Check, if the timer was already in the past */
                         if (hrtimer_active(&ts->sched_timer))
                                 goto out;
-               } else if(!tick_program_event(expires, 0))
+               } else if (!tick_program_event(expires, 0))
                                 goto out;
                 /*
                  * We are past the event already. So we crossed a
@@ -337,23 +364,22 @@ void tick_nohz_restart_sched_tick(void)
         int cpu = smp_processor_id();
         struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
         unsigned long ticks;
-       ktime_t now, delta;
+       ktime_t now;
  
-       if (!ts->tick_stopped)
+       local_irq_disable();
+       tick_nohz_stop_idle(cpu);
+
+       if (!ts->tick_stopped) {
+               local_irq_enable();
                 return;
+       }
  
         /* Update jiffies first */
-       now = ktime_get();
-
-       local_irq_disable();
         select_nohz_load_balancer(0);
+       now = ktime_get();
         tick_do_update_jiffies64(now);
         cpu_clear(cpu, nohz_cpu_mask);
  
-       /* Account the idle time */
-       delta = ktime_sub(now, ts->idle_entrytime);
-       ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-
         /*
          * We stopped the tick in idle. Update process times would miss the
          * time we slept as update_process_times does only a 1 tick
@@ -507,7 +533,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
   */
  #ifdef CONFIG_HIGH_RES_TIMERS
  /*
- * We rearm the timer until we get disabled by the idle code
+ * We rearm the timer until we get disabled by the idle code.
   * Called with interrupts disabled and timer->base->cpu_base->lock held.
   */
  static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c

index ab46ae8c062b0b971aaae2fc28071d0cc01260ed..092a2366b5a905d2fb8347106c08176a7ed81114 100644 (file)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
  }
  
  /**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * getnstimeofday - Returns the time of day in a timespec
   * @ts:                pointer to the timespec to be set
   *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
+ * Returns the time of day in a timespec.
   */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
+void getnstimeofday(struct timespec *ts)
  {
         unsigned long seq;
         s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
         timespec_add_ns(ts, nsecs);
  }
  
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts:                pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
-       __get_realtime_clock_ts(ts);
-}
-
  EXPORT_SYMBOL(getnstimeofday);
  
  /**
   * do_gettimeofday - Returns the time of day in a timeval
   * @tv:                pointer to the timeval to be set
   *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
+ * NOTE: Users should be converted to using getnstimeofday()
   */
  void do_gettimeofday(struct timeval *tv)
  {
         struct timespec now;
  
-       __get_realtime_clock_ts(&now);
+       getnstimeofday(&now);
         tv->tv_sec = now.tv_sec;
         tv->tv_usec = now.tv_nsec/1000;
  }
@@ -198,7 +186,8 @@ static void change_clocksource(void)
  
         clock->error = 0;
         clock->xtime_nsec = 0;
-       clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+       clocksource_calculate_interval(clock,
+               (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
  
         tick_clock_notify();
  
@@ -255,7 +244,8 @@ void __init timekeeping_init(void)
         ntp_clear();
  
         clock = clocksource_get_next();
-       clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+       clocksource_calculate_interval(clock,
+               (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
         clock->cycle_last = clocksource_read(clock);
  
         xtime.tv_sec = sec;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c

index c36bb7ed0301e198753dc29eef982ddf02ed7faf..417da8c5bc7248018d915f40d0aedfa579679e8a 100644 (file)
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
   * the pid and cmdline from the owner process if applicable.
   *
   * Start/stop data collection:
- * # echo 1[0] >/proc/timer_stats
+ * # echo [1|0] >/proc/timer_stats
   *
   * Display the information collected so far:
   * # cat /proc/timer_stats
diff --git a/kernel/timer.c b/kernel/timer.c

index f739dfb539cec7dd1f6c737b9b6fccfe017aebf8..23f7ead78faeae25b07ad78819f4cadefdd3b4b6 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
  #define TVN_MASK (TVN_SIZE - 1)
  #define TVR_MASK (TVR_SIZE - 1)
  
-typedef struct tvec_s {
+struct tvec {
         struct list_head vec[TVN_SIZE];
-} tvec_t;
+};
  
-typedef struct tvec_root_s {
+struct tvec_root {
         struct list_head vec[TVR_SIZE];
-} tvec_root_t;
+};
  
-struct tvec_t_base_s {
+struct tvec_base {
         spinlock_t lock;
         struct timer_list *running_timer;
         unsigned long timer_jiffies;
-       tvec_root_t tv1;
-       tvec_t tv2;
-       tvec_t tv3;
-       tvec_t tv4;
-       tvec_t tv5;
+       struct tvec_root tv1;
+       struct tvec tv2;
+       struct tvec tv3;
+       struct tvec tv4;
+       struct tvec tv5;
  } ____cacheline_aligned;
  
-typedef struct tvec_t_base_s tvec_base_t;
-
-tvec_base_t boot_tvec_bases;
+struct tvec_base boot_tvec_bases;
  EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
  
  /*
- * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
   * base in timer_list is guaranteed to be zero. Use the LSB for
   * the new flag to indicate whether the timer is deferrable
   */
  #define TBASE_DEFERRABLE_FLAG          (0x1)
  
  /* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
  {
         return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
  }
  
-static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
  {
-       return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+       return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
  }
  
  static inline void timer_set_deferrable(struct timer_list *timer)
  {
-       timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+       timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
                                        TBASE_DEFERRABLE_FLAG));
  }
  
  static inline void
-timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
  {
-       timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+       timer->base = (struct tvec_base *)((unsigned long)(new_base) |
                                       tbase_get_deferrable(timer->base));
  }
  
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
  EXPORT_SYMBOL_GPL(round_jiffies_relative);
  
  
-static inline void set_running_timer(tvec_base_t *base,
+static inline void set_running_timer(struct tvec_base *base,
                                         struct timer_list *timer)
  {
  #ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
  #endif
  }
  
-static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
  {
         unsigned long expires = timer->expires;
         unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
   * possible to set timer->base = NULL and drop the lock: the timer remains
   * locked.
   */
-static tvec_base_t *lock_timer_base(struct timer_list *timer,
+static struct tvec_base *lock_timer_base(struct timer_list *timer,
                                         unsigned long *flags)
         __acquires(timer->base->lock)
  {
-       tvec_base_t *base;
+       struct tvec_base *base;
  
         for (;;) {
-               tvec_base_t *prelock_base = timer->base;
+               struct tvec_base *prelock_base = timer->base;
                 base = tbase_get_base(prelock_base);
                 if (likely(base != NULL)) {
                         spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
  
  int __mod_timer(struct timer_list *timer, unsigned long expires)
  {
-       tvec_base_t *base, *new_base;
+       struct tvec_base *base, *new_base;
         unsigned long flags;
         int ret = 0;
  
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
   */
  void add_timer_on(struct timer_list *timer, int cpu)
  {
-       tvec_base_t *base = per_cpu(tvec_bases, cpu);
+       struct tvec_base *base = per_cpu(tvec_bases, cpu);
         unsigned long flags;
  
         timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
   */
  int del_timer(struct timer_list *timer)
  {
-       tvec_base_t *base;
+       struct tvec_base *base;
         unsigned long flags;
         int ret = 0;
  
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
   */
  int try_to_del_timer_sync(struct timer_list *timer)
  {
-       tvec_base_t *base;
+       struct tvec_base *base;
         unsigned long flags;
         int ret = -1;
  
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
  EXPORT_SYMBOL(del_timer_sync);
  #endif
  
-static int cascade(tvec_base_t *base, tvec_t *tv, int index)
+static int cascade(struct tvec_base *base, struct tvec *tv, int index)
  {
         /* cascade all the timers from tv up one level */
         struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
   * This function cascades all vectors and executes all expired timer
   * vectors.
   */
-static inline void __run_timers(tvec_base_t *base)
+static inline void __run_timers(struct tvec_base *base)
  {
         struct timer_list *timer;
  
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base)
                                 int preempt_count = preempt_count();
                                 fn(data);
                                 if (preempt_count != preempt_count()) {
-                                       printk(KERN_WARNING "huh, entered %p "
+                                       printk(KERN_ERR "huh, entered %p "
                                                "with preempt_count %08x, exited"
                                                " with %08x?\n",
                                                fn, preempt_count,
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
   * is used on S/390 to stop all activity when a cpus is idle.
   * This functions needs to be called disabled.
   */
-static unsigned long __next_timer_interrupt(tvec_base_t *base)
+static unsigned long __next_timer_interrupt(struct tvec_base *base)
  {
         unsigned long timer_jiffies = base->timer_jiffies;
         unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
         int index, slot, array, found = 0;
         struct timer_list *nte;
-       tvec_t *varray[4];
+       struct tvec *varray[4];
  
         /* Look for timer events in tv1. */
         index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
         varray[3] = &base->tv5;
  
         for (array = 0; array < 4; array++) {
-               tvec_t *varp = varray[array];
+               struct tvec *varp = varray[array];
  
                 index = slot = timer_jiffies & TVN_MASK;
                 do {
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
   */
  unsigned long get_next_timer_interrupt(unsigned long now)
  {
-       tvec_base_t *base = __get_cpu_var(tvec_bases);
+       struct tvec_base *base = __get_cpu_var(tvec_bases);
         unsigned long expires;
  
         spin_lock(&base->lock);
@@ -894,7 +892,7 @@ static inline void calc_load(unsigned long ticks)
   */
  static void run_timer_softirq(struct softirq_action *h)
  {
-       tvec_base_t *base = __get_cpu_var(tvec_bases);
+       struct tvec_base *base = __get_cpu_var(tvec_bases);
  
         hrtimer_run_pending();
  
@@ -1223,7 +1221,7 @@ static struct lock_class_key base_lock_keys[NR_CPUS];
  static int __cpuinit init_timers_cpu(int cpu)
  {
         int j;
-       tvec_base_t *base;
+       struct tvec_base *base;
         static char __cpuinitdata tvec_base_done[NR_CPUS];
  
         if (!tvec_base_done[cpu]) {
@@ -1278,7 +1276,7 @@ static int __cpuinit init_timers_cpu(int cpu)
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
  {
         struct timer_list *timer;
  
@@ -1292,8 +1290,8 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
  
  static void __cpuinit migrate_timers(int cpu)
  {
-       tvec_base_t *old_base;
-       tvec_base_t *new_base;
+       struct tvec_base *old_base;
+       struct tvec_base *new_base;
         int i;
  
         BUG_ON(cpu_online(cpu));
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug

index c4ecb2994ba3855110cdb596b6500a99b2404b24..89f4035b526c147de3115794fa2cbd55b25db8ab 100644 (file)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -494,6 +494,30 @@ config RCU_TORTURE_TEST
           Say M if you want the RCU torture tests to build as a module.
           Say N if you are unsure.
  
+config KPROBES_SANITY_TEST
+       bool "Kprobes sanity tests"
+       depends on DEBUG_KERNEL
+       depends on KPROBES
+       default n
+       help
+         This option provides for testing basic kprobes functionality on
+         boot. A sample kprobe, jprobe and kretprobe are inserted and
+         verified for functionality.
+
+         Say N if you are unsure.
+
+config BACKTRACE_SELF_TEST
+       tristate "Self test for the backtrace code"
+       depends on DEBUG_KERNEL
+       default n
+       help
+         This option provides a kernel module that can be used to test
+         the kernel stack backtrace code. This option is not useful
+         for distributions or general kernels, but only for kernel
+         developers working on architecture code.
+
+         Say N if you are unsure.
+
  config LKDTM
         tristate "Linux Kernel Dump Test Tool Module"
         depends on DEBUG_KERNEL
@@ -562,5 +586,33 @@ config LATENCYTOP
           Enable this option if you want to use the LatencyTOP tool
           to find out which userspace is blocking on what kernel operations.
  
+config PROVIDE_OHCI1394_DMA_INIT
+       bool "Provide code for enabling DMA over FireWire early on boot"
+       depends on PCI && X86
+       help
+         If you want to debug problems which hang or crash the kernel early
+         on boot and the crashing machine has a FireWire port, you can use
+         this feature to remotely access the memory of the crashed machine
+         over FireWire. This employs remote DMA as part of the OHCI1394
+         specification which is now the standard for FireWire controllers.
+
+         With remote DMA, you can monitor the printk buffer remotely using
+         firescope and access all memory below 4GB using fireproxy from gdb.
+         Even controlling a kernel debugger is possible using remote DMA.
+
+         Usage:
+
+         If ohci1394_dma=early is used as boot parameter, it will initialize
+         all OHCI1394 controllers which are found in the PCI config space.
+
+         As all changes to the FireWire bus such as enabling and disabling
+         devices cause a bus reset and thereby disable remote DMA for all
+         devices, be sure to have the cable plugged and FireWire enabled on
+         the debugging host before booting the debug target for debugging.
+
+         This code (~1k) is freed after boot. By then, the firewire stack
+         in charge of the OHCI-1394 controllers should be used instead.
+
+         See Documentation/debugging-via-ohci1394.txt for more information.
  
  source "samples/Kconfig"
diff --git a/lib/rwsem.c b/lib/rwsem.c

index 7d02700a4b0e131ae0dfba2e6da6ecf75b6a8d2c..3e3365e5665eebf950803b34411f386681157533 100644 (file)
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -187,7 +187,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
  /*
   * wait for the read lock to be granted
   */
-struct rw_semaphore fastcall __sched *
+asmregparm struct rw_semaphore __sched *
  rwsem_down_read_failed(struct rw_semaphore *sem)
  {
         struct rwsem_waiter waiter;
@@ -201,7 +201,7 @@ rwsem_down_read_failed(struct rw_semaphore *sem)
  /*
   * wait for the write lock to be granted
   */
-struct rw_semaphore fastcall __sched *
+asmregparm struct rw_semaphore __sched *
  rwsem_down_write_failed(struct rw_semaphore *sem)
  {
         struct rwsem_waiter waiter;
@@ -216,7 +216,7 @@ rwsem_down_write_failed(struct rw_semaphore *sem)
   * handle waking up a waiter on the semaphore
   * - up_read/up_write has decremented the active part of count if we come here
   */
-struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem)
+asmregparm struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
  {
         unsigned long flags;
  
@@ -236,7 +236,7 @@ struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem)
   * - caller incremented waiting part of count and discovered it still negative
   * - just wake up any readers at the front of the queue
   */
-struct rw_semaphore fastcall *rwsem_downgrade_wake(struct rw_semaphore *sem)
+asmregparm struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
  {
         unsigned long flags;
  
diff --git a/mm/memory.c b/mm/memory.c

index 4b0144b24c123681dcd9e95e715ae56b009355d7..d902d0e25edc564862605d2c8c93209602e797cc 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -513,8 +513,7 @@ again:
                 if (progress >= 32) {
                         progress = 0;
                         if (need_resched() ||
-                           need_lockbreak(src_ptl) ||
-                           need_lockbreak(dst_ptl))
+                           spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                 break;
                 }
                 if (pte_none(*src_pte)) {
@@ -853,7 +852,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                         tlb_finish_mmu(*tlbp, tlb_start, start);
  
                         if (need_resched() ||
-                               (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+                               (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
                                 if (i_mmap_lock) {
                                         *tlbp = NULL;
                                         goto out;
@@ -1768,8 +1767,7 @@ again:
  
         restart_addr = zap_page_range(vma, start_addr,
                                         end_addr - start_addr, details);
-       need_break = need_resched() ||
-                       need_lockbreak(details->i_mmap_lock);
+       need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
  
         if (restart_addr >= end_addr) {
                 /* We have now completed this vma: mark it so */
@@ -2756,3 +2754,34 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
  
         return buf - old_buf;
  }
+
+/*
+ * Print the name of a VMA.
+ */
+void print_vma_addr(char *prefix, unsigned long ip)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+
+       down_read(&mm->mmap_sem);
+       vma = find_vma(mm, ip);
+       if (vma && vma->vm_file) {
+               struct file *f = vma->vm_file;
+               char *buf = (char *)__get_free_page(GFP_KERNEL);
+               if (buf) {
+                       char *p, *s;
+
+                       p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE);
+                       if (IS_ERR(p))
+                               p = "?";
+                       s = strrchr(p, '/');
+                       if (s)
+                               p = s+1;
+                       printk("%s%s[%lx+%lx]", prefix, p,
+                                       vma->vm_start,
+                                       vma->vm_end - vma->vm_start);
+                       free_page((unsigned long)buf);
+               }
+       }
+       up_read(&current->mm->mmap_sem);
+}
diff --git a/mm/mmap.c b/mm/mmap.c

index bfa389fc6ded6a2a5892eec57ed6cd90bbd34d1e..d2b6d44962b7c7b28b89e20cdbf1af0b0c6ada31 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -251,7 +251,8 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
          * not page aligned -Ram Gupta
          */
         rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
-       if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+       if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+                       (mm->end_data - mm->start_data) > rlim)
                 goto out;
  
         newbrk = PAGE_ALIGN(brk);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c

index d4dc4eb48d95ab32c79b96297e7e1a5bb8c23dcc..a2241060113be3d47d9a3f88919e8b5b8c5a9f4a 100644 (file)
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -348,6 +348,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
                         atomic_inc(&entry->lsm_data->refcount);
                         secattr->cache = entry->lsm_data;
                         secattr->flags |= NETLBL_SECATTR_CACHE;
+                       secattr->type = NETLBL_NLTYPE_CIPSOV4;
                         if (prev_entry == NULL) {
                                 spin_unlock_bh(&cipso_v4_cache[bkt].lock);
                                 return 0;
@@ -865,7 +866,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
         }
  
         for (;;) {
-               host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat,
+               host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
                                                        host_spot + 1);
                 if (host_spot < 0)
                         break;
@@ -948,7 +949,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
                                 return -EPERM;
                         break;
                 }
-               ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
+               ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
                                                        host_spot,
                                                        GFP_ATOMIC);
                 if (ret_val != 0)
@@ -1014,7 +1015,8 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
         u32 cat_iter = 0;
  
         for (;;) {
-               cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1);
+               cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+                                                cat + 1);
                 if (cat < 0)
                         break;
                 if ((cat_iter + 2) > net_cat_len)
@@ -1049,7 +1051,7 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
         u32 iter;
  
         for (iter = 0; iter < net_cat_len; iter += 2) {
-               ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat,
+               ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
                                 ntohs(get_unaligned((__be16 *)&net_cat[iter])),
                                 GFP_ATOMIC);
                 if (ret_val != 0)
@@ -1130,7 +1132,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
                 return -ENOSPC;
  
         for (;;) {
-               iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1);
+               iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+                                                 iter + 1);
                 if (iter < 0)
                         break;
                 cat_size += (iter == 0 ? 0 : sizeof(u16));
@@ -1138,7 +1141,8 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
                         return -ENOSPC;
                 array[array_cnt++] = iter;
  
-               iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter);
+               iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
+                                                     iter);
                 if (iter < 0)
                         return -EFAULT;
                 cat_size += sizeof(u16);
@@ -1191,7 +1195,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
                 else
                         cat_low = 0;
  
-               ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat,
+               ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
                                                        cat_low,
                                                        cat_high,
                                                        GFP_ATOMIC);
@@ -1251,7 +1255,9 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
         if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
                 return -EPERM;
  
-       ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+       ret_val = cipso_v4_map_lvl_hton(doi_def,
+                                       secattr->attr.mls.lvl,
+                                       &level);
         if (ret_val != 0)
                 return ret_val;
  
@@ -1303,12 +1309,13 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
         ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
         if (ret_val != 0)
                 return ret_val;
-       secattr->mls_lvl = level;
+       secattr->attr.mls.lvl = level;
         secattr->flags |= NETLBL_SECATTR_MLS_LVL;
  
         if (tag_len > 4) {
-               secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
-               if (secattr->mls_cat == NULL)
+               secattr->attr.mls.cat =
+                                      netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+               if (secattr->attr.mls.cat == NULL)
                         return -ENOMEM;
  
                 ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
@@ -1316,7 +1323,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
                                                     tag_len - 4,
                                                     secattr);
                 if (ret_val != 0) {
-                       netlbl_secattr_catmap_free(secattr->mls_cat);
+                       netlbl_secattr_catmap_free(secattr->attr.mls.cat);
                         return ret_val;
                 }
  
@@ -1350,7 +1357,9 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
         if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
                 return -EPERM;
  
-       ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+       ret_val = cipso_v4_map_lvl_hton(doi_def,
+                                       secattr->attr.mls.lvl,
+                                       &level);
         if (ret_val != 0)
                 return ret_val;
  
@@ -1396,12 +1405,13 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
         ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
         if (ret_val != 0)
                 return ret_val;
-       secattr->mls_lvl = level;
+       secattr->attr.mls.lvl = level;
         secattr->flags |= NETLBL_SECATTR_MLS_LVL;
  
         if (tag_len > 4) {
-               secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
-               if (secattr->mls_cat == NULL)
+               secattr->attr.mls.cat =
+                                      netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+               if (secattr->attr.mls.cat == NULL)
                         return -ENOMEM;
  
                 ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
@@ -1409,7 +1419,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
                                                      tag_len - 4,
                                                      secattr);
                 if (ret_val != 0) {
-                       netlbl_secattr_catmap_free(secattr->mls_cat);
+                       netlbl_secattr_catmap_free(secattr->attr.mls.cat);
                         return ret_val;
                 }
  
@@ -1443,7 +1453,9 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
         if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
                 return -EPERM;
  
-       ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+       ret_val = cipso_v4_map_lvl_hton(doi_def,
+                                       secattr->attr.mls.lvl,
+                                       &level);
         if (ret_val != 0)
                 return ret_val;
  
@@ -1488,12 +1500,13 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
         ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
         if (ret_val != 0)
                 return ret_val;
-       secattr->mls_lvl = level;
+       secattr->attr.mls.lvl = level;
         secattr->flags |= NETLBL_SECATTR_MLS_LVL;
  
         if (tag_len > 4) {
-               secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC);
-               if (secattr->mls_cat == NULL)
+               secattr->attr.mls.cat =
+                                      netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+               if (secattr->attr.mls.cat == NULL)
                         return -ENOMEM;
  
                 ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
@@ -1501,7 +1514,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
                                                     tag_len - 4,
                                                     secattr);
                 if (ret_val != 0) {
-                       netlbl_secattr_catmap_free(secattr->mls_cat);
+                       netlbl_secattr_catmap_free(secattr->attr.mls.cat);
                         return ret_val;
                 }
  
@@ -1850,6 +1863,8 @@ static int cipso_v4_getattr(const unsigned char *cipso,
                 ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
                 break;
         }
+       if (ret_val == 0)
+               secattr->type = NETLBL_NLTYPE_CIPSOV4;
  
  getattr_return:
         rcu_read_unlock();
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig

index 4fc0b023cfd703c5a0aa3b81bd8f8e8e8517303d..6cae5475737e530c94024c9b3a70f820b020f64a 100644 (file)
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -99,7 +99,7 @@ config IP6_NF_MATCH_HL
  config IP6_NF_MATCH_IPV6HEADER
         tristate '"ipv6header" IPv6 Extension Headers Match'
         depends on IP6_NF_IPTABLES
-       depends on NETFILTER_ADVANCED
+       default m if NETFILTER_ADVANCED=n
         help
           This module allows one to match packets based upon
           the ipv6 extension headers.
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c

index b11b3ecbb39d70f37862ab34c0a2ad522103288b..7708e2084ce28efd6dcb8b70384ca23d2387c178 100644 (file)
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -72,12 +72,13 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info)
                 return false;
         }
  
-       err = selinux_relabel_packet_permission(sel->selsid);
+       err = selinux_secmark_relabel_packet_permission(sel->selsid);
         if (err) {
                 printk(KERN_INFO PFX "unable to obtain relabeling permission\n");
                 return false;
         }
  
+       selinux_secmark_refcount_inc();
         return true;
  }
  
@@ -110,11 +111,20 @@ secmark_tg_check(const char *tablename, const void *entry,
         return true;
  }
  
+void secmark_tg_destroy(const struct xt_target *target, void *targinfo)
+{
+       switch (mode) {
+       case SECMARK_MODE_SEL:
+               selinux_secmark_refcount_dec();
+       }
+}
+
  static struct xt_target secmark_tg_reg[] __read_mostly = {
         {
                 .name           = "SECMARK",
                 .family         = AF_INET,
                 .checkentry     = secmark_tg_check,
+               .destroy        = secmark_tg_destroy,
                 .target         = secmark_tg,
                 .targetsize     = sizeof(struct xt_secmark_target_info),
                 .table          = "mangle",
@@ -124,6 +134,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = {
                 .name           = "SECMARK",
                 .family         = AF_INET6,
                 .checkentry     = secmark_tg_check,
+               .destroy        = secmark_tg_destroy,
                 .target         = secmark_tg,
                 .targetsize     = sizeof(struct xt_secmark_target_info),
                 .table          = "mangle",
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c

index ba0ca8d3f77d49720358da39e1988797b487e9e1..becf91a952aeca035f0d060eac857850c854112e 100644 (file)
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -38,6 +38,7 @@
  #include <net/genetlink.h>
  #include <net/netlabel.h>
  #include <net/cipso_ipv4.h>
+#include <asm/atomic.h>
  
  #include "netlabel_user.h"
  #include "netlabel_cipso_v4.h"
@@ -421,7 +422,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
                 break;
         }
         if (ret_val == 0)
-               netlbl_mgmt_protocount_inc();
+               atomic_inc(&netlabel_mgmt_protocount);
  
         audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD,
                                               &audit_info);
@@ -698,7 +699,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
                                       &audit_info,
                                       netlbl_cipsov4_doi_free);
         if (ret_val == 0)
-               netlbl_mgmt_protocount_dec();
+               atomic_dec(&netlabel_mgmt_protocount);
  
         audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL,
                                               &audit_info);
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c

index b3675bd7db334af0e19ee9746c55775664104ec1..9a8ea0195c4fbc2f08bb8c55a5535ae43889c809 100644 (file)
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -54,9 +54,6 @@ struct netlbl_domhsh_tbl {
   * hash table should be okay */
  static DEFINE_SPINLOCK(netlbl_domhsh_lock);
  static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
-
-/* Default domain mapping */
-static DEFINE_SPINLOCK(netlbl_domhsh_def_lock);
  static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
  
  /*
@@ -109,17 +106,14 @@ static u32 netlbl_domhsh_hash(const char *key)
  /**
   * netlbl_domhsh_search - Search for a domain entry
   * @domain: the domain
- * @def: return default if no match is found
   *
   * Description:
   * Searches the domain hash table and returns a pointer to the hash table
- * entry if found, otherwise NULL is returned.  If @def is non-zero and a
- * match is not found in the domain hash table the default mapping is returned
- * if it exists.  The caller is responsibile for the rcu hash table locks
- * (i.e. the caller much call rcu_read_[un]lock()).
+ * entry if found, otherwise NULL is returned.  The caller is responsibile for
+ * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()).
   *
   */
-static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def)
+static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
  {
         u32 bkt;
         struct netlbl_dom_map *iter;
@@ -133,10 +127,31 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def)
                                 return iter;
         }
  
-       if (def != 0) {
-               iter = rcu_dereference(netlbl_domhsh_def);
-               if (iter != NULL && iter->valid)
-                       return iter;
+       return NULL;
+}
+
+/**
+ * netlbl_domhsh_search_def - Search for a domain entry
+ * @domain: the domain
+ * @def: return default if no match is found
+ *
+ * Description:
+ * Searches the domain hash table and returns a pointer to the hash table
+ * entry if an exact match is found, if an exact match is not present in the
+ * hash table then the default entry is returned if valid otherwise NULL is
+ * returned.  The caller is responsibile for the rcu hash table locks
+ * (i.e. the caller much call rcu_read_[un]lock()).
+ *
+ */
+static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
+{
+       struct netlbl_dom_map *entry;
+
+       entry = netlbl_domhsh_search(domain);
+       if (entry == NULL) {
+               entry = rcu_dereference(netlbl_domhsh_def);
+               if (entry != NULL && entry->valid)
+                       return entry;
         }
  
         return NULL;
@@ -221,24 +236,22 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
         INIT_RCU_HEAD(&entry->rcu);
  
         rcu_read_lock();
+       spin_lock(&netlbl_domhsh_lock);
         if (entry->domain != NULL) {
                 bkt = netlbl_domhsh_hash(entry->domain);
-               spin_lock(&netlbl_domhsh_lock);
-               if (netlbl_domhsh_search(entry->domain, 0) == NULL)
+               if (netlbl_domhsh_search(entry->domain) == NULL)
                         list_add_tail_rcu(&entry->list,
                                     &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
                 else
                         ret_val = -EEXIST;
-               spin_unlock(&netlbl_domhsh_lock);
         } else {
                 INIT_LIST_HEAD(&entry->list);
-               spin_lock(&netlbl_domhsh_def_lock);
                 if (rcu_dereference(netlbl_domhsh_def) == NULL)
                         rcu_assign_pointer(netlbl_domhsh_def, entry);
                 else
                         ret_val = -EEXIST;
-               spin_unlock(&netlbl_domhsh_def_lock);
         }
+       spin_unlock(&netlbl_domhsh_lock);
         audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
         if (audit_buf != NULL) {
                 audit_log_format(audit_buf,
@@ -307,7 +320,10 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
         struct audit_buffer *audit_buf;
  
         rcu_read_lock();
-       entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1));
+       if (domain)
+               entry = netlbl_domhsh_search(domain);
+       else
+               entry = netlbl_domhsh_search_def(domain);
         if (entry == NULL)
                 goto remove_return;
         switch (entry->type) {
@@ -316,23 +332,16 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
                                            entry->domain);
                 break;
         }
-       if (entry != rcu_dereference(netlbl_domhsh_def)) {
-               spin_lock(&netlbl_domhsh_lock);
-               if (entry->valid) {
-                       entry->valid = 0;
+       spin_lock(&netlbl_domhsh_lock);
+       if (entry->valid) {
+               entry->valid = 0;
+               if (entry != rcu_dereference(netlbl_domhsh_def))
                         list_del_rcu(&entry->list);
-                       ret_val = 0;
-               }
-               spin_unlock(&netlbl_domhsh_lock);
-       } else {
-               spin_lock(&netlbl_domhsh_def_lock);
-               if (entry->valid) {
-                       entry->valid = 0;
+               else
                         rcu_assign_pointer(netlbl_domhsh_def, NULL);
-                       ret_val = 0;
-               }
-               spin_unlock(&netlbl_domhsh_def_lock);
+               ret_val = 0;
         }
+       spin_unlock(&netlbl_domhsh_lock);
  
         audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
         if (audit_buf != NULL) {
@@ -377,7 +386,7 @@ int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
   */
  struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
  {
-       return netlbl_domhsh_search(domain, 1);
+       return netlbl_domhsh_search_def(domain);
  }
  
  /**
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c

index 4f50949722a95a6e082404601b98c98f39190821..c69e3e1f05c3957ab076a75bd84bc40a87a87578 100644 (file)
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -34,6 +34,7 @@
  #include <net/netlabel.h>
  #include <net/cipso_ipv4.h>
  #include <asm/bug.h>
+#include <asm/atomic.h>
  
  #include "netlabel_domainhash.h"
  #include "netlabel_unlabeled.h"
@@ -262,7 +263,7 @@ int netlbl_enabled(void)
         /* At some point we probably want to expose this mechanism to the user
          * as well so that admins can toggle NetLabel regardless of the
          * configuration */
-       return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0);
+       return (atomic_read(&netlabel_mgmt_protocount) > 0);
  }
  
  /**
@@ -311,7 +312,7 @@ socket_setattr_return:
   * @secattr: the security attributes
   *
   * Description:
- * Examines the given sock to see any NetLabel style labeling has been
+ * Examines the given sock to see if any NetLabel style labeling has been
   * applied to the sock, if so it parses the socket label and returns the
   * security attributes in @secattr.  Returns zero on success, negative values
   * on failure.
@@ -319,18 +320,13 @@ socket_setattr_return:
   */
  int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
  {
-       int ret_val;
-
-       ret_val = cipso_v4_sock_getattr(sk, secattr);
-       if (ret_val == 0)
-               return 0;
-
-       return netlbl_unlabel_getattr(secattr);
+       return cipso_v4_sock_getattr(sk, secattr);
  }
  
  /**
   * netlbl_skbuff_getattr - Determine the security attributes of a packet
   * @skb: the packet
+ * @family: protocol family
   * @secattr: the security attributes
   *
   * Description:
@@ -341,13 +337,14 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
   *
   */
  int netlbl_skbuff_getattr(const struct sk_buff *skb,
+                         u16 family,
                           struct netlbl_lsm_secattr *secattr)
  {
         if (CIPSO_V4_OPTEXIST(skb) &&
             cipso_v4_skbuff_getattr(skb, secattr) == 0)
                 return 0;
  
-       return netlbl_unlabel_getattr(secattr);
+       return netlbl_unlabel_getattr(skb, family, secattr);
  }
  
  /**
@@ -431,6 +428,10 @@ static int __init netlbl_init(void)
         if (ret_val != 0)
                 goto init_failure;
  
+       ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
+       if (ret_val != 0)
+               goto init_failure;
+
         ret_val = netlbl_netlink_init();
         if (ret_val != 0)
                 goto init_failure;
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c

index 9c41464d58d16c7419c1bc8b057473678d64e10e..e2258dc3c84586e204754fe431f4b52732e7aee0 100644 (file)
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -37,14 +37,14 @@
  #include <net/genetlink.h>
  #include <net/netlabel.h>
  #include <net/cipso_ipv4.h>
+#include <asm/atomic.h>
  
  #include "netlabel_domainhash.h"
  #include "netlabel_user.h"
  #include "netlabel_mgmt.h"
  
-/* NetLabel configured protocol count */
-static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock);
-static u32 netlabel_mgmt_protocount = 0;
+/* NetLabel configured protocol counter */
+atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0);
  
  /* Argument struct for netlbl_domhsh_walk() */
  struct netlbl_domhsh_walk_arg {
@@ -70,63 +70,6 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
         [NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 },
  };
  
-/*
- * NetLabel Misc Management Functions
- */
-
-/**
- * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count
- *
- * Description:
- * Increment the number of labeled protocol configurations in the current
- * NetLabel configuration.  Keep track of this for use in determining if
- * NetLabel label enforcement should be active/enabled or not in the LSM.
- *
- */
-void netlbl_mgmt_protocount_inc(void)
-{
-       spin_lock(&netlabel_mgmt_protocount_lock);
-       netlabel_mgmt_protocount++;
-       spin_unlock(&netlabel_mgmt_protocount_lock);
-}
-
-/**
- * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count
- *
- * Description:
- * Decrement the number of labeled protocol configurations in the current
- * NetLabel configuration.  Keep track of this for use in determining if
- * NetLabel label enforcement should be active/enabled or not in the LSM.
- *
- */
-void netlbl_mgmt_protocount_dec(void)
-{
-       spin_lock(&netlabel_mgmt_protocount_lock);
-       if (netlabel_mgmt_protocount > 0)
-               netlabel_mgmt_protocount--;
-       spin_unlock(&netlabel_mgmt_protocount_lock);
-}
-
-/**
- * netlbl_mgmt_protocount_value - Return the number of configured protocols
- *
- * Description:
- * Return the number of labeled protocols in the current NetLabel
- * configuration.  This value is useful in  determining if NetLabel label
- * enforcement should be active/enabled or not in the LSM.
- *
- */
-u32 netlbl_mgmt_protocount_value(void)
-{
-       u32 val;
-
-       rcu_read_lock();
-       val = netlabel_mgmt_protocount;
-       rcu_read_unlock();
-
-       return val;
-}
-
  /*
   * NetLabel Command Handlers
   */
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h

index ccb2b3923591c9e45afe30e73d141704b1659396..a43bff169d6b5f793b0550b6b4fa65b5a53b37c5 100644 (file)
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -32,6 +32,7 @@
  #define _NETLABEL_MGMT_H
  
  #include <net/netlabel.h>
+#include <asm/atomic.h>
  
  /*
   * The following NetLabel payloads are supported by the management interface.
@@ -168,9 +169,7 @@ enum {
  /* NetLabel protocol functions */
  int netlbl_mgmt_genl_init(void);
  
-/* NetLabel misc management functions */
-void netlbl_mgmt_protocount_inc(void);
-void netlbl_mgmt_protocount_dec(void);
-u32 netlbl_mgmt_protocount_value(void);
+/* NetLabel configured protocol reference counter */
+extern atomic_t netlabel_mgmt_protocount;
  
  #endif
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c

index 348292450deb7ea7ccea3627160f4f4f457c98c1..42e81fd8cc491892fe371beda6de52a4e3ad775b 100644 (file)
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -10,7 +10,7 @@
   */
  
  /*
- * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007
   *
   * This program is free software;  you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -36,22 +36,92 @@
  #include <linux/string.h>
  #include <linux/skbuff.h>
  #include <linux/audit.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
  #include <net/sock.h>
  #include <net/netlink.h>
  #include <net/genetlink.h>
-
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/net_namespace.h>
  #include <net/netlabel.h>
  #include <asm/bug.h>
+#include <asm/atomic.h>
  
  #include "netlabel_user.h"
  #include "netlabel_domainhash.h"
  #include "netlabel_unlabeled.h"
+#include "netlabel_mgmt.h"
+
+/* NOTE: at present we always use init's network namespace since we don't
+ *       presently support different namespaces even though the majority of
+ *       the functions in this file are "namespace safe" */
+
+/* The unlabeled connection hash table which we use to map network interfaces
+ * and addresses of unlabeled packets to a user specified secid value for the
+ * LSM.  The hash table is used to lookup the network interface entry
+ * (struct netlbl_unlhsh_iface) and then the interface entry is used to
+ * lookup an IP address match from an ordered list.  If a network interface
+ * match can not be found in the hash table then the default entry
+ * (netlbl_unlhsh_def) is used.  The IP address entry list
+ * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
+ * larger netmask come first.
+ */
+struct netlbl_unlhsh_tbl {
+       struct list_head *tbl;
+       u32 size;
+};
+struct netlbl_unlhsh_addr4 {
+       __be32 addr;
+       __be32 mask;
+       u32 secid;
+
+       u32 valid;
+       struct list_head list;
+       struct rcu_head rcu;
+};
+struct netlbl_unlhsh_addr6 {
+       struct in6_addr addr;
+       struct in6_addr mask;
+       u32 secid;
+
+       u32 valid;
+       struct list_head list;
+       struct rcu_head rcu;
+};
+struct netlbl_unlhsh_iface {
+       int ifindex;
+       struct list_head addr4_list;
+       struct list_head addr6_list;
+
+       u32 valid;
+       struct list_head list;
+       struct rcu_head rcu;
+};
+
+/* Argument struct for netlbl_unlhsh_walk() */
+struct netlbl_unlhsh_walk_arg {
+       struct netlink_callback *nl_cb;
+       struct sk_buff *skb;
+       u32 seq;
+};
+
+/* Unlabeled connection hash table */
+/* updates should be so rare that having one spinlock for the entire
+ * hash table should be okay */
+static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
+static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
  
  /* Accept unlabeled packets flag */
-static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock);
  static u8 netlabel_unlabel_acceptflg = 0;
  
-/* NetLabel Generic NETLINK CIPSOv4 family */
+/* NetLabel Generic NETLINK unlabeled family */
  static struct genl_family netlbl_unlabel_gnl_family = {
         .id = GENL_ID_GENERATE,
         .hdrsize = 0,
@@ -60,14 +130,844 @@ static struct genl_family netlbl_unlabel_gnl_family = {
         .maxattr = NLBL_UNLABEL_A_MAX,
  };
  
-/* NetLabel Netlink attribute policy */
-static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
-       [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
-};
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
+       [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
+       [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
+                                     .len = sizeof(struct in6_addr) },
+       [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
+                                     .len = sizeof(struct in6_addr) },
+       [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
+                                     .len = sizeof(struct in_addr) },
+       [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
+                                     .len = sizeof(struct in_addr) },
+       [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
+                                  .len = IFNAMSIZ - 1 },
+       [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
+};
+
+/*
+ * Audit Helper Functions
+ */
+
+/**
+ * netlbl_unlabel_audit_addr4 - Audit an IPv4 address
+ * @audit_buf: audit buffer
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv4 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf,
+                                    const char *dev,
+                                    __be32 addr, __be32 mask)
+{
+       u32 mask_val = ntohl(mask);
+
+       if (dev != NULL)
+               audit_log_format(audit_buf, " netif=%s", dev);
+       audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr));
+       if (mask_val != 0xffffffff) {
+               u32 mask_len = 0;
+               while (mask_val > 0) {
+                       mask_val <<= 1;
+                       mask_len++;
+               }
+               audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
+       }
+}
+
+/**
+ * netlbl_unlabel_audit_addr6 - Audit an IPv6 address
+ * @audit_buf: audit buffer
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv6 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf,
+                                    const char *dev,
+                                    const struct in6_addr *addr,
+                                    const struct in6_addr *mask)
+{
+       if (dev != NULL)
+               audit_log_format(audit_buf, " netif=%s", dev);
+       audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr));
+       if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
+               u32 mask_len = 0;
+               u32 mask_val;
+               int iter = -1;
+               while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
+                       mask_len += 32;
+               mask_val = ntohl(mask->s6_addr32[iter]);
+               while (mask_val > 0) {
+                       mask_val <<= 1;
+                       mask_len++;
+               }
+               audit_log_format(audit_buf, " src_prefixlen=%d", mask_len);
+       }
+}
+
+/*
+ * Unlabeled Connection Hash Table Functions
+ */
+
+/**
+ * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table address entry can be
+ * released safely.
+ *
+ */
+static void netlbl_unlhsh_free_addr4(struct rcu_head *entry)
+{
+       struct netlbl_unlhsh_addr4 *ptr;
+
+       ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu);
+       kfree(ptr);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table address entry can be
+ * released safely.
+ *
+ */
+static void netlbl_unlhsh_free_addr6(struct rcu_head *entry)
+{
+       struct netlbl_unlhsh_addr6 *ptr;
+
+       ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu);
+       kfree(ptr);
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table interface entry can be
+ * released safely.  It is important to note that this function does not free
+ * the IPv4 and IPv6 address lists contained as part of an interface entry.  It
+ * is up to the rest of the code to make sure an interface entry is only freed
+ * once it's address lists are empty.
+ *
+ */
+static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
+{
+       struct netlbl_unlhsh_iface *iface;
+       struct netlbl_unlhsh_addr4 *iter4;
+       struct netlbl_unlhsh_addr4 *tmp4;
+       struct netlbl_unlhsh_addr6 *iter6;
+       struct netlbl_unlhsh_addr6 *tmp6;
+
+       iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
+
+       /* no need for locks here since we are the only one with access to this
+        * structure */
+
+       list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list)
+               if (iter4->valid) {
+                       list_del_rcu(&iter4->list);
+                       kfree(iter4);
+               }
+       list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list)
+               if (iter6->valid) {
+                       list_del_rcu(&iter6->list);
+                       kfree(iter6);
+               }
+       kfree(iface);
+}
+
+/**
+ * netlbl_unlhsh_hash - Hashing function for the hash table
+ * @ifindex: the network interface/device to hash
+ *
+ * Description:
+ * This is the hashing function for the unlabeled hash table, it returns the
+ * bucket number for the given device/interface.  The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+static u32 netlbl_unlhsh_hash(int ifindex)
+{
+       /* this is taken _almost_ directly from
+        * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much
+        * the same thing */
+       return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1);
+}
+
+/**
+ * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry
+ * @addr: IPv4 address
+ * @iface: the network interface entry
+ *
+ * Description:
+ * Searches the IPv4 address list of the network interface specified by @iface.
+ * If a matching address entry is found it is returned, otherwise NULL is
+ * returned.  The caller is responsible for calling the rcu_read_[un]lock()
+ * functions.
+ *
+ */
+static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4(
+                                      __be32 addr,
+                                      const struct netlbl_unlhsh_iface *iface)
+{
+       struct netlbl_unlhsh_addr4 *iter;
+
+       list_for_each_entry_rcu(iter, &iface->addr4_list, list)
+               if (iter->valid && (addr & iter->mask) == iter->addr)
+                       return iter;
+
+       return NULL;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry
+ * @addr: IPv6 address
+ * @iface: the network interface entry
+ *
+ * Description:
+ * Searches the IPv6 address list of the network interface specified by @iface.
+ * If a matching address entry is found it is returned, otherwise NULL is
+ * returned.  The caller is responsible for calling the rcu_read_[un]lock()
+ * functions.
+ *
+ */
+static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6(
+                                      const struct in6_addr *addr,
+                                      const struct netlbl_unlhsh_iface *iface)
+{
+       struct netlbl_unlhsh_addr6 *iter;
+
+       list_for_each_entry_rcu(iter, &iface->addr6_list, list)
+               if (iter->valid &&
+                   ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
+               return iter;
+
+       return NULL;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_search_iface - Search for a matching interface entry
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Searches the unlabeled connection hash table and returns a pointer to the
+ * interface entry which matches @ifindex, otherwise NULL is returned.  The
+ * caller is responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
+{
+       u32 bkt;
+       struct netlbl_unlhsh_iface *iter;
+
+       bkt = netlbl_unlhsh_hash(ifindex);
+       list_for_each_entry_rcu(iter,
+                               &rcu_dereference(netlbl_unlhsh)->tbl[bkt],
+                               list)
+               if (iter->valid && iter->ifindex == ifindex)
+                       return iter;
+
+       return NULL;
+}
+
+/**
+ * netlbl_unlhsh_search_iface_def - Search for a matching interface entry
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Searches the unlabeled connection hash table and returns a pointer to the
+ * interface entry which matches @ifindex.  If an exact match can not be found
+ * and there is a valid default entry, the default entry is returned, otherwise
+ * NULL is returned.  The caller is responsible for calling the
+ * rcu_read_[un]lock() functions.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex)
+{
+       struct netlbl_unlhsh_iface *entry;
+
+       entry = netlbl_unlhsh_search_iface(ifindex);
+       if (entry != NULL)
+               return entry;
+
+       entry = rcu_dereference(netlbl_unlhsh_def);
+       if (entry != NULL && entry->valid)
+               return entry;
+
+       return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv4 address in network byte order
+ * @mask: IPv4 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface.  On success zero is returned, otherwise
+ * a negative value is returned.  The caller is responsible for calling the
+ * rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
+                                  const struct in_addr *addr,
+                                  const struct in_addr *mask,
+                                  u32 secid)
+{
+       struct netlbl_unlhsh_addr4 *entry;
+       struct netlbl_unlhsh_addr4 *iter;
+
+       entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+       if (entry == NULL)
+               return -ENOMEM;
+
+       entry->addr = addr->s_addr & mask->s_addr;
+       entry->mask = mask->s_addr;
+       entry->secid = secid;
+       entry->valid = 1;
+       INIT_RCU_HEAD(&entry->rcu);
+
+       spin_lock(&netlbl_unlhsh_lock);
+       iter = netlbl_unlhsh_search_addr4(entry->addr, iface);
+       if (iter != NULL &&
+           iter->addr == addr->s_addr && iter->mask == mask->s_addr) {
+               spin_unlock(&netlbl_unlhsh_lock);
+               kfree(entry);
+               return -EEXIST;
+       }
+       /* in order to speed up address searches through the list (the common
+        * case) we need to keep the list in order based on the size of the
+        * address mask such that the entry with the widest mask (smallest
+        * numerical value) appears first in the list */
+       list_for_each_entry_rcu(iter, &iface->addr4_list, list)
+               if (iter->valid &&
+                   ntohl(entry->mask) > ntohl(iter->mask)) {
+                       __list_add_rcu(&entry->list,
+                                      iter->list.prev,
+                                      &iter->list);
+                       spin_unlock(&netlbl_unlhsh_lock);
+                       return 0;
+               }
+       list_add_tail_rcu(&entry->list, &iface->addr4_list);
+       spin_unlock(&netlbl_unlhsh_lock);
+       return 0;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv6 address in network byte order
+ * @mask: IPv6 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface.  On success zero is returned, otherwise
+ * a negative value is returned.  The caller is responsible for calling the
+ * rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
+                                  const struct in6_addr *addr,
+                                  const struct in6_addr *mask,
+                                  u32 secid)
+{
+       struct netlbl_unlhsh_addr6 *entry;
+       struct netlbl_unlhsh_addr6 *iter;
+
+       entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+       if (entry == NULL)
+               return -ENOMEM;
+
+       ipv6_addr_copy(&entry->addr, addr);
+       entry->addr.s6_addr32[0] &= mask->s6_addr32[0];
+       entry->addr.s6_addr32[1] &= mask->s6_addr32[1];
+       entry->addr.s6_addr32[2] &= mask->s6_addr32[2];
+       entry->addr.s6_addr32[3] &= mask->s6_addr32[3];
+       ipv6_addr_copy(&entry->mask, mask);
+       entry->secid = secid;
+       entry->valid = 1;
+       INIT_RCU_HEAD(&entry->rcu);
+
+       spin_lock(&netlbl_unlhsh_lock);
+       iter = netlbl_unlhsh_search_addr6(&entry->addr, iface);
+       if (iter != NULL &&
+           (ipv6_addr_equal(&iter->addr, addr) &&
+            ipv6_addr_equal(&iter->mask, mask))) {
+               spin_unlock(&netlbl_unlhsh_lock);
+               kfree(entry);
+               return -EEXIST;
+       }
+       /* in order to speed up address searches through the list (the common
+        * case) we need to keep the list in order based on the size of the
+        * address mask such that the entry with the widest mask (smallest
+        * numerical value) appears first in the list */
+       list_for_each_entry_rcu(iter, &iface->addr6_list, list)
+               if (iter->valid &&
+                   ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
+                       __list_add_rcu(&entry->list,
+                                      iter->list.prev,
+                                      &iter->list);
+                       spin_unlock(&netlbl_unlhsh_lock);
+                       return 0;
+               }
+       list_add_tail_rcu(&entry->list, &iface->addr6_list);
+       spin_unlock(&netlbl_unlhsh_lock);
+       return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
+ * @ifindex: network interface
+ *
+ * Description:
+ * Add a new, empty, interface entry into the unlabeled connection hash table.
+ * On success a pointer to the new interface entry is returned, on failure NULL
+ * is returned.  The caller is responsible for calling the rcu_read_[un]lock()
+ * functions.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
+{
+       u32 bkt;
+       struct netlbl_unlhsh_iface *iface;
+
+       iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
+       if (iface == NULL)
+               return NULL;
+
+       iface->ifindex = ifindex;
+       INIT_LIST_HEAD(&iface->addr4_list);
+       INIT_LIST_HEAD(&iface->addr6_list);
+       iface->valid = 1;
+       INIT_RCU_HEAD(&iface->rcu);
+
+       spin_lock(&netlbl_unlhsh_lock);
+       if (ifindex > 0) {
+               bkt = netlbl_unlhsh_hash(ifindex);
+               if (netlbl_unlhsh_search_iface(ifindex) != NULL)
+                       goto add_iface_failure;
+               list_add_tail_rcu(&iface->list,
+                                 &rcu_dereference(netlbl_unlhsh)->tbl[bkt]);
+       } else {
+               INIT_LIST_HEAD(&iface->list);
+               if (rcu_dereference(netlbl_unlhsh_def) != NULL)
+                       goto add_iface_failure;
+               rcu_assign_pointer(netlbl_unlhsh_def, iface);
+       }
+       spin_unlock(&netlbl_unlhsh_lock);
+
+       return iface;
+
+add_iface_failure:
+       spin_unlock(&netlbl_unlhsh_lock);
+       kfree(iface);
+       return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @secid: LSM secid value for the entry
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new entry to the unlabeled connection hash table.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_add(struct net *net,
+                            const char *dev_name,
+                            const void *addr,
+                            const void *mask,
+                            u32 addr_len,
+                            u32 secid,
+                            struct netlbl_audit *audit_info)
+{
+       int ret_val;
+       int ifindex;
+       struct net_device *dev;
+       struct netlbl_unlhsh_iface *iface;
+       struct in_addr *addr4, *mask4;
+       struct in6_addr *addr6, *mask6;
+       struct audit_buffer *audit_buf = NULL;
+       char *secctx = NULL;
+       u32 secctx_len;
+
+       if (addr_len != sizeof(struct in_addr) &&
+           addr_len != sizeof(struct in6_addr))
+               return -EINVAL;
+
+       rcu_read_lock();
+       if (dev_name != NULL) {
+               dev = dev_get_by_name(net, dev_name);
+               if (dev == NULL) {
+                       ret_val = -ENODEV;
+                       goto unlhsh_add_return;
+               }
+               ifindex = dev->ifindex;
+               dev_put(dev);
+               iface = netlbl_unlhsh_search_iface(ifindex);
+       } else {
+               ifindex = 0;
+               iface = rcu_dereference(netlbl_unlhsh_def);
+       }
+       if (iface == NULL) {
+               iface = netlbl_unlhsh_add_iface(ifindex);
+               if (iface == NULL) {
+                       ret_val = -ENOMEM;
+                       goto unlhsh_add_return;
+               }
+       }
+       audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
+                                             audit_info);
+       switch (addr_len) {
+       case sizeof(struct in_addr):
+               addr4 = (struct in_addr *)addr;
+               mask4 = (struct in_addr *)mask;
+               ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
+               if (audit_buf != NULL)
+                       netlbl_unlabel_audit_addr4(audit_buf,
+                                                  dev_name,
+                                                  addr4->s_addr,
+                                                  mask4->s_addr);
+               break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       case sizeof(struct in6_addr):
+               addr6 = (struct in6_addr *)addr;
+               mask6 = (struct in6_addr *)mask;
+               ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
+               if (audit_buf != NULL)
+                       netlbl_unlabel_audit_addr6(audit_buf,
+                                                  dev_name,
+                                                  addr6, mask6);
+               break;
+#endif /* IPv6 */
+       default:
+               ret_val = -EINVAL;
+       }
+       if (ret_val == 0)
+               atomic_inc(&netlabel_mgmt_protocount);
+
+unlhsh_add_return:
+       rcu_read_unlock();
+       if (audit_buf != NULL) {
+               if (security_secid_to_secctx(secid,
+                                            &secctx,
+                                            &secctx_len) == 0) {
+                       audit_log_format(audit_buf, " sec_obj=%s", secctx);
+                       security_release_secctx(secctx, secctx_len);
+               }
+               audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+               audit_log_end(audit_buf);
+       }
+       return ret_val;
+}
+
+/**
+ * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.  The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_remove_addr4(struct net *net,
+                                     struct netlbl_unlhsh_iface *iface,
+                                     const struct in_addr *addr,
+                                     const struct in_addr *mask,
+                                     struct netlbl_audit *audit_info)
+{
+       int ret_val = -ENOENT;
+       struct netlbl_unlhsh_addr4 *entry;
+       struct audit_buffer *audit_buf = NULL;
+       struct net_device *dev;
+       char *secctx = NULL;
+       u32 secctx_len;
+
+       spin_lock(&netlbl_unlhsh_lock);
+       entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface);
+       if (entry != NULL &&
+           entry->addr == addr->s_addr && entry->mask == mask->s_addr) {
+               entry->valid = 0;
+               list_del_rcu(&entry->list);
+               ret_val = 0;
+       }
+       spin_unlock(&netlbl_unlhsh_lock);
+
+       audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+                                             audit_info);
+       if (audit_buf != NULL) {
+               dev = dev_get_by_index(net, iface->ifindex);
+               netlbl_unlabel_audit_addr4(audit_buf,
+                                          (dev != NULL ? dev->name : NULL),
+                                          entry->addr, entry->mask);
+               if (dev != NULL)
+                       dev_put(dev);
+               if (security_secid_to_secctx(entry->secid,
+                                            &secctx,
+                                            &secctx_len) == 0) {
+                       audit_log_format(audit_buf, " sec_obj=%s", secctx);
+                       security_release_secctx(secctx, secctx_len);
+               }
+               audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+               audit_log_end(audit_buf);
+       }
+
+       if (ret_val == 0)
+               call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4);
+       return ret_val;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.  The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+static int netlbl_unlhsh_remove_addr6(struct net *net,
+                                     struct netlbl_unlhsh_iface *iface,
+                                     const struct in6_addr *addr,
+                                     const struct in6_addr *mask,
+                                     struct netlbl_audit *audit_info)
+{
+       int ret_val = -ENOENT;
+       struct netlbl_unlhsh_addr6 *entry;
+       struct audit_buffer *audit_buf = NULL;
+       struct net_device *dev;
+       char *secctx = NULL;
+       u32 secctx_len;
+
+       spin_lock(&netlbl_unlhsh_lock);
+       entry = netlbl_unlhsh_search_addr6(addr, iface);
+       if (entry != NULL &&
+           (ipv6_addr_equal(&entry->addr, addr) &&
+            ipv6_addr_equal(&entry->mask, mask))) {
+               entry->valid = 0;
+               list_del_rcu(&entry->list);
+               ret_val = 0;
+       }
+       spin_unlock(&netlbl_unlhsh_lock);
+
+       audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+                                             audit_info);
+       if (audit_buf != NULL) {
+               dev = dev_get_by_index(net, iface->ifindex);
+               netlbl_unlabel_audit_addr6(audit_buf,
+                                          (dev != NULL ? dev->name : NULL),
+                                          addr, mask);
+               if (dev != NULL)
+                       dev_put(dev);
+               if (security_secid_to_secctx(entry->secid,
+                                            &secctx,
+                                            &secctx_len) == 0) {
+                       audit_log_format(audit_buf, " sec_obj=%s", secctx);
+                       security_release_secctx(secctx, secctx_len);
+               }
+               audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+               audit_log_end(audit_buf);
+       }
+
+       if (ret_val == 0)
+               call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6);
+       return ret_val;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_condremove_iface - Remove an interface entry
+ * @iface: the interface entry
+ *
+ * Description:
+ * Remove an interface entry from the unlabeled connection hash table if it is
+ * empty.  An interface entry is considered to be empty if there are no
+ * address entries assigned to it.
+ *
+ */
+static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
+{
+       struct netlbl_unlhsh_addr4 *iter4;
+       struct netlbl_unlhsh_addr6 *iter6;
+
+       spin_lock(&netlbl_unlhsh_lock);
+       list_for_each_entry_rcu(iter4, &iface->addr4_list, list)
+               if (iter4->valid)
+                       goto unlhsh_condremove_failure;
+       list_for_each_entry_rcu(iter6, &iface->addr6_list, list)
+               if (iter6->valid)
+                       goto unlhsh_condremove_failure;
+       iface->valid = 0;
+       if (iface->ifindex > 0)
+               list_del_rcu(&iface->list);
+       else
+               rcu_assign_pointer(netlbl_unlhsh_def, NULL);
+       spin_unlock(&netlbl_unlhsh_lock);
+
+       call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+       return;
+
+unlhsh_condremove_failure:
+       spin_unlock(&netlbl_unlhsh_lock);
+       return;
+}
+
+/**
+ * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes and existing entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_remove(struct net *net,
+                               const char *dev_name,
+                               const void *addr,
+                               const void *mask,
+                               u32 addr_len,
+                               struct netlbl_audit *audit_info)
+{
+       int ret_val;
+       struct net_device *dev;
+       struct netlbl_unlhsh_iface *iface;
+
+       if (addr_len != sizeof(struct in_addr) &&
+           addr_len != sizeof(struct in6_addr))
+               return -EINVAL;
+
+       rcu_read_lock();
+       if (dev_name != NULL) {
+               dev = dev_get_by_name(net, dev_name);
+               if (dev == NULL) {
+                       ret_val = -ENODEV;
+                       goto unlhsh_remove_return;
+               }
+               iface = netlbl_unlhsh_search_iface(dev->ifindex);
+               dev_put(dev);
+       } else
+               iface = rcu_dereference(netlbl_unlhsh_def);
+       if (iface == NULL) {
+               ret_val = -ENOENT;
+               goto unlhsh_remove_return;
+       }
+       switch (addr_len) {
+       case sizeof(struct in_addr):
+               ret_val = netlbl_unlhsh_remove_addr4(net,
+                                                    iface, addr, mask,
+                                                    audit_info);
+               break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       case sizeof(struct in6_addr):
+               ret_val = netlbl_unlhsh_remove_addr6(net,
+                                                    iface, addr, mask,
+                                                    audit_info);
+               break;
+#endif /* IPv6 */
+       default:
+               ret_val = -EINVAL;
+       }
+       if (ret_val == 0) {
+               netlbl_unlhsh_condremove_iface(iface);
+               atomic_dec(&netlabel_mgmt_protocount);
+       }
+
+unlhsh_remove_return:
+       rcu_read_unlock();
+       return ret_val;
+}
  
  /*
- * Helper Functions
+ * General Helper Functions
+ */
+
+/**
+ * netlbl_unlhsh_netdev_handler - Network device notification handler
+ * @this: notifier block
+ * @event: the event
+ * @ptr: the network device (cast to void)
+ *
+ * Description:
+ * Handle network device events, although at present all we care about is a
+ * network device going away.  In the case of a device going away we clear any
+ * related entries from the unlabeled connection hash table.
+ *
   */
+static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
+                                       unsigned long event,
+                                       void *ptr)
+{
+       struct net_device *dev = ptr;
+       struct netlbl_unlhsh_iface *iface = NULL;
+
+       if (dev->nd_net != &init_net)
+               return NOTIFY_DONE;
+
+       /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
+       if (event == NETDEV_DOWN) {
+               spin_lock(&netlbl_unlhsh_lock);
+               iface = netlbl_unlhsh_search_iface(dev->ifindex);
+               if (iface != NULL && iface->valid) {
+                       iface->valid = 0;
+                       list_del_rcu(&iface->list);
+               } else
+                       iface = NULL;
+               spin_unlock(&netlbl_unlhsh_lock);
+       }
+
+       if (iface != NULL)
+               call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+
+       return NOTIFY_DONE;
+}
  
  /**
   * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
@@ -84,11 +984,8 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
         struct audit_buffer *audit_buf;
         u8 old_val;
  
-       spin_lock(&netlabel_unlabel_acceptflg_lock);
         old_val = netlabel_unlabel_acceptflg;
         netlabel_unlabel_acceptflg = value;
-       spin_unlock(&netlabel_unlabel_acceptflg_lock);
-
         audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
                                               audit_info);
         if (audit_buf != NULL) {
@@ -98,6 +995,48 @@ static void netlbl_unlabel_acceptflg_set(u8 value,
         }
  }
  
+/**
+ * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
+ * @info: the Generic NETLINK info block
+ * @addr: the IP address
+ * @mask: the IP address mask
+ * @len: the address length
+ *
+ * Description:
+ * Examine the Generic NETLINK message and extract the IP address information.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
+                                      void **addr,
+                                      void **mask,
+                                      u32 *len)
+{
+       u32 addr_len;
+
+       if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
+               addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+               if (addr_len != sizeof(struct in_addr) &&
+                   addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
+                       return -EINVAL;
+               *len = addr_len;
+               *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+               *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
+               return 0;
+       } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
+               addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+               if (addr_len != sizeof(struct in6_addr) &&
+                   addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
+                       return -EINVAL;
+               *len = addr_len;
+               *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+               *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
  /*
   * NetLabel Command Handlers
   */
@@ -155,11 +1094,9 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
                 goto list_failure;
         }
  
-       rcu_read_lock();
         ret_val = nla_put_u8(ans_skb,
                              NLBL_UNLABEL_A_ACPTFLG,
                              netlabel_unlabel_acceptflg);
-       rcu_read_unlock();
         if (ret_val != 0)
                 goto list_failure;
  
@@ -175,11 +1112,489 @@ list_failure:
         return ret_val;
  }
  
+/**
+ * netlbl_unlabel_staticadd - Handle a STATICADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADD message and add a new unlabeled
+ * connection entry to the hash table.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticadd(struct sk_buff *skb,
+                                   struct genl_info *info)
+{
+       int ret_val;
+       char *dev_name;
+       void *addr;
+       void *mask;
+       u32 addr_len;
+       u32 secid;
+       struct netlbl_audit audit_info;
+
+       /* Don't allow users to add both IPv4 and IPv6 addresses for a
+        * single entry.  However, allow users to create two entries, one each
+        * for IPv4 and IPv4, with the same LSM security context which should
+        * achieve the same result. */
+       if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+           !info->attrs[NLBL_UNLABEL_A_IFACE] ||
+           !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+             (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+               return -EINVAL;
+
+       netlbl_netlink_auditinfo(skb, &audit_info);
+
+       ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+       if (ret_val != 0)
+               return ret_val;
+       dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+       ret_val = security_secctx_to_secid(
+                                 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+                                 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+                                 &secid);
+       if (ret_val != 0)
+               return ret_val;
+
+       return netlbl_unlhsh_add(&init_net,
+                                dev_name, addr, mask, addr_len, secid,
+                                &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADDDEF message and add a new default
+ * unlabeled connection entry.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
+                                      struct genl_info *info)
+{
+       int ret_val;
+       void *addr;
+       void *mask;
+       u32 addr_len;
+       u32 secid;
+       struct netlbl_audit audit_info;
+
+       /* Don't allow users to add both IPv4 and IPv6 addresses for a
+        * single entry.  However, allow users to create two entries, one each
+        * for IPv4 and IPv6, with the same LSM security context which should
+        * achieve the same result. */
+       if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+           !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+             (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+               return -EINVAL;
+
+       netlbl_netlink_auditinfo(skb, &audit_info);
+
+       ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+       if (ret_val != 0)
+               return ret_val;
+       ret_val = security_secctx_to_secid(
+                                 nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+                                 nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+                                 &secid);
+       if (ret_val != 0)
+               return ret_val;
+
+       return netlbl_unlhsh_add(&init_net,
+                                NULL, addr, mask, addr_len, secid,
+                                &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVE message and remove the specified
+ * unlabeled connection entry.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremove(struct sk_buff *skb,
+                                      struct genl_info *info)
+{
+       int ret_val;
+       char *dev_name;
+       void *addr;
+       void *mask;
+       u32 addr_len;
+       struct netlbl_audit audit_info;
+
+       /* See the note in netlbl_unlabel_staticadd() about not allowing both
+        * IPv4 and IPv6 in the same entry. */
+       if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
+           !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+             (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+               return -EINVAL;
+
+       netlbl_netlink_auditinfo(skb, &audit_info);
+
+       ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+       if (ret_val != 0)
+               return ret_val;
+       dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+
+       return netlbl_unlhsh_remove(&init_net,
+                                   dev_name, addr, mask, addr_len,
+                                   &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVEDEF message and remove the default
+ * unlabeled connection entry.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
+                                         struct genl_info *info)
+{
+       int ret_val;
+       void *addr;
+       void *mask;
+       u32 addr_len;
+       struct netlbl_audit audit_info;
+
+       /* See the note in netlbl_unlabel_staticadd() about not allowing both
+        * IPv4 and IPv6 in the same entry. */
+       if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+             (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+              !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+               return -EINVAL;
+
+       netlbl_netlink_auditinfo(skb, &audit_info);
+
+       ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+       if (ret_val != 0)
+               return ret_val;
+
+       return netlbl_unlhsh_remove(&init_net,
+                                   NULL, addr, mask, addr_len,
+                                   &audit_info);
+}
+
+
+/**
+ * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
+ * @cmd: command/message
+ * @iface: the interface entry
+ * @addr4: the IPv4 address entry
+ * @addr6: the IPv6 address entry
+ * @arg: the netlbl_unlhsh_walk_arg structure
+ *
+ * Description:
+ * This function is designed to be used to generate a response for a
+ * STATICLIST or STATICLISTDEF message.  When called either @addr4 or @addr6
+ * can be specified, not both, the other unspecified entry should be set to
+ * NULL by the caller.  Returns the size of the message on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticlist_gen(u32 cmd,
+                                      const struct netlbl_unlhsh_iface *iface,
+                                      const struct netlbl_unlhsh_addr4 *addr4,
+                                      const struct netlbl_unlhsh_addr6 *addr6,
+                                      void *arg)
+{
+       int ret_val = -ENOMEM;
+       struct netlbl_unlhsh_walk_arg *cb_arg = arg;
+       struct net_device *dev;
+       void *data;
+       u32 secid;
+       char *secctx;
+       u32 secctx_len;
+
+       data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
+                          cb_arg->seq, &netlbl_unlabel_gnl_family,
+                          NLM_F_MULTI, cmd);
+       if (data == NULL)
+               goto list_cb_failure;
+
+       if (iface->ifindex > 0) {
+               dev = dev_get_by_index(&init_net, iface->ifindex);
+               ret_val = nla_put_string(cb_arg->skb,
+                                        NLBL_UNLABEL_A_IFACE, dev->name);
+               dev_put(dev);
+               if (ret_val != 0)
+                       goto list_cb_failure;
+       }
+
+       if (addr4) {
+               struct in_addr addr_struct;
+
+               addr_struct.s_addr = addr4->addr;
+               ret_val = nla_put(cb_arg->skb,
+                                 NLBL_UNLABEL_A_IPV4ADDR,
+                                 sizeof(struct in_addr),
+                                 &addr_struct);
+               if (ret_val != 0)
+                       goto list_cb_failure;
+
+               addr_struct.s_addr = addr4->mask;
+               ret_val = nla_put(cb_arg->skb,
+                                 NLBL_UNLABEL_A_IPV4MASK,
+                                 sizeof(struct in_addr),
+                                 &addr_struct);
+               if (ret_val != 0)
+                       goto list_cb_failure;
+
+               secid = addr4->secid;
+       } else {
+               ret_val = nla_put(cb_arg->skb,
+                                 NLBL_UNLABEL_A_IPV6ADDR,
+                                 sizeof(struct in6_addr),
+                                 &addr6->addr);
+               if (ret_val != 0)
+                       goto list_cb_failure;
+
+               ret_val = nla_put(cb_arg->skb,
+                                 NLBL_UNLABEL_A_IPV6MASK,
+                                 sizeof(struct in6_addr),
+                                 &addr6->mask);
+               if (ret_val != 0)
+                       goto list_cb_failure;
+
+               secid = addr6->secid;
+       }
+
+       ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
+       if (ret_val != 0)
+               goto list_cb_failure;
+       ret_val = nla_put(cb_arg->skb,
+                         NLBL_UNLABEL_A_SECCTX,
+                         secctx_len,
+                         secctx);
+       security_release_secctx(secctx, secctx_len);
+       if (ret_val != 0)
+               goto list_cb_failure;
+
+       cb_arg->seq++;
+       return genlmsg_end(cb_arg->skb, data);
+
+list_cb_failure:
+       genlmsg_cancel(cb_arg->skb, data);
+       return ret_val;
+}
+
+/**
+ * netlbl_unlabel_staticlist - Handle a STATICLIST message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLIST message and dump the unlabeled
+ * connection hash table in a form suitable for use in a kernel generated
+ * STATICLIST message.  Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlist(struct sk_buff *skb,
+                                    struct netlink_callback *cb)
+{
+       struct netlbl_unlhsh_walk_arg cb_arg;
+       u32 skip_bkt = cb->args[0];
+       u32 skip_chain = cb->args[1];
+       u32 skip_addr4 = cb->args[2];
+       u32 skip_addr6 = cb->args[3];
+       u32 iter_bkt;
+       u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
+       struct netlbl_unlhsh_iface *iface;
+       struct netlbl_unlhsh_addr4 *addr4;
+       struct netlbl_unlhsh_addr6 *addr6;
+
+       cb_arg.nl_cb = cb;
+       cb_arg.skb = skb;
+       cb_arg.seq = cb->nlh->nlmsg_seq;
+
+       rcu_read_lock();
+       for (iter_bkt = skip_bkt;
+            iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
+            iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
+               list_for_each_entry_rcu(iface,
+                               &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt],
+                               list) {
+                       if (!iface->valid ||
+                           iter_chain++ < skip_chain)
+                               continue;
+                       list_for_each_entry_rcu(addr4,
+                                               &iface->addr4_list,
+                                               list) {
+                               if (!addr4->valid || iter_addr4++ < skip_addr4)
+                                       continue;
+                               if (netlbl_unlabel_staticlist_gen(
+                                                    NLBL_UNLABEL_C_STATICLIST,
+                                                    iface,
+                                                    addr4,
+                                                    NULL,
+                                                    &cb_arg) < 0) {
+                                       iter_addr4--;
+                                       iter_chain--;
+                                       goto unlabel_staticlist_return;
+                               }
+                       }
+                       list_for_each_entry_rcu(addr6,
+                                               &iface->addr6_list,
+                                               list) {
+                               if (!addr6->valid || iter_addr6++ < skip_addr6)
+                                       continue;
+                               if (netlbl_unlabel_staticlist_gen(
+                                                    NLBL_UNLABEL_C_STATICLIST,
+                                                    iface,
+                                                    NULL,
+                                                    addr6,
+                                                    &cb_arg) < 0) {
+                                       iter_addr6--;
+                                       iter_chain--;
+                                       goto unlabel_staticlist_return;
+                               }
+                       }
+               }
+       }
+
+unlabel_staticlist_return:
+       rcu_read_unlock();
+       cb->args[0] = skip_bkt;
+       cb->args[1] = skip_chain;
+       cb->args[2] = skip_addr4;
+       cb->args[3] = skip_addr6;
+       return skb->len;
+}
+
+/**
+ * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLISTDEF message and dump the default
+ * unlabeled connection entry in a form suitable for use in a kernel generated
+ * STATICLISTDEF message.  Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
+                                       struct netlink_callback *cb)
+{
+       struct netlbl_unlhsh_walk_arg cb_arg;
+       struct netlbl_unlhsh_iface *iface;
+       u32 skip_addr4 = cb->args[0];
+       u32 skip_addr6 = cb->args[1];
+       u32 iter_addr4 = 0, iter_addr6 = 0;
+       struct netlbl_unlhsh_addr4 *addr4;
+       struct netlbl_unlhsh_addr6 *addr6;
+
+       cb_arg.nl_cb = cb;
+       cb_arg.skb = skb;
+       cb_arg.seq = cb->nlh->nlmsg_seq;
+
+       rcu_read_lock();
+       iface = rcu_dereference(netlbl_unlhsh_def);
+       if (iface == NULL || !iface->valid)
+               goto unlabel_staticlistdef_return;
+
+       list_for_each_entry_rcu(addr4, &iface->addr4_list, list) {
+               if (!addr4->valid || iter_addr4++ < skip_addr4)
+                       continue;
+               if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+                                          iface,
+                                          addr4,
+                                          NULL,
+                                          &cb_arg) < 0) {
+                       iter_addr4--;
+                       goto unlabel_staticlistdef_return;
+               }
+       }
+       list_for_each_entry_rcu(addr6, &iface->addr6_list, list) {
+               if (addr6->valid || iter_addr6++ < skip_addr6)
+                       continue;
+               if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+                                          iface,
+                                          NULL,
+                                          addr6,
+                                          &cb_arg) < 0) {
+                       iter_addr6--;
+                       goto unlabel_staticlistdef_return;
+               }
+       }
+
+unlabel_staticlistdef_return:
+       rcu_read_unlock();
+       cb->args[0] = skip_addr4;
+       cb->args[1] = skip_addr6;
+       return skb->len;
+}
  
  /*
   * NetLabel Generic NETLINK Command Definitions
   */
  
+static struct genl_ops netlbl_unlabel_genl_c_staticadd = {
+       .cmd = NLBL_UNLABEL_C_STATICADD,
+       .flags = GENL_ADMIN_PERM,
+       .policy = netlbl_unlabel_genl_policy,
+       .doit = netlbl_unlabel_staticadd,
+       .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticremove = {
+       .cmd = NLBL_UNLABEL_C_STATICREMOVE,
+       .flags = GENL_ADMIN_PERM,
+       .policy = netlbl_unlabel_genl_policy,
+       .doit = netlbl_unlabel_staticremove,
+       .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticlist = {
+       .cmd = NLBL_UNLABEL_C_STATICLIST,
+       .flags = 0,
+       .policy = netlbl_unlabel_genl_policy,
+       .doit = NULL,
+       .dumpit = netlbl_unlabel_staticlist,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticadddef = {
+       .cmd = NLBL_UNLABEL_C_STATICADDDEF,
+       .flags = GENL_ADMIN_PERM,
+       .policy = netlbl_unlabel_genl_policy,
+       .doit = netlbl_unlabel_staticadddef,
+       .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = {
+       .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
+       .flags = GENL_ADMIN_PERM,
+       .policy = netlbl_unlabel_genl_policy,
+       .doit = netlbl_unlabel_staticremovedef,
+       .dumpit = NULL,
+};
+
+static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = {
+       .cmd = NLBL_UNLABEL_C_STATICLISTDEF,
+       .flags = 0,
+       .policy = netlbl_unlabel_genl_policy,
+       .doit = NULL,
+       .dumpit = netlbl_unlabel_staticlistdef,
+};
+
  static struct genl_ops netlbl_unlabel_genl_c_accept = {
         .cmd = NLBL_UNLABEL_C_ACCEPT,
         .flags = GENL_ADMIN_PERM,
@@ -196,7 +1611,6 @@ static struct genl_ops netlbl_unlabel_genl_c_list = {
         .dumpit = NULL,
  };
  
-
  /*
   * NetLabel Generic NETLINK Protocol Functions
   */
@@ -217,6 +1631,36 @@ int netlbl_unlabel_genl_init(void)
         if (ret_val != 0)
                 return ret_val;
  
+       ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+                                   &netlbl_unlabel_genl_c_staticadd);
+       if (ret_val != 0)
+               return ret_val;
+
+       ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+                                   &netlbl_unlabel_genl_c_staticremove);
+       if (ret_val != 0)
+               return ret_val;
+
+       ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+                                   &netlbl_unlabel_genl_c_staticlist);
+       if (ret_val != 0)
+               return ret_val;
+
+       ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+                                   &netlbl_unlabel_genl_c_staticadddef);
+       if (ret_val != 0)
+               return ret_val;
+
+       ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+                                   &netlbl_unlabel_genl_c_staticremovedef);
+       if (ret_val != 0)
+               return ret_val;
+
+       ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
+                                   &netlbl_unlabel_genl_c_staticlistdef);
+       if (ret_val != 0)
+               return ret_val;
+
         ret_val = genl_register_ops(&netlbl_unlabel_gnl_family,
                                     &netlbl_unlabel_genl_c_accept);
         if (ret_val != 0)
@@ -234,8 +1678,58 @@ int netlbl_unlabel_genl_init(void)
   * NetLabel KAPI Hooks
   */
  
+static struct notifier_block netlbl_unlhsh_netdev_notifier = {
+       .notifier_call = netlbl_unlhsh_netdev_handler,
+};
+
+/**
+ * netlbl_unlabel_init - Initialize the unlabeled connection hash table
+ * @size: the number of bits to use for the hash buckets
+ *
+ * Description:
+ * Initializes the unlabeled connection hash table and registers a network
+ * device notification handler.  This function should only be called by the
+ * NetLabel subsystem itself during initialization.  Returns zero on success,
+ * non-zero values on error.
+ *
+ */
+int netlbl_unlabel_init(u32 size)
+{
+       u32 iter;
+       struct netlbl_unlhsh_tbl *hsh_tbl;
+
+       if (size == 0)
+               return -EINVAL;
+
+       hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
+       if (hsh_tbl == NULL)
+               return -ENOMEM;
+       hsh_tbl->size = 1 << size;
+       hsh_tbl->tbl = kcalloc(hsh_tbl->size,
+                              sizeof(struct list_head),
+                              GFP_KERNEL);
+       if (hsh_tbl->tbl == NULL) {
+               kfree(hsh_tbl);
+               return -ENOMEM;
+       }
+       for (iter = 0; iter < hsh_tbl->size; iter++)
+               INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
+
+       rcu_read_lock();
+       spin_lock(&netlbl_unlhsh_lock);
+       rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
+       spin_unlock(&netlbl_unlhsh_lock);
+       rcu_read_unlock();
+
+       register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
+
+       return 0;
+}
+
  /**
   * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
+ * @skb: the packet
+ * @family: protocol family
   * @secattr: the security attributes
   *
   * Description:
@@ -243,19 +1737,52 @@ int netlbl_unlabel_genl_init(void)
   * them in @secattr.  Returns zero on success and negative values on failure.
   *
   */
-int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr)
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+                          u16 family,
+                          struct netlbl_lsm_secattr *secattr)
  {
-       int ret_val;
+       struct iphdr *hdr4;
+       struct ipv6hdr *hdr6;
+       struct netlbl_unlhsh_addr4 *addr4;
+       struct netlbl_unlhsh_addr6 *addr6;
+       struct netlbl_unlhsh_iface *iface;
  
         rcu_read_lock();
-       if (netlabel_unlabel_acceptflg == 1) {
-               netlbl_secattr_init(secattr);
-               ret_val = 0;
-       } else
-               ret_val = -ENOMSG;
+       iface = netlbl_unlhsh_search_iface_def(skb->iif);
+       if (iface == NULL)
+               goto unlabel_getattr_nolabel;
+       switch (family) {
+       case PF_INET:
+               hdr4 = ip_hdr(skb);
+               addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface);
+               if (addr4 == NULL)
+                       goto unlabel_getattr_nolabel;
+               secattr->attr.secid = addr4->secid;
+               break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       case PF_INET6:
+               hdr6 = ipv6_hdr(skb);
+               addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface);
+               if (addr6 == NULL)
+                       goto unlabel_getattr_nolabel;
+               secattr->attr.secid = addr6->secid;
+               break;
+#endif /* IPv6 */
+       default:
+               goto unlabel_getattr_nolabel;
+       }
         rcu_read_unlock();
  
-       return ret_val;
+       secattr->flags |= NETLBL_SECATTR_SECID;
+       secattr->type = NETLBL_NLTYPE_UNLABELED;
+       return 0;
+
+unlabel_getattr_nolabel:
+       rcu_read_unlock();
+       if (netlabel_unlabel_acceptflg == 0)
+               return -ENOMSG;
+       secattr->type = NETLBL_NLTYPE_UNLABELED;
+       return 0;
  }
  
  /**
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h

index c2917fbb42cf95f5dcaff8791949808879bad871..06b1301ac072fdf63ebb9c156d14ed3be69b1941 100644 (file)
--- a/net/netlabel/netlabel_unlabeled.h
+++ b/net/netlabel/netlabel_unlabeled.h
@@ -36,6 +36,116 @@
  /*
   * The following NetLabel payloads are supported by the Unlabeled subsystem.
   *
+ * o STATICADD
+ *   This message is sent from an application to add a new static label for
+ *   incoming unlabeled connections.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_IFACE
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVE
+ *   This message is sent from an application to remove an existing static
+ *   label for incoming unlabeled connections.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_IFACE
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLIST
+ *   This message can be sent either from an application or by the kernel in
+ *   response to an application generated STATICLIST message.  When sent by an
+ *   application there is no payload and the NLM_F_DUMP flag should be set.
+ *   The kernel should response with a series of the following messages.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_IFACE
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICADDDEF
+ *   This message is sent from an application to set the default static
+ *   label for incoming unlabeled connections.
+ *
+ *   Required attribute:
+ *
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVEDEF
+ *   This message is sent from an application to remove the existing default
+ *   static label for incoming unlabeled connections.
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLISTDEF
+ *   This message can be sent either from an application or by the kernel in
+ *   response to an application generated STATICLISTDEF message.  When sent by
+ *   an application there is no payload and the NLM_F_DUMP flag should be set.
+ *   The kernel should response with the following message.
+ *
+ *   Required attribute:
+ *
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
   * o ACCEPT
   *   This message is sent from an application to specify if the kernel should
   *   allow unlabled packets to pass if they do not match any of the static
@@ -62,6 +172,12 @@ enum {
         NLBL_UNLABEL_C_UNSPEC,
         NLBL_UNLABEL_C_ACCEPT,
         NLBL_UNLABEL_C_LIST,
+       NLBL_UNLABEL_C_STATICADD,
+       NLBL_UNLABEL_C_STATICREMOVE,
+       NLBL_UNLABEL_C_STATICLIST,
+       NLBL_UNLABEL_C_STATICADDDEF,
+       NLBL_UNLABEL_C_STATICREMOVEDEF,
+       NLBL_UNLABEL_C_STATICLISTDEF,
         __NLBL_UNLABEL_C_MAX,
  };
  #define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1)
@@ -73,6 +189,24 @@ enum {
         /* (NLA_U8)
          * if true then unlabeled packets are allowed to pass, else unlabeled
          * packets are rejected */
+       NLBL_UNLABEL_A_IPV6ADDR,
+       /* (NLA_BINARY, struct in6_addr)
+        * an IPv6 address */
+       NLBL_UNLABEL_A_IPV6MASK,
+       /* (NLA_BINARY, struct in6_addr)
+        * an IPv6 address mask */
+       NLBL_UNLABEL_A_IPV4ADDR,
+       /* (NLA_BINARY, struct in_addr)
+        * an IPv4 address */
+       NLBL_UNLABEL_A_IPV4MASK,
+       /* (NLA_BINARY, struct in_addr)
+        * and IPv4 address mask */
+       NLBL_UNLABEL_A_IFACE,
+       /* (NLA_NULL_STRING)
+        * network interface */
+       NLBL_UNLABEL_A_SECCTX,
+       /* (NLA_BINARY)
+        * a LSM specific security context */
         __NLBL_UNLABEL_A_MAX,
  };
  #define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
@@ -80,8 +214,17 @@ enum {
  /* NetLabel protocol functions */
  int netlbl_unlabel_genl_init(void);
  
+/* Unlabeled connection hash table size */
+/* XXX - currently this number is an uneducated guess */
+#define NETLBL_UNLHSH_BITSIZE       7
+
+/* General Unlabeled init function */
+int netlbl_unlabel_init(u32 size);
+
  /* Process Unlabeled incoming network packets */
-int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr);
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+                          u16 family,
+                          struct netlbl_lsm_secattr *secattr);
  
  /* Set the default configuration to allow Unlabeled packets */
  int netlbl_unlabel_defconf(void);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c

index 1ea27559b1deb43dbeb9613fc3c9c501ef6aa942..bcd9abdb031c49a4473b75294d44ae2b2d6c02ac 100644 (file)
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -51,6 +51,7 @@ rpcauth_register(const struct rpc_authops *ops)
         spin_unlock(&rpc_authflavor_lock);
         return ret;
  }
+EXPORT_SYMBOL_GPL(rpcauth_register);
  
  int
  rpcauth_unregister(const struct rpc_authops *ops)
@@ -68,6 +69,7 @@ rpcauth_unregister(const struct rpc_authops *ops)
         spin_unlock(&rpc_authflavor_lock);
         return ret;
  }
+EXPORT_SYMBOL_GPL(rpcauth_unregister);
  
  struct rpc_auth *
  rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
@@ -102,6 +104,7 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
  out:
         return auth;
  }
+EXPORT_SYMBOL_GPL(rpcauth_create);
  
  void
  rpcauth_release(struct rpc_auth *auth)
@@ -151,6 +154,7 @@ rpcauth_init_credcache(struct rpc_auth *auth)
         auth->au_credcache = new;
         return 0;
  }
+EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
  
  /*
   * Destroy a list of credentials
@@ -213,6 +217,7 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
                 kfree(cache);
         }
  }
+EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
  
  /*
   * Remove stale credentials. Avoid sleeping inside the loop.
@@ -332,6 +337,7 @@ found:
  out:
         return cred;
  }
+EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache);
  
  struct rpc_cred *
  rpcauth_lookupcred(struct rpc_auth *auth, int flags)
@@ -350,6 +356,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
         put_group_info(acred.group_info);
         return ret;
  }
+EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
  
  void
  rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
@@ -366,7 +373,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
  #endif
         cred->cr_uid = acred->uid;
  }
-EXPORT_SYMBOL(rpcauth_init_cred);
+EXPORT_SYMBOL_GPL(rpcauth_init_cred);
  
  struct rpc_cred *
  rpcauth_bindcred(struct rpc_task *task)
@@ -378,6 +385,7 @@ rpcauth_bindcred(struct rpc_task *task)
                 .group_info = current->group_info,
         };
         struct rpc_cred *ret;
+       sigset_t oldset;
         int flags = 0;
  
         dprintk("RPC: %5u looking up %s cred\n",
@@ -385,7 +393,9 @@ rpcauth_bindcred(struct rpc_task *task)
         get_group_info(acred.group_info);
         if (task->tk_flags & RPC_TASK_ROOTCREDS)
                 flags |= RPCAUTH_LOOKUP_ROOTCREDS;
+       rpc_clnt_sigmask(task->tk_client, &oldset);
         ret = auth->au_ops->lookup_cred(auth, &acred, flags);
+       rpc_clnt_sigunmask(task->tk_client, &oldset);
         if (!IS_ERR(ret))
                 task->tk_msg.rpc_cred = ret;
         else
@@ -435,6 +445,7 @@ need_lock:
  out_destroy:
         cred->cr_ops->crdestroy(cred);
  }
+EXPORT_SYMBOL_GPL(put_rpccred);
  
  void
  rpcauth_unbindcred(struct rpc_task *task)
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c

index 1f2d85e869c0a2760de7641f7d3c125d80d54adb..6dac387922888def8781b2f6d6d104acab9dc97c 100644 (file)
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -472,16 +472,15 @@ gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
                 char __user *dst, size_t buflen)
  {
         char *data = (char *)msg->data + msg->copied;
-       ssize_t mlen = msg->len;
-       ssize_t left;
+       size_t mlen = min(msg->len, buflen);
+       unsigned long left;
  
-       if (mlen > buflen)
-               mlen = buflen;
         left = copy_to_user(dst, data, mlen);
-       if (left < 0) {
-               msg->errno = left;
-               return left;
+       if (left == mlen) {
+               msg->errno = -EFAULT;
+               return -EFAULT;
         }
+
         mlen -= left;
         msg->copied += mlen;
         msg->errno = 0;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c

index 76be83ee4b04a59f90f99a3afc69ce8889a7ff39..924916ceaa435b531b043b190cd006268492d28b 100644 (file)
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -30,6 +30,7 @@
  #include <linux/smp_lock.h>
  #include <linux/utsname.h>
  #include <linux/workqueue.h>
+#include <linux/in6.h>
  
  #include <linux/sunrpc/clnt.h>
  #include <linux/sunrpc/rpc_pipe_fs.h>
@@ -121,8 +122,9 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
         }
  }
  
-static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, struct rpc_program *program, u32 vers, rpc_authflavor_t flavor)
+static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt)
  {
+       struct rpc_program      *program = args->program;
         struct rpc_version      *version;
         struct rpc_clnt         *clnt = NULL;
         struct rpc_auth         *auth;
@@ -131,13 +133,13 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
  
         /* sanity check the name before trying to print it */
         err = -EINVAL;
-       len = strlen(servname);
+       len = strlen(args->servername);
         if (len > RPC_MAXNETNAMELEN)
                 goto out_no_rpciod;
         len++;
  
         dprintk("RPC:       creating %s client for %s (xprt %p)\n",
-                       program->name, servname, xprt);
+                       program->name, args->servername, xprt);
  
         err = rpciod_up();
         if (err)
@@ -145,7 +147,11 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
         err = -EINVAL;
         if (!xprt)
                 goto out_no_xprt;
-       if (vers >= program->nrvers || !(version = program->version[vers]))
+
+       if (args->version >= program->nrvers)
+               goto out_err;
+       version = program->version[args->version];
+       if (version == NULL)
                 goto out_err;
  
         err = -ENOMEM;
@@ -157,12 +163,12 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
         clnt->cl_server = clnt->cl_inline_name;
         if (len > sizeof(clnt->cl_inline_name)) {
                 char *buf = kmalloc(len, GFP_KERNEL);
-               if (buf != 0)
+               if (buf != NULL)
                         clnt->cl_server = buf;
                 else
                         len = sizeof(clnt->cl_inline_name);
         }
-       strlcpy(clnt->cl_server, servname, len);
+       strlcpy(clnt->cl_server, args->servername, len);
  
         clnt->cl_xprt     = xprt;
         clnt->cl_procinfo = version->procs;
@@ -182,8 +188,15 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
         if (!xprt_bound(clnt->cl_xprt))
                 clnt->cl_autobind = 1;
  
+       clnt->cl_timeout = xprt->timeout;
+       if (args->timeout != NULL) {
+               memcpy(&clnt->cl_timeout_default, args->timeout,
+                               sizeof(clnt->cl_timeout_default));
+               clnt->cl_timeout = &clnt->cl_timeout_default;
+       }
+
         clnt->cl_rtt = &clnt->cl_rtt_default;
-       rpc_init_rtt(&clnt->cl_rtt_default, xprt->timeout.to_initval);
+       rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
  
         kref_init(&clnt->cl_kref);
  
@@ -191,10 +204,10 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s
         if (err < 0)
                 goto out_no_path;
  
-       auth = rpcauth_create(flavor, clnt);
+       auth = rpcauth_create(args->authflavor, clnt);
         if (IS_ERR(auth)) {
                 printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
-                               flavor);
+                               args->authflavor);
                 err = PTR_ERR(auth);
                 goto out_no_auth;
         }
@@ -245,9 +258,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
                 .srcaddr = args->saddress,
                 .dstaddr = args->address,
                 .addrlen = args->addrsize,
-               .timeout = args->timeout
         };
-       char servername[20];
+       char servername[48];
  
         xprt = xprt_create_transport(&xprtargs);
         if (IS_ERR(xprt))
@@ -258,13 +270,34 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
          * up a string representation of the passed-in address.
          */
         if (args->servername == NULL) {
-               struct sockaddr_in *addr =
-                                       (struct sockaddr_in *) args->address;
-               snprintf(servername, sizeof(servername), NIPQUAD_FMT,
-                       NIPQUAD(addr->sin_addr.s_addr));
+               servername[0] = '\0';
+               switch (args->address->sa_family) {
+               case AF_INET: {
+                       struct sockaddr_in *sin =
+                                       (struct sockaddr_in *)args->address;
+                       snprintf(servername, sizeof(servername), NIPQUAD_FMT,
+                                NIPQUAD(sin->sin_addr.s_addr));
+                       break;
+               }
+               case AF_INET6: {
+                       struct sockaddr_in6 *sin =
+                                       (struct sockaddr_in6 *)args->address;
+                       snprintf(servername, sizeof(servername), NIP6_FMT,
+                                NIP6(sin->sin6_addr));
+                       break;
+               }
+               default:
+                       /* caller wants default server name, but
+                        * address family isn't recognized. */
+                       return ERR_PTR(-EINVAL);
+               }
                 args->servername = servername;
         }
  
+       xprt = xprt_create_transport(&xprtargs);
+       if (IS_ERR(xprt))
+               return (struct rpc_clnt *)xprt;
+
         /*
          * By default, kernel RPC client connects from a reserved port.
          * CAP_NET_BIND_SERVICE will not be set for unprivileged requesters,
@@ -275,8 +308,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
         if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
                 xprt->resvport = 0;
  
-       clnt = rpc_new_client(xprt, args->servername, args->program,
-                               args->version, args->authflavor);
+       clnt = rpc_new_client(args, xprt);
         if (IS_ERR(clnt))
                 return clnt;
  
@@ -322,7 +354,7 @@ rpc_clone_client(struct rpc_clnt *clnt)
         new->cl_autobind = 0;
         INIT_LIST_HEAD(&new->cl_tasks);
         spin_lock_init(&new->cl_lock);
-       rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
+       rpc_init_rtt(&new->cl_rtt_default, clnt->cl_timeout->to_initval);
         new->cl_metrics = rpc_alloc_iostats(clnt);
         if (new->cl_metrics == NULL)
                 goto out_no_stats;
@@ -345,6 +377,7 @@ out_no_clnt:
         dprintk("RPC:       %s: returned error %d\n", __FUNCTION__, err);
         return ERR_PTR(err);
  }
+EXPORT_SYMBOL_GPL(rpc_clone_client);
  
  /*
   * Properly shut down an RPC client, terminating all outstanding
@@ -363,6 +396,7 @@ void rpc_shutdown_client(struct rpc_clnt *clnt)
  
         rpc_release_client(clnt);
  }
+EXPORT_SYMBOL_GPL(rpc_shutdown_client);
  
  /*
   * Free an RPC client
@@ -467,6 +501,7 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
  out:
         return clnt;
  }
+EXPORT_SYMBOL_GPL(rpc_bind_new_program);
  
  /*
   * Default callback for async RPC calls
@@ -498,12 +533,12 @@ static void rpc_save_sigmask(sigset_t *oldset, int intr)
         sigprocmask(SIG_BLOCK, &sigmask, oldset);
  }
  
-static inline void rpc_task_sigmask(struct rpc_task *task, sigset_t *oldset)
+static void rpc_task_sigmask(struct rpc_task *task, sigset_t *oldset)
  {
         rpc_save_sigmask(oldset, !RPC_TASK_UNINTERRUPTIBLE(task));
  }
  
-static inline void rpc_restore_sigmask(sigset_t *oldset)
+static void rpc_restore_sigmask(sigset_t *oldset)
  {
         sigprocmask(SIG_SETMASK, oldset, NULL);
  }
@@ -512,45 +547,49 @@ void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset)
  {
         rpc_save_sigmask(oldset, clnt->cl_intr);
  }
+EXPORT_SYMBOL_GPL(rpc_clnt_sigmask);
  
  void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset)
  {
         rpc_restore_sigmask(oldset);
  }
+EXPORT_SYMBOL_GPL(rpc_clnt_sigunmask);
  
-static
-struct rpc_task *rpc_do_run_task(struct rpc_clnt *clnt,
-               struct rpc_message *msg,
-               int flags,
-               const struct rpc_call_ops *ops,
-               void *data)
+/**
+ * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
+ * @task_setup_data: pointer to task initialisation data
+ */
+struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
  {
         struct rpc_task *task, *ret;
         sigset_t oldset;
  
-       task = rpc_new_task(clnt, flags, ops, data);
+       task = rpc_new_task(task_setup_data);
         if (task == NULL) {
-               rpc_release_calldata(ops, data);
-               return ERR_PTR(-ENOMEM);
+               rpc_release_calldata(task_setup_data->callback_ops,
+                               task_setup_data->callback_data);
+               ret = ERR_PTR(-ENOMEM);
+               goto out;
         }
  
-       /* Mask signals on synchronous RPC calls and RPCSEC_GSS upcalls */
-       rpc_task_sigmask(task, &oldset);
-       if (msg != NULL) {
-               rpc_call_setup(task, msg, 0);
-               if (task->tk_status != 0) {
-                       ret = ERR_PTR(task->tk_status);
-                       rpc_put_task(task);
-                       goto out;
-               }
+       if (task->tk_status != 0) {
+               ret = ERR_PTR(task->tk_status);
+               rpc_put_task(task);
+               goto out;
         }
         atomic_inc(&task->tk_count);
-       rpc_execute(task);
+       /* Mask signals on synchronous RPC calls and RPCSEC_GSS upcalls */
+       if (!RPC_IS_ASYNC(task)) {
+               rpc_task_sigmask(task, &oldset);
+               rpc_execute(task);
+               rpc_restore_sigmask(&oldset);
+       } else
+               rpc_execute(task);
         ret = task;
  out:
-       rpc_restore_sigmask(&oldset);
         return ret;
  }
+EXPORT_SYMBOL_GPL(rpc_run_task);
  
  /**
   * rpc_call_sync - Perform a synchronous RPC call
@@ -561,17 +600,24 @@ out:
  int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
  {
         struct rpc_task *task;
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = clnt,
+               .rpc_message = msg,
+               .callback_ops = &rpc_default_ops,
+               .flags = flags,
+       };
         int status;
  
         BUG_ON(flags & RPC_TASK_ASYNC);
  
-       task = rpc_do_run_task(clnt, msg, flags, &rpc_default_ops, NULL);
+       task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
         status = task->tk_status;
         rpc_put_task(task);
         return status;
  }
+EXPORT_SYMBOL_GPL(rpc_call_sync);
  
  /**
   * rpc_call_async - Perform an asynchronous RPC call
@@ -586,45 +632,28 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
                const struct rpc_call_ops *tk_ops, void *data)
  {
         struct rpc_task *task;
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = clnt,
+               .rpc_message = msg,
+               .callback_ops = tk_ops,
+               .callback_data = data,
+               .flags = flags|RPC_TASK_ASYNC,
+       };
  
-       task = rpc_do_run_task(clnt, msg, flags|RPC_TASK_ASYNC, tk_ops, data);
+       task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
         rpc_put_task(task);
         return 0;
  }
-
-/**
- * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
- * @clnt: pointer to RPC client
- * @flags: RPC flags
- * @ops: RPC call ops
- * @data: user call data
- */
-struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
-                                       const struct rpc_call_ops *tk_ops,
-                                       void *data)
-{
-       return rpc_do_run_task(clnt, NULL, flags, tk_ops, data);
-}
-EXPORT_SYMBOL(rpc_run_task);
+EXPORT_SYMBOL_GPL(rpc_call_async);
  
  void
-rpc_call_setup(struct rpc_task *task, struct rpc_message *msg, int flags)
+rpc_call_start(struct rpc_task *task)
  {
-       task->tk_msg   = *msg;
-       task->tk_flags |= flags;
-       /* Bind the user cred */
-       if (task->tk_msg.rpc_cred != NULL)
-               rpcauth_holdcred(task);
-       else
-               rpcauth_bindcred(task);
-
-       if (task->tk_status == 0)
-               task->tk_action = call_start;
-       else
-               task->tk_action = rpc_exit_task;
+       task->tk_action = call_start;
  }
+EXPORT_SYMBOL_GPL(rpc_call_start);
  
  /**
   * rpc_peeraddr - extract remote peer address from clnt's xprt
@@ -653,7 +682,8 @@ EXPORT_SYMBOL_GPL(rpc_peeraddr);
   * @format: address format
   *
   */
-char *rpc_peeraddr2str(struct rpc_clnt *clnt, enum rpc_display_format_t format)
+const char *rpc_peeraddr2str(struct rpc_clnt *clnt,
+                            enum rpc_display_format_t format)
  {
         struct rpc_xprt *xprt = clnt->cl_xprt;
  
@@ -671,6 +701,7 @@ rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize
         if (xprt->ops->set_buffer_size)
                 xprt->ops->set_buffer_size(xprt, sndsize, rcvsize);
  }
+EXPORT_SYMBOL_GPL(rpc_setbufsize);
  
  /*
   * Return size of largest payload RPC client can support, in bytes
@@ -710,6 +741,7 @@ rpc_restart_call(struct rpc_task *task)
  
         task->tk_action = call_start;
  }
+EXPORT_SYMBOL_GPL(rpc_restart_call);
  
  /*
   * 0.  Initial state
@@ -1137,7 +1169,7 @@ call_status(struct rpc_task *task)
         case -ETIMEDOUT:
                 task->tk_action = call_timeout;
                 if (task->tk_client->cl_discrtry)
-                       xprt_disconnect(task->tk_xprt);
+                       xprt_force_disconnect(task->tk_xprt);
                 break;
         case -ECONNREFUSED:
         case -ENOTCONN:
@@ -1260,7 +1292,7 @@ out_retry:
         req->rq_received = req->rq_private_buf.len = 0;
         task->tk_status = 0;
         if (task->tk_client->cl_discrtry)
-               xprt_disconnect(task->tk_xprt);
+               xprt_force_disconnect(task->tk_xprt);
  }
  
  /*
@@ -1517,9 +1549,15 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int
                 .rpc_proc = &rpcproc_null,
                 .rpc_cred = cred,
         };
-       return rpc_do_run_task(clnt, &msg, flags, &rpc_default_ops, NULL);
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = clnt,
+               .rpc_message = &msg,
+               .callback_ops = &rpc_default_ops,
+               .flags = flags,
+       };
+       return rpc_run_task(&task_setup_data);
  }
-EXPORT_SYMBOL(rpc_call_null);
+EXPORT_SYMBOL_GPL(rpc_call_null);
  
  #ifdef RPC_DEBUG
  void rpc_show_tasks(void)
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c

index c59f3ca2b41b02627a017354f35e319366571633..7e197168a245118e8ed9a2a567d9fe9646fb7e22 100644 (file)
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -76,6 +76,16 @@ rpc_timeout_upcall_queue(struct work_struct *work)
         rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT);
  }
  
+/**
+ * rpc_queue_upcall
+ * @inode: inode of upcall pipe on which to queue given message
+ * @msg: message to queue
+ *
+ * Call with an @inode created by rpc_mkpipe() to queue an upcall.
+ * A userspace process may then later read the upcall by performing a
+ * read on an open file for this inode.  It is up to the caller to
+ * initialize the fields of @msg (other than @msg->list) appropriately.
+ */
  int
  rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
  {
@@ -103,6 +113,7 @@ out:
         wake_up(&rpci->waitq);
         return res;
  }
+EXPORT_SYMBOL(rpc_queue_upcall);
  
  static inline void
  rpc_inode_setowner(struct inode *inode, void *private)
@@ -512,8 +523,8 @@ rpc_get_inode(struct super_block *sb, int mode)
  /*
   * FIXME: This probably has races.
   */
-static void
-rpc_depopulate(struct dentry *parent, int start, int eof)
+static void rpc_depopulate(struct dentry *parent,
+                          unsigned long start, unsigned long eof)
  {
         struct inode *dir = parent->d_inode;
         struct list_head *pos, *next;
@@ -663,7 +674,16 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
         return dentry;
  }
  
-
+/**
+ * rpc_mkdir - Create a new directory in rpc_pipefs
+ * @path: path from the rpc_pipefs root to the new directory
+ * @rpc_clnt: rpc client to associate with this directory
+ *
+ * This creates a directory at the given @path associated with
+ * @rpc_clnt, which will contain a file named "info" with some basic
+ * information about the client, together with any "pipes" that may
+ * later be created using rpc_mkpipe().
+ */
  struct dentry *
  rpc_mkdir(char *path, struct rpc_clnt *rpc_client)
  {
@@ -699,6 +719,10 @@ err_dput:
         goto out;
  }
  
+/**
+ * rpc_rmdir - Remove a directory created with rpc_mkdir()
+ * @dentry: directory to remove
+ */
  int
  rpc_rmdir(struct dentry *dentry)
  {
@@ -717,6 +741,25 @@ rpc_rmdir(struct dentry *dentry)
         return error;
  }
  
+/**
+ * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication
+ * @parent: dentry of directory to create new "pipe" in
+ * @name: name of pipe
+ * @private: private data to associate with the pipe, for the caller's use
+ * @ops: operations defining the behavior of the pipe: upcall, downcall,
+ *     release_pipe, and destroy_msg.
+ *
+ * Data is made available for userspace to read by calls to
+ * rpc_queue_upcall().  The actual reads will result in calls to
+ * @ops->upcall, which will be called with the file pointer,
+ * message, and userspace buffer to copy to.
+ *
+ * Writes can come at any time, and do not necessarily have to be
+ * responses to upcalls.  They will result in calls to @msg->downcall.
+ *
+ * The @private argument passed here will be available to all these methods
+ * from the file pointer, via RPC_I(file->f_dentry->d_inode)->private.
+ */
  struct dentry *
  rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pipe_ops *ops, int flags)
  {
@@ -763,7 +806,16 @@ err_dput:
                         -ENOMEM);
         goto out;
  }
+EXPORT_SYMBOL(rpc_mkpipe);
  
+/**
+ * rpc_unlink - remove a pipe
+ * @dentry: dentry for the pipe, as returned from rpc_mkpipe
+ *
+ * After this call, lookups will no longer find the pipe, and any
+ * attempts to read or write using preexisting opens of the pipe will
+ * return -EPIPE.
+ */
  int
  rpc_unlink(struct dentry *dentry)
  {
@@ -785,6 +837,7 @@ rpc_unlink(struct dentry *dentry)
         dput(parent);
         return error;
  }
+EXPORT_SYMBOL(rpc_unlink);
  
  /*
   * populate the filesystem
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c

index a05493aedb6800b11c91b31f1587dd229ca5387f..fa5b8f202d5b3a358e090f484cff3475d4fc228e 100644 (file)
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -54,45 +54,6 @@ enum {
  #define RPCB_HIGHPROC_3                RPCBPROC_TADDR2UADDR
  #define RPCB_HIGHPROC_4                RPCBPROC_GETSTAT
  
-/*
- * r_addr
- *
- * Quoting RFC 3530, section 2.2:
- *
- * For TCP over IPv4 and for UDP over IPv4, the format of r_addr is the
- * US-ASCII string:
- *
- *     h1.h2.h3.h4.p1.p2
- *
- * The prefix, "h1.h2.h3.h4", is the standard textual form for
- * representing an IPv4 address, which is always four octets long.
- * Assuming big-endian ordering, h1, h2, h3, and h4, are respectively,
- * the first through fourth octets each converted to ASCII-decimal.
- * Assuming big-endian ordering, p1 and p2 are, respectively, the first
- * and second octets each converted to ASCII-decimal.  For example, if a
- * host, in big-endian order, has an address of 0x0A010307 and there is
- * a service listening on, in big endian order, port 0x020F (decimal
- * 527), then the complete universal address is "10.1.3.7.2.15".
- *
- * ...
- *
- * For TCP over IPv6 and for UDP over IPv6, the format of r_addr is the
- * US-ASCII string:
- *
- *     x1:x2:x3:x4:x5:x6:x7:x8.p1.p2
- *
- * The suffix "p1.p2" is the service port, and is computed the same way
- * as with universal addresses for TCP and UDP over IPv4.  The prefix,
- * "x1:x2:x3:x4:x5:x6:x7:x8", is the standard textual form for
- * representing an IPv6 address as defined in Section 2.2 of [RFC2373].
- * Additionally, the two alternative forms specified in Section 2.2 of
- * [RFC2373] are also acceptable.
- *
- * XXX: Currently this implementation does not explicitly convert the
- *      stored address to US-ASCII on non-ASCII systems.
- */
-#define RPCB_MAXADDRLEN                (128u)
-
  /*
   * r_owner
   *
@@ -112,9 +73,9 @@ struct rpcbind_args {
         u32                     r_vers;
         u32                     r_prot;
         unsigned short          r_port;
-       char *                  r_netid;
-       char                    r_addr[RPCB_MAXADDRLEN];
-       char *                  r_owner;
+       const char *            r_netid;
+       const char *            r_addr;
+       const char *            r_owner;
  };
  
  static struct rpc_procinfo rpcb_procedures2[];
@@ -128,19 +89,6 @@ struct rpcb_info {
  static struct rpcb_info rpcb_next_version[];
  static struct rpcb_info rpcb_next_version6[];
  
-static void rpcb_getport_prepare(struct rpc_task *task, void *calldata)
-{
-       struct rpcbind_args *map = calldata;
-       struct rpc_xprt *xprt = map->r_xprt;
-       struct rpc_message msg = {
-               .rpc_proc       = rpcb_next_version[xprt->bind_index].rpc_proc,
-               .rpc_argp       = map,
-               .rpc_resp       = &map->r_port,
-       };
-
-       rpc_call_setup(task, &msg, 0);
-}
-
  static void rpcb_map_release(void *data)
  {
         struct rpcbind_args *map = data;
@@ -150,7 +98,6 @@ static void rpcb_map_release(void *data)
  }
  
  static const struct rpc_call_ops rpcb_getport_ops = {
-       .rpc_call_prepare       = rpcb_getport_prepare,
         .rpc_call_done          = rpcb_getport_done,
         .rpc_release            = rpcb_map_release,
  };
@@ -162,12 +109,13 @@ static void rpcb_wake_rpcbind_waiters(struct rpc_xprt *xprt, int status)
  }
  
  static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
-                                       int proto, int version, int privileged)
+                                   size_t salen, int proto, u32 version,
+                                   int privileged)
  {
         struct rpc_create_args args = {
                 .protocol       = proto,
                 .address        = srvaddr,
-               .addrsize       = sizeof(struct sockaddr_in),
+               .addrsize       = salen,
                 .servername     = hostname,
                 .program        = &rpcb_program,
                 .version        = version,
@@ -230,7 +178,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
                         prog, vers, prot, port);
  
         rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin,
-                                       XPRT_TRANSPORT_UDP, 2, 1);
+                               sizeof(sin), XPRT_TRANSPORT_UDP, 2, 1);
         if (IS_ERR(rpcb_clnt))
                 return PTR_ERR(rpcb_clnt);
  
@@ -252,13 +200,15 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
   * @vers: RPC version number to bind
   * @prot: transport protocol to use to make this request
   *
+ * Return value is the requested advertised port number,
+ * or a negative errno value.
+ *
   * Called from outside the RPC client in a synchronous task context.
   * Uses default timeout parameters specified by underlying transport.
   *
- * XXX: Needs to support IPv6, and rpcbind versions 3 and 4
+ * XXX: Needs to support IPv6
   */
-int rpcb_getport_sync(struct sockaddr_in *sin, __u32 prog,
-                     __u32 vers, int prot)
+int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot)
  {
         struct rpcbind_args map = {
                 .r_prog         = prog,
@@ -272,14 +222,13 @@ int rpcb_getport_sync(struct sockaddr_in *sin, __u32 prog,
                 .rpc_resp       = &map.r_port,
         };
         struct rpc_clnt *rpcb_clnt;
-       char hostname[40];
         int status;
  
         dprintk("RPC:       %s(" NIPQUAD_FMT ", %u, %u, %d)\n",
                 __FUNCTION__, NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot);
  
-       sprintf(hostname, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
-       rpcb_clnt = rpcb_create(hostname, (struct sockaddr *)sin, prot, 2, 0);
+       rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin,
+                               sizeof(*sin), prot, 2, 0);
         if (IS_ERR(rpcb_clnt))
                 return PTR_ERR(rpcb_clnt);
  
@@ -295,6 +244,24 @@ int rpcb_getport_sync(struct sockaddr_in *sin, __u32 prog,
  }
  EXPORT_SYMBOL_GPL(rpcb_getport_sync);
  
+static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, int version)
+{
+       struct rpc_message msg = {
+               .rpc_proc = rpcb_next_version[version].rpc_proc,
+               .rpc_argp = map,
+               .rpc_resp = &map->r_port,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = rpcb_clnt,
+               .rpc_message = &msg,
+               .callback_ops = &rpcb_getport_ops,
+               .callback_data = map,
+               .flags = RPC_TASK_ASYNC,
+       };
+
+       return rpc_run_task(&task_setup_data);
+}
+
  /**
   * rpcb_getport_async - obtain the port for a given RPC service on a given host
   * @task: task that is waiting for portmapper request
@@ -305,12 +272,14 @@ EXPORT_SYMBOL_GPL(rpcb_getport_sync);
  void rpcb_getport_async(struct rpc_task *task)
  {
         struct rpc_clnt *clnt = task->tk_client;
-       int bind_version;
+       u32 bind_version;
         struct rpc_xprt *xprt = task->tk_xprt;
         struct rpc_clnt *rpcb_clnt;
         static struct rpcbind_args *map;
         struct rpc_task *child;
-       struct sockaddr addr;
+       struct sockaddr_storage addr;
+       struct sockaddr *sap = (struct sockaddr *)&addr;
+       size_t salen;
         int status;
         struct rpcb_info *info;
  
@@ -340,10 +309,10 @@ void rpcb_getport_async(struct rpc_task *task)
                 goto bailout_nofree;
         }
  
-       rpc_peeraddr(clnt, (void *)&addr, sizeof(addr));
+       salen = rpc_peeraddr(clnt, sap, sizeof(addr));
  
         /* Don't ever use rpcbind v2 for AF_INET6 requests */
-       switch (addr.sa_family) {
+       switch (sap->sa_family) {
         case AF_INET:
                 info = rpcb_next_version;
                 break;
@@ -368,7 +337,7 @@ void rpcb_getport_async(struct rpc_task *task)
         dprintk("RPC: %5u %s: trying rpcbind version %u\n",
                 task->tk_pid, __FUNCTION__, bind_version);
  
-       rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot,
+       rpcb_clnt = rpcb_create(clnt->cl_server, sap, salen, xprt->prot,
                                 bind_version, 0);
         if (IS_ERR(rpcb_clnt)) {
                 status = PTR_ERR(rpcb_clnt);
@@ -390,12 +359,10 @@ void rpcb_getport_async(struct rpc_task *task)
         map->r_port = 0;
         map->r_xprt = xprt_get(xprt);
         map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
-       memcpy(map->r_addr,
-              rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR),
-              sizeof(map->r_addr));
+       map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR);
         map->r_owner = RPCB_OWNER_STRING;       /* ignored for GETADDR */
  
-       child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map);
+       child = rpcb_call_async(rpcb_clnt, map, xprt->bind_index);
         rpc_release_client(rpcb_clnt);
         if (IS_ERR(child)) {
                 status = -EIO;
@@ -518,7 +485,7 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
          * Simple sanity check.  The smallest possible universal
          * address is an IPv4 address string containing 11 bytes.
          */
-       if (addr_len < 11 || addr_len > RPCB_MAXADDRLEN)
+       if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN)
                 goto out_err;
  
         /*
@@ -569,7 +536,7 @@ out_err:
  #define RPCB_boolean_sz                (1u)
  
  #define RPCB_netid_sz          (1+XDR_QUADLEN(RPCBIND_MAXNETIDLEN))
-#define RPCB_addr_sz           (1+XDR_QUADLEN(RPCB_MAXADDRLEN))
+#define RPCB_addr_sz           (1+XDR_QUADLEN(RPCBIND_MAXUADDRLEN))
  #define RPCB_ownerstring_sz    (1+XDR_QUADLEN(RPCB_MAXOWNERLEN))
  
  #define RPCB_mappingargs_sz    RPCB_program_sz+RPCB_version_sz+        \
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c

index eed5dd9819cde8ddc48e147954852d515a77f7ef..40ce6f6672d6bb8b735d2ed379dff9add6ff087b 100644 (file)
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -45,7 +45,7 @@ static void                    rpc_release_task(struct rpc_task *task);
  /*
   * RPC tasks sit here while waiting for conditions to improve.
   */
-static RPC_WAITQ(delay_queue, "delayq");
+static struct rpc_wait_queue delay_queue;
  
  /*
   * rpciod-related stuff
@@ -135,7 +135,7 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct r
         if (unlikely(task->tk_priority > queue->maxpriority))
                 q = &queue->tasks[queue->maxpriority];
         list_for_each_entry(t, q, u.tk_wait.list) {
-               if (t->tk_cookie == task->tk_cookie) {
+               if (t->tk_owner == task->tk_owner) {
                         list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
                         return;
                 }
@@ -208,26 +208,26 @@ static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int
         queue->count = 1 << (priority * 2);
  }
  
-static inline void rpc_set_waitqueue_cookie(struct rpc_wait_queue *queue, unsigned long cookie)
+static inline void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
  {
-       queue->cookie = cookie;
+       queue->owner = pid;
         queue->nr = RPC_BATCH_COUNT;
  }
  
  static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
  {
         rpc_set_waitqueue_priority(queue, queue->maxpriority);
-       rpc_set_waitqueue_cookie(queue, 0);
+       rpc_set_waitqueue_owner(queue, 0);
  }
  
-static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, int maxprio)
+static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues)
  {
         int i;
  
         spin_lock_init(&queue->lock);
         for (i = 0; i < ARRAY_SIZE(queue->tasks); i++)
                 INIT_LIST_HEAD(&queue->tasks[i]);
-       queue->maxpriority = maxprio;
+       queue->maxpriority = nr_queues - 1;
         rpc_reset_waitqueue_priority(queue);
  #ifdef RPC_DEBUG
         queue->name = qname;
@@ -236,14 +236,14 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c
  
  void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname)
  {
-       __rpc_init_priority_wait_queue(queue, qname, RPC_PRIORITY_HIGH);
+       __rpc_init_priority_wait_queue(queue, qname, RPC_NR_PRIORITY);
  }
  
  void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname)
  {
-       __rpc_init_priority_wait_queue(queue, qname, 0);
+       __rpc_init_priority_wait_queue(queue, qname, 1);
  }
-EXPORT_SYMBOL(rpc_init_wait_queue);
+EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
  
  static int rpc_wait_bit_interruptible(void *word)
  {
@@ -303,7 +303,7 @@ int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *))
         return wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
                         action, TASK_INTERRUPTIBLE);
  }
-EXPORT_SYMBOL(__rpc_wait_for_completion_task);
+EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
  
  /*
   * Make an RPC task runnable.
@@ -373,6 +373,7 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
         __rpc_sleep_on(q, task, action, timer);
         spin_unlock_bh(&q->lock);
  }
+EXPORT_SYMBOL_GPL(rpc_sleep_on);
  
  /**
   * __rpc_do_wake_up_task - wake up a single rpc_task
@@ -444,6 +445,7 @@ void rpc_wake_up_task(struct rpc_task *task)
         }
         rcu_read_unlock_bh();
  }
+EXPORT_SYMBOL_GPL(rpc_wake_up_task);
  
  /*
   * Wake up the next task on a priority queue.
@@ -454,12 +456,12 @@ static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queu
         struct rpc_task *task;
  
         /*
-        * Service a batch of tasks from a single cookie.
+        * Service a batch of tasks from a single owner.
          */
         q = &queue->tasks[queue->priority];
         if (!list_empty(q)) {
                 task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
-               if (queue->cookie == task->tk_cookie) {
+               if (queue->owner == task->tk_owner) {
                         if (--queue->nr)
                                 goto out;
                         list_move_tail(&task->u.tk_wait.list, q);
@@ -468,7 +470,7 @@ static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queu
                  * Check if we need to switch queues.
                  */
                 if (--queue->count)
-                       goto new_cookie;
+                       goto new_owner;
         }
  
         /*
@@ -490,8 +492,8 @@ static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queu
  
  new_queue:
         rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0]));
-new_cookie:
-       rpc_set_waitqueue_cookie(queue, task->tk_cookie);
+new_owner:
+       rpc_set_waitqueue_owner(queue, task->tk_owner);
  out:
         __rpc_wake_up_task(task);
         return task;
@@ -519,6 +521,7 @@ struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue)
  
         return task;
  }
+EXPORT_SYMBOL_GPL(rpc_wake_up_next);
  
  /**
   * rpc_wake_up - wake up all rpc_tasks
@@ -544,6 +547,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue)
         spin_unlock(&queue->lock);
         rcu_read_unlock_bh();
  }
+EXPORT_SYMBOL_GPL(rpc_wake_up);
  
  /**
   * rpc_wake_up_status - wake up all rpc_tasks and set their status value.
@@ -572,6 +576,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
         spin_unlock(&queue->lock);
         rcu_read_unlock_bh();
  }
+EXPORT_SYMBOL_GPL(rpc_wake_up_status);
  
  static void __rpc_atrun(struct rpc_task *task)
  {
@@ -586,6 +591,7 @@ void rpc_delay(struct rpc_task *task, unsigned long delay)
         task->tk_timeout = delay;
         rpc_sleep_on(&delay_queue, task, NULL, __rpc_atrun);
  }
+EXPORT_SYMBOL_GPL(rpc_delay);
  
  /*
   * Helper to call task->tk_ops->rpc_call_prepare
@@ -614,7 +620,7 @@ void rpc_exit_task(struct rpc_task *task)
                 }
         }
  }
-EXPORT_SYMBOL(rpc_exit_task);
+EXPORT_SYMBOL_GPL(rpc_exit_task);
  
  void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
  {
@@ -808,39 +814,49 @@ EXPORT_SYMBOL_GPL(rpc_free);
  /*
   * Creation and deletion of RPC task structures
   */
-void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata)
+static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
  {
         memset(task, 0, sizeof(*task));
         setup_timer(&task->tk_timer, (void (*)(unsigned long))rpc_run_timer,
                         (unsigned long)task);
         atomic_set(&task->tk_count, 1);
-       task->tk_client = clnt;
-       task->tk_flags  = flags;
-       task->tk_ops = tk_ops;
-       if (tk_ops->rpc_call_prepare != NULL)
-               task->tk_action = rpc_prepare_task;
-       task->tk_calldata = calldata;
+       task->tk_flags  = task_setup_data->flags;
+       task->tk_ops = task_setup_data->callback_ops;
+       task->tk_calldata = task_setup_data->callback_data;
         INIT_LIST_HEAD(&task->tk_task);
  
         /* Initialize retry counters */
         task->tk_garb_retry = 2;
         task->tk_cred_retry = 2;
  
-       task->tk_priority = RPC_PRIORITY_NORMAL;
-       task->tk_cookie = (unsigned long)current;
+       task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW;
+       task->tk_owner = current->tgid;
  
         /* Initialize workqueue for async tasks */
         task->tk_workqueue = rpciod_workqueue;
  
-       if (clnt) {
-               kref_get(&clnt->cl_kref);
-               if (clnt->cl_softrtry)
+       task->tk_client = task_setup_data->rpc_client;
+       if (task->tk_client != NULL) {
+               kref_get(&task->tk_client->cl_kref);
+               if (task->tk_client->cl_softrtry)
                         task->tk_flags |= RPC_TASK_SOFT;
-               if (!clnt->cl_intr)
+               if (!task->tk_client->cl_intr)
                         task->tk_flags |= RPC_TASK_NOINTR;
         }
  
-       BUG_ON(task->tk_ops == NULL);
+       if (task->tk_ops->rpc_call_prepare != NULL)
+               task->tk_action = rpc_prepare_task;
+
+       if (task_setup_data->rpc_message != NULL) {
+               memcpy(&task->tk_msg, task_setup_data->rpc_message, sizeof(task->tk_msg));
+               /* Bind the user cred */
+               if (task->tk_msg.rpc_cred != NULL)
+                       rpcauth_holdcred(task);
+               else
+                       rpcauth_bindcred(task);
+               if (task->tk_action == NULL)
+                       rpc_call_start(task);
+       }
  
         /* starting timestamp */
         task->tk_start = jiffies;
@@ -865,18 +881,22 @@ static void rpc_free_task(struct rcu_head *rcu)
  /*
   * Create a new task for the specified client.
   */
-struct rpc_task *rpc_new_task(struct rpc_clnt *clnt, int flags, const struct rpc_call_ops *tk_ops, void *calldata)
+struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
  {
-       struct rpc_task *task;
-
-       task = rpc_alloc_task();
-       if (!task)
-               goto out;
+       struct rpc_task *task = setup_data->task;
+       unsigned short flags = 0;
+
+       if (task == NULL) {
+               task = rpc_alloc_task();
+               if (task == NULL)
+                       goto out;
+               flags = RPC_TASK_DYNAMIC;
+       }
  
-       rpc_init_task(task, clnt, flags, tk_ops, calldata);
+       rpc_init_task(task, setup_data);
  
+       task->tk_flags |= flags;
         dprintk("RPC:       allocated task %p\n", task);
-       task->tk_flags |= RPC_TASK_DYNAMIC;
  out:
         return task;
  }
@@ -902,7 +922,7 @@ void rpc_put_task(struct rpc_task *task)
                 call_rcu_bh(&task->u.tk_rcu, rpc_free_task);
         rpc_release_calldata(tk_ops, calldata);
  }
-EXPORT_SYMBOL(rpc_put_task);
+EXPORT_SYMBOL_GPL(rpc_put_task);
  
  static void rpc_release_task(struct rpc_task *task)
  {
@@ -959,6 +979,7 @@ void rpc_killall_tasks(struct rpc_clnt *clnt)
         }
         spin_unlock(&clnt->cl_lock);
  }
+EXPORT_SYMBOL_GPL(rpc_killall_tasks);
  
  int rpciod_up(void)
  {
@@ -1038,6 +1059,11 @@ rpc_init_mempool(void)
                 goto err_nomem;
         if (!rpciod_start())
                 goto err_nomem;
+       /*
+        * The following is not strictly a mempool initialisation,
+        * but there is no harm in doing it here
+        */
+       rpc_init_wait_queue(&delay_queue, "delayq");
         return 0;
  err_nomem:
         rpc_destroy_mempool();
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c

index 97ac45f034d6f35155537adc7face2cd47791a80..a661a3acb37e682dba442b98ec1c9729a75f7f5f 100644 (file)
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -72,7 +72,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct
         struct page     **ppage = xdr->pages;
         unsigned int    len, pglen = xdr->page_len;
         ssize_t         copied = 0;
-       int             ret;
+       size_t          ret;
  
         len = xdr->head[0].iov_len;
         if (base < len) {
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c

index 4d4f3738b6887837ec4327c73ed07e50bdc69784..74df2d358e61ba5eb77a806a2cb1d58e163dd7de 100644 (file)
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -118,7 +118,7 @@ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt)
         new = kcalloc(clnt->cl_maxproc, sizeof(struct rpc_iostats), GFP_KERNEL);
         return new;
  }
-EXPORT_SYMBOL(rpc_alloc_iostats);
+EXPORT_SYMBOL_GPL(rpc_alloc_iostats);
  
  /**
   * rpc_free_iostats - release an rpc_iostats structure
@@ -129,7 +129,7 @@ void rpc_free_iostats(struct rpc_iostats *stats)
  {
         kfree(stats);
  }
-EXPORT_SYMBOL(rpc_free_iostats);
+EXPORT_SYMBOL_GPL(rpc_free_iostats);
  
  /**
   * rpc_count_iostats - tally up per-task stats
@@ -215,7 +215,7 @@ void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
                                 metrics->om_execute * MILLISECS_PER_JIFFY);
         }
  }
-EXPORT_SYMBOL(rpc_print_iostats);
+EXPORT_SYMBOL_GPL(rpc_print_iostats);
  
  /*
   * Register/unregister RPC proc files
@@ -241,12 +241,14 @@ rpc_proc_register(struct rpc_stat *statp)
  {
         return do_register(statp->program->name, statp, &rpc_proc_fops);
  }
+EXPORT_SYMBOL_GPL(rpc_proc_register);
  
  void
  rpc_proc_unregister(const char *name)
  {
         remove_proc_entry(name, proc_net_rpc);
  }
+EXPORT_SYMBOL_GPL(rpc_proc_unregister);
  
  struct proc_dir_entry *
  svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c

index 33d89e842c8506ead293a7ac8460c71e8b392fb0..1a7e309d008bcd9624dff48195c8052c515a3791 100644 (file)
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,45 +22,6 @@
  #include <linux/sunrpc/rpc_pipe_fs.h>
  #include <linux/sunrpc/xprtsock.h>
  
-/* RPC scheduler */
-EXPORT_SYMBOL(rpc_execute);
-EXPORT_SYMBOL(rpc_init_task);
-EXPORT_SYMBOL(rpc_sleep_on);
-EXPORT_SYMBOL(rpc_wake_up_next);
-EXPORT_SYMBOL(rpc_wake_up_task);
-EXPORT_SYMBOL(rpc_wake_up_status);
-
-/* RPC client functions */
-EXPORT_SYMBOL(rpc_clone_client);
-EXPORT_SYMBOL(rpc_bind_new_program);
-EXPORT_SYMBOL(rpc_shutdown_client);
-EXPORT_SYMBOL(rpc_killall_tasks);
-EXPORT_SYMBOL(rpc_call_sync);
-EXPORT_SYMBOL(rpc_call_async);
-EXPORT_SYMBOL(rpc_call_setup);
-EXPORT_SYMBOL(rpc_clnt_sigmask);
-EXPORT_SYMBOL(rpc_clnt_sigunmask);
-EXPORT_SYMBOL(rpc_delay);
-EXPORT_SYMBOL(rpc_restart_call);
-EXPORT_SYMBOL(rpc_setbufsize);
-EXPORT_SYMBOL(rpc_unlink);
-EXPORT_SYMBOL(rpc_wake_up);
-EXPORT_SYMBOL(rpc_queue_upcall);
-EXPORT_SYMBOL(rpc_mkpipe);
-
-/* Client transport */
-EXPORT_SYMBOL(xprt_set_timeout);
-
-/* Client credential cache */
-EXPORT_SYMBOL(rpcauth_register);
-EXPORT_SYMBOL(rpcauth_unregister);
-EXPORT_SYMBOL(rpcauth_create);
-EXPORT_SYMBOL(rpcauth_lookupcred);
-EXPORT_SYMBOL(rpcauth_lookup_credcache);
-EXPORT_SYMBOL(rpcauth_destroy_credcache);
-EXPORT_SYMBOL(rpcauth_init_credcache);
-EXPORT_SYMBOL(put_rpccred);
-
  /* RPC server stuff */
  EXPORT_SYMBOL(svc_create);
  EXPORT_SYMBOL(svc_create_thread);
@@ -81,8 +42,6 @@ EXPORT_SYMBOL(svc_set_client);
  
  /* RPC statistics */
  #ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(rpc_proc_register);
-EXPORT_SYMBOL(rpc_proc_unregister);
  EXPORT_SYMBOL(svc_proc_register);
  EXPORT_SYMBOL(svc_proc_unregister);
  EXPORT_SYMBOL(svc_seq_show);
@@ -105,31 +64,6 @@ EXPORT_SYMBOL(qword_get);
  EXPORT_SYMBOL(svcauth_unix_purge);
  EXPORT_SYMBOL(unix_domain_find);
  
-/* Generic XDR */
-EXPORT_SYMBOL(xdr_encode_string);
-EXPORT_SYMBOL(xdr_decode_string_inplace);
-EXPORT_SYMBOL(xdr_decode_netobj);
-EXPORT_SYMBOL(xdr_encode_netobj);
-EXPORT_SYMBOL(xdr_encode_pages);
-EXPORT_SYMBOL(xdr_inline_pages);
-EXPORT_SYMBOL(xdr_shift_buf);
-EXPORT_SYMBOL(xdr_encode_word);
-EXPORT_SYMBOL(xdr_decode_word);
-EXPORT_SYMBOL(xdr_encode_array2);
-EXPORT_SYMBOL(xdr_decode_array2);
-EXPORT_SYMBOL(xdr_buf_from_iov);
-EXPORT_SYMBOL(xdr_buf_subsegment);
-EXPORT_SYMBOL(xdr_buf_read_netobj);
-EXPORT_SYMBOL(read_bytes_from_xdr_buf);
-
-/* Debugging symbols */
-#ifdef RPC_DEBUG
-EXPORT_SYMBOL(rpc_debug);
-EXPORT_SYMBOL(nfs_debug);
-EXPORT_SYMBOL(nfsd_debug);
-EXPORT_SYMBOL(nlm_debug);
-#endif
-
  extern struct cache_detail ip_map_cache, unix_gid_cache;
  
  static int __init
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c

index a4a6bf7deaa494407cff505ffc00057b6f25c8bd..4ad5fbbb18b48df0d2ad9a0b457212a6c764330d 100644 (file)
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
  #include <linux/mm.h>
  #include <linux/interrupt.h>
  #include <linux/module.h>
+#include <linux/sched.h>
  
  #include <linux/sunrpc/types.h>
  #include <linux/sunrpc/xdr.h>
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c

index 2be714e9b382742be19d7c628f292500d65f77ce..bada7de0c2fcd6a216f44817cb67e03f43b736b5 100644 (file)
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -23,9 +23,16 @@
   * Declare the debug flags here
   */
  unsigned int   rpc_debug;
+EXPORT_SYMBOL_GPL(rpc_debug);
+
  unsigned int   nfs_debug;
+EXPORT_SYMBOL_GPL(nfs_debug);
+
  unsigned int   nfsd_debug;
+EXPORT_SYMBOL_GPL(nfsd_debug);
+
  unsigned int   nlm_debug;
+EXPORT_SYMBOL_GPL(nlm_debug);
  
  #ifdef RPC_DEBUG
  
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c

index fdc5e6d7562b72e6779308306feb7cedb14ba1bc..54264062ea695d59f85da1d712f53cd0ff45952a 100644 (file)
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -28,6 +28,7 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj)
         memcpy(p, obj->data, obj->len);
         return p + XDR_QUADLEN(obj->len);
  }
+EXPORT_SYMBOL(xdr_encode_netobj);
  
  __be32 *
  xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj)
@@ -40,6 +41,7 @@ xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj)
         obj->data = (u8 *) p;
         return p + XDR_QUADLEN(len);
  }
+EXPORT_SYMBOL(xdr_decode_netobj);
  
  /**
   * xdr_encode_opaque_fixed - Encode fixed length opaque data
@@ -91,6 +93,7 @@ xdr_encode_string(__be32 *p, const char *string)
  {
         return xdr_encode_array(p, string, strlen(string));
  }
+EXPORT_SYMBOL(xdr_encode_string);
  
  __be32 *
  xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen)
@@ -103,6 +106,7 @@ xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen)
         *sp = (char *) p;
         return p + XDR_QUADLEN(len);
  }
+EXPORT_SYMBOL(xdr_decode_string_inplace);
  
  void
  xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
@@ -130,6 +134,7 @@ xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
         xdr->buflen += len;
         xdr->len += len;
  }
+EXPORT_SYMBOL(xdr_encode_pages);
  
  void
  xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
@@ -151,7 +156,7 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
  
         xdr->buflen += len;
  }
-
+EXPORT_SYMBOL(xdr_inline_pages);
  
  /*
   * Helper routines for doing 'memmove' like operations on a struct xdr_buf
@@ -418,6 +423,7 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
  {
         xdr_shrink_bufhead(buf, len);
  }
+EXPORT_SYMBOL(xdr_shift_buf);
  
  /**
   * xdr_init_encode - Initialize a struct xdr_stream for sending data.
@@ -639,6 +645,7 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
         buf->page_len = 0;
         buf->buflen = buf->len = iov->iov_len;
  }
+EXPORT_SYMBOL(xdr_buf_from_iov);
  
  /* Sets subbuf to the portion of buf of length len beginning base bytes
   * from the start of buf. Returns -1 if base of length are out of bounds. */
@@ -687,6 +694,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
                 return -1;
         return 0;
  }
+EXPORT_SYMBOL(xdr_buf_subsegment);
  
  static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
  {
@@ -717,6 +725,7 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
         __read_bytes_from_xdr_buf(&subbuf, obj, len);
         return 0;
  }
+EXPORT_SYMBOL(read_bytes_from_xdr_buf);
  
  static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
  {
@@ -760,6 +769,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
         *obj = ntohl(raw);
         return 0;
  }
+EXPORT_SYMBOL(xdr_decode_word);
  
  int
  xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
@@ -768,6 +778,7 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
  
         return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
  }
+EXPORT_SYMBOL(xdr_encode_word);
  
  /* If the netobj starting offset bytes from the start of xdr_buf is contained
   * entirely in the head or the tail, set object to point to it; otherwise
@@ -805,6 +816,7 @@ int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned in
         __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len);
         return 0;
  }
+EXPORT_SYMBOL(xdr_buf_read_netobj);
  
  /* Returns 0 on success, or else a negative error code. */
  static int
@@ -1010,6 +1022,7 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
  
         return xdr_xcode_array2(buf, base, desc, 0);
  }
+EXPORT_SYMBOL(xdr_decode_array2);
  
  int
  xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
@@ -1021,6 +1034,7 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
  
         return xdr_xcode_array2(buf, base, desc, 1);
  }
+EXPORT_SYMBOL(xdr_encode_array2);
  
  int
  xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c

index fb92f51405c5d5a2a9016eb980c4536fc455121b..cfcade906a56b7721330bf5929ae4fe60169e8ea 100644 (file)
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -501,9 +501,10 @@ EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
  void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
  {
         int timer = task->tk_msg.rpc_proc->p_timer;
-       struct rpc_rtt *rtt = task->tk_client->cl_rtt;
+       struct rpc_clnt *clnt = task->tk_client;
+       struct rpc_rtt *rtt = clnt->cl_rtt;
         struct rpc_rqst *req = task->tk_rqstp;
-       unsigned long max_timeout = req->rq_xprt->timeout.to_maxval;
+       unsigned long max_timeout = clnt->cl_timeout->to_maxval;
  
         task->tk_timeout = rpc_calc_rto(rtt, timer);
         task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries;
@@ -514,7 +515,7 @@ EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
  
  static void xprt_reset_majortimeo(struct rpc_rqst *req)
  {
-       struct rpc_timeout *to = &req->rq_xprt->timeout;
+       const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
  
         req->rq_majortimeo = req->rq_timeout;
         if (to->to_exponential)
@@ -534,7 +535,7 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req)
  int xprt_adjust_timeout(struct rpc_rqst *req)
  {
         struct rpc_xprt *xprt = req->rq_xprt;
-       struct rpc_timeout *to = &xprt->timeout;
+       const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
         int status = 0;
  
         if (time_before(jiffies, req->rq_majortimeo)) {
@@ -568,17 +569,17 @@ static void xprt_autoclose(struct work_struct *work)
         struct rpc_xprt *xprt =
                 container_of(work, struct rpc_xprt, task_cleanup);
  
-       xprt_disconnect(xprt);
         xprt->ops->close(xprt);
+       clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
         xprt_release_write(xprt, NULL);
  }
  
  /**
- * xprt_disconnect - mark a transport as disconnected
+ * xprt_disconnect_done - mark a transport as disconnected
   * @xprt: transport to flag for disconnect
   *
   */
-void xprt_disconnect(struct rpc_xprt *xprt)
+void xprt_disconnect_done(struct rpc_xprt *xprt)
  {
         dprintk("RPC:       disconnected transport %p\n", xprt);
         spin_lock_bh(&xprt->transport_lock);
@@ -586,7 +587,26 @@ void xprt_disconnect(struct rpc_xprt *xprt)
         xprt_wake_pending_tasks(xprt, -ENOTCONN);
         spin_unlock_bh(&xprt->transport_lock);
  }
-EXPORT_SYMBOL_GPL(xprt_disconnect);
+EXPORT_SYMBOL_GPL(xprt_disconnect_done);
+
+/**
+ * xprt_force_disconnect - force a transport to disconnect
+ * @xprt: transport to disconnect
+ *
+ */
+void xprt_force_disconnect(struct rpc_xprt *xprt)
+{
+       /* Don't race with the test_bit() in xprt_clear_locked() */
+       spin_lock_bh(&xprt->transport_lock);
+       set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+       /* Try to schedule an autoclose RPC call */
+       if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+               queue_work(rpciod_workqueue, &xprt->task_cleanup);
+       else if (xprt->snd_task != NULL)
+               rpc_wake_up_task(xprt->snd_task);
+       spin_unlock_bh(&xprt->transport_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_force_disconnect);
  
  static void
  xprt_init_autodisconnect(unsigned long data)
@@ -909,7 +929,7 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
  {
         struct rpc_rqst *req = task->tk_rqstp;
  
-       req->rq_timeout = xprt->timeout.to_initval;
+       req->rq_timeout = task->tk_client->cl_timeout->to_initval;
         req->rq_task    = task;
         req->rq_xprt    = xprt;
         req->rq_buffer  = NULL;
@@ -958,22 +978,6 @@ void xprt_release(struct rpc_task *task)
         spin_unlock(&xprt->reserve_lock);
  }
  
-/**
- * xprt_set_timeout - set constant RPC timeout
- * @to: RPC timeout parameters to set up
- * @retr: number of retries
- * @incr: amount of increase after each retry
- *
- */
-void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr)
-{
-       to->to_initval   =
-       to->to_increment = incr;
-       to->to_maxval    = to->to_initval + (incr * retr);
-       to->to_retries   = retr;
-       to->to_exponential = 0;
-}
-
  /**
   * xprt_create_transport - create an RPC transport
   * @args: rpc transport creation arguments
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c

index 1aa1580cda6d82c247d202596ddfa9676d8f1e08..e55427f73dfe4c957c80bd1646e568e1681bc588 100644 (file)
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -83,7 +83,7 @@ static const char transfertypes[][12] = {
   */
  
  static int
-rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos,
+rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
         enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
  {
         int len, n = 0, p;
@@ -169,7 +169,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
         struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
         struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt);
         int nsegs, nchunks = 0;
-       int pos;
+       unsigned int pos;
         struct rpcrdma_mr_seg *seg = req->rl_segments;
         struct rpcrdma_read_chunk *cur_rchunk = NULL;
         struct rpcrdma_write_array *warray = NULL;
@@ -213,7 +213,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
                                         (__be32 *)&cur_rchunk->rc_target.rs_offset,
                                         seg->mr_base);
                         dprintk("RPC:       %s: read chunk "
-                               "elem %d@0x%llx:0x%x pos %d (%s)\n", __func__,
+                               "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
                                 seg->mr_len, (unsigned long long)seg->mr_base,
                                 seg->mr_rkey, pos, n < nsegs ? "more" : "last");
                         cur_rchunk++;
@@ -552,7 +552,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
   * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
   */
  static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, __be32 **iptrp)
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
  {
         unsigned int i, total_len;
         struct rpcrdma_write_chunk *cur_wchunk;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c

index 6f2112dd9f786953c24eadc707fb208a946da4f0..02c522c17de599f5ea4d776038ff1e0e5d268642 100644 (file)
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -212,12 +212,16 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
  static void
  xprt_rdma_free_addresses(struct rpc_xprt *xprt)
  {
-       kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
-       kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
-       kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
-       kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]);
-       kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
-       kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]);
+       unsigned int i;
+
+       for (i = 0; i < RPC_DISPLAY_MAX; i++)
+               switch (i) {
+               case RPC_DISPLAY_PROTO:
+               case RPC_DISPLAY_NETID:
+                       continue;
+               default:
+                       kfree(xprt->address_strings[i]);
+               }
  }
  
  static void
@@ -289,6 +293,11 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
         module_put(THIS_MODULE);
  }
  
+static const struct rpc_timeout xprt_rdma_default_timeout = {
+       .to_initval = 60 * HZ,
+       .to_maxval = 60 * HZ,
+};
+
  /**
   * xprt_setup_rdma - Set up transport to use RDMA
   *
@@ -327,7 +336,7 @@ xprt_setup_rdma(struct xprt_create *args)
         }
  
         /* 60 second timeout, no retries */
-       xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ);
+       xprt->timeout = &xprt_rdma_default_timeout;
         xprt->bind_timeout = (60U * HZ);
         xprt->connect_timeout = (60U * HZ);
         xprt->reestablish_timeout = (5U * HZ);
@@ -449,7 +458,7 @@ xprt_rdma_close(struct rpc_xprt *xprt)
         struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
  
         dprintk("RPC:       %s: closing\n", __func__);
-       xprt_disconnect(xprt);
+       xprt_disconnect_done(xprt);
         (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
  }
  
@@ -682,7 +691,7 @@ xprt_rdma_send_request(struct rpc_task *task)
         }
  
         if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) {
-               xprt_disconnect(xprt);
+               xprt_disconnect_done(xprt);
                 return -ENOTCONN;       /* implies disconnect */
         }
  
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c

index 44b0fb942e8db5834ddaebf81fe4fc1e3fbd5530..ffbf22a1d2ca0e1a24c76f9e7231732ff7ce8dfd 100644 (file)
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -522,7 +522,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                                 struct rpcrdma_create_data_internal *cdata)
  {
         struct ib_device_attr devattr;
-       int rc;
+       int rc, err;
  
         rc = ib_query_device(ia->ri_id->device, &devattr);
         if (rc) {
@@ -648,8 +648,10 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
         return 0;
  
  out2:
-       if (ib_destroy_cq(ep->rep_cq))
-               ;
+       err = ib_destroy_cq(ep->rep_cq);
+       if (err)
+               dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+                       __func__, err);
  out1:
         return rc;
  }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c

index 6fa52f44de0fd1b43f8630ae7d50ef0805be06a1..30e7ac243a90bf2347fc36bdac30004f21e84d40 100644 (file)
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -280,7 +280,9 @@ static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt)
         return (struct sockaddr_in6 *) &xprt->addr;
  }
  
-static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
+static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt,
+                                         const char *protocol,
+                                         const char *netid)
  {
         struct sockaddr_in *addr = xs_addr_in(xprt);
         char *buf;
@@ -299,21 +301,14 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
         }
         xprt->address_strings[RPC_DISPLAY_PORT] = buf;
  
-       buf = kzalloc(8, GFP_KERNEL);
-       if (buf) {
-               if (xprt->prot == IPPROTO_UDP)
-                       snprintf(buf, 8, "udp");
-               else
-                       snprintf(buf, 8, "tcp");
-       }
-       xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
+       xprt->address_strings[RPC_DISPLAY_PROTO] = protocol;
  
         buf = kzalloc(48, GFP_KERNEL);
         if (buf) {
                 snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s",
                         NIPQUAD(addr->sin_addr.s_addr),
                         ntohs(addr->sin_port),
-                       xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
+                       protocol);
         }
         xprt->address_strings[RPC_DISPLAY_ALL] = buf;
  
@@ -340,12 +335,12 @@ static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt)
         }
         xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
  
-       xprt->address_strings[RPC_DISPLAY_NETID] =
-               kstrdup(xprt->prot == IPPROTO_UDP ?
-                       RPCBIND_NETID_UDP : RPCBIND_NETID_TCP, GFP_KERNEL);
+       xprt->address_strings[RPC_DISPLAY_NETID] = netid;
  }
  
-static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
+static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt,
+                                         const char *protocol,
+                                         const char *netid)
  {
         struct sockaddr_in6 *addr = xs_addr_in6(xprt);
         char *buf;
@@ -364,21 +359,14 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
         }
         xprt->address_strings[RPC_DISPLAY_PORT] = buf;
  
-       buf = kzalloc(8, GFP_KERNEL);
-       if (buf) {
-               if (xprt->prot == IPPROTO_UDP)
-                       snprintf(buf, 8, "udp");
-               else
-                       snprintf(buf, 8, "tcp");
-       }
-       xprt->address_strings[RPC_DISPLAY_PROTO] = buf;
+       xprt->address_strings[RPC_DISPLAY_PROTO] = protocol;
  
         buf = kzalloc(64, GFP_KERNEL);
         if (buf) {
                 snprintf(buf, 64, "addr="NIP6_FMT" port=%u proto=%s",
                                 NIP6(addr->sin6_addr),
                                 ntohs(addr->sin6_port),
-                               xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
+                               protocol);
         }
         xprt->address_strings[RPC_DISPLAY_ALL] = buf;
  
@@ -405,17 +393,21 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt)
         }
         xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf;
  
-       xprt->address_strings[RPC_DISPLAY_NETID] =
-               kstrdup(xprt->prot == IPPROTO_UDP ?
-                       RPCBIND_NETID_UDP6 : RPCBIND_NETID_TCP6, GFP_KERNEL);
+       xprt->address_strings[RPC_DISPLAY_NETID] = netid;
  }
  
  static void xs_free_peer_addresses(struct rpc_xprt *xprt)
  {
-       int i;
+       unsigned int i;
  
         for (i = 0; i < RPC_DISPLAY_MAX; i++)
-               kfree(xprt->address_strings[i]);
+               switch (i) {
+               case RPC_DISPLAY_PROTO:
+               case RPC_DISPLAY_NETID:
+                       continue;
+               default:
+                       kfree(xprt->address_strings[i]);
+               }
  }
  
  #define XS_SENDMSG_FLAGS       (MSG_DONTWAIT | MSG_NOSIGNAL)
@@ -614,6 +606,22 @@ static int xs_udp_send_request(struct rpc_task *task)
         return status;
  }
  
+/**
+ * xs_tcp_shutdown - gracefully shut down a TCP socket
+ * @xprt: transport
+ *
+ * Initiates a graceful shutdown of the TCP socket by calling the
+ * equivalent of shutdown(SHUT_WR);
+ */
+static void xs_tcp_shutdown(struct rpc_xprt *xprt)
+{
+       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+       struct socket *sock = transport->sock;
+
+       if (sock != NULL)
+               kernel_sock_shutdown(sock, SHUT_WR);
+}
+
  static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf)
  {
         u32 reclen = buf->len - sizeof(rpc_fraghdr);
@@ -691,7 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
         default:
                 dprintk("RPC:       sendmsg returned unrecognized error %d\n",
                         -status);
-               xprt_disconnect(xprt);
+               xs_tcp_shutdown(xprt);
                 break;
         }
  
@@ -759,7 +767,9 @@ static void xs_close(struct rpc_xprt *xprt)
  clear_close_wait:
         smp_mb__before_clear_bit();
         clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+       clear_bit(XPRT_CLOSING, &xprt->state);
         smp_mb__after_clear_bit();
+       xprt_disconnect_done(xprt);
  }
  
  /**
@@ -775,7 +785,6 @@ static void xs_destroy(struct rpc_xprt *xprt)
  
         cancel_rearming_delayed_work(&transport->connect_worker);
  
-       xprt_disconnect(xprt);
         xs_close(xprt);
         xs_free_peer_addresses(xprt);
         kfree(xprt->slot);
@@ -886,7 +895,7 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_rea
         /* Sanity check of the record length */
         if (unlikely(transport->tcp_reclen < 4)) {
                 dprintk("RPC:       invalid TCP record fragment length\n");
-               xprt_disconnect(xprt);
+               xprt_force_disconnect(xprt);
                 return;
         }
         dprintk("RPC:       reading TCP record fragment of length %d\n",
@@ -1113,21 +1122,44 @@ static void xs_tcp_state_change(struct sock *sk)
                         transport->tcp_flags =
                                 TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
  
-                       xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
                         xprt_wake_pending_tasks(xprt, 0);
                 }
                 spin_unlock_bh(&xprt->transport_lock);
                 break;
-       case TCP_SYN_SENT:
-       case TCP_SYN_RECV:
+       case TCP_FIN_WAIT1:
+               /* The client initiated a shutdown of the socket */
+               xprt->reestablish_timeout = 0;
+               set_bit(XPRT_CLOSING, &xprt->state);
+               smp_mb__before_clear_bit();
+               clear_bit(XPRT_CONNECTED, &xprt->state);
+               clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+               smp_mb__after_clear_bit();
                 break;
         case TCP_CLOSE_WAIT:
-               /* Try to schedule an autoclose RPC calls */
-               set_bit(XPRT_CLOSE_WAIT, &xprt->state);
-               if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
-                       queue_work(rpciod_workqueue, &xprt->task_cleanup);
-       default:
-               xprt_disconnect(xprt);
+               /* The server initiated a shutdown of the socket */
+               set_bit(XPRT_CLOSING, &xprt->state);
+               xprt_force_disconnect(xprt);
+       case TCP_SYN_SENT:
+       case TCP_CLOSING:
+               /*
+                * If the server closed down the connection, make sure that
+                * we back off before reconnecting
+                */
+               if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+                       xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+               break;
+       case TCP_LAST_ACK:
+               smp_mb__before_clear_bit();
+               clear_bit(XPRT_CONNECTED, &xprt->state);
+               smp_mb__after_clear_bit();
+               break;
+       case TCP_CLOSE:
+               smp_mb__before_clear_bit();
+               clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+               clear_bit(XPRT_CLOSING, &xprt->state);
+               smp_mb__after_clear_bit();
+               /* Mark transport as closed and wake up all pending tasks */
+               xprt_disconnect_done(xprt);
         }
   out:
         read_unlock(&sk->sk_callback_lock);
@@ -1279,34 +1311,53 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
         }
  }
  
+static unsigned short xs_get_srcport(struct sock_xprt *transport, struct socket *sock)
+{
+       unsigned short port = transport->port;
+
+       if (port == 0 && transport->xprt.resvport)
+               port = xs_get_random_port();
+       return port;
+}
+
+static unsigned short xs_next_srcport(struct sock_xprt *transport, struct socket *sock, unsigned short port)
+{
+       if (transport->port != 0)
+               transport->port = 0;
+       if (!transport->xprt.resvport)
+               return 0;
+       if (port <= xprt_min_resvport || port > xprt_max_resvport)
+               return xprt_max_resvport;
+       return --port;
+}
+
  static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
  {
         struct sockaddr_in myaddr = {
                 .sin_family = AF_INET,
         };
         struct sockaddr_in *sa;
-       int err;
-       unsigned short port = transport->port;
+       int err, nloop = 0;
+       unsigned short port = xs_get_srcport(transport, sock);
+       unsigned short last;
  
-       if (!transport->xprt.resvport)
-               port = 0;
         sa = (struct sockaddr_in *)&transport->addr;
         myaddr.sin_addr = sa->sin_addr;
         do {
                 myaddr.sin_port = htons(port);
                 err = kernel_bind(sock, (struct sockaddr *) &myaddr,
                                                 sizeof(myaddr));
-               if (!transport->xprt.resvport)
+               if (port == 0)
                         break;
                 if (err == 0) {
                         transport->port = port;
                         break;
                 }
-               if (port <= xprt_min_resvport)
-                       port = xprt_max_resvport;
-               else
-                       port--;
-       } while (err == -EADDRINUSE && port != transport->port);
+               last = port;
+               port = xs_next_srcport(transport, sock, port);
+               if (port > last)
+                       nloop++;
+       } while (err == -EADDRINUSE && nloop != 2);
         dprintk("RPC:       %s "NIPQUAD_FMT":%u: %s (%d)\n",
                         __FUNCTION__, NIPQUAD(myaddr.sin_addr),
                         port, err ? "failed" : "ok", err);
@@ -1319,28 +1370,27 @@ static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
                 .sin6_family = AF_INET6,
         };
         struct sockaddr_in6 *sa;
-       int err;
-       unsigned short port = transport->port;
+       int err, nloop = 0;
+       unsigned short port = xs_get_srcport(transport, sock);
+       unsigned short last;
  
-       if (!transport->xprt.resvport)
-               port = 0;
         sa = (struct sockaddr_in6 *)&transport->addr;
         myaddr.sin6_addr = sa->sin6_addr;
         do {
                 myaddr.sin6_port = htons(port);
                 err = kernel_bind(sock, (struct sockaddr *) &myaddr,
                                                 sizeof(myaddr));
-               if (!transport->xprt.resvport)
+               if (port == 0)
                         break;
                 if (err == 0) {
                         transport->port = port;
                         break;
                 }
-               if (port <= xprt_min_resvport)
-                       port = xprt_max_resvport;
-               else
-                       port--;
-       } while (err == -EADDRINUSE && port != transport->port);
+               last = port;
+               port = xs_next_srcport(transport, sock, port);
+               if (port > last)
+                       nloop++;
+       } while (err == -EADDRINUSE && nloop != 2);
         dprintk("RPC:       xs_bind6 "NIP6_FMT":%u: %s (%d)\n",
                 NIP6(myaddr.sin6_addr), port, err ? "failed" : "ok", err);
         return err;
@@ -1602,8 +1652,7 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
                                 break;
                         default:
                                 /* get rid of existing socket, and retry */
-                               xs_close(xprt);
-                               break;
+                               xs_tcp_shutdown(xprt);
                 }
         }
  out:
@@ -1662,8 +1711,7 @@ static void xs_tcp_connect_worker6(struct work_struct *work)
                                 break;
                         default:
                                 /* get rid of existing socket, and retry */
-                               xs_close(xprt);
-                               break;
+                               xs_tcp_shutdown(xprt);
                 }
         }
  out:
@@ -1710,6 +1758,19 @@ static void xs_connect(struct rpc_task *task)
         }
  }
  
+static void xs_tcp_connect(struct rpc_task *task)
+{
+       struct rpc_xprt *xprt = task->tk_xprt;
+
+       /* Initiate graceful shutdown of the socket if not already done */
+       if (test_bit(XPRT_CONNECTED, &xprt->state))
+               xs_tcp_shutdown(xprt);
+       /* Exit if we need to wait for socket shutdown to complete */
+       if (test_bit(XPRT_CLOSING, &xprt->state))
+               return;
+       xs_connect(task);
+}
+
  /**
   * xs_udp_print_stats - display UDP socket-specifc stats
   * @xprt: rpc_xprt struct containing statistics
@@ -1780,12 +1841,12 @@ static struct rpc_xprt_ops xs_tcp_ops = {
         .release_xprt           = xs_tcp_release_xprt,
         .rpcbind                = rpcb_getport_async,
         .set_port               = xs_set_port,
-       .connect                = xs_connect,
+       .connect                = xs_tcp_connect,
         .buf_alloc              = rpc_malloc,
         .buf_free               = rpc_free,
         .send_request           = xs_tcp_send_request,
         .set_retrans_timeout    = xprt_set_retrans_timeout_def,
-       .close                  = xs_close,
+       .close                  = xs_tcp_shutdown,
         .destroy                = xs_destroy,
         .print_stats            = xs_tcp_print_stats,
  };
@@ -1822,11 +1883,17 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
         xprt->addrlen = args->addrlen;
         if (args->srcaddr)
                 memcpy(&new->addr, args->srcaddr, args->addrlen);
-       new->port = xs_get_random_port();
  
         return xprt;
  }
  
+static const struct rpc_timeout xs_udp_default_timeout = {
+       .to_initval = 5 * HZ,
+       .to_maxval = 30 * HZ,
+       .to_increment = 5 * HZ,
+       .to_retries = 5,
+};
+
  /**
   * xs_setup_udp - Set up transport to use a UDP socket
   * @args: rpc transport creation arguments
@@ -1855,10 +1922,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
  
         xprt->ops = &xs_udp_ops;
  
-       if (args->timeout)
-               xprt->timeout = *args->timeout;
-       else
-               xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
+       xprt->timeout = &xs_udp_default_timeout;
  
         switch (addr->sa_family) {
         case AF_INET:
@@ -1867,7 +1931,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
  
                 INIT_DELAYED_WORK(&transport->connect_worker,
                                         xs_udp_connect_worker4);
-               xs_format_ipv4_peer_addresses(xprt);
+               xs_format_ipv4_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
                 break;
         case AF_INET6:
                 if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
@@ -1875,7 +1939,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
  
                 INIT_DELAYED_WORK(&transport->connect_worker,
                                         xs_udp_connect_worker6);
-               xs_format_ipv6_peer_addresses(xprt);
+               xs_format_ipv6_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
                 break;
         default:
                 kfree(xprt);
@@ -1893,6 +1957,12 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
         return ERR_PTR(-EINVAL);
  }
  
+static const struct rpc_timeout xs_tcp_default_timeout = {
+       .to_initval = 60 * HZ,
+       .to_maxval = 60 * HZ,
+       .to_retries = 2,
+};
+
  /**
   * xs_setup_tcp - Set up transport to use a TCP socket
   * @args: rpc transport creation arguments
@@ -1919,11 +1989,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
         xprt->idle_timeout = XS_IDLE_DISC_TO;
  
         xprt->ops = &xs_tcp_ops;
-
-       if (args->timeout)
-               xprt->timeout = *args->timeout;
-       else
-               xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
+       xprt->timeout = &xs_tcp_default_timeout;
  
         switch (addr->sa_family) {
         case AF_INET:
@@ -1931,14 +1997,14 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
                         xprt_set_bound(xprt);
  
                 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4);
-               xs_format_ipv4_peer_addresses(xprt);
+               xs_format_ipv4_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
                 break;
         case AF_INET6:
                 if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
                         xprt_set_bound(xprt);
  
                 INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6);
-               xs_format_ipv6_peer_addresses(xprt);
+               xs_format_ipv6_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
                 break;
         default:
                 kfree(xprt);
diff --git a/security/Kconfig b/security/Kconfig

index 8086e61058e373e68054dd166be2e942f4db2e4c..389e151e3b68e832d7b63b9623e10098d6d2e03b 100644 (file)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -76,6 +76,7 @@ config SECURITY_NETWORK_XFRM
  config SECURITY_CAPABILITIES
         bool "Default Linux Capabilities"
         depends on SECURITY
+       default y
         help
           This enables the "default" Linux capabilities functionality.
           If you are unsure how to answer this question, answer Y.
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig

index b32a459c0683c720271b6ab15d0829b11ef3e125..2b517d6186729c05608d56dd1123926a242c61d2 100644 (file)
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX
  config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
         int "NSA SELinux maximum supported policy format version value"
         depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX
-       range 15 21
+       range 15 22
         default 19
         help
           This option sets the value for the maximum policy format version
diff --git a/security/selinux/Makefile b/security/selinux/Makefile

index dc3502e30b19f0a4d729aa5b1a0450e8f9e34bf3..00afd85f1edb903544ae9d3f365f0e6df7ac397b 100644 (file)
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -4,7 +4,14 @@
  
  obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
  
-selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o
+selinux-y := avc.o \
+            hooks.o \
+            selinuxfs.o \
+            netlink.o \
+            nlmsgtab.o \
+            netif.o \
+            netnode.o \
+            exports.o
  
  selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
  
diff --git a/security/selinux/avc.c b/security/selinux/avc.c

index 81b3dff3cbf0597a4b7464ee9d6f3fdcd121d2b8..e8529e2f51e5ba24042ccd88b77e481c9457a5aa 100644 (file)
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -661,9 +661,18 @@ void avc_audit(u32 ssid, u32 tsid,
                                                     "daddr", "dest");
                                 break;
                         }
-                       if (a->u.net.netif)
-                               audit_log_format(ab, " netif=%s",
-                                       a->u.net.netif);
+                       if (a->u.net.netif > 0) {
+                               struct net_device *dev;
+
+                               /* NOTE: we always use init's namespace */
+                               dev = dev_get_by_index(&init_net,
+                                                      a->u.net.netif);
+                               if (dev) {
+                                       audit_log_format(ab, " netif=%s",
+                                                        dev->name);
+                                       dev_put(dev);
+                               }
+                       }
                         break;
                 }
         }
diff --git a/security/selinux/exports.c b/security/selinux/exports.c

index b6f96943be1fb456d5a998952abdd54d0ca3587b..87d2bb3ea35574596536a9b67305f8f625042dea 100644 (file)
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -17,10 +17,14 @@
  #include <linux/selinux.h>
  #include <linux/fs.h>
  #include <linux/ipc.h>
+#include <asm/atomic.h>
  
  #include "security.h"
  #include "objsec.h"
  
+/* SECMARK reference count */
+extern atomic_t selinux_secmark_refcount;
+
  int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
  {
         if (selinux_enabled)
@@ -74,7 +78,7 @@ int selinux_string_to_sid(char *str, u32 *sid)
  }
  EXPORT_SYMBOL_GPL(selinux_string_to_sid);
  
-int selinux_relabel_packet_permission(u32 sid)
+int selinux_secmark_relabel_packet_permission(u32 sid)
  {
         if (selinux_enabled) {
                 struct task_security_struct *tsec = current->security;
@@ -84,4 +88,16 @@ int selinux_relabel_packet_permission(u32 sid)
         }
         return 0;
  }
-EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission);
+EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
+
+void selinux_secmark_refcount_inc(void)
+{
+       atomic_inc(&selinux_secmark_refcount);
+}
+EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
+
+void selinux_secmark_refcount_dec(void)
+{
+       atomic_dec(&selinux_secmark_refcount);
+}
+EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c

index 64d414efb404e67d614d5279c8b44fac9431d1f4..be6de0b8734fd9eb950d8f8ca9e2306621764420 100644 (file)
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -12,8 +12,8 @@
   *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
   *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
   *                          <dgoeddel@trustedcs.com>
- *  Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
- *                     Paul Moore, <paul.moore@hp.com>
+ *  Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
+ *                Paul Moore <paul.moore@hp.com>
   *  Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
   *                     Yuichi Nakamura <ynakam@hitachisoft.jp>
   *
@@ -50,8 +50,11 @@
  #include <net/icmp.h>
  #include <net/ip.h>            /* for local_port_range[] */
  #include <net/tcp.h>           /* struct or_callable used in sock_rcv_skb */
+#include <net/net_namespace.h>
+#include <net/netlabel.h>
  #include <asm/uaccess.h>
  #include <asm/ioctls.h>
+#include <asm/atomic.h>
  #include <linux/bitops.h>
  #include <linux/interrupt.h>
  #include <linux/netdevice.h>   /* for network interface checks */
@@ -76,6 +79,7 @@
  #include "avc.h"
  #include "objsec.h"
  #include "netif.h"
+#include "netnode.h"
  #include "xfrm.h"
  #include "netlabel.h"
  
@@ -89,6 +93,9 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm);
  extern int selinux_compat_net;
  extern struct security_operations *security_ops;
  
+/* SECMARK reference count */
+atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);
+
  #ifdef CONFIG_SECURITY_SELINUX_DEVELOP
  int selinux_enforcing = 0;
  
@@ -155,6 +162,21 @@ getsecurity_exit:
         return len;
  }
  
+/**
+ * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
+ *
+ * Description:
+ * This function checks the SECMARK reference counter to see if any SECMARK
+ * targets are currently configured, if the reference counter is greater than
+ * zero SECMARK is considered to be enabled.  Returns true (1) if SECMARK is
+ * enabled, false (0) if SECMARK is disabled.
+ *
+ */
+static int selinux_secmark_enabled(void)
+{
+       return (atomic_read(&selinux_secmark_refcount) > 0);
+}
+
  /* Allocate and free functions for each kind of security blob. */
  
  static int task_alloc_security(struct task_struct *task)
@@ -561,8 +583,8 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag,
   * Allow filesystems with binary mount data to explicitly set mount point
   * labeling information.
   */
-int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
-                                int *flags, int num_opts)
+static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options,
+                               int *flags, int num_opts)
  {
         int rc = 0, i;
         struct task_security_struct *tsec = current->security;
@@ -3395,7 +3417,7 @@ out:
  #endif /* IPV6 */
  
  static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
-                            char **addrp, int *len, int src, u8 *proto)
+                            char **addrp, int src, u8 *proto)
  {
         int ret = 0;
  
@@ -3404,7 +3426,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
                 ret = selinux_parse_skb_ipv4(skb, ad, proto);
                 if (ret || !addrp)
                         break;
-               *len = 4;
                 *addrp = (char *)(src ? &ad->u.net.v4info.saddr :
                                         &ad->u.net.v4info.daddr);
                 break;
@@ -3414,7 +3435,6 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
                 ret = selinux_parse_skb_ipv6(skb, ad, proto);
                 if (ret || !addrp)
                         break;
-               *len = 16;
                 *addrp = (char *)(src ? &ad->u.net.v6info.saddr :
                                         &ad->u.net.v6info.daddr);
                 break;
@@ -3423,36 +3443,48 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
                 break;
         }
  
+       if (unlikely(ret))
+               printk(KERN_WARNING
+                      "SELinux: failure in selinux_parse_skb(),"
+                      " unable to parse packet\n");
+
         return ret;
  }
  
  /**
- * selinux_skb_extlbl_sid - Determine the external label of a packet
+ * selinux_skb_peerlbl_sid - Determine the peer label of a packet
   * @skb: the packet
- * @sid: the packet's SID
+ * @family: protocol family
+ * @sid: the packet's peer label SID
   *
   * Description:
- * Check the various different forms of external packet labeling and determine
- * the external SID for the packet.  If only one form of external labeling is
- * present then it is used, if both labeled IPsec and NetLabel labels are
- * present then the SELinux type information is taken from the labeled IPsec
- * SA and the MLS sensitivity label information is taken from the NetLabel
- * security attributes.  This bit of "magic" is done in the call to
- * selinux_netlbl_skbuff_getsid().
+ * Check the various different forms of network peer labeling and determine
+ * the peer label/SID for the packet; most of the magic actually occurs in
+ * the security server function security_net_peersid_cmp().  The function
+ * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
+ * or -EACCES if @sid is invalid due to inconsistencies with the different
+ * peer labels.
   *
   */
-static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid)
+static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
  {
+       int err;
         u32 xfrm_sid;
         u32 nlbl_sid;
+       u32 nlbl_type;
  
         selinux_skb_xfrm_sid(skb, &xfrm_sid);
-       if (selinux_netlbl_skbuff_getsid(skb,
-                                        (xfrm_sid == SECSID_NULL ?
-                                         SECINITSID_NETMSG : xfrm_sid),
-                                        &nlbl_sid) != 0)
-               nlbl_sid = SECSID_NULL;
-       *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid);
+       selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
+
+       err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid);
+       if (unlikely(err)) {
+               printk(KERN_WARNING
+                      "SELinux: failure in selinux_skb_peerlbl_sid(),"
+                      " unable to determine packet's peer label\n");
+               return -EACCES;
+       }
+
+       return 0;
  }
  
  /* socket security operations */
@@ -3518,6 +3550,7 @@ static int selinux_socket_post_create(struct socket *sock, int family,
         if (sock->sk) {
                 sksec = sock->sk->sk_security;
                 sksec->sid = isec->sid;
+               sksec->sclass = isec->sclass;
                 err = selinux_netlbl_socket_post_create(sock);
         }
  
@@ -3610,7 +3643,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
                         break;
                 }
                 
-               err = security_node_sid(family, addrp, addrlen, &sid);
+               err = sel_netnode_sid(addrp, family, &sid);
                 if (err)
                         goto out;
                 
@@ -3821,131 +3854,182 @@ static int selinux_socket_unix_may_send(struct socket *sock,
         return 0;
  }
  
-static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
-               struct avc_audit_data *ad, u16 family, char *addrp, int len)
+static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family,
+                                   u32 peer_sid,
+                                   struct avc_audit_data *ad)
  {
-       int err = 0;
-       u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0;
-       struct socket *sock;
-       u16 sock_class = 0;
-       u32 sock_sid = 0;
-
-       read_lock_bh(&sk->sk_callback_lock);
-       sock = sk->sk_socket;
-       if (sock) {
-               struct inode *inode;
-               inode = SOCK_INODE(sock);
-               if (inode) {
-                       struct inode_security_struct *isec;
-                       isec = inode->i_security;
-                       sock_sid = isec->sid;
-                       sock_class = isec->sclass;
-               }
-       }
-       read_unlock_bh(&sk->sk_callback_lock);
-       if (!sock_sid)
-               goto out;
+       int err;
+       u32 if_sid;
+       u32 node_sid;
  
-       if (!skb->dev)
-               goto out;
+       err = sel_netif_sid(ifindex, &if_sid);
+       if (err)
+               return err;
+       err = avc_has_perm(peer_sid, if_sid,
+                          SECCLASS_NETIF, NETIF__INGRESS, ad);
+       if (err)
+               return err;
  
-       err = sel_netif_sids(skb->dev, &if_sid, NULL);
+       err = sel_netnode_sid(addrp, family, &node_sid);
         if (err)
-               goto out;
+               return err;
+       return avc_has_perm(peer_sid, node_sid,
+                           SECCLASS_NODE, NODE__RECVFROM, ad);
+}
+
+static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk,
+                                               struct sk_buff *skb,
+                                               struct avc_audit_data *ad,
+                                               u16 family,
+                                               char *addrp)
+{
+       int err;
+       struct sk_security_struct *sksec = sk->sk_security;
+       u16 sk_class;
+       u32 netif_perm, node_perm, recv_perm;
+       u32 port_sid, node_sid, if_sid, sk_sid;
  
-       switch (sock_class) {
+       sk_sid = sksec->sid;
+       sk_class = sksec->sclass;
+
+       switch (sk_class) {
         case SECCLASS_UDP_SOCKET:
                 netif_perm = NETIF__UDP_RECV;
                 node_perm = NODE__UDP_RECV;
                 recv_perm = UDP_SOCKET__RECV_MSG;
                 break;
-       
         case SECCLASS_TCP_SOCKET:
                 netif_perm = NETIF__TCP_RECV;
                 node_perm = NODE__TCP_RECV;
                 recv_perm = TCP_SOCKET__RECV_MSG;
                 break;
-
         case SECCLASS_DCCP_SOCKET:
                 netif_perm = NETIF__DCCP_RECV;
                 node_perm = NODE__DCCP_RECV;
                 recv_perm = DCCP_SOCKET__RECV_MSG;
                 break;
-
         default:
                 netif_perm = NETIF__RAWIP_RECV;
                 node_perm = NODE__RAWIP_RECV;
+               recv_perm = 0;
                 break;
         }
  
-       err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+       err = sel_netif_sid(skb->iif, &if_sid);
         if (err)
-               goto out;
-       
-       err = security_node_sid(family, addrp, len, &node_sid);
+               return err;
+       err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
         if (err)
-               goto out;
+               return err;
         
-       err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad);
+       err = sel_netnode_sid(addrp, family, &node_sid);
         if (err)
-               goto out;
+               return err;
+       err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
+       if (err)
+               return err;
  
-       if (recv_perm) {
-               u32 port_sid;
+       if (!recv_perm)
+               return 0;
+       err = security_port_sid(sk->sk_family, sk->sk_type,
+                               sk->sk_protocol, ntohs(ad->u.net.sport),
+                               &port_sid);
+       if (unlikely(err)) {
+               printk(KERN_WARNING
+                      "SELinux: failure in"
+                      " selinux_sock_rcv_skb_iptables_compat(),"
+                      " network port label not found\n");
+               return err;
+       }
+       return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad);
+}
  
-               err = security_port_sid(sk->sk_family, sk->sk_type,
-                                       sk->sk_protocol, ntohs(ad->u.net.sport),
-                                       &port_sid);
-               if (err)
-                       goto out;
+static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
+                                      struct avc_audit_data *ad,
+                                      u16 family, char *addrp)
+{
+       int err;
+       struct sk_security_struct *sksec = sk->sk_security;
+       u32 peer_sid;
+       u32 sk_sid = sksec->sid;
  
-               err = avc_has_perm(sock_sid, port_sid,
-                                  sock_class, recv_perm, ad);
+       if (selinux_compat_net)
+               err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
+                                                          family, addrp);
+       else
+               err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
+                                  PACKET__RECV, ad);
+       if (err)
+               return err;
+
+       if (selinux_policycap_netpeer) {
+               err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
+               if (err)
+                       return err;
+               err = avc_has_perm(sk_sid, peer_sid,
+                                  SECCLASS_PEER, PEER__RECV, ad);
+       } else {
+               err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
+               if (err)
+                       return err;
+               err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
         }
  
-out:
         return err;
  }
  
  static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
  {
-       u16 family;
-       char *addrp;
-       int len, err = 0;
-       struct avc_audit_data ad;
+       int err;
         struct sk_security_struct *sksec = sk->sk_security;
+       u16 family = sk->sk_family;
+       u32 sk_sid = sksec->sid;
+       struct avc_audit_data ad;
+       char *addrp;
  
-       family = sk->sk_family;
         if (family != PF_INET && family != PF_INET6)
-               goto out;
+               return 0;
  
         /* Handle mapped IPv4 packets arriving via IPv6 sockets */
         if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                 family = PF_INET;
  
         AVC_AUDIT_DATA_INIT(&ad, NET);
-       ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]";
+       ad.u.net.netif = skb->iif;
         ad.u.net.family = family;
-
-       err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL);
+       err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
         if (err)
-               goto out;
+               return err;
  
-       if (selinux_compat_net)
-               err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family,
-                                                 addrp, len);
-       else
-               err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
-                                  PACKET__RECV, &ad);
-       if (err)
-               goto out;
+       /* If any sort of compatibility mode is enabled then handoff processing
+        * to the selinux_sock_rcv_skb_compat() function to deal with the
+        * special handling.  We do this in an attempt to keep this function
+        * as fast and as clean as possible. */
+       if (selinux_compat_net || !selinux_policycap_netpeer)
+               return selinux_sock_rcv_skb_compat(sk, skb, &ad,
+                                                  family, addrp);
  
-       err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad);
-       if (err)
-               goto out;
+       if (netlbl_enabled() || selinux_xfrm_enabled()) {
+               u32 peer_sid;
+
+               err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
+               if (err)
+                       return err;
+               err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
+                                              peer_sid, &ad);
+               if (err)
+                       return err;
+               err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
+                                  PEER__RECV, &ad);
+       }
+
+       if (selinux_secmark_enabled()) {
+               err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
+                                  PACKET__RECV, &ad);
+               if (err)
+                       return err;
+       }
  
-       err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
-out:   
         return err;
  }
  
@@ -3996,18 +4080,25 @@ out:
  static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
  {
         u32 peer_secid = SECSID_NULL;
-       int err = 0;
+       u16 family;
+
+       if (sock)
+               family = sock->sk->sk_family;
+       else if (skb && skb->sk)
+               family = skb->sk->sk_family;
+       else
+               goto out;
  
-       if (sock && sock->sk->sk_family == PF_UNIX)
+       if (sock && family == PF_UNIX)
                 selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
         else if (skb)
-               selinux_skb_extlbl_sid(skb, &peer_secid);
+               selinux_skb_peerlbl_sid(skb, family, &peer_secid);
  
-       if (peer_secid == SECSID_NULL)
-               err = -EINVAL;
+out:
         *secid = peer_secid;
-
-       return err;
+       if (peer_secid == SECSID_NULL)
+               return -EINVAL;
+       return 0;
  }
  
  static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
@@ -4027,6 +4118,7 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
  
         newssec->sid = ssec->sid;
         newssec->peer_sid = ssec->peer_sid;
+       newssec->sclass = ssec->sclass;
  
         selinux_netlbl_sk_security_clone(ssec, newssec);
  }
@@ -4050,6 +4142,7 @@ static void selinux_sock_graft(struct sock* sk, struct socket *parent)
         if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
             sk->sk_family == PF_UNIX)
                 isec->sid = sksec->sid;
+       sksec->sclass = isec->sclass;
  
         selinux_netlbl_sock_graft(sk, parent);
  }
@@ -4062,7 +4155,9 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
         u32 newsid;
         u32 peersid;
  
-       selinux_skb_extlbl_sid(skb, &peersid);
+       err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
+       if (err)
+               return err;
         if (peersid == SECSID_NULL) {
                 req->secid = sksec->sid;
                 req->peer_secid = SECSID_NULL;
@@ -4100,7 +4195,7 @@ static void selinux_inet_conn_established(struct sock *sk,
  {
         struct sk_security_struct *sksec = sk->sk_security;
  
-       selinux_skb_extlbl_sid(skb, &sksec->peer_sid);
+       selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
  }
  
  static void selinux_req_classify_flow(const struct request_sock *req,
@@ -4147,149 +4242,260 @@ out:
  
  #ifdef CONFIG_NETFILTER
  
-static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev,
-                                           struct avc_audit_data *ad,
-                                           u16 family, char *addrp, int len)
+static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
+                                      u16 family)
  {
-       int err = 0;
-       u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0;
-       struct socket *sock;
-       struct inode *inode;
-       struct inode_security_struct *isec;
+       char *addrp;
+       u32 peer_sid;
+       struct avc_audit_data ad;
+       u8 secmark_active;
+       u8 peerlbl_active;
  
-       sock = sk->sk_socket;
-       if (!sock)
-               goto out;
+       if (!selinux_policycap_netpeer)
+               return NF_ACCEPT;
  
-       inode = SOCK_INODE(sock);
-       if (!inode)
-               goto out;
+       secmark_active = selinux_secmark_enabled();
+       peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
+       if (!secmark_active && !peerlbl_active)
+               return NF_ACCEPT;
  
-       isec = inode->i_security;
-       
-       err = sel_netif_sids(dev, &if_sid, NULL);
-       if (err)
-               goto out;
+       AVC_AUDIT_DATA_INIT(&ad, NET);
+       ad.u.net.netif = ifindex;
+       ad.u.net.family = family;
+       if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
+               return NF_DROP;
+
+       if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
+               return NF_DROP;
+
+       if (peerlbl_active)
+               if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
+                                            peer_sid, &ad) != 0)
+                       return NF_DROP;
+
+       if (secmark_active)
+               if (avc_has_perm(peer_sid, skb->secmark,
+                                SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
+                       return NF_DROP;
+
+       return NF_ACCEPT;
+}
+
+static unsigned int selinux_ipv4_forward(unsigned int hooknum,
+                                        struct sk_buff *skb,
+                                        const struct net_device *in,
+                                        const struct net_device *out,
+                                        int (*okfn)(struct sk_buff *))
+{
+       return selinux_ip_forward(skb, in->ifindex, PF_INET);
+}
  
-       switch (isec->sclass) {
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static unsigned int selinux_ipv6_forward(unsigned int hooknum,
+                                        struct sk_buff *skb,
+                                        const struct net_device *in,
+                                        const struct net_device *out,
+                                        int (*okfn)(struct sk_buff *))
+{
+       return selinux_ip_forward(skb, in->ifindex, PF_INET6);
+}
+#endif /* IPV6 */
+
+static int selinux_ip_postroute_iptables_compat(struct sock *sk,
+                                               int ifindex,
+                                               struct avc_audit_data *ad,
+                                               u16 family, char *addrp)
+{
+       int err;
+       struct sk_security_struct *sksec = sk->sk_security;
+       u16 sk_class;
+       u32 netif_perm, node_perm, send_perm;
+       u32 port_sid, node_sid, if_sid, sk_sid;
+
+       sk_sid = sksec->sid;
+       sk_class = sksec->sclass;
+
+       switch (sk_class) {
         case SECCLASS_UDP_SOCKET:
                 netif_perm = NETIF__UDP_SEND;
                 node_perm = NODE__UDP_SEND;
                 send_perm = UDP_SOCKET__SEND_MSG;
                 break;
-       
         case SECCLASS_TCP_SOCKET:
                 netif_perm = NETIF__TCP_SEND;
                 node_perm = NODE__TCP_SEND;
                 send_perm = TCP_SOCKET__SEND_MSG;
                 break;
-
         case SECCLASS_DCCP_SOCKET:
                 netif_perm = NETIF__DCCP_SEND;
                 node_perm = NODE__DCCP_SEND;
                 send_perm = DCCP_SOCKET__SEND_MSG;
                 break;
-
         default:
                 netif_perm = NETIF__RAWIP_SEND;
                 node_perm = NODE__RAWIP_SEND;
+               send_perm = 0;
                 break;
         }
  
-       err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+       err = sel_netif_sid(ifindex, &if_sid);
         if (err)
-               goto out;
+               return err;
+       err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad);
+               return err;
                 
-       err = security_node_sid(family, addrp, len, &node_sid);
+       err = sel_netnode_sid(addrp, family, &node_sid);
         if (err)
-               goto out;
-       
-       err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad);
+               return err;
+       err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad);
         if (err)
-               goto out;
+               return err;
  
-       if (send_perm) {
-               u32 port_sid;
-               
-               err = security_port_sid(sk->sk_family,
-                                       sk->sk_type,
-                                       sk->sk_protocol,
-                                       ntohs(ad->u.net.dport),
-                                       &port_sid);
-               if (err)
-                       goto out;
+       if (send_perm != 0)
+               return 0;
  
-               err = avc_has_perm(isec->sid, port_sid, isec->sclass,
-                                  send_perm, ad);
+       err = security_port_sid(sk->sk_family, sk->sk_type,
+                               sk->sk_protocol, ntohs(ad->u.net.dport),
+                               &port_sid);
+       if (unlikely(err)) {
+               printk(KERN_WARNING
+                      "SELinux: failure in"
+                      " selinux_ip_postroute_iptables_compat(),"
+                      " network port label not found\n");
+               return err;
         }
-out:
-       return err;
+       return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad);
  }
  
-static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
-                                              struct sk_buff *skb,
-                                              const struct net_device *in,
-                                              const struct net_device *out,
-                                              int (*okfn)(struct sk_buff *),
-                                              u16 family)
+static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
+                                               int ifindex,
+                                               struct avc_audit_data *ad,
+                                               u16 family,
+                                               char *addrp,
+                                               u8 proto)
  {
-       char *addrp;
-       int len, err = 0;
-       struct sock *sk;
-       struct avc_audit_data ad;
-       struct net_device *dev = (struct net_device *)out;
+       struct sock *sk = skb->sk;
         struct sk_security_struct *sksec;
-       u8 proto;
-
-       sk = skb->sk;
-       if (!sk)
-               goto out;
  
+       if (sk == NULL)
+               return NF_ACCEPT;
         sksec = sk->sk_security;
  
-       AVC_AUDIT_DATA_INIT(&ad, NET);
-       ad.u.net.netif = dev->name;
-       ad.u.net.family = family;
+       if (selinux_compat_net) {
+               if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
+                                                        ad, family, addrp))
+                       return NF_DROP;
+       } else {
+               if (avc_has_perm(sksec->sid, skb->secmark,
+                                SECCLASS_PACKET, PACKET__SEND, ad))
+                       return NF_DROP;
+       }
  
-       err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto);
-       if (err)
-               goto out;
+       if (selinux_policycap_netpeer)
+               if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
+                       return NF_DROP;
  
-       if (selinux_compat_net)
-               err = selinux_ip_postroute_last_compat(sk, dev, &ad,
-                                                      family, addrp, len);
-       else
-               err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
-                                  PACKET__SEND, &ad);
+       return NF_ACCEPT;
+}
  
-       if (err)
-               goto out;
+static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
+                                        u16 family)
+{
+       u32 secmark_perm;
+       u32 peer_sid;
+       struct sock *sk;
+       struct avc_audit_data ad;
+       char *addrp;
+       u8 proto;
+       u8 secmark_active;
+       u8 peerlbl_active;
  
-       err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto);
-out:
-       return err ? NF_DROP : NF_ACCEPT;
+       AVC_AUDIT_DATA_INIT(&ad, NET);
+       ad.u.net.netif = ifindex;
+       ad.u.net.family = family;
+       if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
+               return NF_DROP;
+
+       /* If any sort of compatibility mode is enabled then handoff processing
+        * to the selinux_ip_postroute_compat() function to deal with the
+        * special handling.  We do this in an attempt to keep this function
+        * as fast and as clean as possible. */
+       if (selinux_compat_net || !selinux_policycap_netpeer)
+               return selinux_ip_postroute_compat(skb, ifindex, &ad,
+                                                  family, addrp, proto);
+
+       /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
+        * packet transformation so allow the packet to pass without any checks
+        * since we'll have another chance to perform access control checks
+        * when the packet is on it's final way out.
+        * NOTE: there appear to be some IPv6 multicast cases where skb->dst
+        *       is NULL, in this case go ahead and apply access control. */
+       if (skb->dst != NULL && skb->dst->xfrm != NULL)
+               return NF_ACCEPT;
+
+       secmark_active = selinux_secmark_enabled();
+       peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
+       if (!secmark_active && !peerlbl_active)
+               return NF_ACCEPT;
+
+       /* if the packet is locally generated (skb->sk != NULL) then use the
+        * socket's label as the peer label, otherwise the packet is being
+        * forwarded through this system and we need to fetch the peer label
+        * directly from the packet */
+       sk = skb->sk;
+       if (sk) {
+               struct sk_security_struct *sksec = sk->sk_security;
+               peer_sid = sksec->sid;
+               secmark_perm = PACKET__SEND;
+       } else {
+               if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
+                               return NF_DROP;
+               secmark_perm = PACKET__FORWARD_OUT;
+       }
+
+       if (secmark_active)
+               if (avc_has_perm(peer_sid, skb->secmark,
+                                SECCLASS_PACKET, secmark_perm, &ad))
+                       return NF_DROP;
+
+       if (peerlbl_active) {
+               u32 if_sid;
+               u32 node_sid;
+
+               if (sel_netif_sid(ifindex, &if_sid))
+                       return NF_DROP;
+               if (avc_has_perm(peer_sid, if_sid,
+                                SECCLASS_NETIF, NETIF__EGRESS, &ad))
+                       return NF_DROP;
+
+               if (sel_netnode_sid(addrp, family, &node_sid))
+                       return NF_DROP;
+               if (avc_has_perm(peer_sid, node_sid,
+                                SECCLASS_NODE, NODE__SENDTO, &ad))
+                       return NF_DROP;
+       }
+
+       return NF_ACCEPT;
  }
  
-static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum,
-                                               struct sk_buff *skb,
-                                               const struct net_device *in,
-                                               const struct net_device *out,
-                                               int (*okfn)(struct sk_buff *))
+static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
+                                          struct sk_buff *skb,
+                                          const struct net_device *in,
+                                          const struct net_device *out,
+                                          int (*okfn)(struct sk_buff *))
  {
-       return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET);
+       return selinux_ip_postroute(skb, out->ifindex, PF_INET);
  }
  
  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-
-static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum,
-                                               struct sk_buff *skb,
-                                               const struct net_device *in,
-                                               const struct net_device *out,
-                                               int (*okfn)(struct sk_buff *))
+static unsigned int selinux_ipv6_postroute(unsigned int hooknum,
+                                          struct sk_buff *skb,
+                                          const struct net_device *in,
+                                          const struct net_device *out,
+                                          int (*okfn)(struct sk_buff *))
  {
-       return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6);
+       return selinux_ip_postroute(skb, out->ifindex, PF_INET6);
  }
-
  #endif /* IPV6 */
  
  #endif /* CONFIG_NETFILTER */
@@ -5277,22 +5483,40 @@ security_initcall(selinux_init);
  
  #if defined(CONFIG_NETFILTER)
  
-static struct nf_hook_ops selinux_ipv4_op = {
-       .hook =         selinux_ipv4_postroute_last,
-       .owner =        THIS_MODULE,
-       .pf =           PF_INET,
-       .hooknum =      NF_INET_POST_ROUTING,
-       .priority =     NF_IP_PRI_SELINUX_LAST,
+static struct nf_hook_ops selinux_ipv4_ops[] = {
+       {
+               .hook =         selinux_ipv4_postroute,
+               .owner =        THIS_MODULE,
+               .pf =           PF_INET,
+               .hooknum =      NF_INET_POST_ROUTING,
+               .priority =     NF_IP_PRI_SELINUX_LAST,
+       },
+       {
+               .hook =         selinux_ipv4_forward,
+               .owner =        THIS_MODULE,
+               .pf =           PF_INET,
+               .hooknum =      NF_INET_FORWARD,
+               .priority =     NF_IP_PRI_SELINUX_FIRST,
+       }
  };
  
  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
  
-static struct nf_hook_ops selinux_ipv6_op = {
-       .hook =         selinux_ipv6_postroute_last,
-       .owner =        THIS_MODULE,
-       .pf =           PF_INET6,
-       .hooknum =      NF_INET_POST_ROUTING,
-       .priority =     NF_IP6_PRI_SELINUX_LAST,
+static struct nf_hook_ops selinux_ipv6_ops[] = {
+       {
+               .hook =         selinux_ipv6_postroute,
+               .owner =        THIS_MODULE,
+               .pf =           PF_INET6,
+               .hooknum =      NF_INET_POST_ROUTING,
+               .priority =     NF_IP6_PRI_SELINUX_LAST,
+       },
+       {
+               .hook =         selinux_ipv6_forward,
+               .owner =        THIS_MODULE,
+               .pf =           PF_INET6,
+               .hooknum =      NF_INET_FORWARD,
+               .priority =     NF_IP6_PRI_SELINUX_FIRST,
+       }
  };
  
  #endif /* IPV6 */
@@ -5300,22 +5524,27 @@ static struct nf_hook_ops selinux_ipv6_op = {
  static int __init selinux_nf_ip_init(void)
  {
         int err = 0;
+       u32 iter;
  
         if (!selinux_enabled)
                 goto out;
  
         printk(KERN_DEBUG "SELinux:  Registering netfilter hooks\n");
  
-       err = nf_register_hook(&selinux_ipv4_op);
-       if (err)
-               panic("SELinux: nf_register_hook for IPv4: error %d\n", err);
+       for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) {
+               err = nf_register_hook(&selinux_ipv4_ops[iter]);
+               if (err)
+                       panic("SELinux: nf_register_hook for IPv4: error %d\n",
+                             err);
+       }
  
  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-
-       err = nf_register_hook(&selinux_ipv6_op);
-       if (err)
-               panic("SELinux: nf_register_hook for IPv6: error %d\n", err);
-
+       for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) {
+               err = nf_register_hook(&selinux_ipv6_ops[iter]);
+               if (err)
+                       panic("SELinux: nf_register_hook for IPv6: error %d\n",
+                             err);
+       }
  #endif /* IPV6 */
  
  out:
@@ -5327,11 +5556,15 @@ __initcall(selinux_nf_ip_init);
  #ifdef CONFIG_SECURITY_SELINUX_DISABLE
  static void selinux_nf_ip_exit(void)
  {
+       u32 iter;
+
         printk(KERN_DEBUG "SELinux:  Unregistering netfilter hooks\n");
  
-       nf_unregister_hook(&selinux_ipv4_op);
+       for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++)
+               nf_unregister_hook(&selinux_ipv4_ops[iter]);
  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-       nf_unregister_hook(&selinux_ipv6_op);
+       for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++)
+               nf_unregister_hook(&selinux_ipv6_ops[iter]);
  #endif /* IPV6 */
  }
  #endif
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h

index 049bf69429b66d2566abc5bb063c19fc41647095..399f868c5c8fb945f430e1f3206c73036bb38045 100644 (file)
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -37,6 +37,8 @@
     S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest")
     S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv")
     S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send")
+   S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom")
+   S_(SECCLASS_NODE, NODE__SENDTO, "sendto")
     S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv")
     S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send")
     S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv")
@@ -45,6 +47,8 @@
     S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send")
     S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv")
     S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send")
+   S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress")
+   S_(SECCLASS_NETIF, NETIF__EGRESS, "egress")
     S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto")
     S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn")
     S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom")
@@ -149,6 +153,10 @@
     S_(SECCLASS_PACKET, PACKET__SEND, "send")
     S_(SECCLASS_PACKET, PACKET__RECV, "recv")
     S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto")
+   S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in")
+   S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out")
+   S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in")
+   S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out")
     S_(SECCLASS_KEY, KEY__VIEW, "view")
     S_(SECCLASS_KEY, KEY__READ, "read")
     S_(SECCLASS_KEY, KEY__WRITE, "write")
@@ -159,3 +167,4 @@
     S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind")
     S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect")
     S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero")
+   S_(SECCLASS_PEER, PEER__RECV, "recv")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h

index eda89a2ec635db3ec67e546af97cd5d76dedc565..84c9abc809787026b077d2f0a808ee9f1efe71ca 100644 (file)
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -292,6 +292,8 @@
  #define NODE__ENFORCE_DEST                        0x00000040UL
  #define NODE__DCCP_RECV                           0x00000080UL
  #define NODE__DCCP_SEND                           0x00000100UL
+#define NODE__RECVFROM                            0x00000200UL
+#define NODE__SENDTO                              0x00000400UL
  #define NETIF__TCP_RECV                           0x00000001UL
  #define NETIF__TCP_SEND                           0x00000002UL
  #define NETIF__UDP_RECV                           0x00000004UL
@@ -300,6 +302,8 @@
  #define NETIF__RAWIP_SEND                         0x00000020UL
  #define NETIF__DCCP_RECV                          0x00000040UL
  #define NETIF__DCCP_SEND                          0x00000080UL
+#define NETIF__INGRESS                            0x00000100UL
+#define NETIF__EGRESS                             0x00000200UL
  #define NETLINK_SOCKET__IOCTL                     0x00000001UL
  #define NETLINK_SOCKET__READ                      0x00000002UL
  #define NETLINK_SOCKET__WRITE                     0x00000004UL
@@ -792,6 +796,10 @@
  #define PACKET__SEND                              0x00000001UL
  #define PACKET__RECV                              0x00000002UL
  #define PACKET__RELABELTO                         0x00000004UL
+#define PACKET__FLOW_IN                           0x00000008UL
+#define PACKET__FLOW_OUT                          0x00000010UL
+#define PACKET__FORWARD_IN                        0x00000020UL
+#define PACKET__FORWARD_OUT                       0x00000040UL
  #define KEY__VIEW                                 0x00000001UL
  #define KEY__READ                                 0x00000002UL
  #define KEY__WRITE                                0x00000004UL
@@ -824,3 +832,4 @@
  #define DCCP_SOCKET__NODE_BIND                    0x00400000UL
  #define DCCP_SOCKET__NAME_CONNECT                 0x00800000UL
  #define MEMPROTECT__MMAP_ZERO                     0x00000001UL
+#define PEER__RECV                                0x00000001UL
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h

index 553607a19e92f7e957cb4f5f05fd4f61465fd3d1..80c28fa6621c78527a23ee3fa8f9aa3e20ec6d55 100644 (file)
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -51,7 +51,7 @@ struct avc_audit_data {
                         struct inode *inode;
                 } fs;
                 struct {
-                       char *netif;
+                       int netif;
                         struct sock *sk;
                         u16 family;
                         __be16 dport;
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h

index e77de0e62ea0e00deae7c92822d61b5ba9e82ce0..b1b0d1d8f9503d11004ab32581a8d17e647ea16a 100644 (file)
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -64,3 +64,10 @@
      S_(NULL)
      S_("dccp_socket")
      S_("memprotect")
+    S_(NULL)
+    S_(NULL)
+    S_(NULL)
+    S_(NULL)
+    S_(NULL)
+    S_(NULL)
+    S_("peer")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h

index a9c2b20f14b5c0584085180c272875453b9796c9..09e9dd23ee1a5f54345122b1f286852772e25227 100644 (file)
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -50,6 +50,7 @@
  #define SECCLASS_KEY                                     58
  #define SECCLASS_DCCP_SOCKET                             60
  #define SECCLASS_MEMPROTECT                              61
+#define SECCLASS_PEER                                    68
  
  /*
   * Security identifier indices for initial entities
diff --git a/security/selinux/include/netif.h b/security/selinux/include/netif.h

index 8bd6f9992d2b3fe01bfe53bf807fa52c42c5d509..ce23edd128b3893b7649c374ea52ee7a7a8bfc29 100644 (file)
--- a/security/selinux/include/netif.h
+++ b/security/selinux/include/netif.h
@@ -7,6 +7,8 @@
   * Author: James Morris <jmorris@redhat.com>
   *
   * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ *                    Paul Moore, <paul.moore@hp.com>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2,
@@ -15,7 +17,7 @@
  #ifndef _SELINUX_NETIF_H_
  #define _SELINUX_NETIF_H_
  
-int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid);
+int sel_netif_sid(int ifindex, u32 *sid);
  
  #endif /* _SELINUX_NETIF_H_ */
  
diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h

index 218e3f77c35096f1dd5d37a992191242517e6691..00a2809c85064b5e6a06e754e97a56208dd9eab7 100644 (file)
--- a/security/selinux/include/netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -46,13 +46,17 @@ void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec,
  void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
                                       struct sk_security_struct *newssec);
  
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid);
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
+                                u16 family,
+                                u32 *type,
+                                u32 *sid);
  
  void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
  int selinux_netlbl_socket_post_create(struct socket *sock);
  int selinux_netlbl_inode_permission(struct inode *inode, int mask);
  int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
                                 struct sk_buff *skb,
+                               u16 family,
                                 struct avc_audit_data *ad);
  int selinux_netlbl_socket_setsockopt(struct socket *sock,
                                      int level,
@@ -83,9 +87,11 @@ static inline void selinux_netlbl_sk_security_clone(
  }
  
  static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
-                                              u32 base_sid,
+                                              u16 family,
+                                              u32 *type,
                                                u32 *sid)
  {
+       *type = NETLBL_NLTYPE_NONE;
         *sid = SECSID_NULL;
         return 0;
  }
@@ -106,6 +112,7 @@ static inline int selinux_netlbl_inode_permission(struct inode *inode,
  }
  static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
                                               struct sk_buff *skb,
+                                             u16 family,
                                               struct avc_audit_data *ad)
  {
         return 0;
diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h

new file mode 100644 (file)

index 0000000..1b94450
--- /dev/null
+++ b/security/selinux/include/netnode.h
@@ -0,0 +1,32 @@
+/*
+ * Network node table
+ *
+ * SELinux must keep a mapping of network nodes to labels/SIDs.  This
+ * mapping is maintained as part of the normal policy but a fast cache is
+ * needed to reduce the lookup overhead since most of these queries happen on
+ * a per-packet basis.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _SELINUX_NETNODE_H
+#define _SELINUX_NETNODE_H
+
+int sel_netnode_sid(void *addr, u16 family, u32 *sid);
+
+#endif
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h

index 4138a80f8e2706061666f5a3b83d731ff9204afb..c6c2bb4ebacc731d548febecaac36c6380ceab45 100644 (file)
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -96,17 +96,25 @@ struct bprm_security_struct {
  };
  
  struct netif_security_struct {
-       struct net_device *dev;         /* back pointer */
-       u32 if_sid;                     /* SID for this interface */
-       u32 msg_sid;                    /* default SID for messages received on this interface */
+       int ifindex;                    /* device index */
+       u32 sid;                        /* SID for this interface */
+};
+
+struct netnode_security_struct {
+       union {
+               __be32 ipv4;            /* IPv4 node address */
+               struct in6_addr ipv6;   /* IPv6 node address */
+       } addr;
+       u32 sid;                        /* SID for this node */
+       u16 family;                     /* address family */
  };
  
  struct sk_security_struct {
         struct sock *sk;                /* back pointer to sk object */
         u32 sid;                        /* SID of this object */
         u32 peer_sid;                   /* SID of peer */
-#ifdef CONFIG_NETLABEL
         u16 sclass;                     /* sock security class */
+#ifdef CONFIG_NETLABEL
         enum {                          /* NetLabel state */
                 NLBL_UNSET = 0,
                 NLBL_REQUIRE,
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h

index 39337afffec2cf4cbc571fffaea33e6e530061dc..23137c17f917a0727dd79a99bcad900c308c387a 100644 (file)
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -25,13 +25,14 @@
  #define POLICYDB_VERSION_MLS           19
  #define POLICYDB_VERSION_AVTAB         20
  #define POLICYDB_VERSION_RANGETRANS    21
+#define POLICYDB_VERSION_POLCAP                22
  
  /* Range of policy versions we understand*/
  #define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
  #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
  #define POLICYDB_VERSION_MAX   CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
  #else
-#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_RANGETRANS
+#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_POLCAP
  #endif
  
  struct netlbl_lsm_secattr;
@@ -39,8 +40,19 @@ struct netlbl_lsm_secattr;
  extern int selinux_enabled;
  extern int selinux_mls_enabled;
  
+/* Policy capabilities */
+enum {
+       POLICYDB_CAPABILITY_NETPEER,
+       __POLICYDB_CAPABILITY_MAX
+};
+#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
+
+extern int selinux_policycap_netpeer;
+
  int security_load_policy(void * data, size_t len);
  
+int security_policycap_supported(unsigned int req_cap);
+
  #define SEL_VEC_MAX 32
  struct av_decision {
         u32 allowed;
@@ -77,8 +89,7 @@ int security_get_user_sids(u32 callsid, char *username,
  int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port,
         u32 *out_sid);
  
-int security_netif_sid(char *name, u32 *if_sid,
-       u32 *msg_sid);
+int security_netif_sid(char *name, u32 *if_sid);
  
  int security_node_sid(u16 domain, void *addr, u32 addrlen,
         u32 *out_sid);
@@ -88,10 +99,15 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
  
  int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid);
  
+int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
+                                u32 xfrm_sid,
+                                u32 *peer_sid);
+
  int security_get_classes(char ***classes, int *nclasses);
  int security_get_permissions(char *class, char ***perms, int *nperms);
  int security_get_reject_unknown(void);
  int security_get_allow_unknown(void);
+int security_get_policycaps(int *len, int **values);
  
  #define SECURITY_FS_USE_XATTR          1 /* use xattr */
  #define SECURITY_FS_USE_TRANS          2 /* use transition SIDs, e.g. devpts/tmpfs */
@@ -108,7 +124,6 @@ int security_genfs_sid(const char *fstype, char *name, u16 sclass,
  
  #ifdef CONFIG_NETLABEL
  int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
-                                  u32 base_sid,
                                    u32 *sid);
  
  int security_netlbl_sid_to_secattr(u32 sid,
@@ -116,7 +131,6 @@ int security_netlbl_sid_to_secattr(u32 sid,
  #else
  static inline int security_netlbl_secattr_to_sid(
                                             struct netlbl_lsm_secattr *secattr,
-                                           u32 base_sid,
                                             u32 *sid)
  {
         return -EIDRM;
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h

index 31929e39f5ca9369e34e95715f9744fcb9a8f85f..36b0510efa7b9b1688360cdde9f14849187a0677 100644 (file)
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -32,6 +32,13 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
  }
  
  #ifdef CONFIG_SECURITY_NETWORK_XFRM
+extern atomic_t selinux_xfrm_refcount;
+
+static inline int selinux_xfrm_enabled(void)
+{
+       return (atomic_read(&selinux_xfrm_refcount) > 0);
+}
+
  int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
                         struct avc_audit_data *ad);
  int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
@@ -43,6 +50,11 @@ static inline void selinux_xfrm_notify_policyload(void)
         atomic_inc(&flow_cache_genid);
  }
  #else
+static inline int selinux_xfrm_enabled(void)
+{
+       return 0;
+}
+
  static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
                         struct avc_audit_data *ad)
  {
diff --git a/security/selinux/netif.c b/security/selinux/netif.c

index e87ab948104c05a263163752dbbe173c9403f144..013d3117a86b78ef57a159bf41de4d360c499029 100644 (file)
--- a/security/selinux/netif.c
+++ b/security/selinux/netif.c
@@ -7,6 +7,8 @@
   * Author: James Morris <jmorris@redhat.com>
   *
   * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ *                    Paul Moore <paul.moore@hp.com>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2,
@@ -29,14 +31,6 @@
  #define SEL_NETIF_HASH_SIZE    64
  #define SEL_NETIF_HASH_MAX     1024
  
-#undef DEBUG
-
-#ifdef DEBUG
-#define DEBUGP printk
-#else
-#define DEBUGP(format, args...)
-#endif
-
  struct sel_netif
  {
         struct list_head list;
@@ -49,174 +43,226 @@ static LIST_HEAD(sel_netif_list);
  static DEFINE_SPINLOCK(sel_netif_lock);
  static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE];
  
-static inline u32 sel_netif_hasfn(struct net_device *dev)
+/**
+ * sel_netif_hashfn - Hashing function for the interface table
+ * @ifindex: the network interface
+ *
+ * Description:
+ * This is the hashing function for the network interface table, it returns the
+ * bucket number for the given interface.
+ *
+ */
+static inline u32 sel_netif_hashfn(int ifindex)
  {
-       return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1));
+       return (ifindex & (SEL_NETIF_HASH_SIZE - 1));
  }
  
-/*
- * All of the devices should normally fit in the hash, so we optimize
- * for that case.
+/**
+ * sel_netif_find - Search for an interface record
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Search the network interface table and return the record matching @ifindex.
+ * If an entry can not be found in the table return NULL.
+ *
   */
-static inline struct sel_netif *sel_netif_find(struct net_device *dev)
+static inline struct sel_netif *sel_netif_find(int ifindex)
  {
-       struct list_head *pos;
-       int idx = sel_netif_hasfn(dev);
+       int idx = sel_netif_hashfn(ifindex);
+       struct sel_netif *netif;
  
-       __list_for_each_rcu(pos, &sel_netif_hash[idx]) {
-               struct sel_netif *netif = list_entry(pos,
-                                                    struct sel_netif, list);
-               if (likely(netif->nsec.dev == dev))
+       list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list)
+               /* all of the devices should normally fit in the hash, so we
+                * optimize for that case */
+               if (likely(netif->nsec.ifindex == ifindex))
                         return netif;
-       }
+
         return NULL;
  }
  
+/**
+ * sel_netif_insert - Insert a new interface into the table
+ * @netif: the new interface record
+ *
+ * Description:
+ * Add a new interface record to the network interface hash table.  Returns
+ * zero on success, negative values on failure.
+ *
+ */
  static int sel_netif_insert(struct sel_netif *netif)
  {
-       int idx, ret = 0;
+       int idx;
         
-       if (sel_netif_total >= SEL_NETIF_HASH_MAX) {
-               ret = -ENOSPC;
-               goto out;
-       }
+       if (sel_netif_total >= SEL_NETIF_HASH_MAX)
+               return -ENOSPC;
         
-       idx = sel_netif_hasfn(netif->nsec.dev);
+       idx = sel_netif_hashfn(netif->nsec.ifindex);
         list_add_rcu(&netif->list, &sel_netif_hash[idx]);
         sel_netif_total++;
-out:
-       return ret;
+
+       return 0;
  }
  
+/**
+ * sel_netif_free - Frees an interface entry
+ * @p: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table interface entry can be
+ * released safely.
+ *
+ */
  static void sel_netif_free(struct rcu_head *p)
  {
         struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head);
-
-       DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
         kfree(netif);
  }
  
+/**
+ * sel_netif_destroy - Remove an interface record from the table
+ * @netif: the existing interface record
+ *
+ * Description:
+ * Remove an existing interface record from the network interface table.
+ *
+ */
  static void sel_netif_destroy(struct sel_netif *netif)
  {
-       DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
-
         list_del_rcu(&netif->list);
         sel_netif_total--;
         call_rcu(&netif->rcu_head, sel_netif_free);
  }
  
-static struct sel_netif *sel_netif_lookup(struct net_device *dev)
+/**
+ * sel_netif_sid_slow - Lookup the SID of a network interface using the policy
+ * @ifindex: the network interface
+ * @sid: interface SID
+ *
+ * Description:
+ * This function determines the SID of a network interface by quering the
+ * security policy.  The result is added to the network interface table to
+ * speedup future queries.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int sel_netif_sid_slow(int ifindex, u32 *sid)
  {
         int ret;
-       struct sel_netif *netif, *new;
-       struct netif_security_struct *nsec;
-
-       netif = sel_netif_find(dev);
-       if (likely(netif != NULL))
-               goto out;
-       
-       new = kzalloc(sizeof(*new), GFP_ATOMIC);
-       if (!new) {
-               netif = ERR_PTR(-ENOMEM);
-               goto out;
+       struct sel_netif *netif;
+       struct sel_netif *new = NULL;
+       struct net_device *dev;
+
+       /* NOTE: we always use init's network namespace since we don't
+        * currently support containers */
+
+       dev = dev_get_by_index(&init_net, ifindex);
+       if (unlikely(dev == NULL)) {
+               printk(KERN_WARNING
+                      "SELinux: failure in sel_netif_sid_slow(),"
+                      " invalid network interface (%d)\n", ifindex);
+               return -ENOENT;
         }
-       
-       nsec = &new->nsec;
  
-       ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid);
-       if (ret < 0) {
-               kfree(new);
-               netif = ERR_PTR(ret);
+       spin_lock_bh(&sel_netif_lock);
+       netif = sel_netif_find(ifindex);
+       if (netif != NULL) {
+               *sid = netif->nsec.sid;
+               ret = 0;
                 goto out;
         }
-
-       nsec->dev = dev;
-       
-       spin_lock_bh(&sel_netif_lock);
-       
-       netif = sel_netif_find(dev);
-       if (netif) {
-               spin_unlock_bh(&sel_netif_lock);
-               kfree(new);
+       new = kzalloc(sizeof(*new), GFP_ATOMIC);
+       if (new == NULL) {
+               ret = -ENOMEM;
                 goto out;
         }
-       
+       ret = security_netif_sid(dev->name, &new->nsec.sid);
+       if (ret != 0)
+               goto out;
+       new->nsec.ifindex = ifindex;
         ret = sel_netif_insert(new);
-       spin_unlock_bh(&sel_netif_lock);
-       
-       if (ret) {
-               kfree(new);
-               netif = ERR_PTR(ret);
+       if (ret != 0)
                 goto out;
-       }
+       *sid = new->nsec.sid;
  
-       netif = new;
-       
-       DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name,
-               nsec->if_sid, nsec->msg_sid);
  out:
-       return netif;
-}
-
-static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out)
-{
-       if (if_sid_out)
-               *if_sid_out = if_sid_in;
-       if (msg_sid_out)
-               *msg_sid_out = msg_sid_in;
-}
-
-static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
-{
-       int ret = 0;
-       u32 tmp_if_sid, tmp_msg_sid;
-       
-       ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid);
-       if (!ret)
-               sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid);
+       spin_unlock_bh(&sel_netif_lock);
+       dev_put(dev);
+       if (unlikely(ret)) {
+               printk(KERN_WARNING
+                      "SELinux: failure in sel_netif_sid_slow(),"
+                      " unable to determine network interface label (%d)\n",
+                      ifindex);
+               kfree(new);
+       }
         return ret;
  }
  
-int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid)
+/**
+ * sel_netif_sid - Lookup the SID of a network interface
+ * @ifindex: the network interface
+ * @sid: interface SID
+ *
+ * Description:
+ * This function determines the SID of a network interface using the fastest
+ * method possible.  First the interface table is queried, but if an entry
+ * can't be found then the policy is queried and the result is added to the
+ * table to speedup future queries.  Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int sel_netif_sid(int ifindex, u32 *sid)
  {
-       int ret = 0;
         struct sel_netif *netif;
  
         rcu_read_lock();
-       netif = sel_netif_lookup(dev);
-       if (IS_ERR(netif)) {
+       netif = sel_netif_find(ifindex);
+       if (likely(netif != NULL)) {
+               *sid = netif->nsec.sid;
                 rcu_read_unlock();
-               ret = sel_netif_sids_slow(dev, if_sid, msg_sid);
-               goto out;
+               return 0;
         }
-       sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid);
         rcu_read_unlock();
-out:
-       return ret;
+
+       return sel_netif_sid_slow(ifindex, sid);
  }
  
-static void sel_netif_kill(struct net_device *dev)
+/**
+ * sel_netif_kill - Remove an entry from the network interface table
+ * @ifindex: the network interface
+ *
+ * Description:
+ * This function removes the entry matching @ifindex from the network interface
+ * table if it exists.
+ *
+ */
+static void sel_netif_kill(int ifindex)
  {
         struct sel_netif *netif;
  
         spin_lock_bh(&sel_netif_lock);
-       netif = sel_netif_find(dev);
+       netif = sel_netif_find(ifindex);
         if (netif)
                 sel_netif_destroy(netif);
         spin_unlock_bh(&sel_netif_lock);
  }
  
+/**
+ * sel_netif_flush - Flush the entire network interface table
+ *
+ * Description:
+ * Remove all entries from the network interface table.
+ *
+ */
  static void sel_netif_flush(void)
  {
         int idx;
+       struct sel_netif *netif;
  
         spin_lock_bh(&sel_netif_lock);
-       for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) {
-               struct sel_netif *netif;
-               
+       for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++)
                 list_for_each_entry(netif, &sel_netif_hash[idx], list)
                         sel_netif_destroy(netif);
-       }
         spin_unlock_bh(&sel_netif_lock);
  }
  
@@ -239,7 +285,7 @@ static int sel_netif_netdev_notifier_handler(struct notifier_block *this,
                 return NOTIFY_DONE;
  
         if (event == NETDEV_DOWN)
-               sel_netif_kill(dev);
+               sel_netif_kill(dev->ifindex);
  
         return NOTIFY_DONE;
  }
@@ -250,10 +296,10 @@ static struct notifier_block sel_netif_netdev_notifier = {
  
  static __init int sel_netif_init(void)
  {
-       int i, err = 0;
+       int i, err;
         
         if (!selinux_enabled)
-               goto out;
+               return 0;
  
         for (i = 0; i < SEL_NETIF_HASH_SIZE; i++)
                 INIT_LIST_HEAD(&sel_netif_hash[i]);
@@ -265,7 +311,6 @@ static __init int sel_netif_init(void)
         if (err)
                 panic("avc_add_callback() failed, error %d\n", err);
  
-out:
         return err;
  }
  
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c

index 66e013d6f6f6f081665a62c7b3a733b14b4d2cb7..0fa2be4149e80db80741eb633d9c7b27f1b87d88 100644 (file)
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -35,6 +35,33 @@
  #include "objsec.h"
  #include "security.h"
  
+/**
+ * selinux_netlbl_sidlookup_cached - Cache a SID lookup
+ * @skb: the packet
+ * @secattr: the NetLabel security attributes
+ * @sid: the SID
+ *
+ * Description:
+ * Query the SELinux security server to lookup the correct SID for the given
+ * security attributes.  If the query is successful, cache the result to speed
+ * up future lookups.  Returns zero on success, negative values on failure.
+ *
+ */
+static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
+                                          struct netlbl_lsm_secattr *secattr,
+                                          u32 *sid)
+{
+       int rc;
+
+       rc = security_netlbl_secattr_to_sid(secattr, sid);
+       if (rc == 0 &&
+           (secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
+           (secattr->flags & NETLBL_SECATTR_CACHE))
+               netlbl_cache_add(skb, secattr);
+
+       return rc;
+}
+
  /**
   * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism
   * @sk: the socket to label
@@ -137,14 +164,14 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
          * lock as other threads could have access to ssec */
         rcu_read_lock();
         selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family);
-       newssec->sclass = ssec->sclass;
         rcu_read_unlock();
  }
  
  /**
   * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
   * @skb: the packet
- * @base_sid: the SELinux SID to use as a context for MLS only attributes
+ * @family: protocol family
+ * @type: NetLabel labeling protocol type
   * @sid: the SID
   *
   * Description:
@@ -153,7 +180,10 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec,
   * assign to the packet.  Returns zero on success, negative values on failure.
   *
   */
-int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
+int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
+                                u16 family,
+                                u32 *type,
+                                u32 *sid)
  {
         int rc;
         struct netlbl_lsm_secattr secattr;
@@ -164,15 +194,12 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
         }
  
         netlbl_secattr_init(&secattr);
-       rc = netlbl_skbuff_getattr(skb, &secattr);
-       if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) {
-               rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid);
-               if (rc == 0 &&
-                   (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
-                   (secattr.flags & NETLBL_SECATTR_CACHE))
-                       netlbl_cache_add(skb, &secattr);
-       } else
+       rc = netlbl_skbuff_getattr(skb, family, &secattr);
+       if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
+               rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid);
+       else
                 *sid = SECSID_NULL;
+       *type = secattr.type;
         netlbl_secattr_destroy(&secattr);
  
         return rc;
@@ -190,13 +217,10 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid)
   */
  void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
  {
-       struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
         struct sk_security_struct *sksec = sk->sk_security;
         struct netlbl_lsm_secattr secattr;
         u32 nlbl_peer_sid;
  
-       sksec->sclass = isec->sclass;
-
         rcu_read_lock();
  
         if (sksec->nlbl_state != NLBL_REQUIRE) {
@@ -207,9 +231,7 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
         netlbl_secattr_init(&secattr);
         if (netlbl_sock_getattr(sk, &secattr) == 0 &&
             secattr.flags != NETLBL_SECATTR_NONE &&
-           security_netlbl_secattr_to_sid(&secattr,
-                                          SECINITSID_NETMSG,
-                                          &nlbl_peer_sid) == 0)
+           security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0)
                 sksec->peer_sid = nlbl_peer_sid;
         netlbl_secattr_destroy(&secattr);
  
@@ -234,11 +256,8 @@ int selinux_netlbl_socket_post_create(struct socket *sock)
  {
         int rc = 0;
         struct sock *sk = sock->sk;
-       struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
         struct sk_security_struct *sksec = sk->sk_security;
  
-       sksec->sclass = isec->sclass;
-
         rcu_read_lock();
         if (sksec->nlbl_state == NLBL_REQUIRE)
                 rc = selinux_netlbl_sock_setsid(sk, sksec->sid);
@@ -292,6 +311,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
   * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
   * @sksec: the sock's sk_security_struct
   * @skb: the packet
+ * @family: protocol family
   * @ad: the audit data
   *
   * Description:
@@ -302,6 +322,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
   */
  int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
                                 struct sk_buff *skb,
+                               u16 family,
                                 struct avc_audit_data *ad)
  {
         int rc;
@@ -313,16 +334,10 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
                 return 0;
  
         netlbl_secattr_init(&secattr);
-       rc = netlbl_skbuff_getattr(skb, &secattr);
-       if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) {
-               rc = security_netlbl_secattr_to_sid(&secattr,
-                                                   SECINITSID_NETMSG,
-                                                   &nlbl_sid);
-               if (rc == 0 &&
-                   (secattr.flags & NETLBL_SECATTR_CACHEABLE) &&
-                   (secattr.flags & NETLBL_SECATTR_CACHE))
-                       netlbl_cache_add(skb, &secattr);
-       } else
+       rc = netlbl_skbuff_getattr(skb, family, &secattr);
+       if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
+               rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid);
+       else
                 nlbl_sid = SECINITSID_UNLABELED;
         netlbl_secattr_destroy(&secattr);
         if (rc != 0)
diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c

new file mode 100644 (file)

index 0000000..f3c526f
--- /dev/null
+++ b/security/selinux/netnode.c
@@ -0,0 +1,354 @@
+/*
+ * Network node table
+ *
+ * SELinux must keep a mapping of network nodes to labels/SIDs.  This
+ * mapping is maintained as part of the normal policy but a fast cache is
+ * needed to reduce the lookup overhead since most of these queries happen on
+ * a per-packet basis.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ * This code is heavily based on the "netif" concept originally developed by
+ * James Morris <jmorris@redhat.com>
+ *   (see security/selinux/netif.c for more information)
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2007
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <asm/bug.h>
+
+#include "objsec.h"
+
+#define SEL_NETNODE_HASH_SIZE       256
+#define SEL_NETNODE_HASH_BKT_LIMIT   16
+
+struct sel_netnode {
+       struct netnode_security_struct nsec;
+
+       struct list_head list;
+       struct rcu_head rcu;
+};
+
+/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason
+ * for this is that I suspect most users will not make heavy use of both
+ * address families at the same time so one table will usually end up wasted,
+ * if this becomes a problem we can always add a hash table for each address
+ * family later */
+
+static LIST_HEAD(sel_netnode_list);
+static DEFINE_SPINLOCK(sel_netnode_lock);
+static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE];
+
+/**
+ * sel_netnode_free - Frees a node entry
+ * @p: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table node entry can be
+ * released safely.
+ *
+ */
+static void sel_netnode_free(struct rcu_head *p)
+{
+       struct sel_netnode *node = container_of(p, struct sel_netnode, rcu);
+       kfree(node);
+}
+
+/**
+ * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table
+ * @addr: IPv4 address
+ *
+ * Description:
+ * This is the IPv4 hashing function for the node interface table, it returns
+ * the bucket number for the given IP address.
+ *
+ */
+static u32 sel_netnode_hashfn_ipv4(__be32 addr)
+{
+       /* at some point we should determine if the mismatch in byte order
+        * affects the hash function dramatically */
+       return (addr & (SEL_NETNODE_HASH_SIZE - 1));
+}
+
+/**
+ * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table
+ * @addr: IPv6 address
+ *
+ * Description:
+ * This is the IPv6 hashing function for the node interface table, it returns
+ * the bucket number for the given IP address.
+ *
+ */
+static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr)
+{
+       /* just hash the least significant 32 bits to keep things fast (they
+        * are the most likely to be different anyway), we can revisit this
+        * later if needed */
+       return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1));
+}
+
+/**
+ * sel_netnode_find - Search for a node record
+ * @addr: IP address
+ * @family: address family
+ *
+ * Description:
+ * Search the network node table and return the record matching @addr.  If an
+ * entry can not be found in the table return NULL.
+ *
+ */
+static struct sel_netnode *sel_netnode_find(const void *addr, u16 family)
+{
+       u32 idx;
+       struct sel_netnode *node;
+
+       switch (family) {
+       case PF_INET:
+               idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr);
+               break;
+       case PF_INET6:
+               idx = sel_netnode_hashfn_ipv6(addr);
+               break;
+       default:
+               BUG();
+       }
+
+       list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list)
+               if (node->nsec.family == family)
+                       switch (family) {
+                       case PF_INET:
+                               if (node->nsec.addr.ipv4 == *(__be32 *)addr)
+                                       return node;
+                               break;
+                       case PF_INET6:
+                               if (ipv6_addr_equal(&node->nsec.addr.ipv6,
+                                                   addr))
+                                       return node;
+                               break;
+                       }
+
+       return NULL;
+}
+
+/**
+ * sel_netnode_insert - Insert a new node into the table
+ * @node: the new node record
+ *
+ * Description:
+ * Add a new node record to the network address hash table.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int sel_netnode_insert(struct sel_netnode *node)
+{
+       u32 idx;
+       u32 count = 0;
+       struct sel_netnode *iter;
+
+       switch (node->nsec.family) {
+       case PF_INET:
+               idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4);
+               break;
+       case PF_INET6:
+               idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6);
+               break;
+       default:
+               BUG();
+       }
+       list_add_rcu(&node->list, &sel_netnode_hash[idx]);
+
+       /* we need to impose a limit on the growth of the hash table so check
+        * this bucket to make sure it is within the specified bounds */
+       list_for_each_entry(iter, &sel_netnode_hash[idx], list)
+               if (++count > SEL_NETNODE_HASH_BKT_LIMIT) {
+                       list_del_rcu(&iter->list);
+                       call_rcu(&iter->rcu, sel_netnode_free);
+                       break;
+               }
+
+       return 0;
+}
+
+/**
+ * sel_netnode_destroy - Remove a node record from the table
+ * @node: the existing node record
+ *
+ * Description:
+ * Remove an existing node record from the network address table.
+ *
+ */
+static void sel_netnode_destroy(struct sel_netnode *node)
+{
+       list_del_rcu(&node->list);
+       call_rcu(&node->rcu, sel_netnode_free);
+}
+
+/**
+ * sel_netnode_sid_slow - Lookup the SID of a network address using the policy
+ * @addr: the IP address
+ * @family: the address family
+ * @sid: node SID
+ *
+ * Description:
+ * This function determines the SID of a network address by quering the
+ * security policy.  The result is added to the network address table to
+ * speedup future queries.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid)
+{
+       int ret;
+       struct sel_netnode *node;
+       struct sel_netnode *new = NULL;
+
+       spin_lock_bh(&sel_netnode_lock);
+       node = sel_netnode_find(addr, family);
+       if (node != NULL) {
+               *sid = node->nsec.sid;
+               ret = 0;
+               goto out;
+       }
+       new = kzalloc(sizeof(*new), GFP_ATOMIC);
+       if (new == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       switch (family) {
+       case PF_INET:
+               ret = security_node_sid(PF_INET,
+                                       addr, sizeof(struct in_addr),
+                                       &new->nsec.sid);
+               new->nsec.addr.ipv4 = *(__be32 *)addr;
+               break;
+       case PF_INET6:
+               ret = security_node_sid(PF_INET6,
+                                       addr, sizeof(struct in6_addr),
+                                       &new->nsec.sid);
+               ipv6_addr_copy(&new->nsec.addr.ipv6, addr);
+               break;
+       default:
+               BUG();
+       }
+       if (ret != 0)
+               goto out;
+       new->nsec.family = family;
+       ret = sel_netnode_insert(new);
+       if (ret != 0)
+               goto out;
+       *sid = new->nsec.sid;
+
+out:
+       spin_unlock_bh(&sel_netnode_lock);
+       if (unlikely(ret)) {
+               printk(KERN_WARNING
+                      "SELinux: failure in sel_netnode_sid_slow(),"
+                      " unable to determine network node label\n");
+               kfree(new);
+       }
+       return ret;
+}
+
+/**
+ * sel_netnode_sid - Lookup the SID of a network address
+ * @addr: the IP address
+ * @family: the address family
+ * @sid: node SID
+ *
+ * Description:
+ * This function determines the SID of a network address using the fastest
+ * method possible.  First the address table is queried, but if an entry
+ * can't be found then the policy is queried and the result is added to the
+ * table to speedup future queries.  Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int sel_netnode_sid(void *addr, u16 family, u32 *sid)
+{
+       struct sel_netnode *node;
+
+       rcu_read_lock();
+       node = sel_netnode_find(addr, family);
+       if (node != NULL) {
+               *sid = node->nsec.sid;
+               rcu_read_unlock();
+               return 0;
+       }
+       rcu_read_unlock();
+
+       return sel_netnode_sid_slow(addr, family, sid);
+}
+
+/**
+ * sel_netnode_flush - Flush the entire network address table
+ *
+ * Description:
+ * Remove all entries from the network address table.
+ *
+ */
+static void sel_netnode_flush(void)
+{
+       u32 idx;
+       struct sel_netnode *node;
+
+       spin_lock_bh(&sel_netnode_lock);
+       for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++)
+               list_for_each_entry(node, &sel_netnode_hash[idx], list)
+                       sel_netnode_destroy(node);
+       spin_unlock_bh(&sel_netnode_lock);
+}
+
+static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid,
+                                   u16 class, u32 perms, u32 *retained)
+{
+       if (event == AVC_CALLBACK_RESET) {
+               sel_netnode_flush();
+               synchronize_net();
+       }
+       return 0;
+}
+
+static __init int sel_netnode_init(void)
+{
+       int iter;
+       int ret;
+
+       if (!selinux_enabled)
+               return 0;
+
+       for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++)
+               INIT_LIST_HEAD(&sel_netnode_hash[iter]);
+
+       ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET,
+                              SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0);
+       if (ret != 0)
+               panic("avc_add_callback() failed, error %d\n", ret);
+
+       return ret;
+}
+
+__initcall(sel_netnode_init);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c

index 397fd4955fe1fb9dc9e57b95189084435f648294..a85740530afc676fdb777c88a8cb4136d90370cd 100644 (file)
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -2,6 +2,11 @@
   *
   *     Added conditional policy language extensions
   *
+ *  Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ *      Added support for the policy capability bitmap
+ *
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
   * Copyright (C) 2003 - 2004 Tresys Technology, LLC
   * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
   *     This program is free software; you can redistribute it and/or modify
@@ -35,6 +40,11 @@
  #include "objsec.h"
  #include "conditional.h"
  
+/* Policy capability filenames */
+static char *policycap_names[] = {
+       "network_peer_controls"
+};
+
  unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;
  
  #ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT
@@ -72,6 +82,9 @@ static int *bool_pending_values = NULL;
  static struct dentry *class_dir = NULL;
  static unsigned long last_class_ino;
  
+/* global data for policy capabilities */
+static struct dentry *policycap_dir = NULL;
+
  extern void selnl_notify_setenforce(int val);
  
  /* Check whether a task is allowed to use a security operation. */
@@ -111,10 +124,11 @@ enum sel_inos {
  
  static unsigned long sel_last_ino = SEL_INO_NEXT - 1;
  
-#define SEL_INITCON_INO_OFFSET         0x01000000
-#define SEL_BOOL_INO_OFFSET    0x02000000
-#define SEL_CLASS_INO_OFFSET   0x04000000
-#define SEL_INO_MASK           0x00ffffff
+#define SEL_INITCON_INO_OFFSET         0x01000000
+#define SEL_BOOL_INO_OFFSET            0x02000000
+#define SEL_CLASS_INO_OFFSET           0x04000000
+#define SEL_POLICYCAP_INO_OFFSET       0x08000000
+#define SEL_INO_MASK                   0x00ffffff
  
  #define TMPBUFLEN      12
  static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
@@ -263,6 +277,7 @@ static const struct file_operations sel_policyvers_ops = {
  /* declaration for sel_write_load */
  static int sel_make_bools(void);
  static int sel_make_classes(void);
+static int sel_make_policycap(void);
  
  /* declaration for sel_make_class_dirs */
  static int sel_make_dir(struct inode *dir, struct dentry *dentry,
@@ -323,6 +338,12 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
         }
  
         ret = sel_make_classes();
+       if (ret) {
+               length = ret;
+               goto out1;
+       }
+
+       ret = sel_make_policycap();
         if (ret)
                 length = ret;
         else
@@ -1399,6 +1420,24 @@ static const struct file_operations sel_perm_ops = {
         .read           = sel_read_perm,
  };
  
+static ssize_t sel_read_policycap(struct file *file, char __user *buf,
+                                 size_t count, loff_t *ppos)
+{
+       int value;
+       char tmpbuf[TMPBUFLEN];
+       ssize_t length;
+       unsigned long i_ino = file->f_path.dentry->d_inode->i_ino;
+
+       value = security_policycap_supported(i_ino & SEL_INO_MASK);
+       length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value);
+
+       return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static const struct file_operations sel_policycap_ops = {
+       .read           = sel_read_policycap,
+};
+
  static int sel_make_perm_files(char *objclass, int classvalue,
                                 struct dentry *dir)
  {
@@ -1545,6 +1584,36 @@ out:
         return rc;
  }
  
+static int sel_make_policycap(void)
+{
+       unsigned int iter;
+       struct dentry *dentry = NULL;
+       struct inode *inode = NULL;
+
+       sel_remove_entries(policycap_dir);
+
+       for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) {
+               if (iter < ARRAY_SIZE(policycap_names))
+                       dentry = d_alloc_name(policycap_dir,
+                                             policycap_names[iter]);
+               else
+                       dentry = d_alloc_name(policycap_dir, "unknown");
+
+               if (dentry == NULL)
+                       return -ENOMEM;
+
+               inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO);
+               if (inode == NULL)
+                       return -ENOMEM;
+
+               inode->i_fop = &sel_policycap_ops;
+               inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET;
+               d_add(dentry, inode);
+       }
+
+       return 0;
+}
+
  static int sel_make_dir(struct inode *dir, struct dentry *dentry,
                         unsigned long *ino)
  {
@@ -1673,6 +1742,18 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent)
  
         class_dir = dentry;
  
+       dentry = d_alloc_name(sb->s_root, "policy_capabilities");
+       if (!dentry) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = sel_make_dir(root_inode, dentry, &sel_last_ino);
+       if (ret)
+               goto err;
+
+       policycap_dir = dentry;
+
  out:
         return ret;
  err:
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c

index 3bbcb5369af9eb6ae2f230a2b0d0ea5099a55273..feaf0a5b828fc19cb80c1af3c48df232d20c0bfe 100644 (file)
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -562,7 +562,7 @@ void mls_export_netlbl_lvl(struct context *context,
         if (!selinux_mls_enabled)
                 return;
  
-       secattr->mls_lvl = context->range.level[0].sens - 1;
+       secattr->attr.mls.lvl = context->range.level[0].sens - 1;
         secattr->flags |= NETLBL_SECATTR_MLS_LVL;
  }
  
@@ -582,7 +582,7 @@ void mls_import_netlbl_lvl(struct context *context,
         if (!selinux_mls_enabled)
                 return;
  
-       context->range.level[0].sens = secattr->mls_lvl + 1;
+       context->range.level[0].sens = secattr->attr.mls.lvl + 1;
         context->range.level[1].sens = context->range.level[0].sens;
  }
  
@@ -605,8 +605,8 @@ int mls_export_netlbl_cat(struct context *context,
                 return 0;
  
         rc = ebitmap_netlbl_export(&context->range.level[0].cat,
-                                  &secattr->mls_cat);
-       if (rc == 0 && secattr->mls_cat != NULL)
+                                  &secattr->attr.mls.cat);
+       if (rc == 0 && secattr->attr.mls.cat != NULL)
                 secattr->flags |= NETLBL_SECATTR_MLS_CAT;
  
         return rc;
@@ -633,7 +633,7 @@ int mls_import_netlbl_cat(struct context *context,
                 return 0;
  
         rc = ebitmap_netlbl_import(&context->range.level[0].cat,
-                                  secattr->mls_cat);
+                                  secattr->attr.mls.cat);
         if (rc != 0)
                 goto import_netlbl_cat_failure;
  
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c

index b582aae3c62c339f320a11d067a14f53fdea3038..bd7d6a00342daa1a36ffcd244e0ac2eb02fdab79 100644 (file)
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -13,6 +13,11 @@
   *
   *     Added conditional policy language extensions
   *
+ * Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ *      Added support for the policy capability bitmap
+ *
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
   * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
   * Copyright (C) 2003 - 2004 Tresys Technology, LLC
   *     This program is free software; you can redistribute it and/or modify
@@ -102,6 +107,11 @@ static struct policydb_compat_info policydb_compat[] = {
                 .sym_num        = SYM_NUM,
                 .ocon_num       = OCON_NUM,
         },
+       {
+               .version        = POLICYDB_VERSION_POLCAP,
+               .sym_num        = SYM_NUM,
+               .ocon_num       = OCON_NUM,
+       }
  };
  
  static struct policydb_compat_info *policydb_lookup_compat(int version)
@@ -183,6 +193,8 @@ static int policydb_init(struct policydb *p)
         if (rc)
                 goto out_free_symtab;
  
+       ebitmap_init(&p->policycaps);
+
  out:
         return rc;
  
@@ -673,8 +685,8 @@ void policydb_destroy(struct policydb *p)
                         ebitmap_destroy(&p->type_attr_map[i]);
         }
         kfree(p->type_attr_map);
-
         kfree(p->undefined_perms);
+       ebitmap_destroy(&p->policycaps);
  
         return;
  }
@@ -1554,6 +1566,10 @@ int policydb_read(struct policydb *p, void *fp)
         p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN);
         p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN);
  
+       if (p->policyvers >= POLICYDB_VERSION_POLCAP &&
+           ebitmap_read(&p->policycaps, fp) != 0)
+               goto bad;
+
         info = policydb_lookup_compat(p->policyvers);
         if (!info) {
                 printk(KERN_ERR "security:  unable to find policy compat info "
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h

index ed6fc687c66fd44107afe0e475ab233399e2d0d2..c4ce996e202c1c62357bdaf7bcd02d5988bb4fff 100644 (file)
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -241,6 +241,8 @@ struct policydb {
         /* type -> attribute reverse mapping */
         struct ebitmap *type_attr_map;
  
+       struct ebitmap policycaps;
+
         unsigned int policyvers;
  
         unsigned int reject_unknown : 1;
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c

index 4bf715d4cf29e2090ab2d7d97b346a5e4087f9dd..f96dec1f9258f31c88525543c59d586354619649 100644 (file)
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -16,12 +16,13 @@
   * Updated: Hewlett-Packard <paul.moore@hp.com>
   *
   *      Added support for NetLabel
+ *      Added support for the policy capability bitmap
   *
   * Updated: Chad Sellers <csellers@tresys.com>
   *
   *  Added validation of kernel classes and permissions
   *
- * Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
   * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
   * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
   * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -59,6 +60,8 @@
  extern void selnl_notify_policyload(u32 seqno);
  unsigned int policydb_loaded_version;
  
+int selinux_policycap_netpeer;
+
  /*
   * This is declared in avc.c
   */
@@ -1299,6 +1302,12 @@ bad:
         goto out;
  }
  
+static void security_load_policycaps(void)
+{
+       selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps,
+                                                 POLICYDB_CAPABILITY_NETPEER);
+}
+
  extern void selinux_complete_init(void);
  static int security_preserve_bools(struct policydb *p);
  
@@ -1346,6 +1355,7 @@ int security_load_policy(void *data, size_t len)
                         avtab_cache_destroy();
                         return -EINVAL;
                 }
+               security_load_policycaps();
                 policydb_loaded_version = policydb.policyvers;
                 ss_initialized = 1;
                 seqno = ++latest_granting;
@@ -1404,6 +1414,7 @@ int security_load_policy(void *data, size_t len)
         POLICY_WRLOCK;
         memcpy(&policydb, &newpolicydb, sizeof policydb);
         sidtab_set(&sidtab, &newsidtab);
+       security_load_policycaps();
         seqno = ++latest_granting;
         policydb_loaded_version = policydb.policyvers;
         POLICY_WRUNLOCK;
@@ -1478,11 +1489,8 @@ out:
   * security_netif_sid - Obtain the SID for a network interface.
   * @name: interface name
   * @if_sid: interface SID
- * @msg_sid: default SID for received packets
   */
-int security_netif_sid(char *name,
-                      u32 *if_sid,
-                      u32 *msg_sid)
+int security_netif_sid(char *name, u32 *if_sid)
  {
         int rc = 0;
         struct ocontext *c;
@@ -1510,11 +1518,8 @@ int security_netif_sid(char *name,
                                 goto out;
                 }
                 *if_sid = c->sid[0];
-               *msg_sid = c->sid[1];
-       } else {
+       } else
                 *if_sid = SECINITSID_NETIF;
-               *msg_sid = SECINITSID_NETMSG;
-       }
  
  out:
         POLICY_RDUNLOCK;
@@ -2049,6 +2054,91 @@ out:
         return rc;
  }
  
+/**
+ * security_net_peersid_resolve - Compare and resolve two network peer SIDs
+ * @nlbl_sid: NetLabel SID
+ * @nlbl_type: NetLabel labeling protocol type
+ * @xfrm_sid: XFRM SID
+ *
+ * Description:
+ * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
+ * resolved into a single SID it is returned via @peer_sid and the function
+ * returns zero.  Otherwise @peer_sid is set to SECSID_NULL and the function
+ * returns a negative value.  A table summarizing the behavior is below:
+ *
+ *                                 | function return |      @sid
+ *   ------------------------------+-----------------+-----------------
+ *   no peer labels                |        0        |    SECSID_NULL
+ *   single peer label             |        0        |    <peer_label>
+ *   multiple, consistent labels   |        0        |    <peer_label>
+ *   multiple, inconsistent labels |    -<errno>     |    SECSID_NULL
+ *
+ */
+int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
+                                u32 xfrm_sid,
+                                u32 *peer_sid)
+{
+       int rc;
+       struct context *nlbl_ctx;
+       struct context *xfrm_ctx;
+
+       /* handle the common (which also happens to be the set of easy) cases
+        * right away, these two if statements catch everything involving a
+        * single or absent peer SID/label */
+       if (xfrm_sid == SECSID_NULL) {
+               *peer_sid = nlbl_sid;
+               return 0;
+       }
+       /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
+        * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
+        * is present */
+       if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
+               *peer_sid = xfrm_sid;
+               return 0;
+       }
+
+       /* we don't need to check ss_initialized here since the only way both
+        * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
+        * security server was initialized and ss_initialized was true */
+       if (!selinux_mls_enabled) {
+               *peer_sid = SECSID_NULL;
+               return 0;
+       }
+
+       POLICY_RDLOCK;
+
+       nlbl_ctx = sidtab_search(&sidtab, nlbl_sid);
+       if (!nlbl_ctx) {
+               printk(KERN_ERR
+                      "security_sid_mls_cmp:  unrecognized SID %d\n",
+                      nlbl_sid);
+               rc = -EINVAL;
+               goto out_slowpath;
+       }
+       xfrm_ctx = sidtab_search(&sidtab, xfrm_sid);
+       if (!xfrm_ctx) {
+               printk(KERN_ERR
+                      "security_sid_mls_cmp:  unrecognized SID %d\n",
+                      xfrm_sid);
+               rc = -EINVAL;
+               goto out_slowpath;
+       }
+       rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
+
+out_slowpath:
+       POLICY_RDUNLOCK;
+       if (rc == 0)
+               /* at present NetLabel SIDs/labels really only carry MLS
+                * information so if the MLS portion of the NetLabel SID
+                * matches the MLS portion of the labeled XFRM SID/label
+                * then pass along the XFRM SID as it is the most
+                * expressive */
+               *peer_sid = xfrm_sid;
+       else
+               *peer_sid = SECSID_NULL;
+       return rc;
+}
+
  static int get_classes_callback(void *k, void *d, void *args)
  {
         struct class_datum *datum = d;
@@ -2154,6 +2244,60 @@ int security_get_allow_unknown(void)
         return policydb.allow_unknown;
  }
  
+/**
+ * security_get_policycaps - Query the loaded policy for its capabilities
+ * @len: the number of capability bits
+ * @values: the capability bit array
+ *
+ * Description:
+ * Get an array of the policy capabilities in @values where each entry in
+ * @values is either true (1) or false (0) depending the policy's support of
+ * that feature.  The policy capabilities are defined by the
+ * POLICYDB_CAPABILITY_* enums.  The size of the array is stored in @len and it
+ * is up to the caller to free the array in @values.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int security_get_policycaps(int *len, int **values)
+{
+       int rc = -ENOMEM;
+       unsigned int iter;
+
+       POLICY_RDLOCK;
+
+       *values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC);
+       if (*values == NULL)
+               goto out;
+       for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++)
+               (*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter);
+       *len = POLICYDB_CAPABILITY_MAX;
+
+out:
+       POLICY_RDUNLOCK;
+       return rc;
+}
+
+/**
+ * security_policycap_supported - Check for a specific policy capability
+ * @req_cap: capability
+ *
+ * Description:
+ * This function queries the currently loaded policy to see if it supports the
+ * capability specified by @req_cap.  Returns true (1) if the capability is
+ * supported, false (0) if it isn't supported.
+ *
+ */
+int security_policycap_supported(unsigned int req_cap)
+{
+       int rc;
+
+       POLICY_RDLOCK;
+       rc = ebitmap_get_bit(&policydb.policycaps, req_cap);
+       POLICY_RDUNLOCK;
+
+       return rc;
+}
+
  struct selinux_audit_rule {
         u32 au_seqno;
         struct context au_ctxt;
@@ -2403,50 +2547,10 @@ void selinux_audit_set_callback(int (*callback)(void))
  }
  
  #ifdef CONFIG_NETLABEL
-/*
- * NetLabel cache structure
- */
-#define NETLBL_CACHE(x)           ((struct selinux_netlbl_cache *)(x))
-#define NETLBL_CACHE_T_NONE       0
-#define NETLBL_CACHE_T_SID        1
-#define NETLBL_CACHE_T_MLS        2
-struct selinux_netlbl_cache {
-       u32 type;
-       union {
-               u32 sid;
-               struct mls_range mls_label;
-       } data;
-};
-
-/**
- * security_netlbl_cache_free - Free the NetLabel cached data
- * @data: the data to free
- *
- * Description:
- * This function is intended to be used as the free() callback inside the
- * netlbl_lsm_cache structure.
- *
- */
-static void security_netlbl_cache_free(const void *data)
-{
-       struct selinux_netlbl_cache *cache;
-
-       if (data == NULL)
-               return;
-
-       cache = NETLBL_CACHE(data);
-       switch (cache->type) {
-       case NETLBL_CACHE_T_MLS:
-               ebitmap_destroy(&cache->data.mls_label.level[0].cat);
-               break;
-       }
-       kfree(data);
-}
-
  /**
   * security_netlbl_cache_add - Add an entry to the NetLabel cache
   * @secattr: the NetLabel packet security attributes
- * @ctx: the SELinux context
+ * @sid: the SELinux SID
   *
   * Description:
   * Attempt to cache the context in @ctx, which was derived from the packet in
@@ -2455,60 +2559,46 @@ static void security_netlbl_cache_free(const void *data)
   *
   */
  static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
-                                     struct context *ctx)
+                                     u32 sid)
  {
-       struct selinux_netlbl_cache *cache = NULL;
+       u32 *sid_cache;
  
-       secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
-       if (secattr->cache == NULL)
-               return;
-
-       cache = kzalloc(sizeof(*cache), GFP_ATOMIC);
-       if (cache == NULL)
+       sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
+       if (sid_cache == NULL)
                 return;
-
-       cache->type = NETLBL_CACHE_T_MLS;
-       if (ebitmap_cpy(&cache->data.mls_label.level[0].cat,
-                       &ctx->range.level[0].cat) != 0) {
-               kfree(cache);
+       secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
+       if (secattr->cache == NULL) {
+               kfree(sid_cache);
                 return;
         }
-       cache->data.mls_label.level[1].cat.highbit =
-               cache->data.mls_label.level[0].cat.highbit;
-       cache->data.mls_label.level[1].cat.node =
-               cache->data.mls_label.level[0].cat.node;
-       cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
-       cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
  
-       secattr->cache->free = security_netlbl_cache_free;
-       secattr->cache->data = (void *)cache;
+       *sid_cache = sid;
+       secattr->cache->free = kfree;
+       secattr->cache->data = sid_cache;
         secattr->flags |= NETLBL_SECATTR_CACHE;
  }
  
  /**
   * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
   * @secattr: the NetLabel packet security attributes
- * @base_sid: the SELinux SID to use as a context for MLS only attributes
   * @sid: the SELinux SID
   *
   * Description:
   * Convert the given NetLabel security attributes in @secattr into a
   * SELinux SID.  If the @secattr field does not contain a full SELinux
- * SID/context then use the context in @base_sid as the foundation.  If
- * possibile the 'cache' field of @secattr is set and the CACHE flag is set;
- * this is to allow the @secattr to be used by NetLabel to cache the secattr to
- * SID conversion for future lookups.  Returns zero on success, negative
- * values on failure.
+ * SID/context then use SECINITSID_NETMSG as the foundation.  If possibile the
+ * 'cache' field of @secattr is set and the CACHE flag is set; this is to
+ * allow the @secattr to be used by NetLabel to cache the secattr to SID
+ * conversion for future lookups.  Returns zero on success, negative values on
+ * failure.
   *
   */
  int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
-                                  u32 base_sid,
                                    u32 *sid)
  {
         int rc = -EIDRM;
         struct context *ctx;
         struct context ctx_new;
-       struct selinux_netlbl_cache *cache;
  
         if (!ss_initialized) {
                 *sid = SECSID_NULL;
@@ -2518,40 +2608,13 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
         POLICY_RDLOCK;
  
         if (secattr->flags & NETLBL_SECATTR_CACHE) {
-               cache = NETLBL_CACHE(secattr->cache->data);
-               switch (cache->type) {
-               case NETLBL_CACHE_T_SID:
-                       *sid = cache->data.sid;
-                       rc = 0;
-                       break;
-               case NETLBL_CACHE_T_MLS:
-                       ctx = sidtab_search(&sidtab, base_sid);
-                       if (ctx == NULL)
-                               goto netlbl_secattr_to_sid_return;
-
-                       ctx_new.user = ctx->user;
-                       ctx_new.role = ctx->role;
-                       ctx_new.type = ctx->type;
-                       ctx_new.range.level[0].sens =
-                               cache->data.mls_label.level[0].sens;
-                       ctx_new.range.level[0].cat.highbit =
-                               cache->data.mls_label.level[0].cat.highbit;
-                       ctx_new.range.level[0].cat.node =
-                               cache->data.mls_label.level[0].cat.node;
-                       ctx_new.range.level[1].sens =
-                               cache->data.mls_label.level[1].sens;
-                       ctx_new.range.level[1].cat.highbit =
-                               cache->data.mls_label.level[1].cat.highbit;
-                       ctx_new.range.level[1].cat.node =
-                               cache->data.mls_label.level[1].cat.node;
-
-                       rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid);
-                       break;
-               default:
-                       goto netlbl_secattr_to_sid_return;
-               }
+               *sid = *(u32 *)secattr->cache->data;
+               rc = 0;
+       } else if (secattr->flags & NETLBL_SECATTR_SECID) {
+               *sid = secattr->attr.secid;
+               rc = 0;
         } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
-               ctx = sidtab_search(&sidtab, base_sid);
+               ctx = sidtab_search(&sidtab, SECINITSID_NETMSG);
                 if (ctx == NULL)
                         goto netlbl_secattr_to_sid_return;
  
@@ -2561,7 +2624,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
                 mls_import_netlbl_lvl(&ctx_new, secattr);
                 if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
                         if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat,
-                                                 secattr->mls_cat) != 0)
+                                                 secattr->attr.mls.cat) != 0)
                                 goto netlbl_secattr_to_sid_return;
                         ctx_new.range.level[1].cat.highbit =
                                 ctx_new.range.level[0].cat.highbit;
@@ -2578,7 +2641,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
                 if (rc != 0)
                         goto netlbl_secattr_to_sid_return_cleanup;
  
-               security_netlbl_cache_add(secattr, &ctx_new);
+               security_netlbl_cache_add(secattr, *sid);
  
                 ebitmap_destroy(&ctx_new.range.level[0].cat);
         } else {
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c

index e076039690330675f6679edd39b18f02ab8e852d..7e158205d0810894ebe783320acdfe11e5f2da38 100644 (file)
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -46,11 +46,14 @@
  #include <net/checksum.h>
  #include <net/udp.h>
  #include <asm/semaphore.h>
+#include <asm/atomic.h>
  
  #include "avc.h"
  #include "objsec.h"
  #include "xfrm.h"
  
+/* Labeled XFRM instance counter */
+atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0);
  
  /*
   * Returns true if an LSM/SELinux context
@@ -293,6 +296,9 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp,
         BUG_ON(!uctx);
  
         err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0);
+       if (err == 0)
+               atomic_inc(&selinux_xfrm_refcount);
+
         return err;
  }
  
@@ -340,10 +346,13 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp)
         struct xfrm_sec_ctx *ctx = xp->security;
         int rc = 0;
  
-       if (ctx)
+       if (ctx) {
                 rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
                                   SECCLASS_ASSOCIATION,
                                   ASSOCIATION__SETCONTEXT, NULL);
+               if (rc == 0)
+                       atomic_dec(&selinux_xfrm_refcount);
+       }
  
         return rc;
  }
@@ -360,6 +369,8 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct
         BUG_ON(!x);
  
         err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid);
+       if (err == 0)
+               atomic_inc(&selinux_xfrm_refcount);
         return err;
  }
  
@@ -382,10 +393,13 @@ int selinux_xfrm_state_delete(struct xfrm_state *x)
         struct xfrm_sec_ctx *ctx = x->security;
         int rc = 0;
  
-       if (ctx)
+       if (ctx) {
                 rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
                                   SECCLASS_ASSOCIATION,
                                   ASSOCIATION__SETCONTEXT, NULL);
+               if (rc == 0)
+                       atomic_dec(&selinux_xfrm_refcount);
+       }
  
         return rc;
  }
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c

index b4a38a3d855bc89d46ad11ec2e7f80f030ad7c3f..4bb97646a67abebd031206bcfef3789b870f9f39 100644 (file)
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -711,11 +711,13 @@ static void snd_intel8x0_setup_periods(struct intel8x0 *chip, struct ichdev *ich
  static void fill_nocache(void *buf, int size, int nocache)
  {
         size = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       change_page_attr(virt_to_page(buf), size, nocache ? PAGE_KERNEL_NOCACHE : PAGE_KERNEL);
-       global_flush_tlb();
+       if (nocache)
+               set_pages_uc(virt_to_page(buf), size);
+       else
+               set_pages_wb(virt_to_page(buf), size);
  }
  #else
-#define fill_nocache(buf,size,nocache)
+#define fill_nocache(buf, size, nocache) do { ; } while (0)
  #endif
  
  /*
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c

similarity index 83%

rename from drivers/kvm/ioapic.c

rename to virt/kvm/ioapic.c

index c7992e667fdbed4ce2bae533f7c97d5e72e16d6a..317f8e211cd2a136f6a07c8333731dadd0389cbe 100644 (file)
--- a/drivers/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -26,7 +26,7 @@
   *  Based on Xen 3.1 code.
   */
  
-#include "kvm.h"
+#include <linux/kvm_host.h>
  #include <linux/kvm.h>
  #include <linux/mm.h>
  #include <linux/highmem.h>
@@ -34,14 +34,17 @@
  #include <linux/hrtimer.h>
  #include <linux/io.h>
  #include <asm/processor.h>
-#include <asm/msr.h>
  #include <asm/page.h>
  #include <asm/current.h>
-#include <asm/apicdef.h>
-#include <asm/io_apic.h>
-#include "irq.h"
-/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
  #define ioapic_debug(fmt, arg...)
+#endif
  static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
  
  static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
         default:
                 index = (ioapic->ioregsel - 0x10) >> 1;
  
-               ioapic_debug("change redir index %x val %x", index, val);
+               ioapic_debug("change redir index %x val %x\n", index, val);
                 if (index >= IOAPIC_NUM_PINS)
                         return;
                 if (ioapic->ioregsel & 1) {
@@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
  }
  
  static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
-                          struct kvm_lapic *target,
+                          struct kvm_vcpu *vcpu,
                            u8 vector, u8 trig_mode, u8 delivery_mode)
  {
-       ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
+       ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
                      delivery_mode);
  
-       ASSERT((delivery_mode == dest_Fixed) ||
-              (delivery_mode == dest_LowestPrio));
+       ASSERT((delivery_mode == IOAPIC_FIXED) ||
+              (delivery_mode == IOAPIC_LOWEST_PRIORITY));
  
-       kvm_apic_set_irq(target, vector, trig_mode);
+       kvm_apic_set_irq(vcpu, vector, trig_mode);
  }
  
  static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
@@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
         struct kvm *kvm = ioapic->kvm;
         struct kvm_vcpu *vcpu;
  
-       ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
+       ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
  
         if (dest_mode == 0) {   /* Physical mode. */
                 if (dest == 0xFF) {     /* Broadcast. */
                         for (i = 0; i < KVM_MAX_VCPUS; ++i)
-                               if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
+                               if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
                                         mask |= 1 << i;
                         return mask;
                 }
@@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
                         vcpu = kvm->vcpus[i];
                         if (!vcpu)
                                 continue;
-                       if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
-                               if (vcpu->apic)
+                       if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
+                               if (vcpu->arch.apic)
                                         mask = 1 << i;
                                 break;
                         }
@@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
                         vcpu = kvm->vcpus[i];
                         if (!vcpu)
                                 continue;
-                       if (vcpu->apic &&
-                           kvm_apic_match_logical_addr(vcpu->apic, dest))
+                       if (vcpu->arch.apic &&
+                           kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
                                 mask |= 1 << vcpu->vcpu_id;
                 }
-       ioapic_debug("mask %x", mask);
+       ioapic_debug("mask %x\n", mask);
         return mask;
  }
  
@@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
         u8 vector = ioapic->redirtbl[irq].fields.vector;
         u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
         u32 deliver_bitmask;
-       struct kvm_lapic *target;
         struct kvm_vcpu *vcpu;
         int vcpu_id;
  
         ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-                    "vector=%x trig_mode=%x",
+                    "vector=%x trig_mode=%x\n",
                      dest, dest_mode, delivery_mode, vector, trig_mode);
  
         deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
         if (!deliver_bitmask) {
-               ioapic_debug("no target on destination");
+               ioapic_debug("no target on destination\n");
                 return;
         }
  
         switch (delivery_mode) {
-       case dest_LowestPrio:
-               target =
-                   kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
-               if (target != NULL)
-                       ioapic_inj_irq(ioapic, target, vector,
+       case IOAPIC_LOWEST_PRIORITY:
+               vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+                               deliver_bitmask);
+               if (vcpu != NULL)
+                       ioapic_inj_irq(ioapic, vcpu, vector,
                                        trig_mode, delivery_mode);
                 else
-                       ioapic_debug("null round robin: "
-                                    "mask=%x vector=%x delivery_mode=%x",
-                                    deliver_bitmask, vector, dest_LowestPrio);
+                       ioapic_debug("null lowest prio vcpu: "
+                                    "mask=%x vector=%x delivery_mode=%x\n",
+                                    deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
                 break;
-       case dest_Fixed:
+       case IOAPIC_FIXED:
                 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
                         if (!(deliver_bitmask & (1 << vcpu_id)))
                                 continue;
                         deliver_bitmask &= ~(1 << vcpu_id);
                         vcpu = ioapic->kvm->vcpus[vcpu_id];
                         if (vcpu) {
-                               target = vcpu->apic;
-                               ioapic_inj_irq(ioapic, target, vector,
+                               ioapic_inj_irq(ioapic, vcpu, vector,
                                                trig_mode, delivery_mode);
                         }
                 }
@@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
  
  void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
  {
-       struct kvm_ioapic *ioapic = kvm->vioapic;
+       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
         union ioapic_redir_entry *ent;
         int gsi;
  
@@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
         struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
         u32 result;
  
-       ioapic_debug("addr %lx", (unsigned long)addr);
+       ioapic_debug("addr %lx\n", (unsigned long)addr);
         ASSERT(!(addr & 0xf));  /* check alignment */
  
         addr &= 0xff;
@@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
         struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
         u32 data;
  
-       ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
-                    addr, len, val);
+       ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+                    (void*)addr, len, val);
         ASSERT(!(addr & 0xf));  /* check alignment */
         if (len == 4 || len == 8)
                 data = *(u32 *) val;
@@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
         case IOAPIC_REG_WINDOW:
                 ioapic_write_indirect(ioapic, data);
                 break;
+#ifdef CONFIG_IA64
+       case IOAPIC_REG_EOI:
+               kvm_ioapic_update_eoi(ioapic->kvm, data);
+               break;
+#endif
  
         default:
                 break;
         }
  }
  
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+       int i;
+
+       for (i = 0; i < IOAPIC_NUM_PINS; i++)
+               ioapic->redirtbl[i].fields.mask = 1;
+       ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+       ioapic->ioregsel = 0;
+       ioapic->irr = 0;
+       ioapic->id = 0;
+}
+
  int kvm_ioapic_init(struct kvm *kvm)
  {
         struct kvm_ioapic *ioapic;
-       int i;
  
         ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
         if (!ioapic)
                 return -ENOMEM;
-       kvm->vioapic = ioapic;
-       for (i = 0; i < IOAPIC_NUM_PINS; i++)
-               ioapic->redirtbl[i].fields.mask = 1;
-       ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+       kvm->arch.vioapic = ioapic;
+       kvm_ioapic_reset(ioapic);
         ioapic->dev.read = ioapic_mmio_read;
         ioapic->dev.write = ioapic_mmio_write;
         ioapic->dev.in_range = ioapic_in_range;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h

new file mode 100644 (file)

index 0000000..7f16675
--- /dev/null
+++ b/virt/kvm/ioapic.h
@@ -0,0 +1,95 @@
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG  0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS  0xfec00000
+#define IOAPIC_MEM_LENGTH            0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT  0x00
+#define IOAPIC_REG_WINDOW  0x10
+#define IOAPIC_REG_EOI     0x40        /* IA64 IOSAPIC only */
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00        /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID  0x02        /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define        IOAPIC_FIXED                    0x0
+#define        IOAPIC_LOWEST_PRIORITY          0x1
+#define        IOAPIC_PMI                      0x2
+#define        IOAPIC_NMI                      0x4
+#define        IOAPIC_INIT                     0x5
+#define        IOAPIC_EXTINT                   0x7
+
+struct kvm_ioapic {
+       u64 base_address;
+       u32 ioregsel;
+       u32 id;
+       u32 irr;
+       u32 pad;
+       union ioapic_redir_entry {
+               u64 bits;
+               struct {
+                       u8 vector;
+                       u8 delivery_mode:3;
+                       u8 dest_mode:1;
+                       u8 delivery_status:1;
+                       u8 polarity:1;
+                       u8 remote_irr:1;
+                       u8 trig_mode:1;
+                       u8 mask:1;
+                       u8 reserve:7;
+                       u8 reserved[4];
+                       u8 dest_id;
+               } fields;
+       } redirtbl[IOAPIC_NUM_PINS];
+       struct kvm_io_device dev;
+       struct kvm *kvm;
+};
+
+#ifdef DEBUG
+#define ASSERT(x)                                                      \
+do {                                                                   \
+       if (!(x)) {                                                     \
+               printk(KERN_EMERG "assertion failed %s: %d: %s\n",      \
+                      __FILE__, __LINE__, #x);                         \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+       return kvm->arch.vioapic;
+}
+
+#ifdef CONFIG_IA64
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       return 1;
+}
+#endif
+
+struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+                                      unsigned long bitmap);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+
+#endif
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h

new file mode 100644 (file)

index 0000000..c14e642
--- /dev/null
+++ b/virt/kvm/iodev.h
@@ -0,0 +1,63 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include <linux/kvm_types.h>
+
+struct kvm_io_device {
+       void (*read)(struct kvm_io_device *this,
+                    gpa_t addr,
+                    int len,
+                    void *val);
+       void (*write)(struct kvm_io_device *this,
+                     gpa_t addr,
+                     int len,
+                     const void *val);
+       int (*in_range)(struct kvm_io_device *this, gpa_t addr);
+       void (*destructor)(struct kvm_io_device *this);
+
+       void             *private;
+};
+
+static inline void kvm_iodevice_read(struct kvm_io_device *dev,
+                                    gpa_t addr,
+                                    int len,
+                                    void *val)
+{
+       dev->read(dev, addr, len, val);
+}
+
+static inline void kvm_iodevice_write(struct kvm_io_device *dev,
+                                     gpa_t addr,
+                                     int len,
+                                     const void *val)
+{
+       dev->write(dev, addr, len, val);
+}
+
+static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
+{
+       return dev->in_range(dev, addr);
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+       if (dev->destructor)
+               dev->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

new file mode 100644 (file)

index 0000000..3c4fe26
--- /dev/null
+++ b/virt/kvm/kvm_main.c
@@ -0,0 +1,1400 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "iodev.h"
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+#include <linux/debugfs.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/smp.h>
+#include <linux/anon_inodes.h>
+#include <linux/profile.h>
+#include <linux/kvm_para.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+DEFINE_SPINLOCK(kvm_lock);
+LIST_HEAD(vm_list);
+
+static cpumask_t cpus_hardware_enabled;
+
+struct kmem_cache *kvm_vcpu_cache;
+EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+
+static __read_mostly struct preempt_ops kvm_preempt_ops;
+
+static struct dentry *debugfs_dir;
+
+static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
+                          unsigned long arg);
+
+static inline int valid_vcpu(int n)
+{
+       return likely(n >= 0 && n < KVM_MAX_VCPUS);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put()
+ */
+void vcpu_load(struct kvm_vcpu *vcpu)
+{
+       int cpu;
+
+       mutex_lock(&vcpu->mutex);
+       cpu = get_cpu();
+       preempt_notifier_register(&vcpu->preempt_notifier);
+       kvm_arch_vcpu_load(vcpu, cpu);
+       put_cpu();
+}
+
+void vcpu_put(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       kvm_arch_vcpu_put(vcpu);
+       preempt_notifier_unregister(&vcpu->preempt_notifier);
+       preempt_enable();
+       mutex_unlock(&vcpu->mutex);
+}
+
+static void ack_flush(void *_completed)
+{
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+       int i, cpu;
+       cpumask_t cpus;
+       struct kvm_vcpu *vcpu;
+
+       cpus_clear(cpus);
+       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+               vcpu = kvm->vcpus[i];
+               if (!vcpu)
+                       continue;
+               if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                       continue;
+               cpu = vcpu->cpu;
+               if (cpu != -1 && cpu != raw_smp_processor_id())
+                       cpu_set(cpu, cpus);
+       }
+       if (cpus_empty(cpus))
+               return;
+       ++kvm->stat.remote_tlb_flush;
+       smp_call_function_mask(cpus, ack_flush, NULL, 1);
+}
+
+int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
+       struct page *page;
+       int r;
+
+       mutex_init(&vcpu->mutex);
+       vcpu->cpu = -1;
+       vcpu->kvm = kvm;
+       vcpu->vcpu_id = id;
+       init_waitqueue_head(&vcpu->wq);
+
+       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       if (!page) {
+               r = -ENOMEM;
+               goto fail;
+       }
+       vcpu->run = page_address(page);
+
+       r = kvm_arch_vcpu_init(vcpu);
+       if (r < 0)
+               goto fail_free_run;
+       return 0;
+
+fail_free_run:
+       free_page((unsigned long)vcpu->run);
+fail:
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_init);
+
+void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+       kvm_arch_vcpu_uninit(vcpu);
+       free_page((unsigned long)vcpu->run);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+
+static struct kvm *kvm_create_vm(void)
+{
+       struct kvm *kvm = kvm_arch_create_vm();
+
+       if (IS_ERR(kvm))
+               goto out;
+
+       kvm->mm = current->mm;
+       atomic_inc(&kvm->mm->mm_count);
+       spin_lock_init(&kvm->mmu_lock);
+       kvm_io_bus_init(&kvm->pio_bus);
+       mutex_init(&kvm->lock);
+       kvm_io_bus_init(&kvm->mmio_bus);
+       spin_lock(&kvm_lock);
+       list_add(&kvm->vm_list, &vm_list);
+       spin_unlock(&kvm_lock);
+out:
+       return kvm;
+}
+
+/*
+ * Free any memory in @free but not in @dont.
+ */
+static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
+                                 struct kvm_memory_slot *dont)
+{
+       if (!dont || free->rmap != dont->rmap)
+               vfree(free->rmap);
+
+       if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+               vfree(free->dirty_bitmap);
+
+       free->npages = 0;
+       free->dirty_bitmap = NULL;
+       free->rmap = NULL;
+}
+
+void kvm_free_physmem(struct kvm *kvm)
+{
+       int i;
+
+       for (i = 0; i < kvm->nmemslots; ++i)
+               kvm_free_physmem_slot(&kvm->memslots[i], NULL);
+}
+
+static void kvm_destroy_vm(struct kvm *kvm)
+{
+       struct mm_struct *mm = kvm->mm;
+
+       spin_lock(&kvm_lock);
+       list_del(&kvm->vm_list);
+       spin_unlock(&kvm_lock);
+       kvm_io_bus_destroy(&kvm->pio_bus);
+       kvm_io_bus_destroy(&kvm->mmio_bus);
+       kvm_arch_destroy_vm(kvm);
+       mmdrop(mm);
+}
+
+static int kvm_vm_release(struct inode *inode, struct file *filp)
+{
+       struct kvm *kvm = filp->private_data;
+
+       kvm_destroy_vm(kvm);
+       return 0;
+}
+
+/*
+ * Allocate some memory and give it an address in the guest physical address
+ * space.
+ *
+ * Discontiguous memory is allowed, mostly for framebuffers.
+ *
+ * Must be called holding mmap_sem for write.
+ */
+int __kvm_set_memory_region(struct kvm *kvm,
+                           struct kvm_userspace_memory_region *mem,
+                           int user_alloc)
+{
+       int r;
+       gfn_t base_gfn;
+       unsigned long npages;
+       unsigned long i;
+       struct kvm_memory_slot *memslot;
+       struct kvm_memory_slot old, new;
+
+       r = -EINVAL;
+       /* General sanity checks */
+       if (mem->memory_size & (PAGE_SIZE - 1))
+               goto out;
+       if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+               goto out;
+       if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+               goto out;
+       if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+               goto out;
+
+       memslot = &kvm->memslots[mem->slot];
+       base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+       npages = mem->memory_size >> PAGE_SHIFT;
+
+       if (!npages)
+               mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+       new = old = *memslot;
+
+       new.base_gfn = base_gfn;
+       new.npages = npages;
+       new.flags = mem->flags;
+
+       /* Disallow changing a memory slot's size. */
+       r = -EINVAL;
+       if (npages && old.npages && npages != old.npages)
+               goto out_free;
+
+       /* Check for overlaps */
+       r = -EEXIST;
+       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+               struct kvm_memory_slot *s = &kvm->memslots[i];
+
+               if (s == memslot)
+                       continue;
+               if (!((base_gfn + npages <= s->base_gfn) ||
+                     (base_gfn >= s->base_gfn + s->npages)))
+                       goto out_free;
+       }
+
+       /* Free page dirty bitmap if unneeded */
+       if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
+               new.dirty_bitmap = NULL;
+
+       r = -ENOMEM;
+
+       /* Allocate if a slot is being created */
+       if (npages && !new.rmap) {
+               new.rmap = vmalloc(npages * sizeof(struct page *));
+
+               if (!new.rmap)
+                       goto out_free;
+
+               memset(new.rmap, 0, npages * sizeof(*new.rmap));
+
+               new.user_alloc = user_alloc;
+               new.userspace_addr = mem->userspace_addr;
+       }
+
+       /* Allocate page dirty bitmap if needed */
+       if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+               unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
+
+               new.dirty_bitmap = vmalloc(dirty_bytes);
+               if (!new.dirty_bitmap)
+                       goto out_free;
+               memset(new.dirty_bitmap, 0, dirty_bytes);
+       }
+
+       if (mem->slot >= kvm->nmemslots)
+               kvm->nmemslots = mem->slot + 1;
+
+       *memslot = new;
+
+       r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
+       if (r) {
+               *memslot = old;
+               goto out_free;
+       }
+
+       kvm_free_physmem_slot(&old, &new);
+       return 0;
+
+out_free:
+       kvm_free_physmem_slot(&new, &old);
+out:
+       return r;
+
+}
+EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
+
+int kvm_set_memory_region(struct kvm *kvm,
+                         struct kvm_userspace_memory_region *mem,
+                         int user_alloc)
+{
+       int r;
+
+       down_write(&current->mm->mmap_sem);
+       r = __kvm_set_memory_region(kvm, mem, user_alloc);
+       up_write(&current->mm->mmap_sem);
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_set_memory_region);
+
+int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+                                  struct
+                                  kvm_userspace_memory_region *mem,
+                                  int user_alloc)
+{
+       if (mem->slot >= KVM_MEMORY_SLOTS)
+               return -EINVAL;
+       return kvm_set_memory_region(kvm, mem, user_alloc);
+}
+
+int kvm_get_dirty_log(struct kvm *kvm,
+                       struct kvm_dirty_log *log, int *is_dirty)
+{
+       struct kvm_memory_slot *memslot;
+       int r, i;
+       int n;
+       unsigned long any = 0;
+
+       r = -EINVAL;
+       if (log->slot >= KVM_MEMORY_SLOTS)
+               goto out;
+
+       memslot = &kvm->memslots[log->slot];
+       r = -ENOENT;
+       if (!memslot->dirty_bitmap)
+               goto out;
+
+       n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+
+       for (i = 0; !any && i < n/sizeof(long); ++i)
+               any = memslot->dirty_bitmap[i];
+
+       r = -EFAULT;
+       if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+               goto out;
+
+       if (any)
+               *is_dirty = 1;
+
+       r = 0;
+out:
+       return r;
+}
+
+int is_error_page(struct page *page)
+{
+       return page == bad_page;
+}
+EXPORT_SYMBOL_GPL(is_error_page);
+
+static inline unsigned long bad_hva(void)
+{
+       return PAGE_OFFSET;
+}
+
+int kvm_is_error_hva(unsigned long addr)
+{
+       return addr == bad_hva();
+}
+EXPORT_SYMBOL_GPL(kvm_is_error_hva);
+
+static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+       int i;
+
+       for (i = 0; i < kvm->nmemslots; ++i) {
+               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+               if (gfn >= memslot->base_gfn
+                   && gfn < memslot->base_gfn + memslot->npages)
+                       return memslot;
+       }
+       return NULL;
+}
+
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+{
+       gfn = unalias_gfn(kvm, gfn);
+       return __gfn_to_memslot(kvm, gfn);
+}
+
+int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
+{
+       int i;
+
+       gfn = unalias_gfn(kvm, gfn);
+       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+
+               if (gfn >= memslot->base_gfn
+                   && gfn < memslot->base_gfn + memslot->npages)
+                       return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+
+static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_memory_slot *slot;
+
+       gfn = unalias_gfn(kvm, gfn);
+       slot = __gfn_to_memslot(kvm, gfn);
+       if (!slot)
+               return bad_hva();
+       return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
+}
+
+/*
+ * Requires current->mm->mmap_sem to be held
+ */
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+{
+       struct page *page[1];
+       unsigned long addr;
+       int npages;
+
+       might_sleep();
+
+       addr = gfn_to_hva(kvm, gfn);
+       if (kvm_is_error_hva(addr)) {
+               get_page(bad_page);
+               return bad_page;
+       }
+
+       npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
+                               NULL);
+
+       if (npages != 1) {
+               get_page(bad_page);
+               return bad_page;
+       }
+
+       return page[0];
+}
+
+EXPORT_SYMBOL_GPL(gfn_to_page);
+
+void kvm_release_page_clean(struct page *page)
+{
+       put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+
+void kvm_release_page_dirty(struct page *page)
+{
+       if (!PageReserved(page))
+               SetPageDirty(page);
+       put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+
+static int next_segment(unsigned long len, int offset)
+{
+       if (len > PAGE_SIZE - offset)
+               return PAGE_SIZE - offset;
+       else
+               return len;
+}
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+                       int len)
+{
+       int r;
+       unsigned long addr;
+
+       addr = gfn_to_hva(kvm, gfn);
+       if (kvm_is_error_hva(addr))
+               return -EFAULT;
+       r = copy_from_user(data, (void __user *)addr + offset, len);
+       if (r)
+               return -EFAULT;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page);
+
+int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+{
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       int seg;
+       int offset = offset_in_page(gpa);
+       int ret;
+
+       while ((seg = next_segment(len, offset)) != 0) {
+               ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+               if (ret < 0)
+                       return ret;
+               offset = 0;
+               len -= seg;
+               data += seg;
+               ++gfn;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest);
+
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+                         unsigned long len)
+{
+       int r;
+       unsigned long addr;
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       int offset = offset_in_page(gpa);
+
+       addr = gfn_to_hva(kvm, gfn);
+       if (kvm_is_error_hva(addr))
+               return -EFAULT;
+       r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+       if (r)
+               return -EFAULT;
+       return 0;
+}
+EXPORT_SYMBOL(kvm_read_guest_atomic);
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
+                        int offset, int len)
+{
+       int r;
+       unsigned long addr;
+
+       addr = gfn_to_hva(kvm, gfn);
+       if (kvm_is_error_hva(addr))
+               return -EFAULT;
+       r = copy_to_user((void __user *)addr + offset, data, len);
+       if (r)
+               return -EFAULT;
+       mark_page_dirty(kvm, gfn);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_page);
+
+int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+                   unsigned long len)
+{
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       int seg;
+       int offset = offset_in_page(gpa);
+       int ret;
+
+       while ((seg = next_segment(len, offset)) != 0) {
+               ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+               if (ret < 0)
+                       return ret;
+               offset = 0;
+               len -= seg;
+               data += seg;
+               ++gfn;
+       }
+       return 0;
+}
+
+int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+{
+       return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
+
+int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
+{
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       int seg;
+       int offset = offset_in_page(gpa);
+       int ret;
+
+        while ((seg = next_segment(len, offset)) != 0) {
+               ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+               if (ret < 0)
+                       return ret;
+               offset = 0;
+               len -= seg;
+               ++gfn;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_clear_guest);
+
+void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+{
+       struct kvm_memory_slot *memslot;
+
+       gfn = unalias_gfn(kvm, gfn);
+       memslot = __gfn_to_memslot(kvm, gfn);
+       if (memslot && memslot->dirty_bitmap) {
+               unsigned long rel_gfn = gfn - memslot->base_gfn;
+
+               /* avoid RMW */
+               if (!test_bit(rel_gfn, memslot->dirty_bitmap))
+                       set_bit(rel_gfn, memslot->dirty_bitmap);
+       }
+}
+
+/*
+ * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ */
+void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+       add_wait_queue(&vcpu->wq, &wait);
+
+       /*
+        * We will block until either an interrupt or a signal wakes us up
+        */
+       while (!kvm_cpu_has_interrupt(vcpu)
+              && !signal_pending(current)
+              && !kvm_arch_vcpu_runnable(vcpu)) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               vcpu_put(vcpu);
+               schedule();
+               vcpu_load(vcpu);
+       }
+
+       __set_current_state(TASK_RUNNING);
+       remove_wait_queue(&vcpu->wq, &wait);
+}
+
+void kvm_resched(struct kvm_vcpu *vcpu)
+{
+       if (!need_resched())
+               return;
+       cond_resched();
+}
+EXPORT_SYMBOL_GPL(kvm_resched);
+
+static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+       struct page *page;
+
+       if (vmf->pgoff == 0)
+               page = virt_to_page(vcpu->run);
+       else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
+               page = virt_to_page(vcpu->arch.pio_data);
+       else
+               return VM_FAULT_SIGBUS;
+       get_page(page);
+       vmf->page = page;
+       return 0;
+}
+
+static struct vm_operations_struct kvm_vcpu_vm_ops = {
+       .fault = kvm_vcpu_fault,
+};
+
+static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_ops = &kvm_vcpu_vm_ops;
+       return 0;
+}
+
+static int kvm_vcpu_release(struct inode *inode, struct file *filp)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+
+       fput(vcpu->kvm->filp);
+       return 0;
+}
+
+static struct file_operations kvm_vcpu_fops = {
+       .release        = kvm_vcpu_release,
+       .unlocked_ioctl = kvm_vcpu_ioctl,
+       .compat_ioctl   = kvm_vcpu_ioctl,
+       .mmap           = kvm_vcpu_mmap,
+};
+
+/*
+ * Allocates an inode for the vcpu.
+ */
+static int create_vcpu_fd(struct kvm_vcpu *vcpu)
+{
+       int fd, r;
+       struct inode *inode;
+       struct file *file;
+
+       r = anon_inode_getfd(&fd, &inode, &file,
+                            "kvm-vcpu", &kvm_vcpu_fops, vcpu);
+       if (r)
+               return r;
+       atomic_inc(&vcpu->kvm->filp->f_count);
+       return fd;
+}
+
+/*
+ * Creates some virtual cpus.  Good luck creating more than one.
+ */
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
+{
+       int r;
+       struct kvm_vcpu *vcpu;
+
+       if (!valid_vcpu(n))
+               return -EINVAL;
+
+       vcpu = kvm_arch_vcpu_create(kvm, n);
+       if (IS_ERR(vcpu))
+               return PTR_ERR(vcpu);
+
+       preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+
+       r = kvm_arch_vcpu_setup(vcpu);
+       if (r)
+               goto vcpu_destroy;
+
+       mutex_lock(&kvm->lock);
+       if (kvm->vcpus[n]) {
+               r = -EEXIST;
+               mutex_unlock(&kvm->lock);
+               goto vcpu_destroy;
+       }
+       kvm->vcpus[n] = vcpu;
+       mutex_unlock(&kvm->lock);
+
+       /* Now it's all set up, let userspace reach it */
+       r = create_vcpu_fd(vcpu);
+       if (r < 0)
+               goto unlink;
+       return r;
+
+unlink:
+       mutex_lock(&kvm->lock);
+       kvm->vcpus[n] = NULL;
+       mutex_unlock(&kvm->lock);
+vcpu_destroy:
+       kvm_arch_vcpu_destroy(vcpu);
+       return r;
+}
+
+static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+{
+       if (sigset) {
+               sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+               vcpu->sigset_active = 1;
+               vcpu->sigset = *sigset;
+       } else
+               vcpu->sigset_active = 0;
+       return 0;
+}
+
+static long kvm_vcpu_ioctl(struct file *filp,
+                          unsigned int ioctl, unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       int r;
+
+       if (vcpu->kvm->mm != current->mm)
+               return -EIO;
+       switch (ioctl) {
+       case KVM_RUN:
+               r = -EINVAL;
+               if (arg)
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+               break;
+       case KVM_GET_REGS: {
+               struct kvm_regs kvm_regs;
+
+               memset(&kvm_regs, 0, sizeof kvm_regs);
+               r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_REGS: {
+               struct kvm_regs kvm_regs;
+
+               r = -EFAULT;
+               if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_GET_SREGS: {
+               struct kvm_sregs kvm_sregs;
+
+               memset(&kvm_sregs, 0, sizeof kvm_sregs);
+               r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_SREGS: {
+               struct kvm_sregs kvm_sregs;
+
+               r = -EFAULT;
+               if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_TRANSLATE: {
+               struct kvm_translation tr;
+
+               r = -EFAULT;
+               if (copy_from_user(&tr, argp, sizeof tr))
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &tr, sizeof tr))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_DEBUG_GUEST: {
+               struct kvm_debug_guest dbg;
+
+               r = -EFAULT;
+               if (copy_from_user(&dbg, argp, sizeof dbg))
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_SIGNAL_MASK: {
+               struct kvm_signal_mask __user *sigmask_arg = argp;
+               struct kvm_signal_mask kvm_sigmask;
+               sigset_t sigset, *p;
+
+               p = NULL;
+               if (argp) {
+                       r = -EFAULT;
+                       if (copy_from_user(&kvm_sigmask, argp,
+                                          sizeof kvm_sigmask))
+                               goto out;
+                       r = -EINVAL;
+                       if (kvm_sigmask.len != sizeof sigset)
+                               goto out;
+                       r = -EFAULT;
+                       if (copy_from_user(&sigset, sigmask_arg->sigset,
+                                          sizeof sigset))
+                               goto out;
+                       p = &sigset;
+               }
+               r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+               break;
+       }
+       case KVM_GET_FPU: {
+               struct kvm_fpu fpu;
+
+               memset(&fpu, 0, sizeof fpu);
+               r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &fpu, sizeof fpu))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_FPU: {
+               struct kvm_fpu fpu;
+
+               r = -EFAULT;
+               if (copy_from_user(&fpu, argp, sizeof fpu))
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
+       default:
+               r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+       }
+out:
+       return r;
+}
+
+static long kvm_vm_ioctl(struct file *filp,
+                          unsigned int ioctl, unsigned long arg)
+{
+       struct kvm *kvm = filp->private_data;
+       void __user *argp = (void __user *)arg;
+       int r;
+
+       if (kvm->mm != current->mm)
+               return -EIO;
+       switch (ioctl) {
+       case KVM_CREATE_VCPU:
+               r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+               if (r < 0)
+                       goto out;
+               break;
+       case KVM_SET_USER_MEMORY_REGION: {
+               struct kvm_userspace_memory_region kvm_userspace_mem;
+
+               r = -EFAULT;
+               if (copy_from_user(&kvm_userspace_mem, argp,
+                                               sizeof kvm_userspace_mem))
+                       goto out;
+
+               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_GET_DIRTY_LOG: {
+               struct kvm_dirty_log log;
+
+               r = -EFAULT;
+               if (copy_from_user(&log, argp, sizeof log))
+                       goto out;
+               r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+               if (r)
+                       goto out;
+               break;
+       }
+       default:
+               r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+       }
+out:
+       return r;
+}
+
+static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct kvm *kvm = vma->vm_file->private_data;
+       struct page *page;
+
+       if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
+               return VM_FAULT_SIGBUS;
+       page = gfn_to_page(kvm, vmf->pgoff);
+       if (is_error_page(page)) {
+               kvm_release_page_clean(page);
+               return VM_FAULT_SIGBUS;
+       }
+       vmf->page = page;
+       return 0;
+}
+
+static struct vm_operations_struct kvm_vm_vm_ops = {
+       .fault = kvm_vm_fault,
+};
+
+static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_ops = &kvm_vm_vm_ops;
+       return 0;
+}
+
+static struct file_operations kvm_vm_fops = {
+       .release        = kvm_vm_release,
+       .unlocked_ioctl = kvm_vm_ioctl,
+       .compat_ioctl   = kvm_vm_ioctl,
+       .mmap           = kvm_vm_mmap,
+};
+
+static int kvm_dev_ioctl_create_vm(void)
+{
+       int fd, r;
+       struct inode *inode;
+       struct file *file;
+       struct kvm *kvm;
+
+       kvm = kvm_create_vm();
+       if (IS_ERR(kvm))
+               return PTR_ERR(kvm);
+       r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
+       if (r) {
+               kvm_destroy_vm(kvm);
+               return r;
+       }
+
+       kvm->filp = file;
+
+       return fd;
+}
+
+static long kvm_dev_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       long r = -EINVAL;
+
+       switch (ioctl) {
+       case KVM_GET_API_VERSION:
+               r = -EINVAL;
+               if (arg)
+                       goto out;
+               r = KVM_API_VERSION;
+               break;
+       case KVM_CREATE_VM:
+               r = -EINVAL;
+               if (arg)
+                       goto out;
+               r = kvm_dev_ioctl_create_vm();
+               break;
+       case KVM_CHECK_EXTENSION:
+               r = kvm_dev_ioctl_check_extension((long)argp);
+               break;
+       case KVM_GET_VCPU_MMAP_SIZE:
+               r = -EINVAL;
+               if (arg)
+                       goto out;
+               r = 2 * PAGE_SIZE;
+               break;
+       default:
+               return kvm_arch_dev_ioctl(filp, ioctl, arg);
+       }
+out:
+       return r;
+}
+
+static struct file_operations kvm_chardev_ops = {
+       .unlocked_ioctl = kvm_dev_ioctl,
+       .compat_ioctl   = kvm_dev_ioctl,
+};
+
+static struct miscdevice kvm_dev = {
+       KVM_MINOR,
+       "kvm",
+       &kvm_chardev_ops,
+};
+
+static void hardware_enable(void *junk)
+{
+       int cpu = raw_smp_processor_id();
+
+       if (cpu_isset(cpu, cpus_hardware_enabled))
+               return;
+       cpu_set(cpu, cpus_hardware_enabled);
+       kvm_arch_hardware_enable(NULL);
+}
+
+static void hardware_disable(void *junk)
+{
+       int cpu = raw_smp_processor_id();
+
+       if (!cpu_isset(cpu, cpus_hardware_enabled))
+               return;
+       cpu_clear(cpu, cpus_hardware_enabled);
+       decache_vcpus_on_cpu(cpu);
+       kvm_arch_hardware_disable(NULL);
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+                          void *v)
+{
+       int cpu = (long)v;
+
+       val &= ~CPU_TASKS_FROZEN;
+       switch (val) {
+       case CPU_DYING:
+               printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+                      cpu);
+               hardware_disable(NULL);
+               break;
+       case CPU_UP_CANCELED:
+               printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
+                      cpu);
+               smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
+               break;
+       case CPU_ONLINE:
+               printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
+                      cpu);
+               smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+                     void *v)
+{
+       if (val == SYS_RESTART) {
+               /*
+                * Some (well, at least mine) BIOSes hang on reboot if
+                * in vmx root mode.
+                */
+               printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+               on_each_cpu(hardware_disable, NULL, 0, 1);
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_reboot_notifier = {
+       .notifier_call = kvm_reboot,
+       .priority = 0,
+};
+
+void kvm_io_bus_init(struct kvm_io_bus *bus)
+{
+       memset(bus, 0, sizeof(*bus));
+}
+
+void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+{
+       int i;
+
+       for (i = 0; i < bus->dev_count; i++) {
+               struct kvm_io_device *pos = bus->devs[i];
+
+               kvm_iodevice_destructor(pos);
+       }
+}
+
+struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
+{
+       int i;
+
+       for (i = 0; i < bus->dev_count; i++) {
+               struct kvm_io_device *pos = bus->devs[i];
+
+               if (pos->in_range(pos, addr))
+                       return pos;
+       }
+
+       return NULL;
+}
+
+void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
+{
+       BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
+
+       bus->devs[bus->dev_count++] = dev;
+}
+
+static struct notifier_block kvm_cpu_notifier = {
+       .notifier_call = kvm_cpu_hotplug,
+       .priority = 20, /* must be > scheduler priority */
+};
+
+static u64 vm_stat_get(void *_offset)
+{
+       unsigned offset = (long)_offset;
+       u64 total = 0;
+       struct kvm *kvm;
+
+       spin_lock(&kvm_lock);
+       list_for_each_entry(kvm, &vm_list, vm_list)
+               total += *(u32 *)((void *)kvm + offset);
+       spin_unlock(&kvm_lock);
+       return total;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
+
+static u64 vcpu_stat_get(void *_offset)
+{
+       unsigned offset = (long)_offset;
+       u64 total = 0;
+       struct kvm *kvm;
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       spin_lock(&kvm_lock);
+       list_for_each_entry(kvm, &vm_list, vm_list)
+               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       vcpu = kvm->vcpus[i];
+                       if (vcpu)
+                               total += *(u32 *)((void *)vcpu + offset);
+               }
+       spin_unlock(&kvm_lock);
+       return total;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
+
+static struct file_operations *stat_fops[] = {
+       [KVM_STAT_VCPU] = &vcpu_stat_fops,
+       [KVM_STAT_VM]   = &vm_stat_fops,
+};
+
+static void kvm_init_debug(void)
+{
+       struct kvm_stats_debugfs_item *p;
+
+       debugfs_dir = debugfs_create_dir("kvm", NULL);
+       for (p = debugfs_entries; p->name; ++p)
+               p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
+                                               (void *)(long)p->offset,
+                                               stat_fops[p->kind]);
+}
+
+static void kvm_exit_debug(void)
+{
+       struct kvm_stats_debugfs_item *p;
+
+       for (p = debugfs_entries; p->name; ++p)
+               debugfs_remove(p->dentry);
+       debugfs_remove(debugfs_dir);
+}
+
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+{
+       hardware_disable(NULL);
+       return 0;
+}
+
+static int kvm_resume(struct sys_device *dev)
+{
+       hardware_enable(NULL);
+       return 0;
+}
+
+static struct sysdev_class kvm_sysdev_class = {
+       .name = "kvm",
+       .suspend = kvm_suspend,
+       .resume = kvm_resume,
+};
+
+static struct sys_device kvm_sysdev = {
+       .id = 0,
+       .cls = &kvm_sysdev_class,
+};
+
+struct page *bad_page;
+
+static inline
+struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+{
+       return container_of(pn, struct kvm_vcpu, preempt_notifier);
+}
+
+static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+{
+       struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+       kvm_arch_vcpu_load(vcpu, cpu);
+}
+
+static void kvm_sched_out(struct preempt_notifier *pn,
+                         struct task_struct *next)
+{
+       struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+
+       kvm_arch_vcpu_put(vcpu);
+}
+
+int kvm_init(void *opaque, unsigned int vcpu_size,
+                 struct module *module)
+{
+       int r;
+       int cpu;
+
+       kvm_init_debug();
+
+       r = kvm_arch_init(opaque);
+       if (r)
+               goto out_fail;
+
+       bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+       if (bad_page == NULL) {
+               r = -ENOMEM;
+               goto out;
+       }
+
+       r = kvm_arch_hardware_setup();
+       if (r < 0)
+               goto out_free_0;
+
+       for_each_online_cpu(cpu) {
+               smp_call_function_single(cpu,
+                               kvm_arch_check_processor_compat,
+                               &r, 0, 1);
+               if (r < 0)
+                       goto out_free_1;
+       }
+
+       on_each_cpu(hardware_enable, NULL, 0, 1);
+       r = register_cpu_notifier(&kvm_cpu_notifier);
+       if (r)
+               goto out_free_2;
+       register_reboot_notifier(&kvm_reboot_notifier);
+
+       r = sysdev_class_register(&kvm_sysdev_class);
+       if (r)
+               goto out_free_3;
+
+       r = sysdev_register(&kvm_sysdev);
+       if (r)
+               goto out_free_4;
+
+       /* A kmem cache lets us meet the alignment requirements of fx_save. */
+       kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
+                                          __alignof__(struct kvm_vcpu),
+                                          0, NULL);
+       if (!kvm_vcpu_cache) {
+               r = -ENOMEM;
+               goto out_free_5;
+       }
+
+       kvm_chardev_ops.owner = module;
+
+       r = misc_register(&kvm_dev);
+       if (r) {
+               printk(KERN_ERR "kvm: misc device register failed\n");
+               goto out_free;
+       }
+
+       kvm_preempt_ops.sched_in = kvm_sched_in;
+       kvm_preempt_ops.sched_out = kvm_sched_out;
+
+       return 0;
+
+out_free:
+       kmem_cache_destroy(kvm_vcpu_cache);
+out_free_5:
+       sysdev_unregister(&kvm_sysdev);
+out_free_4:
+       sysdev_class_unregister(&kvm_sysdev_class);
+out_free_3:
+       unregister_reboot_notifier(&kvm_reboot_notifier);
+       unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_2:
+       on_each_cpu(hardware_disable, NULL, 0, 1);
+out_free_1:
+       kvm_arch_hardware_unsetup();
+out_free_0:
+       __free_page(bad_page);
+out:
+       kvm_arch_exit();
+       kvm_exit_debug();
+out_fail:
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init);
+
+void kvm_exit(void)
+{
+       misc_deregister(&kvm_dev);
+       kmem_cache_destroy(kvm_vcpu_cache);
+       sysdev_unregister(&kvm_sysdev);
+       sysdev_class_unregister(&kvm_sysdev_class);
+       unregister_reboot_notifier(&kvm_reboot_notifier);
+       unregister_cpu_notifier(&kvm_cpu_notifier);
+       on_each_cpu(hardware_disable, NULL, 0, 1);
+       kvm_arch_hardware_unsetup();
+       kvm_arch_exit();
+       kvm_exit_debug();
+       __free_page(bad_page);
+}
+EXPORT_SYMBOL_GPL(kvm_exit);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 30 Jan 2008 22:36:35 +0000 (09:36 +1100)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 30 Jan 2008 22:36:35 +0000 (09:36 +1100)