]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge branch 'akpm/master'
authorStephen Rothwell <sfr@canb.auug.org.au>
Wed, 19 Sep 2012 05:31:59 +0000 (15:31 +1000)
committerStephen Rothwell <sfr@canb.auug.org.au>
Wed, 19 Sep 2012 05:31:59 +0000 (15:31 +1000)
400 files changed:
Documentation/00-INDEX
Documentation/ABI/obsolete/proc-pid-oom_adj [deleted file]
Documentation/CodingStyle
Documentation/aoe/aoe.txt
Documentation/devicetree/bindings/crypto/fsl-sec4.txt
Documentation/devicetree/bindings/misc/lis302.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rtc/snvs-rtc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/video/backlight/pwm-backlight.txt
Documentation/feature-removal-schedule.txt
Documentation/filesystems/proc.txt
Documentation/printk-formats.txt
Documentation/prio_tree.txt [deleted file]
Documentation/rbtree.txt
Documentation/rtc.txt
Documentation/vm/unevictable-lru.txt
MAINTAINERS
arch/Kconfig
arch/alpha/kernel/pci-sysfs.c
arch/arm/mm/fault-armv.c
arch/arm/mm/fault.c
arch/arm/mm/flush.c
arch/avr32/include/asm/elf.h
arch/avr32/mm/fault.c
arch/blackfin/include/asm/elf.h
arch/c6x/include/asm/elf.h
arch/cris/include/asm/elf.h
arch/cris/include/asm/io.h
arch/cris/mm/fault.c
arch/frv/include/asm/elf.h
arch/frv/kernel/pm.c
arch/h8300/Kconfig
arch/h8300/include/asm/elf.h
arch/hexagon/include/asm/elf.h
arch/hexagon/mm/vm_fault.c
arch/ia64/include/asm/hugetlb.h
arch/ia64/kernel/perfmon.c
arch/ia64/mm/fault.c
arch/ia64/mm/init.c
arch/m32r/include/asm/elf.h
arch/m68k/include/asm/elf.h
arch/m68k/mm/fault.c
arch/microblaze/include/asm/atomic.h
arch/microblaze/include/asm/elf.h
arch/microblaze/mm/fault.c
arch/mips/include/asm/hugetlb.h
arch/mips/mm/fault.c
arch/mn10300/Makefile
arch/mn10300/include/asm/elf.h
arch/openrisc/include/asm/elf.h
arch/openrisc/mm/fault.c
arch/parisc/kernel/cache.c
arch/powerpc/include/asm/atomic.h
arch/powerpc/include/asm/hugetlb.h
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/mm/fault.c
arch/powerpc/oprofile/cell/spu_task_sync.c
arch/s390/include/asm/hugetlb.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/setup.h
arch/s390/include/asm/tlb.h
arch/s390/kernel/early.c
arch/s390/mm/fault.c
arch/s390/mm/gup.c
arch/s390/mm/pgtable.c
arch/score/Kconfig
arch/score/include/asm/elf.h
arch/sh/include/asm/elf.h
arch/sh/include/asm/hugetlb.h
arch/sh/mm/fault.c
arch/sparc/include/asm/elf_32.h
arch/sparc/include/asm/hugetlb.h
arch/sparc/kernel/pci.c
arch/sparc/mm/fault_32.c
arch/sparc/mm/fault_64.c
arch/tile/include/asm/elf.h
arch/tile/include/asm/hugetlb.h
arch/tile/mm/elf.c
arch/tile/mm/fault.c
arch/um/kernel/trap.c
arch/unicore32/Kconfig
arch/unicore32/kernel/process.c
arch/x86/Kconfig
arch/x86/include/asm/atomic.h
arch/x86/include/asm/hugetlb.h
arch/x86/kernel/acpi/boot.c
arch/x86/mm/fault.c
arch/x86/mm/hugetlbpage.c
arch/x86/mm/numa.c
arch/x86/mm/pat.c
arch/x86/mm/pat_rbtree.c
arch/x86/platform/iris/iris.c
arch/x86/xen/mmu.c
arch/xtensa/include/asm/elf.h
arch/xtensa/mm/fault.c
block/genhd.c
block/partitions/efi.c
block/partitions/msdos.c
drivers/acpi/acpi_memhotplug.c
drivers/block/aoe/aoe.h
drivers/block/aoe/aoeblk.c
drivers/block/aoe/aoechr.c
drivers/block/aoe/aoecmd.c
drivers/block/aoe/aoedev.c
drivers/block/aoe/aoemain.c
drivers/block/aoe/aoenet.c
drivers/block/cciss.c
drivers/block/nbd.c
drivers/char/mbcs.c
drivers/char/mem.c
drivers/char/mspec.c
drivers/dma/dmaengine.c
drivers/firmware/dmi_scan.c
drivers/gpu/drm/drm_gem.c
drivers/gpu/drm/drm_vm.c
drivers/gpu/drm/exynos/exynos_drm_gem.c
drivers/gpu/drm/gma500/framebuffer.c
drivers/gpu/drm/i915/intel_pm.c
drivers/gpu/drm/ttm/ttm_bo_vm.c
drivers/gpu/drm/udl/udl_fb.c
drivers/i2c/i2c-core.c
drivers/infiniband/core/cm.c
drivers/infiniband/hw/ehca/ehca_uverbs.c
drivers/infiniband/hw/ipath/ipath_file_ops.c
drivers/infiniband/hw/qib/qib_file_ops.c
drivers/media/pci/meye/meye.c
drivers/media/platform/omap/omap_vout.c
drivers/media/platform/vino.c
drivers/media/usb/sn9c102/sn9c102_core.c
drivers/media/usb/usbvision/usbvision-video.c
drivers/media/v4l2-core/videobuf-dma-sg.c
drivers/media/v4l2-core/videobuf-vmalloc.c
drivers/media/v4l2-core/videobuf2-memops.c
drivers/misc/carma/carma-fpga.c
drivers/misc/lis3lv02d/lis3lv02d.c
drivers/misc/lis3lv02d/lis3lv02d.h
drivers/misc/lis3lv02d/lis3lv02d_spi.c
drivers/misc/sgi-gru/grufile.c
drivers/mtd/mtdchar.c
drivers/mtd/mtdcore.c
drivers/oprofile/buffer_sync.c
drivers/pcmcia/cs.c
drivers/pps/pps.c
drivers/rapidio/devices/tsi721.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/class.c
drivers/rtc/hctosys.c
drivers/rtc/rtc-at91sam9.c
drivers/rtc/rtc-coh901331.c
drivers/rtc/rtc-ds2404.c [new file with mode: 0644]
drivers/rtc/rtc-isl1208.c
drivers/rtc/rtc-jz4740.c
drivers/rtc/rtc-max8907.c [new file with mode: 0644]
drivers/rtc/rtc-mxc.c
drivers/rtc/rtc-proc.c
drivers/rtc/rtc-rc5t583.c [new file with mode: 0644]
drivers/rtc/rtc-snvs.c [new file with mode: 0644]
drivers/rtc/rtc-spear.c
drivers/rtc/rtc-sysfs.c
drivers/rtc/rtc-tps65910.c [new file with mode: 0644]
drivers/scsi/atp870u.c
drivers/scsi/sg.c
drivers/staging/android/ashmem.c
drivers/staging/omapdrm/omap_gem_dmabuf.c
drivers/staging/tidspbridge/rmgr/drv_interface.c
drivers/thermal/cpu_cooling.c
drivers/thermal/thermal_sys.c
drivers/uio/uio.c
drivers/usb/mon/mon_bin.c
drivers/vfio/pci/vfio_pci.c
drivers/video/68328fb.c
drivers/video/aty/atyfb_base.c
drivers/video/backlight/Kconfig
drivers/video/backlight/Makefile
drivers/video/backlight/da9052_bl.c
drivers/video/backlight/kb3886_bl.c
drivers/video/backlight/lm3630_bl.c [new file with mode: 0644]
drivers/video/backlight/lm3639_bl.c [new file with mode: 0644]
drivers/video/backlight/ltv350qv.c
drivers/video/backlight/progear_bl.c [deleted file]
drivers/video/backlight/pwm_bl.c
drivers/video/fb-puv3.c
drivers/video/fb_defio.c
drivers/video/fbmem.c
drivers/video/gbefb.c
drivers/video/omap2/omapfb/omapfb-main.c
drivers/video/sbuslib.c
drivers/video/smscufx.c
drivers/video/udlfb.c
drivers/video/vermilion/vermilion.c
drivers/video/vfb.c
drivers/xen/gntalloc.c
drivers/xen/gntdev.c
drivers/xen/privcmd.c
fs/9p/v9fs.c
fs/9p/vfs_file.c
fs/Kconfig.binfmt
fs/Makefile
fs/adfs/super.c
fs/affs/super.c
fs/afs/super.c
fs/attr.c
fs/befs/linuxvfs.c
fs/bfs/inode.c
fs/binfmt_aout.c
fs/binfmt_elf.c
fs/binfmt_elf_fdpic.c
fs/btrfs/extent_io.c
fs/btrfs/file.c
fs/btrfs/inode.c
fs/ceph/addr.c
fs/ceph/super.c
fs/cifs/cifsfs.c
fs/cifs/file.c
fs/coda/inode.c
fs/compat_ioctl.c
fs/coredump.c [new file with mode: 0644]
fs/coredump.h [new file with mode: 0644]
fs/dcache.c
fs/ecryptfs/main.c
fs/efs/super.c
fs/eventpoll.c
fs/exec.c
fs/exofs/super.c
fs/ext2/super.c
fs/ext3/super.c
fs/ext4/file.c
fs/ext4/super.c
fs/fat/Makefile
fs/fat/cache.c
fs/fat/dir.c
fs/fat/fat.h
fs/fat/fatent.c
fs/fat/inode.c
fs/fat/namei_msdos.c
fs/fat/namei_vfat.c
fs/fat/nfs.c [new file with mode: 0644]
fs/freevxfs/vxfs_super.c
fs/fuse/file.c
fs/fuse/inode.c
fs/gfs2/file.c
fs/hfs/super.c
fs/hfsplus/super.c
fs/hpfs/super.c
fs/hugetlbfs/inode.c
fs/inode.c
fs/isofs/inode.c
fs/jffs2/readinode.c
fs/jffs2/super.c
fs/jfs/super.c
fs/logfs/inode.c
fs/minix/inode.c
fs/ncpfs/inode.c
fs/nfs/file.c
fs/nfs/inode.c
fs/nilfs2/file.c
fs/nilfs2/super.c
fs/ntfs/super.c
fs/ocfs2/dlmfs/dlmfs.c
fs/ocfs2/mmap.c
fs/ocfs2/super.c
fs/openpromfs/inode.c
fs/proc/base.c
fs/proc/generic.c
fs/proc/inode.c
fs/proc/proc_sysctl.c
fs/proc/root.c
fs/proc/task_mmu.c
fs/qnx4/inode.c
fs/qnx6/inode.c
fs/reiserfs/super.c
fs/romfs/super.c
fs/squashfs/super.c
fs/super.c
fs/sysv/inode.c
fs/ubifs/file.c
fs/ubifs/super.c
fs/udf/super.c
fs/ufs/super.c
fs/xfs/xfs_file.c
fs/xfs/xfs_super.c
include/asm-generic/pgtable.h
include/linux/atomic.h
include/linux/audit.h
include/linux/binfmts.h
include/linux/compaction.h
include/linux/coredump.h
include/linux/eventpoll.h
include/linux/fs.h
include/linux/genhd.h
include/linux/gfp.h
include/linux/huge_mm.h
include/linux/idr.h
include/linux/interval_tree.h [new file with mode: 0644]
include/linux/interval_tree_generic.h [new file with mode: 0644]
include/linux/memblock.h
include/linux/mempolicy.h
include/linux/mfd/rc5t583.h
include/linux/mfd/tps65910.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mman.h
include/linux/mmu_notifier.h
include/linux/nbd.h
include/linux/oom.h
include/linux/page-isolation.h
include/linux/platform_data/lm3630_bl.h [new file with mode: 0644]
include/linux/platform_data/lm3639_bl.h [new file with mode: 0644]
include/linux/platform_data/lp855x.h
include/linux/prio_tree.h [deleted file]
include/linux/rbtree.h
include/linux/rbtree_augmented.h [new file with mode: 0644]
include/linux/rio.h
include/linux/rmap.h
include/linux/rtc-ds2404.h [new file with mode: 0644]
include/linux/rtc.h
include/linux/sched.h
include/linux/timerqueue.h
include/trace/events/gfpflags.h
init/Kconfig
init/do_mounts.c
init/main.c
ipc/mqueue.c
ipc/sem.c
kernel/auditsc.c
kernel/events/core.c
kernel/events/uprobes.c
kernel/fork.c
kernel/kexec.c
kernel/resource.c
kernel/signal.c
kernel/sys.c
kernel/sysctl.c
kernel/taskstats.c
kernel/time.c
kernel/timeconst.pl
lib/Kconfig.debug
lib/Makefile
lib/decompress.c
lib/gcd.c
lib/idr.c
lib/interval_tree.c [new file with mode: 0644]
lib/interval_tree_test_main.c [new file with mode: 0644]
lib/parser.c
lib/prio_tree.c [deleted file]
lib/rbtree.c
lib/rbtree_test.c [new file with mode: 0644]
lib/vsprintf.c
mm/Kconfig
mm/Makefile
mm/compaction.c
mm/filemap.c
mm/filemap_xip.c
mm/fremap.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/interval_tree.c [new file with mode: 0644]
mm/kmemleak.c
mm/ksm.c
mm/madvise.c
mm/memblock.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/mlock.c
mm/mmap.c
mm/mmu_notifier.c
mm/mremap.c
mm/nobootmem.c
mm/nommu.c
mm/oom_kill.c
mm/page_alloc.c
mm/page_isolation.c
mm/pgtable-generic.c
mm/prio_tree.c [deleted file]
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/swap.c
mm/swap_state.c
mm/vmalloc.c
mm/vmscan.c
net/ceph/osd_client.c
scripts/Kbuild.include
scripts/checkpatch.pl
scripts/checksyscalls.sh
security/device_cgroup.c
security/selinux/netnode.c
security/selinux/selinuxfs.c
security/tomoyo/util.c
sound/core/pcm_native.c
sound/usb/usx2y/us122l.c
sound/usb/usx2y/usX2Yhwdep.c
sound/usb/usx2y/usx2yhwdeppcm.c
tools/perf/util/include/linux/rbtree.h
tools/testing/selftests/Makefile
tools/testing/selftests/epoll/Makefile [new file with mode: 0644]
tools/testing/selftests/epoll/test_epoll.c [new file with mode: 0644]

index 49c051380daf1973c1dbbf494b5ee58b11ff44a7..f54273e2ac979a386e27c0174e4a9524e60eae5e 100644 (file)
@@ -270,8 +270,6 @@ preempt-locking.txt
        - info on locking under a preemptive kernel.
 printk-formats.txt
        - how to get printk format specifiers right
-prio_tree.txt
-       - info on radix-priority-search-tree use for indexing vmas.
 ramoops.txt
        - documentation of the ramoops oops/panic logging module.
 rbtree.txt
diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj
deleted file mode 100644 (file)
index 9a3cb88..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-What:  /proc/<pid>/oom_adj
-When:  August 2012
-Why:   /proc/<pid>/oom_adj allows userspace to influence the oom killer's
-       badness heuristic used to determine which task to kill when the kernel
-       is out of memory.
-
-       The badness heuristic has since been rewritten since the introduction of
-       this tunable such that its meaning is deprecated.  The value was
-       implemented as a bitshift on a score generated by the badness()
-       function that did not have any precise units of measure.  With the
-       rewrite, the score is given as a proportion of available memory to the
-       task allocating pages, so using a bitshift which grows the score
-       exponentially is, thus, impossible to tune with fine granularity.
-
-       A much more powerful interface, /proc/<pid>/oom_score_adj, was
-       introduced with the oom killer rewrite that allows users to increase or
-       decrease the badness score linearly.  This interface will replace
-       /proc/<pid>/oom_adj.
-
-       A warning will be emitted to the kernel log if an application uses this
-       deprecated interface.  After it is printed once, future warnings will be
-       suppressed until the kernel is rebooted.
index cb9258b8fd35b25b8ac750b18b4237204213fbd4..495e5ba1634cac59350326485b07b7e902242ab3 100644 (file)
@@ -454,6 +454,16 @@ The preferred style for long (multi-line) comments is:
         * with beginning and ending almost-blank lines.
         */
 
+For files in net/ and drivers/net/ the preferred style for long (multi-line)
+comments is a little different.
+
+       /* The preferred comment style for files in net/ and drivers/net
+        * looks like this.
+        *
+        * It is nearly the same as the generally preferred comment style,
+        * but there is no initial almost-blank line.
+        */
+
 It's also important to comment data, whether they are basic types or derived
 types.  To this end, use just one data declaration per line (no commas for
 multiple data declarations).  This leaves you room for a small comment on each
index 5f5aa16047ff4f8f5a9680101572c8afc34e799c..b3e47560346a9074aed5ea2e684ace2c323afbac 100644 (file)
@@ -1,8 +1,11 @@
-The EtherDrive (R) HOWTO for users of 2.6 kernels is found at ...
+The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ...
 
-  http://www.coraid.com/SUPPORT/EtherDrive-HBA  
+  http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html
 
-  It has many tips and hints!
+It has many tips and hints!  Please see, especially, recommended
+tunings for virtual memory:
+
+  http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19
 
 The aoetools are userland programs that are designed to work with this
 driver.  The aoetools are on sourceforge.
index bf57ecd5d73a6218d1310da2695f4bb977710e57..bd7ce120bc135e9eccb9f2cbdb3a38fdc15f9702 100644 (file)
@@ -9,6 +9,7 @@ Copyright (C) 2008-2011 Freescale Semiconductor Inc.
    -Run Time Integrity Check (RTIC) Node
    -Run Time Integrity Check (RTIC) Memory Node
    -Secure Non-Volatile Storage (SNVS) Node
+   -Secure Non-Volatile Storage (SNVS) Low Power (LP) RTC Node
    -Full Example
 
 NOTE: the SEC 4 is also known as Freescale's Cryptographic Accelerator
@@ -294,6 +295,27 @@ Secure Non-Volatile Storage (SNVS) Node
           address and length of the SEC4 configuration
           registers.
 
+   - #address-cells
+       Usage: required
+       Value type: <u32>
+       Definition: A standard property.  Defines the number of cells
+           for representing physical addresses in child nodes.  Must
+           have a value of 1.
+
+   - #size-cells
+       Usage: required
+       Value type: <u32>
+       Definition: A standard property.  Defines the number of cells
+           for representing the size of physical addresses in
+           child nodes.  Must have a value of 1.
+
+   - ranges
+       Usage: required
+       Value type: <prop-encoded-array>
+       Definition: A standard property.  Specifies the physical address
+           range of the SNVS register space.  A triplet that includes
+           the child address, parent address, & length.
+
    - interrupts
       Usage: required
       Value type: <prop_encoded-array>
@@ -314,10 +336,33 @@ EXAMPLE
        sec_mon@314000 {
                compatible = "fsl,sec-v4.0-mon";
                reg = <0x314000 0x1000>;
+               ranges = <0 0x314000 0x1000>;
                interrupt-parent = <&mpic>;
                interrupts = <93 2>;
        };
 
+=====================================================================
+Secure Non-Volatile Storage (SNVS) Low Power (LP) RTC Node
+
+  A SNVS child node that defines SNVS LP RTC.
+
+  - compatible
+      Usage: required
+      Value type: <string>
+      Definition: Must include "fsl,sec-v4.0-mon-rtc-lp".
+
+  - reg
+      Usage: required
+      Value type: <prop-encoded-array>
+      Definition: A standard property.  Specifies the physical
+          address and length of the SNVS LP configuration registers.
+
+EXAMPLE
+       sec_mon_rtc_lp@314000 {
+               compatible = "fsl,sec-v4.0-mon-rtc-lp";
+               reg = <0x34 0x58>;
+       };
+
 =====================================================================
 FULL EXAMPLE
 
@@ -390,8 +435,14 @@ FULL EXAMPLE
        sec_mon: sec_mon@314000 {
                compatible = "fsl,sec-v4.0-mon";
                reg = <0x314000 0x1000>;
+               ranges = <0 0x314000 0x1000>;
                interrupt-parent = <&mpic>;
                interrupts = <93 2>;
+
+               sec_mon_rtc_lp@34 {
+                       compatible = "fsl,sec-v4.0-mon-rtc-lp";
+                       reg = <0x34 0x58>;
+               };
        };
 
 =====================================================================
diff --git a/Documentation/devicetree/bindings/misc/lis302.txt b/Documentation/devicetree/bindings/misc/lis302.txt
new file mode 100644 (file)
index 0000000..e18af9d
--- /dev/null
@@ -0,0 +1,76 @@
+LIS302 accelerometer devicetree bindings
+
+This device is matched via its bus drivers, and has a number of properties
+that apply in on the generic device (independent from the bus).
+
+
+Required properties for the SPI bindings:
+ - compatible:                 should be set to "st,lis3lv02d_spi"
+ - reg:                        the chipselect index
+ - spi-max-frequency:  maximal bus speed, should be set to 1000000 unless
+                       constrained by external circuitry
+ - interrupts:         the interrupt generated by the device
+
+
+Optional properties for all bus drivers:
+
+ - st,click-single-{x,y,z}:    if present, tells the device to issue an
+                               interrupt on single click events on the
+                               x/y/z axis.
+ - st,click-double-{x,y,z}:    if present, tells the device to issue an
+                               interrupt on double click events on the
+                               x/y/z axis.
+ - st,click-thresh-{x,y,z}:    set the x/y/z axis threshold
+ - st,click-click-time-limit:  click time limit, from 0 to 127.5msec
+                               with step of 0.5 msec
+ - st,click-latency:           click latency, from 0 to 255 msec with
+                               step of 1 msec.
+ - st,click-window:            click window, from 0 to 255 msec with
+                               step of 1 msec.
+ - st,irq{1,2}-disable:                disable IRQ 1/2
+ - st,irq{1,2}-ff-wu-1:                raise IRQ 1/2 on FF_WU_1 condition
+ - st,irq{1,2}-ff-wu-2:                raise IRQ 1/2 on FF_WU_2 condition
+ - st,irq{1,2}-data-ready:     raise IRQ 1/2 on data ready contition
+ - st,irq{1,2}-click:          raise IRQ 1/2 on click condition
+ - st,irq-open-drain:          consider IRQ lines open-drain
+ - st,irq-active-low:          make IRQ lines active low
+ - st,wu-duration-1:           duration register for Free-Fall/Wake-Up
+                               interrupt 1
+ - st,wu-duration-2:           duration register for Free-Fall/Wake-Up
+                               interrupt 2
+ - st,wakeup-{x,y,z}-{lo,hi}:  set wakeup condition on x/y/z axis for
+                               upper/lower limit
+ - st,highpass-cutoff-hz=:     1, 2, 4 or 8 for 1Hz, 2Hz, 4Hz or 8Hz of
+                               highpass cut-off frequency
+ - st,hipass{1,2}-disable:     disable highpass 1/2.
+ - st,default-rate=:           set the default rate
+ - st,axis-{x,y,z}=:           set the axis to map to the three coordinates
+ - st,{min,max}-limit-{x,y,z}  set the min/max limits for x/y/z axis
+                               (used by self-test)
+
+
+Example for a SPI device node:
+
+       lis302@0 {
+               compatible = "st,lis302dl-spi";
+               reg = <0>;
+               spi-max-frequency = <1000000>;
+               interrupt-parent = <&gpio>;
+               interrupts = <104 0>;
+
+               st,click-single-x;
+               st,click-single-y;
+               st,click-single-z;
+               st,click-thresh-x = <10>;
+               st,click-thresh-y = <10>;
+               st,click-thresh-z = <10>;
+               st,irq1-click;
+               st,irq2-click;
+               st,wakeup-x-lo;
+               st,wakeup-x-hi;
+               st,wakeup-y-lo;
+               st,wakeup-y-hi;
+               st,wakeup-z-lo;
+               st,wakeup-z-hi;
+       };
+
diff --git a/Documentation/devicetree/bindings/rtc/snvs-rtc.txt b/Documentation/devicetree/bindings/rtc/snvs-rtc.txt
new file mode 100644 (file)
index 0000000..fb61ed7
--- /dev/null
@@ -0,0 +1 @@
+See Documentation/devicetree/bindings/crypto/fsl-sec4.txt for details.
index 1e4fc727f3b180f84242a3d77b3854ad113b4a60..689c7d25f750de5bdb124f754484aac23d614767 100644 (file)
@@ -14,6 +14,8 @@ Required properties:
 Optional properties:
   - pwm-names: a list of names for the PWM devices specified in the
                "pwms" property (see PWM binding[0])
+  - low_threshold_brightness: brightness threshold low level. (get linear
+                scales in brightness in low end of brightness levels)
 
 [0]: Documentation/devicetree/bindings/pwm/pwm.txt
 
@@ -26,3 +28,22 @@ Example:
                brightness-levels = <0 4 8 16 32 64 128 255>;
                default-brightness-level = <6>;
        };
+
+Example for brightness_threshold_level:
+
+       backlight {
+               compatible      = "pwm-backlight";
+               pwms = <&pwm 0 50000>;
+
+               brightness-levels = <0 4 8 16 32 64 128 255>;
+               default-brightness-level = <6>;
+               low_threshold_brightness = <50>;
+       };
+};
+Note:
+Low threshold support is required to have linear brightness scale from
+0 to max. For some panels, backlight absent on low end of brightness
+scale. So support for Low Threshold been required. So that the scale of
+brightness changed from Low Threshold to Max in scales defined in
+brightness-levels. In this example 20% maximum brightness scale should
+be required to turn on panel backlight.
index 2dd834f2c5cd6e9ce6c4cea6e6f7d53d22471de0..008503e6972c7efbfb236ab6f41e39285e0fa755 100644 (file)
@@ -124,31 +124,6 @@ Who:       Pavel Machek <pavel@ucw.cz>
 
 ---------------------------
 
-What:  /proc/<pid>/oom_adj
-When:  August 2012
-Why:   /proc/<pid>/oom_adj allows userspace to influence the oom killer's
-       badness heuristic used to determine which task to kill when the kernel
-       is out of memory.
-
-       The badness heuristic has since been rewritten since the introduction of
-       this tunable such that its meaning is deprecated.  The value was
-       implemented as a bitshift on a score generated by the badness()
-       function that did not have any precise units of measure.  With the
-       rewrite, the score is given as a proportion of available memory to the
-       task allocating pages, so using a bitshift which grows the score
-       exponentially is, thus, impossible to tune with fine granularity.
-
-       A much more powerful interface, /proc/<pid>/oom_score_adj, was
-       introduced with the oom killer rewrite that allows users to increase or
-       decrease the badness score linearly.  This interface will replace
-       /proc/<pid>/oom_adj.
-
-       A warning will be emitted to the kernel log if an application uses this
-       deprecated interface.  After it is printed once, future warnings will be
-       suppressed until the kernel is rebooted.
-
----------------------------
-
 What:  remove EXPORT_SYMBOL(kernel_thread)
 When:  August 2006
 Files: arch/*/kernel/*_ksyms.c
index fb0a6aeb936c86237fe19bcdf630339fc43ad348..a1793d670cd01bd374eddf54ffdfc768504291ff 100644 (file)
@@ -33,7 +33,7 @@ Table of Contents
   2    Modifying System Parameters
 
   3    Per-Process Parameters
-  3.1  /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer
+  3.1  /proc/<pid>/oom_score_adj - Adjust the oom-killer
                                                                score
   3.2  /proc/<pid>/oom_score - Display current oom-killer score
   3.3  /proc/<pid>/io - Display the IO accounting fields
@@ -1320,10 +1320,10 @@ of the kernel.
 CHAPTER 3: PER-PROCESS PARAMETERS
 ------------------------------------------------------------------------------
 
-3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score
+3.1 /proc/<pid>/oom_score_adj- Adjust the oom-killer score
 --------------------------------------------------------------------------------
 
-These file can be used to adjust the badness heuristic used to select which
+This file can be used to adjust the badness heuristic used to select which
 process gets killed in out of memory conditions.
 
 The badness heuristic assigns a value to each candidate task ranging from 0
@@ -1361,22 +1361,10 @@ same system, cpuset, mempolicy, or memory controller resources to use at least
 equivalent to discounting 50% of the task's allowed memory from being considered
 as scoring against the task.
 
-For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
-be used to tune the badness score.  Its acceptable values range from -16
-(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
-(OOM_DISABLE) to disable oom killing entirely for that task.  Its value is
-scaled linearly with /proc/<pid>/oom_score_adj.
-
-Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
-other with its scaled value.
-
 The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
 value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
 requires CAP_SYS_RESOURCE.
 
-NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
-Documentation/feature-removal-schedule.txt.
-
 Caveat: when a parent task is selected, the oom killer will sacrifice any first
 generation children with separate address spaces instead, if possible.  This
 avoids servers and important system daemons from being killed and loses the
@@ -1387,9 +1375,7 @@ minimal amount of work.
 -------------------------------------------------------------
 
 This file can be used to check the current score used by the oom-killer is for
-any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which
-process should be killed in an out-of-memory situation.
-
+any given <pid>.
 
 3.3  /proc/<pid>/io - Display the IO accounting fields
 -------------------------------------------------------
index 7561d7ed8e11ef25a9a1fb479492148b6f038d5d..8ffb274367c7a4867b1930c9c5bd60ff475bfffb 100644 (file)
@@ -69,6 +69,7 @@ MAC/FDDI addresses:
        %pMR    05:04:03:02:01:00
        %pMF    00-01-02-03-04-05
        %pm     000102030405
+       %pmR    050403020100
 
        For printing 6-byte MAC/FDDI addresses in hex notation. The 'M' and 'm'
        specifiers result in a printed address with ('M') or without ('m') byte
diff --git a/Documentation/prio_tree.txt b/Documentation/prio_tree.txt
deleted file mode 100644 (file)
index 3aa68f9..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-The prio_tree.c code indexes vmas using 3 different indexes:
-       * heap_index  = vm_pgoff + vm_size_in_pages : end_vm_pgoff
-       * radix_index = vm_pgoff : start_vm_pgoff
-       * size_index = vm_size_in_pages
-
-A regular radix-priority-search-tree indexes vmas using only heap_index and
-radix_index. The conditions for indexing are:
-       * ->heap_index >= ->left->heap_index &&
-               ->heap_index >= ->right->heap_index
-       * if (->heap_index == ->left->heap_index)
-               then ->radix_index < ->left->radix_index;
-       * if (->heap_index == ->right->heap_index)
-               then ->radix_index < ->right->radix_index;
-       * nodes are hashed to left or right subtree using radix_index
-         similar to a pure binary radix tree.
-
-A regular radix-priority-search-tree helps to store and query
-intervals (vmas). However, a regular radix-priority-search-tree is only
-suitable for storing vmas with different radix indices (vm_pgoff).
-
-Therefore, the prio_tree.c extends the regular radix-priority-search-tree
-to handle many vmas with the same vm_pgoff. Such vmas are handled in
-2 different ways: 1) All vmas with the same radix _and_ heap indices are
-linked using vm_set.list, 2) if there are many vmas with the same radix
-index, but different heap indices and if the regular radix-priority-search
-tree cannot index them all, we build an overflow-sub-tree that indexes such
-vmas using heap and size indices instead of heap and radix indices. For
-example, in the figure below some vmas with vm_pgoff = 0 (zero) are
-indexed by regular radix-priority-search-tree whereas others are pushed
-into an overflow-subtree. Note that all vmas in an overflow-sub-tree have
-the same vm_pgoff (radix_index) and if necessary we build different
-overflow-sub-trees to handle each possible radix_index. For example,
-in figure we have 3 overflow-sub-trees corresponding to radix indices
-0, 2, and 4.
-
-In the final tree the first few (prio_tree_root->index_bits) levels
-are indexed using heap and radix indices whereas the overflow-sub-trees below
-those levels (i.e. levels prio_tree_root->index_bits + 1 and higher) are
-indexed using heap and size indices. In overflow-sub-trees the size_index
-is used for hashing the nodes to appropriate places.
-
-Now, an example prio_tree:
-
-  vmas are represented [radix_index, size_index, heap_index]
-                 i.e., [start_vm_pgoff, vm_size_in_pages, end_vm_pgoff]
-
-level  prio_tree_root->index_bits = 3
------
-                                                                                               _
-  0                                                    [0,7,7]                                  |
-                                                       /     \                                  |
-                                     ------------------       ------------                      |     Regular
-                                    /                                     \                     |  radix priority
-  1                            [1,6,7]                                   [4,3,7]                |   search tree
-                               /     \                                   /     \                |
-                        -------       -----                        ------       -----           |  heap-and-radix
-                       /                   \                      /                  \          |      indexed
-  2                [0,6,6]                [2,5,7]              [5,2,7]             [6,1,7]      |
-                   /     \                /     \              /     \             /     \      |
-  3            [0,5,5] [1,5,6]         [2,4,6] [3,4,7]     [4,2,6] [5,1,6]     [6,0,6] [7,0,7]  |
-                  /                       /                   /                                _
-                  /                      /                   /                                 _
-  4          [0,4,4]                 [2,3,5]              [4,1,5]                               |
-                /                       /                    /                                  |
-  5         [0,3,3]                 [2,2,4]              [4,0,4]                                |  Overflow-sub-trees
-               /                       /                                                        |
-  6        [0,2,2]                 [2,1,3]                                                      |    heap-and-size
-              /                       /                                                         |       indexed
-  7       [0,1,1]                 [2,0,2]                                                       |
-             /                                                                                  |
-  8      [0,0,0]                                                                                |
-                                                                                               _
-
-Note that we use prio_tree_root->index_bits to optimize the height
-of the heap-and-radix indexed tree. Since prio_tree_root->index_bits is
-set according to the maximum end_vm_pgoff mapped, we are sure that all
-bits (in vm_pgoff) above prio_tree_root->index_bits are 0 (zero). Therefore,
-we only use the first prio_tree_root->index_bits as radix_index.
-Whenever index_bits is increased in prio_tree_expand, we shuffle the tree
-to make sure that the first prio_tree_root->index_bits levels of the tree
-is indexed properly using heap and radix indices.
-
-We do not optimize the height of overflow-sub-trees using index_bits.
-The reason is: there can be many such overflow-sub-trees and all of
-them have to be suffled whenever the index_bits increases. This may involve
-walking the whole prio_tree in prio_tree_insert->prio_tree_expand code
-path which is not desirable. Hence, we do not optimize the height of the
-heap-and-size indexed overflow-sub-trees using prio_tree->index_bits.
-Instead the overflow sub-trees are indexed using full BITS_PER_LONG bits
-of size_index. This may lead to skewed sub-trees because most of the
-higher significant bits of the size_index are likely to be 0 (zero). In
-the example above, all 3 overflow-sub-trees are skewed. This may marginally
-affect the performance. However, processes rarely map many vmas with the
-same start_vm_pgoff but different end_vm_pgoffs. Therefore, we normally
-do not require overflow-sub-trees to index all vmas.
-
-From the above discussion it is clear that the maximum height of
-a prio_tree can be prio_tree_root->index_bits + BITS_PER_LONG.
-However, in most of the common cases we do not need overflow-sub-trees,
-so the tree height in the common cases will be prio_tree_root->index_bits.
-
-It is fair to mention here that the prio_tree_root->index_bits
-is increased on demand, however, the index_bits is not decreased when
-vmas are removed from the prio_tree. That's tricky to do. Hence, it's
-left as a home work problem.
-
-
index 8d32d85a523439381273879f363163370bab7a25..61b6c48871a0163a12e4faec50064f61c819b5c6 100644 (file)
@@ -193,24 +193,55 @@ Example:
 Support for Augmented rbtrees
 -----------------------------
 
-Augmented rbtree is an rbtree with "some" additional data stored in each node.
-This data can be used to augment some new functionality to rbtree.
-Augmented rbtree is an optional feature built on top of basic rbtree
-infrastructure. An rbtree user who wants this feature will have to call the
-augmentation functions with the user provided augmentation callback
-when inserting and erasing nodes.
-
-On insertion, the user must call rb_augment_insert() once the new node is in
-place. This will cause the augmentation function callback to be called for
-each node between the new node and the root which has been affected by the
-insertion.
-
-When erasing a node, the user must call rb_augment_erase_begin() first to
-retrieve the deepest node on the rebalance path. Then, after erasing the
-original node, the user must call rb_augment_erase_end() with the deepest
-node found earlier. This will cause the augmentation function to be called
-for each affected node between the deepest node and the root.
-
+Augmented rbtree is an rbtree with "some" additional data stored in
+each node, where the additional data for node N must be a function of
+the contents of all nodes in the subtree rooted at N. This data can
+be used to augment some new functionality to rbtree. Augmented rbtree
+is an optional feature built on top of basic rbtree infrastructure.
+An rbtree user who wants this feature will have to call the augmentation
+functions with the user provided augmentation callback when inserting
+and erasing nodes.
+
+C files implementing augmented rbtree manipulation must include
+<linux/rbtree_augmented.h> instead of <linus/rbtree.h>. Note that
+linux/rbtree_augmented.h exposes some rbtree implementations details
+you are not expected to rely on; please stick to the documented APIs
+there and do not include <linux/rbtree_augmented.h> from header files
+either so as to minimize chances of your users accidentally relying on
+such implementation details.
+
+On insertion, the user must update the augmented information on the path
+leading to the inserted node, then call rb_link_node() as usual and
+rb_augment_inserted() instead of the usual rb_insert_color() call.
+If rb_augment_inserted() rebalances the rbtree, it will callback into
+a user provided function to update the augmented information on the
+affected subtrees.
+
+When erasing a node, the user must call rb_erase_augmented() instead of
+rb_erase(). rb_erase_augmented() calls back into user provided functions
+to updated the augmented information on affected subtrees.
+
+In both cases, the callbacks are provided through struct rb_augment_callbacks.
+3 callbacks must be defined:
+
+- A propagation callback, which updates the augmented value for a given
+  node and its ancestors, up to a given stop point (or NULL to update
+  all the way to the root).
+
+- A copy callback, which copies the augmented value for a given subtree
+  to a newly assigned subtree root.
+
+- A tree rotation callback, which copies the augmented value for a given
+  subtree to a newly assigned subtree root AND recomputes the augmented
+  information for the former subtree root.
+
+The compiled code for rb_erase_augmented() may inline the propagation and
+copy callbacks, which results in a large function, so each augmented rbtree
+user should have a single rb_erase_augmented() call site in order to limit
+compiled code size.
+
+
+Sample usage:
 
 Interval tree is an example of augmented rb tree. Reference -
 "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
@@ -230,26 +261,132 @@ and its immediate children. And this will be used in O(log n) lookup
 for lowest match (lowest start address among all possible matches)
 with something like:
 
-find_lowest_match(lo, hi, node)
+struct interval_tree_node *
+interval_tree_first_match(struct rb_root *root,
+                         unsigned long start, unsigned long last)
 {
-       lowest_match = NULL;
-       while (node) {
-               if (max_hi(node->left) > lo) {
-                       // Lowest overlap if any must be on left side
-                       node = node->left;
-               } else if (overlap(lo, hi, node)) {
-                       lowest_match = node;
-                       break;
-               } else if (lo > node->lo) {
-                       // Lowest overlap if any must be on right side
-                       node = node->right;
-               } else {
-                       break;
+       struct interval_tree_node *node;
+
+       if (!root->rb_node)
+               return NULL;
+       node = rb_entry(root->rb_node, struct interval_tree_node, rb);
+
+       while (true) {
+               if (node->rb.rb_left) {
+                       struct interval_tree_node *left =
+                               rb_entry(node->rb.rb_left,
+                                        struct interval_tree_node, rb);
+                       if (left->__subtree_last >= start) {
+                               /*
+                                * Some nodes in left subtree satisfy Cond2.
+                                * Iterate to find the leftmost such node N.
+                                * If it also satisfies Cond1, that's the match
+                                * we are looking for. Otherwise, there is no
+                                * matching interval as nodes to the right of N
+                                * can't satisfy Cond1 either.
+                                */
+                               node = left;
+                               continue;
+                       }
                }
+               if (node->start <= last) {              /* Cond1 */
+                       if (node->last >= start)        /* Cond2 */
+                               return node;    /* node is leftmost match */
+                       if (node->rb.rb_right) {
+                               node = rb_entry(node->rb.rb_right,
+                                       struct interval_tree_node, rb);
+                               if (node->__subtree_last >= start)
+                                       continue;
+                       }
+               }
+               return NULL;    /* No match */
+       }
+}
+
+Insertion/removal are defined using the following augmented callbacks:
+
+static inline unsigned long
+compute_subtree_last(struct interval_tree_node *node)
+{
+       unsigned long max = node->last, subtree_last;
+       if (node->rb.rb_left) {
+               subtree_last = rb_entry(node->rb.rb_left,
+                       struct interval_tree_node, rb)->__subtree_last;
+               if (max < subtree_last)
+                       max = subtree_last;
+       }
+       if (node->rb.rb_right) {
+               subtree_last = rb_entry(node->rb.rb_right,
+                       struct interval_tree_node, rb)->__subtree_last;
+               if (max < subtree_last)
+                       max = subtree_last;
+       }
+       return max;
+}
+
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+       while (rb != stop) {
+               struct interval_tree_node *node =
+                       rb_entry(rb, struct interval_tree_node, rb);
+               unsigned long subtree_last = compute_subtree_last(node);
+               if (node->__subtree_last == subtree_last)
+                       break;
+               node->__subtree_last = subtree_last;
+               rb = rb_parent(&node->rb);
        }
-       return lowest_match;
 }
 
-Finding exact match will be to first find lowest match and then to follow
-successor nodes looking for exact match, until the start of a node is beyond
-the hi value we are looking for.
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+       struct interval_tree_node *old =
+               rb_entry(rb_old, struct interval_tree_node, rb);
+       struct interval_tree_node *new =
+               rb_entry(rb_new, struct interval_tree_node, rb);
+
+       new->__subtree_last = old->__subtree_last;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+       struct interval_tree_node *old =
+               rb_entry(rb_old, struct interval_tree_node, rb);
+       struct interval_tree_node *new =
+               rb_entry(rb_new, struct interval_tree_node, rb);
+
+       new->__subtree_last = old->__subtree_last;
+       old->__subtree_last = compute_subtree_last(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+       augment_propagate, augment_copy, augment_rotate
+};
+
+void interval_tree_insert(struct interval_tree_node *node,
+                         struct rb_root *root)
+{
+       struct rb_node **link = &root->rb_node, *rb_parent = NULL;
+       unsigned long start = node->start, last = node->last;
+       struct interval_tree_node *parent;
+
+       while (*link) {
+               rb_parent = *link;
+               parent = rb_entry(rb_parent, struct interval_tree_node, rb);
+               if (parent->__subtree_last < last)
+                       parent->__subtree_last = last;
+               if (start < parent->start)
+                       link = &parent->rb.rb_left;
+               else
+                       link = &parent->rb.rb_right;
+       }
+
+       node->__subtree_last = last;
+       rb_link_node(&node->rb, rb_parent, link);
+       rb_insert_augmented(&node->rb, root, &augment_callbacks);
+}
+
+void interval_tree_remove(struct interval_tree_node *node,
+                         struct rb_root *root)
+{
+       rb_erase_augmented(&node->rb, root, &augment_callbacks);
+}
index 250160469d83e65c3a5235ee7b00b83b40abb68d..32aa4002de4a9fbd9fe64045119f811df47886ae 100644 (file)
@@ -119,8 +119,9 @@ three different userspace interfaces:
     *  /sys/class/rtc/rtcN ... sysfs attributes support readonly
        access to some RTC attributes.
 
-    *  /proc/driver/rtc ... the first RTC (rtc0) may expose itself
-       using a procfs interface.  More information is (currently) shown
+    *  /proc/driver/rtc ... the system clock RTC may expose itself
+       using a procfs interface. If there is no RTC for the system clock,
+       rtc0 is used by default. More information is (currently) shown
        here than through sysfs.
 
 The RTC Class framework supports a wide variety of RTCs, ranging from those
index fa206cccf89f8577a8a44ff84d0836b06b712cb5..323ff5dba1cce4dcecd75db001274c08827cbe62 100644 (file)
@@ -371,8 +371,8 @@ mlock_fixup() filters several classes of "special" VMAs:
    mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
    allocate the huge pages and populate the ptes.
 
-3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of
-   kernel pages, such as the VDSO page, relay channel pages, etc.  These pages
+3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
+   such as the VDSO page, relay channel pages, etc. These pages
    are inherently unevictable and are not managed on the LRU lists.
    mlock_fixup() treats these VMAs the same as hugetlbfs VMAs.  It calls
    make_pages_present() to populate the ptes.
index ad25ab398bb87efa728e92339f730fe9dfd05e31..2206a64337803cb74a136e6d7a3e90c09a26fc3d 100644 (file)
@@ -776,6 +776,7 @@ S:  Maintained
 T:     git git://git.pengutronix.de/git/imx/linux-2.6.git
 F:     arch/arm/mach-imx/
 F:     arch/arm/plat-mxc/
+F:     arch/arm/configs/imx*_defconfig
 
 ARM/FREESCALE IMX6
 M:     Shawn Guo <shawn.guo@linaro.org>
@@ -1261,7 +1262,7 @@ F:        include/linux/i2c/at24.h
 
 ATA OVER ETHERNET (AOE) DRIVER
 M:     "Ed L. Cashin" <ecashin@coraid.com>
-W:     http://www.coraid.com/support/linux
+W:     http://support.coraid.com/support/linux
 S:     Supported
 F:     Documentation/aoe/
 F:     drivers/block/aoe/
@@ -3112,6 +3113,7 @@ T:        git git://git.secretlab.ca/git/linux-2.6.git
 F:     Documentation/gpio.txt
 F:     drivers/gpio/
 F:     include/linux/gpio*
+F:     include/asm-generic/gpio.h
 
 GRE DEMULTIPLEXER DRIVER
 M:     Dmitry Kozlov <xeb@mail.ru>
@@ -5562,7 +5564,7 @@ S:        Maintained
 W:     http://linuxptp.sourceforge.net/
 F:     Documentation/ABI/testing/sysfs-ptp
 F:     Documentation/ptp/*
-F:     drivers/net/gianfar_ptp.c
+F:     drivers/net/ethernet/freescale/gianfar_ptp.c
 F:     drivers/net/phy/dp83640*
 F:     drivers/ptp/*
 F:     include/linux/ptp_cl*
index d5176b861e32c4278bbfa09b1ddfcc0fa090fe41..3210fc8835d4698f0f86c1c85c31a01a8681513d 100644 (file)
@@ -326,4 +326,7 @@ config HAVE_RCU_USER_QS
          are already protected inside rcu_irq_enter/rcu_irq_exit() but
          preemption or signal handling on irq exit still need to be protected.
 
+config HAVE_ARCH_TRANSPARENT_HUGEPAGE
+       bool
+
 source "kernel/gcov/Kconfig"
index 53649c7d0068f19e4b66b7cf8a6012ad5519f7b3..b51f7b4818cd07117308d645abfae4a91eeb0aa3 100644 (file)
@@ -26,7 +26,7 @@ static int hose_mmap_page_range(struct pci_controller *hose,
                base = sparse ? hose->sparse_io_base : hose->dense_io_base;
 
        vma->vm_pgoff += base >> PAGE_SHIFT;
-       vma->vm_flags |= (VM_IO | VM_RESERVED);
+       vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
        return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
                                  vma->vm_end - vma->vm_start,
index 7599e2625c7d021a9db22afbd5b4904e6b701b1d..2a5907b5c8d2272bcc366d01f6da6e5312d7d718 100644 (file)
@@ -134,7 +134,6 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *mpnt;
-       struct prio_tree_iter iter;
        unsigned long offset;
        pgoff_t pgoff;
        int aliases = 0;
@@ -147,7 +146,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
         * cache coherency.
         */
        flush_dcache_mmap_lock(mapping);
-       vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
                /*
                 * If this VMA is not in our MM, we can ignore it.
                 * Note that we intentionally mask out the VMA
index c3bd8345022780e45e3135228329a56dda91cee4..5dbf13f954f6f493aaae525d4f3b93282ac88f75 100644 (file)
@@ -336,6 +336,7 @@ retry:
                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
                        * of starvation. */
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
                        goto retry;
                }
        }
index 40ca11ed6e5fbae9c54914db0123bf136e788481..1c8f7f56417598303cac08ca3baa8353361366ff 100644 (file)
@@ -196,7 +196,6 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
 {
        struct mm_struct *mm = current->active_mm;
        struct vm_area_struct *mpnt;
-       struct prio_tree_iter iter;
        pgoff_t pgoff;
 
        /*
@@ -208,7 +207,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 
        flush_dcache_mmap_lock(mapping);
-       vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long offset;
 
                /*
index 3b3159b710d42de528b487bc9758cf04dfb9a7fa..e2c328739808f6d47651e09c226c6a709e929d96 100644 (file)
@@ -102,6 +102,7 @@ typedef struct user_fpu_struct elf_fpregset_t;
 
 #define ELF_PLATFORM  (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX_32BIT)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX_32BIT | (current->personality & (~PER_MASK)))
 
 #endif /* __ASM_AVR32_ELF_H */
index b92e60958617eb25ff90481989f494c3625f5a98..b2f2d2d668491905dbbc37c385449852a9e7fdde 100644 (file)
@@ -152,6 +152,7 @@ good_area:
                        tsk->min_flt++;
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /*
                         * No need to up_read(&mm->mmap_sem) as we would have
index e6c6812a9abd7a48311dd87122a90be43b709ce1..14bc98ff668fb37b48b6ef80855f5a6f057e35f7 100644 (file)
@@ -132,6 +132,7 @@ do {                                                                                        \
 
 #define ELF_PLATFORM  (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #endif
index f4552db20b4a64f23733985c8bc98338270e5e54..32b997126adf76f00d5ab0fe9e08106d0856e6be 100644 (file)
@@ -77,7 +77,8 @@ do {                                                          \
 
 #define ELF_PLATFORM  (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 /* C6X specific section types */
 #define SHT_C6000_UNWIND       0x70000001
index 8a3d8e2b33c1e41c384255ecb764cee17af4a3c4..8182f2dc89d04077c4a05fd302cf7fe46bf80996 100644 (file)
@@ -86,6 +86,7 @@ typedef unsigned long elf_fpregset_t;
 
 #define ELF_PLATFORM  (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #endif
index 32567bc2a42145dead29f5778b9b0b88acd9bf36..ac12ae2b92863ad3388457c7dffbfb10191b3a8f 100644 (file)
@@ -133,12 +133,39 @@ static inline void writel(unsigned int b, volatile void __iomem *addr)
 #define insb(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,1,count) : 0)
 #define insw(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,2,count) : 0)
 #define insl(port,addr,count) (cris_iops ? cris_iops->read_io(port,addr,4,count) : 0)
-#define outb(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,1,1)
-#define outw(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,2,1)
-#define outl(data,port) if (cris_iops) cris_iops->write_io(port,(void*)(unsigned)data,4,1)
-#define outsb(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,1,count)
-#define outsw(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,2,count)
-#define outsl(port,addr,count) if(cris_iops) cris_iops->write_io(port,(void*)addr,3,count)
+static inline void outb(unsigned char data, unsigned int port)
+{
+       if (cris_iops)
+               cris_iops->write_io(port, (void *) &data, 1, 1);
+}
+static inline void outw(unsigned short data, unsigned int port)
+{
+       if (cris_iops)
+               cris_iops->write_io(port, (void *) &data, 2, 1);
+}
+static inline void outl(unsigned int data, unsigned int port)
+{
+       if (cris_iops)
+               cris_iops->write_io(port, (void *) &data, 4, 1);
+}
+static inline void outsb(unsigned int port, const void *addr,
+                        unsigned long count)
+{
+       if (cris_iops)
+               cris_iops->write_io(port, (void *)addr, 1, count);
+}
+static inline void outsw(unsigned int port, const void *addr,
+                        unsigned long count)
+{
+       if (cris_iops)
+               cris_iops->write_io(port, (void *)addr, 2, count);
+}
+static inline void outsl(unsigned int port, const void *addr,
+                        unsigned long count)
+{
+       if (cris_iops)
+               cris_iops->write_io(port, (void *)addr, 4, count);
+}
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
index 45fd542cf173df075c556d3fb1586d740298b426..73312ab6c696c160f7fd58df02a50890390ad650 100644 (file)
@@ -186,6 +186,7 @@ retry:
                        tsk->min_flt++;
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /*
                         * No need to up_read(&mm->mmap_sem) as we would
index c3819804a74b104765c60a16051ffeea4063cbb1..9ccbc80f0b11fbcefcc0f2cdbfeb61eb88fbe81f 100644 (file)
@@ -137,6 +137,7 @@ do {                                                                                        \
 
 #define ELF_PLATFORM  (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #endif
index 5fa3889d858bcbb5bc69b97fb6bd85ef3ca0471e..0b579927439d30e1fffc22cc37443df5da474736 100644 (file)
@@ -153,23 +153,22 @@ static int user_atoi(char __user *ubuf, size_t len)
 static int sysctl_pm_do_suspend(ctl_table *ctl, int write,
                                void __user *buffer, size_t *lenp, loff_t *fpos)
 {
-       int retval, mode;
+       int mode;
 
        if (*lenp <= 0)
                return -EIO;
 
        mode = user_atoi(buffer, *lenp);
-       if ((mode != 1) && (mode != 5))
-               return -EINVAL;
+       switch (mode) {
+       case 1:
+           return pm_do_suspend();
 
-       if (retval == 0) {
-               if (mode == 5)
-                   retval = pm_do_bus_sleep();
-               else
-                   retval = pm_do_suspend();
-       }
+       case 5:
+           return pm_do_bus_sleep();
 
-       return retval;
+       default:
+           return -EINVAL;
+       }
 }
 
 static int try_set_cmode(int new_cmode)
index c149d3b29eb6a1b990c44025137e2632574a3d88..23fff563fb987104c7429fdcea68be8f73ae3726 100644 (file)
@@ -3,6 +3,7 @@ config H8300
        default y
        select HAVE_IDE
        select HAVE_GENERIC_HARDIRQS
+       select GENERIC_ATOMIC64
        select ARCH_WANT_IPC_PARSE_VERSION
        select GENERIC_IRQ_SHOW
        select GENERIC_CPU_DEVICES
index c24fa250d6533cdb3a5429c102323ca19d356e0d..41193c396bffecd20b353ff558e40f4e9dff03ac 100644 (file)
@@ -54,7 +54,8 @@ typedef unsigned long elf_fpregset_t;
 
 #define ELF_PLATFORM  (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #define R_H8_NONE       0
 #define R_H8_DIR32      1
index 37976a0d3650c60b6106b1ce8f0b8017ee93a3bd..82b499621e052f2a272cd137c81f23c84fe2ccf3 100644 (file)
@@ -217,7 +217,8 @@ do {                                        \
 #define ELF_PLATFORM  (NULL)
 
 #ifdef __KERNEL__
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 #endif
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
index 06695cc4fe58f09f61d73c0e28803de8b9dfbda3..513b74cb397eb4dffab84d240367f2e71d8a4263 100644 (file)
@@ -113,6 +113,7 @@ good_area:
                                current->min_flt++;
                        if (fault & VM_FAULT_RETRY) {
                                flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                               flags |= FAULT_FLAG_TRIED;
                                goto retry;
                        }
                }
index da55c63728e0e3f5d2b269d8c6bcb1607bddee57..94eaa5bd5d0c4fb33f4240676765d4e5fb92ec58 100644 (file)
@@ -77,4 +77,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* _ASM_IA64_HUGETLB_H */
index 3fa4bc536953c9494eabc776b1c77a980cb2e148..2eae90e0f272cc82c37b11dee7fe9ca7576f4dd7 100644 (file)
@@ -2307,7 +2307,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
         */
        vma->vm_mm           = mm;
        vma->vm_file         = filp;
-       vma->vm_flags        = VM_READ| VM_MAYREAD |VM_RESERVED;
+       vma->vm_flags        = VM_READ | VM_MAYREAD | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 
        /*
index 8443daf4f515777ca3cd6d710bef4a92b6373608..6cf0341f978e59ddf235c44e70dcf615799390ed 100644 (file)
@@ -184,6 +184,7 @@ retry:
                        current->min_flt++;
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                         /* No need to up_read(&mm->mmap_sem) as we would
                         * have already released it in __lock_page_or_retry
index 0eab454867a23d799cd2fc5798f616cf2131104f..082e383c1b6f22dc8370fdac789489acbb04ffba 100644 (file)
@@ -138,7 +138,8 @@ ia64_init_addr_space (void)
                        vma->vm_mm = current->mm;
                        vma->vm_end = PAGE_SIZE;
                        vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
-                       vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+                       vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO |
+                                       VM_DONTEXPAND | VM_DONTDUMP;
                        down_write(&current->mm->mmap_sem);
                        if (insert_vm_struct(current->mm, vma)) {
                                up_write(&current->mm->mmap_sem);
index b8da7d0574d20635f489315ea8caf957d063be08..70896161c636e50fbe299504f9a41be3fc8ae47d 100644 (file)
@@ -128,6 +128,7 @@ typedef elf_fpreg_t elf_fpregset_t;
    intent than poking at uname or /proc/cpuinfo.  */
 #define ELF_PLATFORM   (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #endif  /* _ASM_M32R__ELF_H */
index e9b7cda597440b5696360307cf0124ab569645c5..f83c1d0a87cf046c42e546bdc86827f8747d421e 100644 (file)
@@ -113,6 +113,7 @@ typedef struct user_m68kfp_struct elf_fpregset_t;
 
 #define ELF_PLATFORM  (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #endif
index aeebbb7b30f0b6ca7348740a6a374ef475066a82..a563727806bf922b5b3315559b9b46b5bbb9d7fc 100644 (file)
@@ -170,6 +170,7 @@ good_area:
                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
                         * of starvation. */
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /*
                         * No need to up_read(&mm->mmap_sem) as we would
index 472d8bf726df41ae4e82d2b4ba320c3de3172ee9..42ac382a09daf3109fbcad4a2f7ba96ffd1a8de5 100644 (file)
@@ -22,5 +22,6 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 
        return res;
 }
+#define atomic_dec_if_positive atomic_dec_if_positive
 
 #endif /* _ASM_MICROBLAZE_ATOMIC_H */
index 834849f59ae8a5474e80d2e13cd4a3f0a9e2976f..640ddd4b6a9b2cf6bc58e159a46bf9490069e188 100644 (file)
@@ -116,7 +116,8 @@ do {                                                        \
 } while (0)
 
 #ifdef __KERNEL__
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX_32BIT)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX_32BIT | (current->personality & (~PER_MASK)))
 #endif
 
 #endif /* __uClinux__ */
index eb365d6795fa80448fbe4fe391b8b3211c0f4156..714b35a9c4f7b156ac59476debad57426988f849 100644 (file)
@@ -233,6 +233,7 @@ good_area:
                        current->min_flt++;
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /*
                         * No need to up_read(&mm->mmap_sem) as we would
index 58d36889f09b3c7e6af95106d1d6fd22c19f4f0d..bd94946a18f343da04b48aba1a13b83f6e724c18 100644 (file)
@@ -112,4 +112,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* __ASM_HUGETLB_H */
index 7a19957735e96056732b00f5ef77f9e658a1d333..ddcec1e1a0cd256df9db44404686a42a31e58947 100644 (file)
@@ -171,6 +171,7 @@ good_area:
                }
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /*
                         * No need to up_read(&mm->mmap_sem) as we would
index 33188b6e81e4509d060aab524de1d9ac5a54d67d..a3d0fef3b126ba859e5fe0459b1e12658cbebe69 100644 (file)
@@ -26,7 +26,7 @@ CHECKFLAGS    +=
 PROCESSOR      := unset
 UNIT           := unset
 
-KBUILD_CFLAGS  += -mam33 -mmem-funcs -DCPU=AM33
+KBUILD_CFLAGS  += -mam33 -DCPU=AM33 $(call cc-option,-mmem-funcs,)
 KBUILD_AFLAGS  += -mam33 -DCPU=AM33
 
 ifeq ($(CONFIG_MN10300_CURRENT_IN_E2),y)
index 8157c9267f426ac7dabdd6f0dcef9a67bd2ff5a3..4ebd6b3a0a1ebf7bd5aebdd8dc42dac5460fc64a 100644 (file)
@@ -151,7 +151,8 @@ do {                                                \
 #define ELF_PLATFORM  (NULL)
 
 #ifdef __KERNEL__
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 #endif
 
 #endif /* _ASM_ELF_H */
index a8fe2c513070389cdf46c03155123a4ea5531fda..225a7ff320ad565329b7e399c8ce80c9f8cf963c 100644 (file)
@@ -110,7 +110,8 @@ extern void dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt);
 
 #define ELF_PLATFORM   (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #endif /* __KERNEL__ */
 #endif
index 40f850e9766c78a0d790a040937b2d053b8ac4bd..e2bfafce66c53661064e2cf5e4b0d5d036e2c6a7 100644 (file)
@@ -183,6 +183,7 @@ good_area:
                        tsk->min_flt++;
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                         /* No need to up_read(&mm->mmap_sem) as we would
                         * have already released it in __lock_page_or_retry
index 9d181890a7e3cbe9b28238be7bee8e7c1c2e3e36..48e16dc20102ddb4b754850af53b4968ff991162 100644 (file)
@@ -276,7 +276,6 @@ void flush_dcache_page(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
        struct vm_area_struct *mpnt;
-       struct prio_tree_iter iter;
        unsigned long offset;
        unsigned long addr, old_addr = 0;
        pgoff_t pgoff;
@@ -299,7 +298,7 @@ void flush_dcache_page(struct page *page)
         * to flush one address here for them all to become coherent */
 
        flush_dcache_mmap_lock(mapping);
-       vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
                offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
                addr = mpnt->vm_start + offset;
 
index da29032ae38fd312a8d9bc744a5450a3363c4763..e3b1d41c89be73425595b364370c096206a6a843 100644 (file)
@@ -268,6 +268,7 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v)
 
        return t;
 }
+#define atomic_dec_if_positive atomic_dec_if_positive
 
 #define smp_mb__before_atomic_dec()     smp_mb()
 #define smp_mb__after_atomic_dec()      smp_mb()
index dfdb95bc59a50a9f89267112dfec3c1b0c10abb2..62e11a32c4c2bfcf4bcbff390a29171b68572e99 100644 (file)
@@ -151,6 +151,10 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #else /* ! CONFIG_HUGETLB_PAGE */
 static inline void flush_hugetlb_page(struct vm_area_struct *vma,
                                      unsigned long vmaddr)
index 38c7f1bc3495c7b7d19f92e8d5b04434ea07131b..4ce9ed87ab5361361944e001f377730fa0b9d832 100644 (file)
@@ -1198,7 +1198,7 @@ static const struct vm_operations_struct kvm_rma_vm_ops = {
 
 static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
 {
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &kvm_rma_vm_ops;
        return 0;
 }
index 995f924e007f8f01fa4fe6eaba2b83006d66d81d..183c787e862ddb29769605d9e883463060ec7f07 100644 (file)
@@ -451,6 +451,7 @@ good_area:
                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
                         * of starvation. */
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
                        goto retry;
                }
        }
index 642fca137ccb1489c593699b21115ed37c542883..28f1af2db1f564525dc374f3b7cf9eee0b5a1175 100644 (file)
@@ -304,7 +304,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
        return cookie;
 }
 
-/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+/* Look up the dcookie for the task's mm->exe_file,
  * which corresponds loosely to "application name". Also, determine
  * the offset for the SPU ELF object.  If computed offset is
  * non-zero, it implies an embedded SPU object; otherwise, it's a
@@ -321,7 +321,6 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
 {
        unsigned long app_cookie = 0;
        unsigned int my_offset = 0;
-       struct file *app = NULL;
        struct vm_area_struct *vma;
        struct mm_struct *mm = spu->mm;
 
@@ -330,16 +329,10 @@ get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
 
        down_read(&mm->mmap_sem);
 
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if (!vma->vm_file)
-                       continue;
-               if (!(vma->vm_flags & VM_EXECUTABLE))
-                       continue;
-               app_cookie = fast_get_dcookie(&vma->vm_file->f_path);
+       if (mm->exe_file) {
+               app_cookie = fast_get_dcookie(&mm->exe_file->f_path);
                pr_debug("got dcookie for %s\n",
-                        vma->vm_file->f_dentry->d_name.name);
-               app = vma->vm_file;
-               break;
+                        mm->exe_file->f_dentry->d_name.name);
        }
 
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
index 2d6e6e3805641bbc79ce1ee11aa639f48f5e2933..593753ee07f3c55126f5910f4fa4d4992d6a14de 100644 (file)
@@ -33,6 +33,7 @@ static inline int prepare_hugepage_range(struct file *file,
 }
 
 #define hugetlb_prefault_arch_hook(mm)         do { } while (0)
+#define arch_clear_hugepage_flags(page)                do { } while (0)
 
 int arch_prepare_hugepage(struct page *page);
 void arch_release_hugepage(struct page *page);
@@ -77,23 +78,6 @@ static inline void __pmd_csp(pmd_t *pmdp)
                "       csp %1,%3"
                : "=m" (*pmdp)
                : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
-       pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
-}
-
-static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
-{
-       unsigned long sto = (unsigned long) pmdp -
-                               pmd_index(address) * sizeof(pmd_t);
-
-       if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
-               asm volatile(
-                       "       .insn   rrf,0xb98e0000,%2,%3,0,0"
-                       : "=m" (*pmdp)
-                       : "m" (*pmdp), "a" (sto),
-                         "a" ((address & HPAGE_MASK))
-               );
-       }
-       pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
 }
 
 static inline void huge_ptep_invalidate(struct mm_struct *mm,
@@ -105,6 +89,7 @@ static inline void huge_ptep_invalidate(struct mm_struct *mm,
                __pmd_idte(address, pmdp);
        else
                __pmd_csp(pmdp);
+       pmd_val(*pmdp) = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY;
 }
 
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
index 6bd7d74830171096c82ac74f5dc671dbb7098090..2fdab82e70d9591202f2b8024f9c6e2a700f990e 100644 (file)
@@ -347,6 +347,12 @@ extern struct page *vmemmap;
 
 #define _SEGMENT_ENTRY_LARGE   0x400   /* STE-format control, large page   */
 #define _SEGMENT_ENTRY_CO      0x100   /* change-recording override   */
+#define _SEGMENT_ENTRY_SPLIT_BIT 0     /* THP splitting bit number */
+#define _SEGMENT_ENTRY_SPLIT   (1UL << _SEGMENT_ENTRY_SPLIT_BIT)
+
+/* Set of bits not changed in pmd_modify */
+#define _SEGMENT_CHG_MASK      (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \
+                                | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO)
 
 /* Page status table bits for virtualization */
 #define RCP_ACC_BITS   0xf000000000000000UL
@@ -506,6 +512,30 @@ static inline int pmd_bad(pmd_t pmd)
        return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY;
 }
 
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+                                unsigned long addr, pmd_t *pmdp);
+
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp,
+                                pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+       return (pmd_val(pmd) & _SEGMENT_ENTRY_RO) == 0;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+       return 0;
+}
+
 static inline int pte_none(pte_t pte)
 {
        return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT);
@@ -1159,6 +1189,184 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
 
+static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
+{
+       unsigned long sto = (unsigned long) pmdp -
+                           pmd_index(address) * sizeof(pmd_t);
+
+       if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
+               asm volatile(
+                       "       .insn   rrf,0xb98e0000,%2,%3,0,0"
+                       : "=m" (*pmdp)
+                       : "m" (*pmdp), "a" (sto),
+                         "a" ((address & HPAGE_MASK))
+                       : "cc"
+               );
+       }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+       return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
+}
+
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                             pmd_t *pmdp, pmd_t entry)
+{
+       *pmdp = entry;
+}
+
+static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
+{
+       unsigned long pgprot_pmd = 0;
+
+       if (pgprot_val(pgprot) & _PAGE_INVALID) {
+               if (pgprot_val(pgprot) & _PAGE_SWT)
+                       pgprot_pmd |= _HPAGE_TYPE_NONE;
+               pgprot_pmd |= _SEGMENT_ENTRY_INV;
+       }
+       if (pgprot_val(pgprot) & _PAGE_RO)
+               pgprot_pmd |= _SEGMENT_ENTRY_RO;
+       return pgprot_pmd;
+}
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+       pmd_val(pmd) &= _SEGMENT_CHG_MASK;
+       pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+       return pmd;
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+       pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
+       return pmd;
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+       pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO;
+       return pmd;
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+       pmd_val(pmd) |= _SEGMENT_ENTRY_RO;
+       return pmd;
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+       /* No dirty bit in the segment table entry. */
+       return pmd;
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+       /* No referenced bit in the segment table entry. */
+       return pmd;
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+       /* No referenced bit in the segment table entry. */
+       return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                           unsigned long address, pmd_t *pmdp)
+{
+       unsigned long pmd_addr = pmd_val(*pmdp) & HPAGE_MASK;
+       long tmp, rc;
+       int counter;
+
+       rc = 0;
+       if (MACHINE_HAS_RRBM) {
+               counter = PTRS_PER_PTE >> 6;
+               asm volatile(
+                       "0:     .insn   rre,0xb9ae0000,%0,%3\n" /* rrbm */
+                       "       ogr     %1,%0\n"
+                       "       la      %3,0(%4,%3)\n"
+                       "       brct    %2,0b\n"
+                       : "=d" (tmp), "+d" (rc), "+d" (counter), "+a" (pmd_addr)
+                       : "a" (64 * 4096UL) : "cc");
+               rc = !!rc;
+       } else {
+               counter = PTRS_PER_PTE;
+               asm volatile(
+                       "0:     rrbe    0,%2\n"
+                       "       la      %2,0(%3,%2)\n"
+                       "       brc     12,1f\n"
+                       "       lhi     %0,1\n"
+                       "1:     brct    %1,0b\n"
+                       : "+d" (rc), "+d" (counter), "+a" (pmd_addr)
+                       : "a" (4096UL) : "cc");
+       }
+       return rc;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                                      unsigned long address, pmd_t *pmdp)
+{
+       pmd_t pmd = *pmdp;
+
+       __pmd_idte(address, pmdp);
+       pmd_clear(pmdp);
+       return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
+static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+                                    unsigned long address, pmd_t *pmdp)
+{
+       return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+}
+
+#define __HAVE_ARCH_PMDP_INVALIDATE
+static inline void pmdp_invalidate(struct vm_area_struct *vma,
+                                  unsigned long address, pmd_t *pmdp)
+{
+       __pmd_idte(address, pmdp);
+}
+
+static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
+{
+       pmd_t __pmd;
+       pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
+       return __pmd;
+}
+
+#define pfn_pmd(pfn, pgprot)   mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+       return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+       return MACHINE_HAS_HPAGE ? 1 : 0;
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+       if (pmd_trans_huge(pmd))
+               return pmd_val(pmd) >> HPAGE_SHIFT;
+       else
+               return pmd_val(pmd) >> PAGE_SHIFT;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /*
  * 31 bit swap entry format:
  * A page-table entry has some bits we have to treat in a special way.
index 87b47ca954f1c1ceec09ee083427eae697166242..8cfd731a18d86260dfbf0cb025cc24b19ff997a4 100644 (file)
@@ -81,6 +81,7 @@ extern unsigned int s390_user_mode;
 #define MACHINE_FLAG_SPP       (1UL << 13)
 #define MACHINE_FLAG_TOPOLOGY  (1UL << 14)
 #define MACHINE_FLAG_TE                (1UL << 15)
+#define MACHINE_FLAG_RRBM      (1UL << 16)
 
 #define MACHINE_IS_VM          (S390_lowcore.machine_flags & MACHINE_FLAG_VM)
 #define MACHINE_IS_KVM         (S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
@@ -99,7 +100,8 @@ extern unsigned int s390_user_mode;
 #define MACHINE_HAS_PFMF       (0)
 #define MACHINE_HAS_SPP                (0)
 #define MACHINE_HAS_TOPOLOGY   (0)
-#define MACHINE_HAS_TE               (0)
+#define MACHINE_HAS_TE         (0)
+#define MACHINE_HAS_RRBM       (0)
 #else /* CONFIG_64BIT */
 #define MACHINE_HAS_IEEE       (1)
 #define MACHINE_HAS_CSP                (1)
@@ -112,6 +114,7 @@ extern unsigned int s390_user_mode;
 #define MACHINE_HAS_SPP                (S390_lowcore.machine_flags & MACHINE_FLAG_SPP)
 #define MACHINE_HAS_TOPOLOGY   (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY)
 #define MACHINE_HAS_TE         (S390_lowcore.machine_flags & MACHINE_FLAG_TE)
+#define MACHINE_HAS_RRBM       (S390_lowcore.machine_flags & MACHINE_FLAG_RRBM)
 #endif /* CONFIG_64BIT */
 
 #define ZFCPDUMP_HSA_SIZE      (32UL<<20)
index 06e5acbc84bd50ef4917eabb5f6fc8cd2d6f6b22..b75d7d686684975278e1881a9aaf74590a6949ed 100644 (file)
@@ -137,6 +137,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 #define tlb_start_vma(tlb, vma)                        do { } while (0)
 #define tlb_end_vma(tlb, vma)                  do { } while (0)
 #define tlb_remove_tlb_entry(tlb, ptep, addr)  do { } while (0)
+#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr)      do { } while (0)
 #define tlb_migrate_finish(mm)                 do { } while (0)
 
 #endif /* _S390_TLB_H */
index 7f4717675c199ce53ad0a1580bcc25c83dd9442c..00d11444506882d9cb771dd0cd2874902c549727 100644 (file)
@@ -388,6 +388,8 @@ static __init void detect_machine_facilities(void)
                S390_lowcore.machine_flags |= MACHINE_FLAG_SPP;
        if (test_facility(50) && test_facility(73))
                S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
+       if (test_facility(66))
+               S390_lowcore.machine_flags |= MACHINE_FLAG_RRBM;
 #endif
 }
 
index ac9122ca11529217e3843483042306a7a09a74cb..04ad4001a289e05c89d30b5d6aaad9573b2ca388 100644 (file)
@@ -367,6 +367,7 @@ retry:
                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
                         * of starvation. */
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
                        down_read(&mm->mmap_sem);
                        goto retry;
                }
index eeaf8023851f41e5c3516a7194cc3b7dbf3cba4a..60acb93a46809065b0d454e65f3837aa37e70aee 100644 (file)
@@ -115,7 +115,16 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
                pmd = *pmdp;
                barrier();
                next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd))
+               /*
+                * The pmd_trans_splitting() check below explains why
+                * pmdp_splitting_flush() has to serialize with
+                * smp_call_function() against our disabled IRQs, to stop
+                * this gup-fast code from running while we set the
+                * splitting bit in the pmd. Returning zero will take
+                * the slow path that will call wait_split_huge_page()
+                * if the pmd is still in splitting state.
+                */
+               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_huge(pmd))) {
                        if (!gup_huge_pmd(pmdp, pmd, addr, next,
index b402991e43d71614cc111918a6b03a4af8525b2e..c8188a18af05b565e57304b02e153ed4629940d5 100644 (file)
@@ -787,6 +787,30 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
                tlb_table_flush(tlb);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void thp_split_vma(struct vm_area_struct *vma)
+{
+       unsigned long addr;
+       struct page *page;
+
+       for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+               page = follow_page(vma, addr, FOLL_SPLIT);
+       }
+}
+
+void thp_split_mm(struct mm_struct *mm)
+{
+       struct vm_area_struct *vma = mm->mmap;
+
+       while (vma != NULL) {
+               thp_split_vma(vma);
+               vma->vm_flags &= ~VM_HUGEPAGE;
+               vma->vm_flags |= VM_NOHUGEPAGE;
+               vma = vma->vm_next;
+       }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /*
  * switch on pgstes for its userspace process (for kvm)
  */
@@ -824,6 +848,12 @@ int s390_enable_sie(void)
        if (!mm)
                return -ENOMEM;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* split thp mappings and disable thp for future mappings */
+       thp_split_mm(mm);
+       mm->def_flags |= VM_NOHUGEPAGE;
+#endif
+
        /* Now lets check again if something happened */
        task_lock(tsk);
        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
@@ -866,3 +896,81 @@ bool kernel_page_present(struct page *page)
        return cc == 0;
 }
 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
+                          pmd_t *pmdp)
+{
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       /* No need to flush TLB
+        * On s390 reference bits are in storage key and never in TLB */
+       return pmdp_test_and_clear_young(vma, address, pmdp);
+}
+
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                         unsigned long address, pmd_t *pmdp,
+                         pmd_t entry, int dirty)
+{
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       if (pmd_same(*pmdp, entry))
+               return 0;
+       pmdp_invalidate(vma, address, pmdp);
+       set_pmd_at(vma->vm_mm, address, pmdp, entry);
+       return 1;
+}
+
+static void pmdp_splitting_flush_sync(void *arg)
+{
+       /* Simply deliver the interrupt */
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+                         pmd_t *pmdp)
+{
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
+                             (unsigned long *) pmdp)) {
+               /* need to serialize against gup-fast (IRQ disabled) */
+               smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
+       }
+}
+
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+       struct list_head *lh = (struct list_head *) pgtable;
+
+       assert_spin_locked(&mm->page_table_lock);
+
+       /* FIFO */
+       if (!mm->pmd_huge_pte)
+               INIT_LIST_HEAD(lh);
+       else
+               list_add(lh, (struct list_head *) mm->pmd_huge_pte);
+       mm->pmd_huge_pte = pgtable;
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+       struct list_head *lh;
+       pgtable_t pgtable;
+       pte_t *ptep;
+
+       assert_spin_locked(&mm->page_table_lock);
+
+       /* FIFO */
+       pgtable = mm->pmd_huge_pte;
+       lh = (struct list_head *) pgtable;
+       if (list_empty(lh))
+               mm->pmd_huge_pte = NULL;
+       else {
+               mm->pmd_huge_pte = (pgtable_t) lh->next;
+               list_del(lh);
+       }
+       ptep = (pte_t *) pgtable;
+       pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+       ptep++;
+       pte_val(*ptep) = _PAGE_TYPE_EMPTY;
+       return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
index e2c8db4533dc3be6cf1d88ba6a7c3863460955c7..fbf9db5e60f90bbbdf43130c5d9f641d0f870698 100644 (file)
@@ -6,12 +6,13 @@ config SCORE
        select GENERIC_IRQ_SHOW
        select GENERIC_IOMAP
        select HAVE_MEMBLOCK
+       select GENERIC_ATOMIC64
        select HAVE_MEMBLOCK_NODE_MAP
        select ARCH_DISCARD_MEMBLOCK
        select GENERIC_CPU_DEVICES
        select GENERIC_CLOCKEVENTS
        select HAVE_MOD_ARCH_SPECIFIC
-       select MODULES_USE_ELF_REL
+       select MODULES_USE_ELF_REL
 
 choice
        prompt "System type"
index f478ce94181fb349324bf836e9fe5a01abfaa13d..5d566c7a0af26c5cf65f7b5d37fc57c0f6f93e8a 100644 (file)
@@ -54,7 +54,7 @@ typedef elf_fpreg_t   elf_fpregset_t;
 
 #define SET_PERSONALITY(ex)                                    \
 do {                                                           \
-       set_personality(PER_LINUX);                             \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK))); \
 } while (0)
 
 struct task_struct;
index f38112be67d29555eb76042c57598e34c2b2e8a4..37924afa8d8a26781a2e8ebf4952144b141f9295 100644 (file)
@@ -183,7 +183,8 @@ do {                                                                        \
 } while (0)
 #endif
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX_32BIT)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX_32BIT | (current->personality & (~PER_MASK)))
 
 #ifdef CONFIG_VSYSCALL
 /* vDSO has arch_setup_additional_pages */
index 967068fb79ac483959ccd13436e1c14601133308..b3808c7d67b23622b6ccaf5729c0d8603b7bd523 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _ASM_SH_HUGETLB_H
 #define _ASM_SH_HUGETLB_H
 
+#include <asm/cacheflush.h>
 #include <asm/page.h>
 
 
@@ -89,4 +90,9 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+       clear_bit(PG_dcache_clean, &page->flags);
+}
+
 #endif /* _ASM_SH_HUGETLB_H */
index 3bdc1ad9a341f4418a1724dfcd8cd3ac4388a011..cbbdcad8fcb357cc7fc383820a6f0c7dc1537be9 100644 (file)
@@ -504,6 +504,7 @@ good_area:
                }
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /*
                         * No need to up_read(&mm->mmap_sem) as we would
index 2d4d755cba9ebcc67d8420cb927f66a98e00de74..ac74a2c98e6dde55417db847f9f103b2e8771457 100644 (file)
@@ -128,6 +128,7 @@ typedef struct {
 
 #define ELF_PLATFORM   (NULL)
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK)))
 
 #endif /* !(__ASMSPARC_ELF_H) */
index 177061064ee602ed4dc0e8e956a4b682695c79f0..e7927c9758a19eb6bb27fb08944ac92bdf096f2b 100644 (file)
@@ -82,4 +82,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* _ASM_SPARC64_HUGETLB_H */
index 065b88c4f868a67da15d6025f24ff3df453372dc..1afda827d1ba5282b3c29030e6e3374ccecb57b4 100644 (file)
@@ -783,7 +783,7 @@ static int __pci_mmap_make_offset(struct pci_dev *pdev,
 static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma,
                                            enum pci_mmap_state mmap_state)
 {
-       vma->vm_flags |= (VM_IO | VM_RESERVED);
+       vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 }
 
 /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
index 77ac917be15295f787977715ea8f63c43bbb1697..e98bfda205a2beb97bcaa49a27d5a9412bf03dc3 100644 (file)
@@ -265,6 +265,7 @@ good_area:
                }
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /* No need to up_read(&mm->mmap_sem) as we would
                         * have already released it in __lock_page_or_retry
index 1fe0429b6314257faa40d80b07f7d4e3b77538c6..413d292633046a62d51b342b6de85bc04125affd 100644 (file)
@@ -452,6 +452,7 @@ good_area:
                }
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                        /* No need to up_read(&mm->mmap_sem) as we would
                         * have already released it in __lock_page_or_retry
index d16d006d660e20307783eb63695c1faf5b46af46..f8ccf08f6934704924f6081476d61d85e28b181f 100644 (file)
@@ -156,12 +156,12 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #undef SET_PERSONALITY
 #define SET_PERSONALITY(ex) \
 do { \
-       current->personality = PER_LINUX; \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK))); \
        current_thread_info()->status &= ~TS_COMPAT; \
 } while (0)
 #define COMPAT_SET_PERSONALITY(ex) \
 do { \
-       current->personality = PER_LINUX_32BIT; \
+       set_personality(PER_LINUX | (current->personality & (~PER_MASK))); \
        current_thread_info()->status |= TS_COMPAT; \
 } while (0)
 
index b2042380a5aab580011e54c436fc24ccc02e9366..0f885af2b62157f12b025f91f694eaa8592c3d49 100644 (file)
@@ -106,6 +106,10 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #ifdef CONFIG_HUGETLB_SUPER_PAGES
 static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
                                       struct page *page, int writable)
index 758b6038c2b78eabe41627ba5b018e289a0d677e..3cfa98bf9125215ae2fc7ea8fdc287fd84245285 100644 (file)
@@ -36,19 +36,14 @@ static void sim_notify_exec(const char *binary_name)
        } while (c);
 }
 
-static int notify_exec(void)
+static int notify_exec(struct mm_struct *mm)
 {
        int retval = 0;  /* failure */
-       struct vm_area_struct *vma = current->mm->mmap;
-       while (vma) {
-               if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
-                       break;
-               vma = vma->vm_next;
-       }
-       if (vma) {
+
+       if (mm->exe_file) {
                char *buf = (char *) __get_free_page(GFP_KERNEL);
                if (buf) {
-                       char *path = d_path(&vma->vm_file->f_path,
+                       char *path = d_path(&mm->exe_file->f_path,
                                            buf, PAGE_SIZE);
                        if (!IS_ERR(path)) {
                                sim_notify_exec(path);
@@ -106,16 +101,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
        unsigned long vdso_base;
        int retval = 0;
 
+       down_write(&mm->mmap_sem);
+
        /*
         * Notify the simulator that an exec just occurred.
         * If we can't find the filename of the mapping, just use
         * whatever was passed as the linux_binprm filename.
         */
-       if (!notify_exec())
+       if (!notify_exec(mm))
                sim_notify_exec(bprm->filename);
 
-       down_write(&mm->mmap_sem);
-
        /*
         * MAYWRITE to allow gdb to COW and set breakpoints
         */
index 84ce7abbf5afb80c5d82648244349bf7d746a9c0..fe811fa5f1b96f682633e2c54f6be6bbdd525095 100644 (file)
@@ -454,6 +454,7 @@ good_area:
                        tsk->min_flt++;
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                         /*
                          * No need to up_read(&mm->mmap_sem) as we would
index 0353b98ae35a28208c695816bd9b7883ad6fb22e..0f00e9c82080f644030191cb31bc7ae3e0937f49 100644 (file)
@@ -89,6 +89,7 @@ good_area:
                                current->min_flt++;
                        if (fault & VM_FAULT_RETRY) {
                                flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                               flags |= FAULT_FLAG_TRIED;
 
                                goto retry;
                        }
index 5d53ffd5736ff47c82f1c60b31f35d4f7a930233..957f9686fcf5f701982996a87483c7060521cc2a 100644 (file)
@@ -6,6 +6,7 @@ config UNICORE32
        select HAVE_DMA_ATTRS
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_BZIP2
+       select GENERIC_ATOMIC64
        select HAVE_KERNEL_LZO
        select HAVE_KERNEL_LZMA
        select ARCH_HAVE_CUSTOM_GPIO_H
index b6f0458c3143187ced10fdde2eb4286548c869b8..b008586dad753a1b29010735b95ee71e1a704900 100644 (file)
@@ -380,7 +380,7 @@ int vectors_user_mapping(void)
        return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
                                       VM_READ | VM_EXEC |
                                       VM_MAYREAD | VM_MAYEXEC |
-                                      VM_RESERVED,
+                                      VM_DONTEXPAND | VM_DONTDUMP,
                                       NULL);
 }
 
index 0c399cd26db3f0f628415116cc88dbb3bdce1994..b98d9fe6c3cc78918bcaac7b704f6ff1575170a4 100644 (file)
@@ -85,6 +85,7 @@ config X86
        select IRQ_FORCED_THREADING
        select USE_GENERIC_SMP_HELPERS if SMP
        select HAVE_BPF_JIT if X86_64
+       select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select CLKEVT_I8253
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
        select GENERIC_IOMAP
index 58cb6d4085f70739b24ca800a4d917ed7d214300..2faac4342e4d7ee94f1f7d75dcc84d28b026a160 100644 (file)
@@ -240,30 +240,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
        return c;
 }
 
-
-/*
- * atomic_dec_if_positive - decrement by 1 if old value positive
- * @v: pointer of type atomic_t
- *
- * The function returns the old value of *v minus 1, even if
- * the atomic variable, v, was not decremented.
- */
-static inline int atomic_dec_if_positive(atomic_t *v)
-{
-       int c, old, dec;
-       c = atomic_read(v);
-       for (;;) {
-               dec = c - 1;
-               if (unlikely(dec < 0))
-                       break;
-               old = atomic_cmpxchg((v), c, dec);
-               if (likely(old == c))
-                       break;
-               c = old;
-       }
-       return dec;
-}
-
 /**
  * atomic_inc_short - increment of a short integer
  * @v: pointer to type int
index 439a9acc132d10f77b469fc24b250f3acc2f51d9..bdd35dbd0605b9c39d20a4111d20cc89f7053a7b 100644 (file)
@@ -90,4 +90,8 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
 #endif /* _ASM_X86_HUGETLB_H */
index b2297e58c6ed27c418fe9d5e9f00bf1e0923c833..c27c0c6fd1df8b2cd2249e7311db1d8f6790f42e 100644 (file)
@@ -691,6 +691,11 @@ EXPORT_SYMBOL(acpi_map_lsapic);
 
 int acpi_unmap_lsapic(int cpu)
 {
+#ifdef CONFIG_ACPI_NUMA
+       set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+       numa_clear_node(cpu);
+#endif
+
        per_cpu(x86_cpu_to_apicid, cpu) = -1;
        set_cpu_present(cpu, false);
        num_processors--;
index 7dde46d68a25a562b54fcdd300f232b051c10153..d57ca71e124508bcc933cb7a51783aa0190bada5 100644 (file)
@@ -1202,6 +1202,7 @@ good_area:
                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
                         * of starvation. */
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
                        goto retry;
                }
        }
index b91e48512425f6f210e9406cbfc666395bad6ad1..937bff5cdaa79a54f5a15174f1717259b79c5291 100644 (file)
@@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
-       struct prio_tree_iter iter;
        struct vm_area_struct *svma;
        unsigned long saddr;
        pte_t *spte = NULL;
@@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
                return (pte_t *)pmd_alloc(mm, pud, addr);
 
        mutex_lock(&mapping->i_mmap_mutex);
-       vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+       vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
 
index 2d125be1bae9f2c229b835b8ea1421e685c38879..21d02f0d7a2c4a563722e0ecc266d4fb433029b1 100644 (file)
@@ -97,8 +97,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 #endif
        per_cpu(x86_cpu_to_node_map, cpu) = node;
 
-       if (node != NUMA_NO_NODE)
-               set_cpu_numa_node(cpu, node);
+       set_cpu_numa_node(cpu, node);
 }
 
 void __cpuinit numa_clear_node(int cpu)
index 3d68ef6d2266cb66b3d07c578191b80c5348e0e2..0eb572eda4060338543bd0dfc97cf05bccaeb6cb 100644 (file)
@@ -664,20 +664,20 @@ static void free_pfn_range(u64 paddr, unsigned long size)
 }
 
 /*
- * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
  * copied through copy_page_range().
  *
  * If the vma has a linear pfn mapping for the entire range, we get the prot
  * from pte and reserve the entire vma range with single reserve_pfn_range call.
  */
-int track_pfn_vma_copy(struct vm_area_struct *vma)
+int track_pfn_copy(struct vm_area_struct *vma)
 {
        resource_size_t paddr;
        unsigned long prot;
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        pgprot_t pgprot;
 
-       if (is_linear_pfn_mapping(vma)) {
+       if (vma->vm_flags & VM_PAT) {
                /*
                 * reserve the whole chunk covered by vma. We need the
                 * starting address and protection from pte.
@@ -694,31 +694,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 }
 
 /*
- * track_pfn_vma_new is called when a _new_ pfn mapping is being established
- * for physical range indicated by pfn and size.
- *
  * prot is passed in as a parameter for the new mapping. If the vma has a
  * linear pfn mapping for the entire range reserve the entire vma range with
  * single reserve_pfn_range call.
  */
-int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
-                       unsigned long pfn, unsigned long size)
+int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+                   unsigned long pfn, unsigned long addr, unsigned long size)
 {
+       resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
        unsigned long flags;
-       resource_size_t paddr;
-       unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-       if (is_linear_pfn_mapping(vma)) {
-               /* reserve the whole chunk starting from vm_pgoff */
-               paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
-               return reserve_pfn_range(paddr, vma_size, prot, 0);
+       /* reserve the whole chunk starting from paddr */
+       if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+               int ret;
+
+               ret = reserve_pfn_range(paddr, size, prot, 0);
+               if (!ret)
+                       vma->vm_flags |= VM_PAT;
+               return ret;
        }
 
        if (!pat_enabled)
                return 0;
 
-       /* for vm_insert_pfn and friends, we set prot based on lookup */
-       flags = lookup_memtype(pfn << PAGE_SHIFT);
+       /*
+        * For anything smaller than the vma size we set prot based on the
+        * lookup.
+        */
+       flags = lookup_memtype(paddr);
+
+       /* Check memtype for the remaining pages */
+       while (size > PAGE_SIZE) {
+               size -= PAGE_SIZE;
+               paddr += PAGE_SIZE;
+               if (flags != lookup_memtype(paddr))
+                       return -EINVAL;
+       }
+
+       *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+                        flags);
+
+       return 0;
+}
+
+int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+                    unsigned long pfn)
+{
+       unsigned long flags;
+
+       if (!pat_enabled)
+               return 0;
+
+       /* Set prot based on lookup */
+       flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
        *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
                         flags);
 
@@ -726,22 +754,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 }
 
 /*
- * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack_pfn is called while unmapping a pfnmap for a region.
  * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case size can be zero).
+ * can be for the entire vma (in which case pfn, size are zero).
  */
-void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
-                       unsigned long size)
+void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+                unsigned long size)
 {
        resource_size_t paddr;
-       unsigned long vma_size = vma->vm_end - vma->vm_start;
+       unsigned long prot;
 
-       if (is_linear_pfn_mapping(vma)) {
-               /* free the whole chunk starting from vm_pgoff */
-               paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
-               free_pfn_range(paddr, vma_size);
+       if (!(vma->vm_flags & VM_PAT))
                return;
+
+       /* free the chunk starting from pfn or the whole chunk */
+       paddr = (resource_size_t)pfn << PAGE_SHIFT;
+       if (!paddr && !size) {
+               if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+                       WARN_ON_ONCE(1);
+                       return;
+               }
+
+               size = vma->vm_end - vma->vm_start;
        }
+       free_pfn_range(paddr, size);
+       vma->vm_flags &= ~VM_PAT;
 }
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
index 8acaddd0fb21173bd46f7aceddfab8d3d0051ba8..415f6c4ced36d889f56d2985a4ccad23b06c2cc3 100644 (file)
@@ -12,7 +12,7 @@
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/sched.h>
 #include <linux/gfp.h>
 
@@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node)
        return ret;
 }
 
-/* Update 'subtree_max_end' for a node, based on node and its children */
-static void memtype_rb_augment_cb(struct rb_node *node, void *__unused)
+static u64 compute_subtree_max_end(struct memtype *data)
 {
-       struct memtype *data;
-       u64 max_end, child_max_end;
-
-       if (!node)
-               return;
+       u64 max_end = data->end, child_max_end;
 
-       data = container_of(node, struct memtype, rb);
-       max_end = data->end;
-
-       child_max_end = get_subtree_max_end(node->rb_right);
+       child_max_end = get_subtree_max_end(data->rb.rb_right);
        if (child_max_end > max_end)
                max_end = child_max_end;
 
-       child_max_end = get_subtree_max_end(node->rb_left);
+       child_max_end = get_subtree_max_end(data->rb.rb_left);
        if (child_max_end > max_end)
                max_end = child_max_end;
 
-       data->subtree_max_end = max_end;
+       return max_end;
 }
 
+RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
+                    u64, subtree_max_end, compute_subtree_max_end)
+
 /* Find the first (lowest start addr) overlapping range from rb tree */
 static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
                                u64 start, u64 end)
@@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
                struct memtype *data = container_of(*node, struct memtype, rb);
 
                parent = *node;
+               if (data->subtree_max_end < newdata->end)
+                       data->subtree_max_end = newdata->end;
                if (newdata->start <= data->start)
                        node = &((*node)->rb_left);
                else if (newdata->start > data->start)
                        node = &((*node)->rb_right);
        }
 
+       newdata->subtree_max_end = newdata->end;
        rb_link_node(&newdata->rb, parent, node);
-       rb_insert_color(&newdata->rb, root);
-       rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL);
+       rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
 }
 
 int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
@@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
 
 struct memtype *rbt_memtype_erase(u64 start, u64 end)
 {
-       struct rb_node *deepest;
        struct memtype *data;
 
        data = memtype_rb_exact_match(&memtype_rbroot, start, end);
        if (!data)
                goto out;
 
-       deepest = rb_augment_erase_begin(&data->rb);
-       rb_erase(&data->rb, &memtype_rbroot);
-       rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL);
+       rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
 out:
        return data;
 }
index 5917eb56b313b3442850d8d28eff32449a30f885..e6cb80f620afffc8882e6bcdaf627b1c32215d41 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <linux/moduleparam.h>
 #include <linux/module.h>
+#include <linux/platform_device.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
@@ -62,29 +63,75 @@ static void iris_power_off(void)
  * by reading its input port and seeing whether the read value is
  * meaningful.
  */
-static int iris_init(void)
+static int iris_probe(struct platform_device *pdev)
 {
-       unsigned char status;
-       if (force != 1) {
-               printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
-               return -ENODEV;
-       }
-       status = inb(IRIS_GIO_INPUT);
+       unsigned char status = inb(IRIS_GIO_INPUT);
        if (status == IRIS_GIO_NODEV) {
-               printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+               printk(KERN_ERR "This machine does not seem to be an Iris. "
+                       "Power off handler not installed.\n");
                return -ENODEV;
        }
        old_pm_power_off = pm_power_off;
        pm_power_off = &iris_power_off;
        printk(KERN_INFO "Iris power_off handler installed.\n");
-
        return 0;
 }
 
-static void iris_exit(void)
+static int iris_remove(struct platform_device *pdev)
 {
        pm_power_off = old_pm_power_off;
        printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+       return 0;
+}
+
+static struct platform_driver iris_driver = {
+       .driver         = {
+               .name   = "iris",
+               .owner  = THIS_MODULE,
+       },
+       .probe          = iris_probe,
+       .remove         = iris_remove,
+};
+
+static struct resource iris_resources[] = {
+       {
+               .start  = IRIS_GIO_BASE,
+               .end    = IRIS_GIO_OUTPUT,
+               .flags  = IORESOURCE_IO,
+               .name   = "address"
+       }
+};
+
+static struct platform_device *iris_device;
+
+static int iris_init(void)
+{
+       int ret;
+       if (force != 1) {
+               printk(KERN_ERR "The force parameter has not been set to 1."
+                       " The Iris poweroff handler will not be installed.\n");
+               return -ENODEV;
+       }
+       ret = platform_driver_register(&iris_driver);
+       if (ret < 0) {
+               printk(KERN_ERR "Failed to register iris platform driver: %d\n",
+                       ret);
+               return ret;
+       }
+       iris_device = platform_device_register_simple("iris", (-1),
+                               iris_resources, ARRAY_SIZE(iris_resources));
+       if (IS_ERR(iris_device)) {
+               printk(KERN_ERR "Failed to register iris platform device\n");
+               platform_driver_unregister(&iris_driver);
+               return PTR_ERR(iris_device);
+       }
+       return 0;
+}
+
+static void iris_exit(void)
+{
+       platform_device_unregister(iris_device);
+       platform_driver_unregister(&iris_driver);
 }
 
 module_init(iris_init);
index 5a16824cc2b3ca88919e484d79473af3f2e951a4..fd28d86fe3d2c979c5c9ac0e5ab95bf3d977a224 100644 (file)
@@ -2451,8 +2451,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 
        prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
 
-       BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
-                               (VM_PFNMAP | VM_RESERVED | VM_IO)));
+       BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
        rmd.mfn = mfn;
        rmd.prot = prot;
index 6e65eadaae14c60c1972b3fe96c5bbae0c2a38bb..5293312bc6a4c55d842519ef6c16cede7e5e197b 100644 (file)
@@ -189,7 +189,8 @@ typedef struct {
 #endif
 } elf_xtregs_t;
 
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX_32BIT)
+#define SET_PERSONALITY(ex) \
+       set_personality(PER_LINUX_32BIT | (current->personality & (~PER_MASK)))
 
 struct task_struct;
 
index 5a74c53bc69c132cfad6231d9294e49f5cedef93..2c2f710ed1dc905a8c9e0509d36cd76ed073e0ce 100644 (file)
@@ -126,6 +126,7 @@ good_area:
                        current->min_flt++;
                if (fault & VM_FAULT_RETRY) {
                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       flags |= FAULT_FLAG_TRIED;
 
                         /* No need to up_read(&mm->mmap_sem) as we would
                         * have already released it in __lock_page_or_retry
index 8f29a0b9032e37eb83d61cbd4f4c9585591ab2b6..9e02cd6760e590caee3964e767c3c48af1f393bf 100644 (file)
@@ -743,7 +743,6 @@ void __init printk_all_partitions(void)
                struct hd_struct *part;
                char name_buf[BDEVNAME_SIZE];
                char devt_buf[BDEVT_SIZE];
-               char uuid_buf[PARTITION_META_INFO_UUIDLTH * 2 + 5];
 
                /*
                 * Don't show empty devices or things that have been
@@ -762,16 +761,11 @@ void __init printk_all_partitions(void)
                while ((part = disk_part_iter_next(&piter))) {
                        bool is_part0 = part == &disk->part0;
 
-                       uuid_buf[0] = '\0';
-                       if (part->info)
-                               snprintf(uuid_buf, sizeof(uuid_buf), "%pU",
-                                        part->info->uuid);
-
                        printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
                               bdevt_str(part_devt(part), devt_buf),
                               (unsigned long long)part_nr_sects_read(part) >> 1
                               , disk_name(disk, part->partno, name_buf),
-                              uuid_buf);
+                              part->info ? part->info->uuid : "");
                        if (is_part0) {
                                if (disk->driverfs_dev != NULL &&
                                    disk->driverfs_dev->driver != NULL)
index 6296b403c67a3d5ca512b05e048ecf3525330153..b62fb88b87118956e81a30884c086b30956ac61d 100644 (file)
@@ -620,7 +620,6 @@ int efi_partition(struct parsed_partitions *state)
        gpt_entry *ptes = NULL;
        u32 i;
        unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
-       u8 unparsed_guid[37];
 
        if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
@@ -649,11 +648,7 @@ int efi_partition(struct parsed_partitions *state)
                        state->parts[i + 1].flags = ADDPART_FLAG_RAID;
 
                info = &state->parts[i + 1].info;
-               /* Instead of doing a manual swap to big endian, reuse the
-                * common ASCII hex format as the interim.
-                */
-               efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
-               part_pack_uuid(unparsed_guid, info->uuid);
+               efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
 
                /* Naively convert UTF16-LE to 7 bits. */
                label_max = min(sizeof(info->volname) - 1,
index 5f79a6677c69706cee48844f2babd988fb5e26a8..8752a5d265653a5a743c333c9c99bb259aca554b 100644 (file)
@@ -94,6 +94,17 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
        return ret;
 }
 
+static void set_info(struct parsed_partitions *state, int slot,
+                    u32 disksig)
+{
+       struct partition_meta_info *info = &state->parts[slot].info;
+
+       snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
+                slot);
+       info->volname[0] = 0;
+       state->parts[slot].has_info = true;
+}
+
 /*
  * Create devices for each logical partition in an extended partition.
  * The logical partitions form a linked list, with each entry being
@@ -106,7 +117,8 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
  */
 
 static void parse_extended(struct parsed_partitions *state,
-                          sector_t first_sector, sector_t first_size)
+                          sector_t first_sector, sector_t first_size,
+                          u32 disksig)
 {
        struct partition *p;
        Sector sect;
@@ -166,6 +178,7 @@ static void parse_extended(struct parsed_partitions *state,
                        }
 
                        put_partition(state, state->next, next, size);
+                       set_info(state, state->next, disksig);
                        if (SYS_IND(p) == LINUX_RAID_PARTITION)
                                state->parts[state->next].flags = ADDPART_FLAG_RAID;
                        loopct = 0;
@@ -437,6 +450,7 @@ int msdos_partition(struct parsed_partitions *state)
        struct partition *p;
        struct fat_boot_sector *fb;
        int slot;
+       u32 disksig;
 
        data = read_part_sector(state, 0, &sect);
        if (!data)
@@ -491,6 +505,8 @@ int msdos_partition(struct parsed_partitions *state)
 #endif
        p = (struct partition *) (data + 0x1be);
 
+       disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
+
        /*
         * Look for partitions in two passes:
         * First find the primary and DOS-type extended partitions.
@@ -515,11 +531,12 @@ int msdos_partition(struct parsed_partitions *state)
                        put_partition(state, slot, start, n);
 
                        strlcat(state->pp_buf, " <", PAGE_SIZE);
-                       parse_extended(state, start, size);
+                       parse_extended(state, start, size, disksig);
                        strlcat(state->pp_buf, " >", PAGE_SIZE);
                        continue;
                }
                put_partition(state, slot, start, size);
+               set_info(state, slot, disksig);
                if (SYS_IND(p) == LINUX_RAID_PARTITION)
                        state->parts[slot].flags = ADDPART_FLAG_RAID;
                if (SYS_IND(p) == DM6_PARTITION)
index 24c807f966365e78972cd824c292a9bb944d79e8..05be17ada0a21b88ec8c601c012695c9a5307467 100644 (file)
@@ -52,6 +52,9 @@ MODULE_LICENSE("GPL");
 #define MEMORY_POWER_ON_STATE  1
 #define MEMORY_POWER_OFF_STATE 2
 
+static bool auto_probe;
+module_param(auto_probe, bool, S_IRUGO | S_IWUSR);
+
 static int acpi_memory_device_add(struct acpi_device *device);
 static int acpi_memory_device_remove(struct acpi_device *device, int type);
 
@@ -78,6 +81,7 @@ struct acpi_memory_info {
        unsigned short caching; /* memory cache attribute */
        unsigned short write_protect;   /* memory read/write attribute */
        unsigned int enabled:1;
+       unsigned int failed:1;
 };
 
 struct acpi_memory_device {
@@ -86,8 +90,6 @@ struct acpi_memory_device {
        struct list_head res_list;
 };
 
-static int acpi_hotmem_initialized;
-
 static acpi_status
 acpi_memory_get_resource(struct acpi_resource *resource, void *context)
 {
@@ -125,12 +127,20 @@ acpi_memory_get_resource(struct acpi_resource *resource, void *context)
        return AE_OK;
 }
 
+static void
+acpi_memory_free_device_resources(struct acpi_memory_device *mem_device)
+{
+       struct acpi_memory_info *info, *n;
+
+       list_for_each_entry_safe(info, n, &mem_device->res_list, list)
+               kfree(info);
+       INIT_LIST_HEAD(&mem_device->res_list);
+}
+
 static int
 acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
 {
        acpi_status status;
-       struct acpi_memory_info *info, *n;
-
 
        if (!list_empty(&mem_device->res_list))
                return 0;
@@ -138,9 +148,7 @@ acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
        status = acpi_walk_resources(mem_device->device->handle, METHOD_NAME__CRS,
                                     acpi_memory_get_resource, mem_device);
        if (ACPI_FAILURE(status)) {
-               list_for_each_entry_safe(info, n, &mem_device->res_list, list)
-                       kfree(info);
-               INIT_LIST_HEAD(&mem_device->res_list);
+               acpi_memory_free_device_resources(mem_device);
                return -EINVAL;
        }
 
@@ -251,9 +259,23 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
                        node = memory_add_physaddr_to_nid(info->start_addr);
 
                result = add_memory(node, info->start_addr, info->length);
-               if (result)
+
+               /*
+                * If the memory block has been used by the kernel, add_memory()
+                * returns -EEXIST. If add_memory() returns the other error, it
+                * means that this memory block is not used by the kernel.
+                */
+               if (result && result != -EEXIST) {
+                       info->failed = 1;
                        continue;
-               info->enabled = 1;
+               }
+
+               if (!result)
+                       info->enabled = 1;
+               /*
+                * Add num_enable even if add_memory() returns -EEXIST, so the
+                * device is bound to this driver.
+                */
                num_enabled++;
        }
        if (!num_enabled) {
@@ -317,11 +339,21 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
         * Note: Assume that this function returns zero on success
         */
        list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
-               if (info->enabled) {
-                       result = remove_memory(info->start_addr, info->length);
-                       if (result)
-                               return result;
-               }
+               if (info->failed)
+                       /* The kernel does not use this memory block */
+                       continue;
+
+               if (!info->enabled)
+                       /*
+                        * The kernel uses this memory block, but it may be not
+                        * managed by us.
+                        */
+                       return -EBUSY;
+
+               result = remove_memory(info->start_addr, info->length);
+               if (result)
+                       return result;
+               list_del(&info->list);
                kfree(info);
        }
 
@@ -420,6 +452,15 @@ static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data)
        return;
 }
 
+static void acpi_memory_device_free(struct acpi_memory_device *mem_device)
+{
+       if (!mem_device)
+               return;
+
+       acpi_memory_free_device_resources(mem_device);
+       kfree(mem_device);
+}
+
 static int acpi_memory_device_add(struct acpi_device *device)
 {
        int result;
@@ -451,35 +492,24 @@ static int acpi_memory_device_add(struct acpi_device *device)
 
        printk(KERN_DEBUG "%s \n", acpi_device_name(device));
 
-       /*
-        * Early boot code has recognized memory area by EFI/E820.
-        * If DSDT shows these memory devices on boot, hotplug is not necessary
-        * for them. So, it just returns until completion of this driver's
-        * start up.
-        */
-       if (!acpi_hotmem_initialized)
-               return 0;
-
        if (!acpi_memory_check_device(mem_device)) {
                /* call add_memory func */
                result = acpi_memory_enable_device(mem_device);
-               if (result)
+               if (result) {
                        printk(KERN_ERR PREFIX
                                "Error in acpi_memory_enable_device\n");
+                       acpi_memory_device_free(mem_device);
+               }
        }
        return result;
 }
 
 static int acpi_memory_device_remove(struct acpi_device *device, int type)
 {
-       struct acpi_memory_device *mem_device = NULL;
-
-
        if (!device || !acpi_driver_data(device))
                return -EINVAL;
 
-       mem_device = acpi_driver_data(device);
-       kfree(mem_device);
+       acpi_memory_device_free(acpi_driver_data(device));
 
        return 0;
 }
@@ -516,12 +546,44 @@ acpi_memory_register_notify_handler(acpi_handle handle,
                                    u32 level, void *ctxt, void **retv)
 {
        acpi_status status;
-
+       struct acpi_memory_device *mem_device = NULL;
+       unsigned long long current_status;
 
        status = is_memory_device(handle);
        if (ACPI_FAILURE(status))
                return AE_OK;   /* continue */
 
+       if (auto_probe) {
+               /* Get device present/absent information from the _STA */
+               status = acpi_evaluate_integer(handle, "_STA", NULL,
+                                              &current_status);
+               if (ACPI_FAILURE(status))
+                       goto install;
+
+               /*
+                * Check for device status. Device should be
+                * present/enabled/functioning.
+                */
+               if (!(current_status &
+                     (ACPI_STA_DEVICE_PRESENT | ACPI_STA_DEVICE_ENABLED |
+                      ACPI_STA_DEVICE_FUNCTIONING)))
+                       goto install;
+
+               if (acpi_memory_get_device(handle, &mem_device))
+                       goto install;
+
+               /* We have bound this device while we register the driver */
+               if (mem_device->state == MEMORY_POWER_ON_STATE)
+                       goto install;
+
+               ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                                 "\nauto probe memory device\n"));
+
+               if (acpi_memory_enable_device(mem_device))
+                       pr_err(PREFIX "Cannot enable memory device\n");
+       }
+
+install:
        status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY,
                                             acpi_memory_device_notify, NULL);
        /* continue */
@@ -568,7 +630,6 @@ static int __init acpi_memory_device_init(void)
                return -ENODEV;
        }
 
-       acpi_hotmem_initialized = 1;
        return 0;
 }
 
index db195abad69889e4d499bde162ff5e818601f3b2..27d0a214f3bc505e6d5ff75024cd6ce6fc459dc4 100644 (file)
@@ -1,5 +1,5 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
-#define VERSION "47"
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
+#define VERSION "49"
 #define AOE_MAJOR 152
 #define DEVICE_NAME "aoe"
 
@@ -75,72 +75,68 @@ enum {
        DEVFL_UP = 1,   /* device is installed in system and ready for AoE->ATA commands */
        DEVFL_TKILL = (1<<1),   /* flag for timer to know when to kill self */
        DEVFL_EXT = (1<<2),     /* device accepts lba48 commands */
-       DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */
-       DEVFL_GDALLOC = (1<<4), /* need to alloc gendisk */
-       DEVFL_KICKME = (1<<5),  /* slow polling network card catch */
-       DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */
-
-       BUFFL_FAIL = 1,
+       DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */
+       DEVFL_KICKME = (1<<4),  /* slow polling network card catch */
+       DEVFL_NEWSIZE = (1<<5), /* need to update dev size in block layer */
 };
 
 enum {
        DEFAULTBCNT = 2 * 512,  /* 2 sectors */
        NPERSHELF = 16,         /* number of slots per shelf address */
-       FREETAG = -1,
        MIN_BUFS = 16,
        NTARGETS = 8,
        NAOEIFS = 8,
-       NSKBPOOLMAX = 128,
+       NSKBPOOLMAX = 256,
+       NFACTIVE = 61,
 
        TIMERTICK = HZ / 10,
        MINTIMER = HZ >> 2,
        MAXTIMER = HZ << 1,
-       HELPWAIT = 20,
 };
 
 struct buf {
-       struct list_head bufs;
-       ulong stime;    /* for disk stats */
-       ulong flags;
        ulong nframesout;
        ulong resid;
        ulong bv_resid;
-       ulong bv_off;
        sector_t sector;
        struct bio *bio;
        struct bio_vec *bv;
+       struct request *rq;
 };
 
 struct frame {
-       int tag;
+       struct list_head head;
+       u32 tag;
        ulong waited;
+       struct aoetgt *t;               /* parent target I belong to */
+       sector_t lba;
+       struct sk_buff *skb;            /* command skb freed on module exit */
+       struct sk_buff *r_skb;          /* response skb for async processing */
        struct buf *buf;
-       char *bufaddr;
+       struct bio_vec *bv;
        ulong bcnt;
-       sector_t lba;
-       struct sk_buff *skb;
+       ulong bv_off;
 };
 
 struct aoeif {
        struct net_device *nd;
-       unsigned char lost;
-       unsigned char lostjumbo;
-       ushort maxbcnt;
+       ulong lost;
+       int bcnt;
 };
 
 struct aoetgt {
        unsigned char addr[6];
        ushort nframes;
-       struct frame *frames;
+       struct aoedev *d;                       /* parent device I belong to */
+       struct list_head ffree;                 /* list of free frames */
        struct aoeif ifs[NAOEIFS];
        struct aoeif *ifp;      /* current aoeif in use */
        ushort nout;
        ushort maxout;
-       u16 lasttag;            /* last tag sent */
-       u16 useme;
+       ulong falloc;
        ulong lastwadj;         /* last window adjustment */
+       int minbcnt;
        int wpkts, rpkts;
-       int dataref;
 };
 
 struct aoedev {
@@ -153,6 +149,9 @@ struct aoedev {
        u16 rttavg;             /* round trip average of requests/responses */
        u16 mintimer;
        u16 fw_ver;             /* version of blade's firmware */
+       u16 lasttag;            /* last tag sent */
+       u16 useme;
+       ulong ref;
        struct work_struct work;/* disk create work struct */
        struct gendisk *gd;
        struct request_queue *blkq;
@@ -160,16 +159,31 @@ struct aoedev {
        sector_t ssize;
        struct timer_list timer;
        spinlock_t lock;
-       struct sk_buff_head sendq;
        struct sk_buff_head skbpool;
        mempool_t *bufpool;     /* for deadlock-free Buf allocation */
-       struct list_head bufq;  /* queue of bios to work on */
-       struct buf *inprocess;  /* the one we're currently working on */
+       struct {                /* pointers to work in progress */
+               struct buf *buf;
+               struct bio *nxbio;
+               struct request *rq;
+       } ip;
+       ulong maxbcnt;
+       struct list_head factive[NFACTIVE];     /* hash of active frames */
        struct aoetgt *targets[NTARGETS];
        struct aoetgt **tgt;    /* target in use when working */
-       struct aoetgt **htgt;   /* target needing rexmit assistance */
+       struct aoetgt *htgt;    /* target needing rexmit assistance */
+       ulong ntargets;
+       ulong kicked;
 };
 
+/* kthread tracking */
+struct ktstate {
+       struct completion rendez;
+       struct task_struct *task;
+       wait_queue_head_t *waitq;
+       int (*fn) (void);
+       char *name;
+       spinlock_t *lock;
+};
 
 int aoeblk_init(void);
 void aoeblk_exit(void);
@@ -182,11 +196,18 @@ void aoechr_error(char *);
 
 void aoecmd_work(struct aoedev *d);
 void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
-void aoecmd_ata_rsp(struct sk_buff *);
+struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
 void aoecmd_cfg_rsp(struct sk_buff *);
 void aoecmd_sleepwork(struct work_struct *);
 void aoecmd_cleanslate(struct aoedev *);
+void aoecmd_exit(void);
+int aoecmd_init(void);
 struct sk_buff *aoecmd_ata_id(struct aoedev *);
+void aoe_freetframe(struct frame *);
+void aoe_flush_iocq(void);
+void aoe_end_request(struct aoedev *, struct request *, int);
+int aoe_ktstart(struct ktstate *k);
+void aoe_ktstop(struct ktstate *k);
 
 int aoedev_init(void);
 void aoedev_exit(void);
@@ -194,6 +215,8 @@ struct aoedev *aoedev_by_aoeaddr(int maj, int min);
 struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
 void aoedev_downdev(struct aoedev *d);
 int aoedev_flush(const char __user *str, size_t size);
+void aoe_failbuf(struct aoedev *, struct buf *);
+void aoedev_put(struct aoedev *);
 
 int aoenet_init(void);
 void aoenet_exit(void);
index 321de7b6c44228e5b7c5cfc0a410be19a54759fe..83160ab0d273a02c64d651c9d3f57b12b026b293 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoeblk.c
  * block device routines
@@ -161,68 +161,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
 }
 
 static void
-aoeblk_make_request(struct request_queue *q, struct bio *bio)
+aoeblk_request(struct request_queue *q)
 {
-       struct sk_buff_head queue;
        struct aoedev *d;
-       struct buf *buf;
-       ulong flags;
-
-       blk_queue_bounce(q, &bio);
-
-       if (bio == NULL) {
-               printk(KERN_ERR "aoe: bio is NULL\n");
-               BUG();
-               return;
-       }
-       d = bio->bi_bdev->bd_disk->private_data;
-       if (d == NULL) {
-               printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
-               BUG();
-               bio_endio(bio, -ENXIO);
-               return;
-       } else if (bio->bi_io_vec == NULL) {
-               printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
-               BUG();
-               bio_endio(bio, -ENXIO);
-               return;
-       }
-       buf = mempool_alloc(d->bufpool, GFP_NOIO);
-       if (buf == NULL) {
-               printk(KERN_INFO "aoe: buf allocation failure\n");
-               bio_endio(bio, -ENOMEM);
-               return;
-       }
-       memset(buf, 0, sizeof(*buf));
-       INIT_LIST_HEAD(&buf->bufs);
-       buf->stime = jiffies;
-       buf->bio = bio;
-       buf->resid = bio->bi_size;
-       buf->sector = bio->bi_sector;
-       buf->bv = &bio->bi_io_vec[bio->bi_idx];
-       buf->bv_resid = buf->bv->bv_len;
-       WARN_ON(buf->bv_resid == 0);
-       buf->bv_off = buf->bv->bv_offset;
-
-       spin_lock_irqsave(&d->lock, flags);
+       struct request *rq;
 
+       d = q->queuedata;
        if ((d->flags & DEVFL_UP) == 0) {
                pr_info_ratelimited("aoe: device %ld.%d is not up\n",
                        d->aoemajor, d->aoeminor);
-               spin_unlock_irqrestore(&d->lock, flags);
-               mempool_free(buf, d->bufpool);
-               bio_endio(bio, -ENXIO);
+               while ((rq = blk_peek_request(q))) {
+                       blk_start_request(rq);
+                       aoe_end_request(d, rq, 1);
+               }
                return;
        }
-
-       list_add_tail(&buf->bufs, &d->bufq);
-
        aoecmd_work(d);
-       __skb_queue_head_init(&queue);
-       skb_queue_splice_init(&d->sendq, &queue);
-
-       spin_unlock_irqrestore(&d->lock, flags);
-       aoenet_xmit(&queue);
 }
 
 static int
@@ -254,31 +208,46 @@ aoeblk_gdalloc(void *vp)
 {
        struct aoedev *d = vp;
        struct gendisk *gd;
+       mempool_t *mp;
+       struct request_queue *q;
+       enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
        ulong flags;
 
        gd = alloc_disk(AOE_PARTITIONS);
        if (gd == NULL) {
-               printk(KERN_ERR
-                       "aoe: cannot allocate disk structure for %ld.%d\n",
+               pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
                        d->aoemajor, d->aoeminor);
                goto err;
        }
 
-       d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache);
-       if (d->bufpool == NULL) {
+       mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
+               buf_pool_cache);
+       if (mp == NULL) {
                printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
                        d->aoemajor, d->aoeminor);
                goto err_disk;
        }
+       q = blk_init_queue(aoeblk_request, &d->lock);
+       if (q == NULL) {
+               pr_err("aoe: cannot allocate block queue for %ld.%d\n",
+                       d->aoemajor, d->aoeminor);
+               mempool_destroy(mp);
+               goto err_disk;
+       }
 
        d->blkq = blk_alloc_queue(GFP_KERNEL);
        if (!d->blkq)
                goto err_mempool;
-       blk_queue_make_request(d->blkq, aoeblk_make_request);
        d->blkq->backing_dev_info.name = "aoe";
        if (bdi_init(&d->blkq->backing_dev_info))
                goto err_blkq;
        spin_lock_irqsave(&d->lock, flags);
+       blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
+       q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
+       d->bufpool = mp;
+       d->blkq = gd->queue = q;
+       q->queuedata = d;
+       d->gd = gd;
        gd->major = AOE_MAJOR;
        gd->first_minor = d->sysminor * AOE_PARTITIONS;
        gd->fops = &aoe_bdops;
@@ -287,8 +256,6 @@ aoeblk_gdalloc(void *vp)
        snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
                d->aoemajor, d->aoeminor);
 
-       gd->queue = d->blkq;
-       d->gd = gd;
        d->flags &= ~DEVFL_GDALLOC;
        d->flags |= DEVFL_UP;
 
index e86d2062a1641f36ff2bc0200d286f433e56fb27..deb30c183fbaaa24f1d2e29b031373808a8c6419 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoechr.c
  * AoE character device driver
@@ -86,10 +86,9 @@ revalidate(const char __user *str, size_t size)
        if (copy_from_user(buf, str, size))
                return -EFAULT;
 
-       /* should be e%d.%d format */
        n = sscanf(buf, "e%d.%d", &major, &minor);
        if (n != 2) {
-               printk(KERN_ERR "aoe: invalid device specification\n");
+               pr_err("aoe: invalid device specification %s\n", buf);
                return -EINVAL;
        }
        d = aoedev_by_aoeaddr(major, minor);
@@ -97,23 +96,24 @@ revalidate(const char __user *str, size_t size)
                return -EINVAL;
        spin_lock_irqsave(&d->lock, flags);
        aoecmd_cleanslate(d);
+       aoecmd_cfg(major, minor);
 loop:
        skb = aoecmd_ata_id(d);
        spin_unlock_irqrestore(&d->lock, flags);
        /* try again if we are able to sleep a bit,
         * otherwise give up this revalidation
         */
-       if (!skb && !msleep_interruptible(200)) {
+       if (!skb && !msleep_interruptible(250)) {
                spin_lock_irqsave(&d->lock, flags);
                goto loop;
        }
+       aoedev_put(d);
        if (skb) {
                struct sk_buff_head queue;
                __skb_queue_head_init(&queue);
                __skb_queue_tail(&queue, skb);
                aoenet_xmit(&queue);
        }
-       aoecmd_cfg(major, minor);
        return 0;
 }
 
@@ -174,6 +174,7 @@ aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp
                break;
        case MINOR_FLUSH:
                ret = aoedev_flush(buf, cnt);
+               break;
        }
        if (ret == 0)
                ret = cnt;
index de0435e63b02cbd349c5dcc282682359f6f85934..c0adbbd38bad13ecf6b6d14cf5cfd307b7284f61 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoecmd.c
  * Filesystem request handling methods
 #include <linux/netdevice.h>
 #include <linux/genhd.h>
 #include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <net/net_namespace.h>
 #include <asm/unaligned.h>
+#include <linux/uio.h>
 #include "aoe.h"
 
+#define MAXIOC (8192)  /* default meant to avoid most soft lockups */
+
+static void ktcomplete(struct frame *, struct sk_buff *);
+
+static struct buf *nextbuf(struct aoedev *);
+
 static int aoe_deadsecs = 60 * 3;
 module_param(aoe_deadsecs, int, 0644);
 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
@@ -25,6 +34,15 @@ module_param(aoe_maxout, int, 0644);
 MODULE_PARM_DESC(aoe_maxout,
        "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
 
+static wait_queue_head_t ktiowq;
+static struct ktstate kts;
+
+/* io completion queue */
+static struct {
+       struct list_head head;
+       spinlock_t lock;
+} iocq;
+
 static struct sk_buff *
 new_skb(ulong len)
 {
@@ -40,15 +58,21 @@ new_skb(ulong len)
 }
 
 static struct frame *
-getframe(struct aoetgt *t, int tag)
+getframe(struct aoedev *d, u32 tag)
 {
-       struct frame *f, *e;
+       struct frame *f;
+       struct list_head *head, *pos, *nx;
+       u32 n;
 
-       f = t->frames;
-       e = f + t->nframes;
-       for (; f<e; f++)
-               if (f->tag == tag)
+       n = tag % NFACTIVE;
+       head = &d->factive[n];
+       list_for_each_safe(pos, nx, head) {
+               f = list_entry(pos, struct frame, head);
+               if (f->tag == tag) {
+                       list_del(pos);
                        return f;
+               }
+       }
        return NULL;
 }
 
@@ -58,18 +82,18 @@ getframe(struct aoetgt *t, int tag)
  * This driver reserves tag -1 to mean "unused frame."
  */
 static int
-newtag(struct aoetgt *t)
+newtag(struct aoedev *d)
 {
        register ulong n;
 
        n = jiffies & 0xffff;
-       return n |= (++t->lasttag & 0x7fff) << 16;
+       return n |= (++d->lasttag & 0x7fff) << 16;
 }
 
-static int
+static u32
 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
 {
-       u32 host_tag = newtag(t);
+       u32 host_tag = newtag(d);
 
        memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
        memcpy(h->dst, t->addr, sizeof h->dst);
@@ -94,16 +118,18 @@ put_lba(struct aoe_atahdr *ah, sector_t lba)
        ah->lba5 = lba >>= 8;
 }
 
-static void
+static struct aoeif *
 ifrotate(struct aoetgt *t)
 {
-       t->ifp++;
-       if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
-               t->ifp = t->ifs;
-       if (t->ifp->nd == NULL) {
-               printk(KERN_INFO "aoe: no interface to rotate to\n");
-               BUG();
-       }
+       struct aoeif *ifp;
+
+       ifp = t->ifp;
+       ifp++;
+       if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
+               ifp = t->ifs;
+       if (ifp->nd == NULL)
+               return NULL;
+       return t->ifp = ifp;
 }
 
 static void
@@ -128,78 +154,128 @@ skb_pool_get(struct aoedev *d)
        return NULL;
 }
 
-/* freeframe is where we do our load balancing so it's a little hairy. */
+void
+aoe_freetframe(struct frame *f)
+{
+       struct aoetgt *t;
+
+       t = f->t;
+       f->buf = NULL;
+       f->bv = NULL;
+       f->r_skb = NULL;
+       list_add(&f->head, &t->ffree);
+}
+
 static struct frame *
-freeframe(struct aoedev *d)
+newtframe(struct aoedev *d, struct aoetgt *t)
 {
-       struct frame *f, *e, *rf;
-       struct aoetgt **t;
+       struct frame *f;
        struct sk_buff *skb;
+       struct list_head *pos;
+
+       if (list_empty(&t->ffree)) {
+               if (t->falloc >= NSKBPOOLMAX*2)
+                       return NULL;
+               f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
+               if (f == NULL)
+                       return NULL;
+               t->falloc++;
+               f->t = t;
+       } else {
+               pos = t->ffree.next;
+               list_del(pos);
+               f = list_entry(pos, struct frame, head);
+       }
+
+       skb = f->skb;
+       if (skb == NULL) {
+               f->skb = skb = new_skb(ETH_ZLEN);
+               if (!skb) {
+bail:                  aoe_freetframe(f);
+                       return NULL;
+               }
+       }
+
+       if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
+               skb = skb_pool_get(d);
+               if (skb == NULL)
+                       goto bail;
+               skb_pool_put(d, f->skb);
+               f->skb = skb;
+       }
+
+       skb->truesize -= skb->data_len;
+       skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+       skb_trim(skb, 0);
+       return f;
+}
+
+static struct frame *
+newframe(struct aoedev *d)
+{
+       struct frame *f;
+       struct aoetgt *t, **tt;
+       int totout = 0;
 
        if (d->targets[0] == NULL) {    /* shouldn't happen, but I'm paranoid */
                printk(KERN_ERR "aoe: NULL TARGETS!\n");
                return NULL;
        }
-       t = d->tgt;
-       t++;
-       if (t >= &d->targets[NTARGETS] || !*t)
-               t = d->targets;
+       tt = d->tgt;    /* last used target */
        for (;;) {
-               if ((*t)->nout < (*t)->maxout
+               tt++;
+               if (tt >= &d->targets[NTARGETS] || !*tt)
+                       tt = d->targets;
+               t = *tt;
+               totout += t->nout;
+               if (t->nout < t->maxout
                && t != d->htgt
-               && (*t)->ifp->nd) {
-                       rf = NULL;
-                       f = (*t)->frames;
-                       e = f + (*t)->nframes;
-                       for (; f < e; f++) {
-                               if (f->tag != FREETAG)
-                                       continue;
-                               skb = f->skb;
-                               if (!skb
-                               && !(f->skb = skb = new_skb(ETH_ZLEN)))
-                                       continue;
-                               if (atomic_read(&skb_shinfo(skb)->dataref)
-                                       != 1) {
-                                       if (!rf)
-                                               rf = f;
-                                       continue;
-                               }
-gotone:                                skb_shinfo(skb)->nr_frags = skb->data_len = 0;
-                               skb_trim(skb, 0);
-                               d->tgt = t;
-                               ifrotate(*t);
+               && t->ifp->nd) {
+                       f = newtframe(d, t);
+                       if (f) {
+                               ifrotate(t);
+                               d->tgt = tt;
                                return f;
                        }
-                       /* Work can be done, but the network layer is
-                          holding our precious packets.  Try to grab
-                          one from the pool. */
-                       f = rf;
-                       if (f == NULL) {        /* more paranoia */
-                               printk(KERN_ERR
-                                       "aoe: freeframe: %s.\n",
-                                       "unexpected null rf");
-                               d->flags |= DEVFL_KICKME;
-                               return NULL;
-                       }
-                       skb = skb_pool_get(d);
-                       if (skb) {
-                               skb_pool_put(d, f->skb);
-                               f->skb = skb;
-                               goto gotone;
-                       }
-                       (*t)->dataref++;
-                       if ((*t)->nout == 0)
-                               d->flags |= DEVFL_KICKME;
                }
-               if (t == d->tgt)        /* we've looped and found nada */
+               if (tt == d->tgt)       /* we've looped and found nada */
                        break;
-               t++;
-               if (t >= &d->targets[NTARGETS] || !*t)
-                       t = d->targets;
+       }
+       if (totout == 0) {
+               d->kicked++;
+               d->flags |= DEVFL_KICKME;
        }
        return NULL;
 }
 
+static void
+skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
+{
+       int frag = 0;
+       ulong fcnt;
+loop:
+       fcnt = bv->bv_len - (off - bv->bv_offset);
+       if (fcnt > cnt)
+               fcnt = cnt;
+       skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
+       cnt -= fcnt;
+       if (cnt <= 0)
+               return;
+       bv++;
+       off = bv->bv_offset;
+       goto loop;
+}
+
+static void
+fhash(struct frame *f)
+{
+       struct aoedev *d = f->t->d;
+       u32 n;
+
+       n = f->tag % NFACTIVE;
+       list_add_tail(&f->head, &d->factive[n]);
+}
+
 static int
 aoecmd_ata_rw(struct aoedev *d)
 {
@@ -207,26 +283,47 @@ aoecmd_ata_rw(struct aoedev *d)
        struct aoe_hdr *h;
        struct aoe_atahdr *ah;
        struct buf *buf;
-       struct bio_vec *bv;
        struct aoetgt *t;
        struct sk_buff *skb;
-       ulong bcnt;
+       struct sk_buff_head queue;
+       ulong bcnt, fbcnt;
        char writebit, extbit;
 
        writebit = 0x10;
        extbit = 0x4;
 
-       f = freeframe(d);
+       buf = nextbuf(d);
+       if (buf == NULL)
+               return 0;
+       f = newframe(d);
        if (f == NULL)
                return 0;
        t = *d->tgt;
-       buf = d->inprocess;
-       bv = buf->bv;
-       bcnt = t->ifp->maxbcnt;
+       bcnt = d->maxbcnt;
        if (bcnt == 0)
                bcnt = DEFAULTBCNT;
-       if (bcnt > buf->bv_resid)
-               bcnt = buf->bv_resid;
+       if (bcnt > buf->resid)
+               bcnt = buf->resid;
+       fbcnt = bcnt;
+       f->bv = buf->bv;
+       f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
+       do {
+               if (fbcnt < buf->bv_resid) {
+                       buf->bv_resid -= fbcnt;
+                       buf->resid -= fbcnt;
+                       break;
+               }
+               fbcnt -= buf->bv_resid;
+               buf->resid -= buf->bv_resid;
+               if (buf->resid == 0) {
+                       d->ip.buf = NULL;
+                       break;
+               }
+               buf->bv++;
+               buf->bv_resid = buf->bv->bv_len;
+               WARN_ON(buf->bv_resid == 0);
+       } while (fbcnt);
+
        /* initialize the headers & frame */
        skb = f->skb;
        h = (struct aoe_hdr *) skb_mac_header(skb);
@@ -234,10 +331,10 @@ aoecmd_ata_rw(struct aoedev *d)
        skb_put(skb, sizeof *h + sizeof *ah);
        memset(h, 0, skb->len);
        f->tag = aoehdr_atainit(d, t, h);
+       fhash(f);
        t->nout++;
        f->waited = 0;
        f->buf = buf;
-       f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
        f->bcnt = bcnt;
        f->lba = buf->sector;
 
@@ -252,10 +349,11 @@ aoecmd_ata_rw(struct aoedev *d)
                ah->lba3 |= 0xe0;       /* LBA bit + obsolete 0xa0 */
        }
        if (bio_data_dir(buf->bio) == WRITE) {
-               skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
+               skb_fillup(skb, f->bv, f->bv_off, bcnt);
                ah->aflags |= AOEAFL_WRITE;
                skb->len += bcnt;
                skb->data_len = bcnt;
+               skb->truesize += bcnt;
                t->wpkts++;
        } else {
                t->rpkts++;
@@ -266,23 +364,15 @@ aoecmd_ata_rw(struct aoedev *d)
 
        /* mark all tracking fields and load out */
        buf->nframesout += 1;
-       buf->bv_off += bcnt;
-       buf->bv_resid -= bcnt;
-       buf->resid -= bcnt;
        buf->sector += bcnt >> 9;
-       if (buf->resid == 0) {
-               d->inprocess = NULL;
-       } else if (buf->bv_resid == 0) {
-               buf->bv = ++bv;
-               buf->bv_resid = bv->bv_len;
-               WARN_ON(buf->bv_resid == 0);
-               buf->bv_off = bv->bv_offset;
-       }
 
        skb->dev = t->ifp->nd;
        skb = skb_clone(skb, GFP_ATOMIC);
-       if (skb)
-               __skb_queue_tail(&d->sendq, skb);
+       if (skb) {
+               __skb_queue_head_init(&queue);
+               __skb_queue_tail(&queue, skb);
+               aoenet_xmit(&queue);
+       }
        return 1;
 }
 
@@ -329,17 +419,25 @@ cont:
 }
 
 static void
-resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
+resend(struct aoedev *d, struct frame *f)
 {
        struct sk_buff *skb;
+       struct sk_buff_head queue;
        struct aoe_hdr *h;
        struct aoe_atahdr *ah;
+       struct aoetgt *t;
        char buf[128];
        u32 n;
 
-       ifrotate(t);
-       n = newtag(t);
+       t = f->t;
+       n = newtag(d);
        skb = f->skb;
+       if (ifrotate(t) == NULL) {
+               /* probably can't happen, but set it up to fail anyway */
+               pr_info("aoe: resend: no interfaces to rotate to.\n");
+               ktcomplete(f, NULL);
+               return;
+       }
        h = (struct aoe_hdr *) skb_mac_header(skb);
        ah = (struct aoe_atahdr *) (h+1);
 
@@ -350,39 +448,22 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
        aoechr_error(buf);
 
        f->tag = n;
+       fhash(f);
        h->tag = cpu_to_be32(n);
        memcpy(h->dst, t->addr, sizeof h->dst);
        memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
 
-       switch (ah->cmdstat) {
-       default:
-               break;
-       case ATA_CMD_PIO_READ:
-       case ATA_CMD_PIO_READ_EXT:
-       case ATA_CMD_PIO_WRITE:
-       case ATA_CMD_PIO_WRITE_EXT:
-               put_lba(ah, f->lba);
-
-               n = f->bcnt;
-               if (n > DEFAULTBCNT)
-                       n = DEFAULTBCNT;
-               ah->scnt = n >> 9;
-               if (ah->aflags & AOEAFL_WRITE) {
-                       skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
-                               offset_in_page(f->bufaddr), n);
-                       skb->len = sizeof *h + sizeof *ah + n;
-                       skb->data_len = n;
-               }
-       }
        skb->dev = t->ifp->nd;
        skb = skb_clone(skb, GFP_ATOMIC);
        if (skb == NULL)
                return;
-       __skb_queue_tail(&d->sendq, skb);
+       __skb_queue_head_init(&queue);
+       __skb_queue_tail(&queue, skb);
+       aoenet_xmit(&queue);
 }
 
 static int
-tsince(int tag)
+tsince(u32 tag)
 {
        int n;
 
@@ -406,58 +487,65 @@ getif(struct aoetgt *t, struct net_device *nd)
        return NULL;
 }
 
-static struct aoeif *
-addif(struct aoetgt *t, struct net_device *nd)
-{
-       struct aoeif *p;
-
-       p = getif(t, NULL);
-       if (!p)
-               return NULL;
-       p->nd = nd;
-       p->maxbcnt = DEFAULTBCNT;
-       p->lost = 0;
-       p->lostjumbo = 0;
-       return p;
-}
-
 static void
 ejectif(struct aoetgt *t, struct aoeif *ifp)
 {
        struct aoeif *e;
+       struct net_device *nd;
        ulong n;
 
+       nd = ifp->nd;
        e = t->ifs + NAOEIFS - 1;
        n = (e - ifp) * sizeof *ifp;
        memmove(ifp, ifp+1, n);
        e->nd = NULL;
+       dev_put(nd);
 }
 
 static int
 sthtith(struct aoedev *d)
 {
-       struct frame *f, *e, *nf;
+       struct frame *f, *nf;
+       struct list_head *nx, *pos, *head;
        struct sk_buff *skb;
-       struct aoetgt *ht = *d->htgt;
-
-       f = ht->frames;
-       e = f + ht->nframes;
-       for (; f < e; f++) {
-               if (f->tag == FREETAG)
-                       continue;
-               nf = freeframe(d);
-               if (!nf)
-                       return 0;
-               skb = nf->skb;
-               *nf = *f;
-               f->skb = skb;
-               f->tag = FREETAG;
-               nf->waited = 0;
-               ht->nout--;
-               (*d->tgt)->nout++;
-               resend(d, *d->tgt, nf);
+       struct aoetgt *ht = d->htgt;
+       int i;
+
+       for (i = 0; i < NFACTIVE; i++) {
+               head = &d->factive[i];
+               list_for_each_safe(pos, nx, head) {
+                       f = list_entry(pos, struct frame, head);
+                       if (f->t != ht)
+                               continue;
+
+                       nf = newframe(d);
+                       if (!nf)
+                               return 0;
+
+                       /* remove frame from active list */
+                       list_del(pos);
+
+                       /* reassign all pertinent bits to new outbound frame */
+                       skb = nf->skb;
+                       nf->skb = f->skb;
+                       nf->buf = f->buf;
+                       nf->bcnt = f->bcnt;
+                       nf->lba = f->lba;
+                       nf->bv = f->bv;
+                       nf->bv_off = f->bv_off;
+                       nf->waited = 0;
+                       f->skb = skb;
+                       aoe_freetframe(f);
+                       ht->nout--;
+                       nf->t->nout++;
+                       resend(d, nf);
+               }
        }
-       /* he's clean, he's useless.  take away his interfaces */
+       /* We've cleaned up the outstanding so take away his
+        * interfaces so he won't be used.  We should remove him from
+        * the target array here, but cleaning up a target is
+        * involved.  PUNT!
+        */
        memset(ht->ifs, 0, sizeof ht->ifs);
        d->htgt = NULL;
        return 1;
@@ -476,13 +564,15 @@ ata_scnt(unsigned char *packet) {
 static void
 rexmit_timer(ulong vp)
 {
-       struct sk_buff_head queue;
        struct aoedev *d;
        struct aoetgt *t, **tt, **te;
        struct aoeif *ifp;
-       struct frame *f, *e;
+       struct frame *f;
+       struct list_head *head, *pos, *nx;
+       LIST_HEAD(flist);
        register long timeout;
        ulong flags, n;
+       int i;
 
        d = (struct aoedev *) vp;
 
@@ -496,58 +586,22 @@ rexmit_timer(ulong vp)
                spin_unlock_irqrestore(&d->lock, flags);
                return;
        }
-       tt = d->targets;
-       te = tt + NTARGETS;
-       for (; tt < te && *tt; tt++) {
-               t = *tt;
-               f = t->frames;
-               e = f + t->nframes;
-               for (; f < e; f++) {
-                       if (f->tag == FREETAG
-                       || tsince(f->tag) < timeout)
-                               continue;
-                       n = f->waited += timeout;
-                       n /= HZ;
-                       if (n > aoe_deadsecs) {
-                               /* waited too long.  device failure. */
-                               aoedev_downdev(d);
-                               break;
-                       }
-
-                       if (n > HELPWAIT /* see if another target can help */
-                       && (tt != d->targets || d->targets[1]))
-                               d->htgt = tt;
-
-                       if (t->nout == t->maxout) {
-                               if (t->maxout > 1)
-                                       t->maxout--;
-                               t->lastwadj = jiffies;
-                       }
-
-                       ifp = getif(t, f->skb->dev);
-                       if (ifp && ++ifp->lost > (t->nframes << 1)
-                       && (ifp != t->ifs || t->ifs[1].nd)) {
-                               ejectif(t, ifp);
-                               ifp = NULL;
-                       }
 
-                       if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
-                       && ifp && ++ifp->lostjumbo > (t->nframes << 1)
-                       && ifp->maxbcnt != DEFAULTBCNT) {
-                               printk(KERN_INFO
-                                       "aoe: e%ld.%d: "
-                                       "too many lost jumbo on "
-                                       "%s:%pm - "
-                                       "falling back to %d frames.\n",
-                                       d->aoemajor, d->aoeminor,
-                                       ifp->nd->name, t->addr,
-                                       DEFAULTBCNT);
-                               ifp->maxbcnt = 0;
-                       }
-                       resend(d, t, f);
+       /* collect all frames to rexmit into flist */
+       for (i = 0; i < NFACTIVE; i++) {
+               head = &d->factive[i];
+               list_for_each_safe(pos, nx, head) {
+                       f = list_entry(pos, struct frame, head);
+                       if (tsince(f->tag) < timeout)
+                               break;  /* end of expired frames */
+                       /* move to flist for later processing */
+                       list_move_tail(pos, &flist);
                }
-
-               /* window check */
+       }
+       /* window check */
+       tt = d->targets;
+       te = tt + d->ntargets;
+       for (; tt < te && (t = *tt); tt++) {
                if (t->nout == t->maxout
                && t->maxout < t->nframes
                && (jiffies - t->lastwadj)/HZ > 10) {
@@ -556,45 +610,173 @@ rexmit_timer(ulong vp)
                }
        }
 
-       if (!skb_queue_empty(&d->sendq)) {
+       if (!list_empty(&flist)) {      /* retransmissions necessary */
                n = d->rttavg <<= 1;
                if (n > MAXTIMER)
                        d->rttavg = MAXTIMER;
        }
 
-       if (d->flags & DEVFL_KICKME || d->htgt) {
-               d->flags &= ~DEVFL_KICKME;
-               aoecmd_work(d);
+       /* process expired frames */
+       while (!list_empty(&flist)) {
+               pos = flist.next;
+               f = list_entry(pos, struct frame, head);
+               n = f->waited += timeout;
+               n /= HZ;
+               if (n > aoe_deadsecs) {
+                       /* Waited too long.  Device failure.
+                        * Hang all frames on first hash bucket for downdev
+                        * to clean up.
+                        */
+                       list_splice(&flist, &d->factive[0]);
+                       aoedev_downdev(d);
+                       break;
+               }
+               list_del(pos);
+
+               t = f->t;
+               if (n > aoe_deadsecs/2)
+                       d->htgt = t; /* see if another target can help */
+
+               if (t->nout == t->maxout) {
+                       if (t->maxout > 1)
+                               t->maxout--;
+                       t->lastwadj = jiffies;
+               }
+
+               ifp = getif(t, f->skb->dev);
+               if (ifp && ++ifp->lost > (t->nframes << 1)
+               && (ifp != t->ifs || t->ifs[1].nd)) {
+                       ejectif(t, ifp);
+                       ifp = NULL;
+               }
+               resend(d, f);
        }
 
-       __skb_queue_head_init(&queue);
-       skb_queue_splice_init(&d->sendq, &queue);
+       if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
+               d->flags &= ~DEVFL_KICKME;
+               d->blkq->request_fn(d->blkq);
+       }
 
        d->timer.expires = jiffies + TIMERTICK;
        add_timer(&d->timer);
 
        spin_unlock_irqrestore(&d->lock, flags);
+}
 
-       aoenet_xmit(&queue);
+static unsigned long
+rqbiocnt(struct request *r)
+{
+       struct bio *bio;
+       unsigned long n = 0;
+
+       __rq_for_each_bio(bio, r)
+               n++;
+       return n;
+}
+
+/* This can be removed if we are certain that no users of the block
+ * layer will ever use zero-count pages in bios.  Otherwise we have to
+ * protect against the put_page sometimes done by the network layer.
+ *
+ * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+ * discussion.
+ *
+ * We cannot use get_page in the workaround, because it insists on a
+ * positive page count as a precondition.  So we use _count directly.
+ */
+static void
+bio_pageinc(struct bio *bio)
+{
+       struct bio_vec *bv;
+       struct page *page;
+       int i;
+
+       bio_for_each_segment(bv, bio, i) {
+               page = bv->bv_page;
+               /* Non-zero page count for non-head members of
+                * compound pages is no longer allowed by the kernel,
+                * but this has never been seen here.
+                */
+               if (unlikely(PageCompound(page)))
+                       if (compound_trans_head(page) != page) {
+                               pr_crit("page tail used for block I/O\n");
+                               BUG();
+                       }
+               atomic_inc(&page->_count);
+       }
+}
+
+static void
+bio_pagedec(struct bio *bio)
+{
+       struct bio_vec *bv;
+       int i;
+
+       bio_for_each_segment(bv, bio, i)
+               atomic_dec(&bv->bv_page->_count);
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+       struct bio_vec *bv;
+
+       memset(buf, 0, sizeof(*buf));
+       buf->rq = rq;
+       buf->bio = bio;
+       buf->resid = bio->bi_size;
+       buf->sector = bio->bi_sector;
+       bio_pageinc(bio);
+       buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
+       buf->bv_resid = bv->bv_len;
+       WARN_ON(buf->bv_resid == 0);
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+       struct request *rq;
+       struct request_queue *q;
+       struct buf *buf;
+       struct bio *bio;
+
+       q = d->blkq;
+       if (q == NULL)
+               return NULL;    /* initializing */
+       if (d->ip.buf)
+               return d->ip.buf;
+       rq = d->ip.rq;
+       if (rq == NULL) {
+               rq = blk_peek_request(q);
+               if (rq == NULL)
+                       return NULL;
+               blk_start_request(rq);
+               d->ip.rq = rq;
+               d->ip.nxbio = rq->bio;
+               rq->special = (void *) rqbiocnt(rq);
+       }
+       buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+       if (buf == NULL) {
+               pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+               return NULL;
+       }
+       bio = d->ip.nxbio;
+       bufinit(buf, rq, bio);
+       bio = bio->bi_next;
+       d->ip.nxbio = bio;
+       if (bio == NULL)
+               d->ip.rq = NULL;
+       return d->ip.buf = buf;
 }
 
 /* enters with d->lock held */
 void
 aoecmd_work(struct aoedev *d)
 {
-       struct buf *buf;
-loop:
        if (d->htgt && !sthtith(d))
                return;
-       if (d->inprocess == NULL) {
-               if (list_empty(&d->bufq))
-                       return;
-               buf = container_of(d->bufq.next, struct buf, bufs);
-               list_del(d->bufq.next);
-               d->inprocess = buf;
-       }
-       if (aoecmd_ata_rw(d))
-               goto loop;
+       while (aoecmd_ata_rw(d))
+               ;
 }
 
 /* this function performs work that has been deferred until sleeping is OK
@@ -603,28 +785,25 @@ void
 aoecmd_sleepwork(struct work_struct *work)
 {
        struct aoedev *d = container_of(work, struct aoedev, work);
+       struct block_device *bd;
+       u64 ssize;
 
        if (d->flags & DEVFL_GDALLOC)
                aoeblk_gdalloc(d);
 
        if (d->flags & DEVFL_NEWSIZE) {
-               struct block_device *bd;
-               unsigned long flags;
-               u64 ssize;
-
                ssize = get_capacity(d->gd);
                bd = bdget_disk(d->gd, 0);
-
                if (bd) {
                        mutex_lock(&bd->bd_inode->i_mutex);
                        i_size_write(bd->bd_inode, (loff_t)ssize<<9);
                        mutex_unlock(&bd->bd_inode->i_mutex);
                        bdput(bd);
                }
-               spin_lock_irqsave(&d->lock, flags);
+               spin_lock_irq(&d->lock);
                d->flags |= DEVFL_UP;
                d->flags &= ~DEVFL_NEWSIZE;
-               spin_unlock_irqrestore(&d->lock, flags);
+               spin_unlock_irq(&d->lock);
        }
 }
 
@@ -717,163 +896,299 @@ gettgt(struct aoedev *d, char *addr)
        return NULL;
 }
 
-static inline void
-diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
+static void
+bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
+{
+       ulong fcnt;
+       char *p;
+       int soff = 0;
+loop:
+       fcnt = bv->bv_len - (off - bv->bv_offset);
+       if (fcnt > cnt)
+               fcnt = cnt;
+       p = page_address(bv->bv_page) + off;
+       skb_copy_bits(skb, soff, p, fcnt);
+       soff += fcnt;
+       cnt -= fcnt;
+       if (cnt <= 0)
+               return;
+       bv++;
+       off = bv->bv_offset;
+       goto loop;
+}
+
+void
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+       struct bio *bio;
+       int bok;
+       struct request_queue *q;
+
+       q = d->blkq;
+       if (rq == d->ip.rq)
+               d->ip.rq = NULL;
+       do {
+               bio = rq->bio;
+               bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+       } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
+
+       /* cf. http://lkml.org/lkml/2006/10/31/28 */
+       if (!fastfail)
+               q->request_fn(q);
+}
+
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+       struct request *rq;
+       unsigned long n;
+
+       if (buf == d->ip.buf)
+               d->ip.buf = NULL;
+       rq = buf->rq;
+       bio_pagedec(buf->bio);
+       mempool_free(buf, d->bufpool);
+       n = (unsigned long) rq->special;
+       rq->special = (void *) --n;
+       if (n == 0)
+               aoe_end_request(d, rq, 0);
+}
+
+static void
+ktiocomplete(struct frame *f)
 {
-       unsigned long n_sect = bio->bi_size >> 9;
-       const int rw = bio_data_dir(bio);
-       struct hd_struct *part;
-       int cpu;
+       struct aoe_hdr *hin, *hout;
+       struct aoe_atahdr *ahin, *ahout;
+       struct buf *buf;
+       struct sk_buff *skb;
+       struct aoetgt *t;
+       struct aoeif *ifp;
+       struct aoedev *d;
+       long n;
+
+       if (f == NULL)
+               return;
+
+       t = f->t;
+       d = t->d;
+
+       hout = (struct aoe_hdr *) skb_mac_header(f->skb);
+       ahout = (struct aoe_atahdr *) (hout+1);
+       buf = f->buf;
+       skb = f->r_skb;
+       if (skb == NULL)
+               goto noskb;     /* just fail the buf. */
+
+       hin = (struct aoe_hdr *) skb->data;
+       skb_pull(skb, sizeof(*hin));
+       ahin = (struct aoe_atahdr *) skb->data;
+       skb_pull(skb, sizeof(*ahin));
+       if (ahin->cmdstat & 0xa9) {     /* these bits cleared on success */
+               pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
+                       ahout->cmdstat, ahin->cmdstat,
+                       d->aoemajor, d->aoeminor);
+noskb: if (buf)
+                       clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+               goto badrsp;
+       }
+
+       n = ahout->scnt << 9;
+       switch (ahout->cmdstat) {
+       case ATA_CMD_PIO_READ:
+       case ATA_CMD_PIO_READ_EXT:
+               if (skb->len < n) {
+                       pr_err("aoe: runt data size in read.  skb->len=%d need=%ld\n",
+                               skb->len, n);
+                       clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+                       break;
+               }
+               bvcpy(f->bv, f->bv_off, skb, n);
+       case ATA_CMD_PIO_WRITE:
+       case ATA_CMD_PIO_WRITE_EXT:
+               spin_lock_irq(&d->lock);
+               ifp = getif(t, skb->dev);
+               if (ifp)
+                       ifp->lost = 0;
+               if (d->htgt == t) /* I'll help myself, thank you. */
+                       d->htgt = NULL;
+               spin_unlock_irq(&d->lock);
+               break;
+       case ATA_CMD_ID_ATA:
+               if (skb->len < 512) {
+                       pr_info("aoe: runt data size in ataid.  skb->len=%d\n",
+                               skb->len);
+                       break;
+               }
+               if (skb_linearize(skb))
+                       break;
+               spin_lock_irq(&d->lock);
+               ataid_complete(d, t, skb->data);
+               spin_unlock_irq(&d->lock);
+               break;
+       default:
+               pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
+                       ahout->cmdstat,
+                       be16_to_cpu(get_unaligned(&hin->major)),
+                       hin->minor);
+       }
+badrsp:
+       spin_lock_irq(&d->lock);
+
+       aoe_freetframe(f);
+
+       if (buf && --buf->nframesout == 0 && buf->resid == 0)
+               aoe_end_buf(d, buf);
+
+       aoecmd_work(d);
+
+       spin_unlock_irq(&d->lock);
+       aoedev_put(d);
+       dev_kfree_skb(skb);
+}
 
-       cpu = part_stat_lock();
-       part = disk_map_sector_rcu(disk, sector);
+/* Enters with iocq.lock held.
+ * Returns true iff responses needing processing remain.
+ */
+static int
+ktio(void)
+{
+       struct frame *f;
+       struct list_head *pos;
+       int i;
 
-       part_stat_inc(cpu, part, ios[rw]);
-       part_stat_add(cpu, part, ticks[rw], duration);
-       part_stat_add(cpu, part, sectors[rw], n_sect);
-       part_stat_add(cpu, part, io_ticks, duration);
+       for (i = 0; ; ++i) {
+               if (i == MAXIOC)
+                       return 1;
+               if (list_empty(&iocq.head))
+                       return 0;
+               pos = iocq.head.next;
+               list_del(pos);
+               spin_unlock_irq(&iocq.lock);
+               f = list_entry(pos, struct frame, head);
+               ktiocomplete(f);
+               spin_lock_irq(&iocq.lock);
+       }
+}
 
-       part_stat_unlock();
+static int
+kthread(void *vp)
+{
+       struct ktstate *k;
+       DECLARE_WAITQUEUE(wait, current);
+       int more;
+
+       k = vp;
+       current->flags |= PF_NOFREEZE;
+       set_user_nice(current, -10);
+       complete(&k->rendez);   /* tell spawner we're running */
+       do {
+               spin_lock_irq(k->lock);
+               more = k->fn();
+               if (!more) {
+                       add_wait_queue(k->waitq, &wait);
+                       __set_current_state(TASK_INTERRUPTIBLE);
+               }
+               spin_unlock_irq(k->lock);
+               if (!more) {
+                       schedule();
+                       remove_wait_queue(k->waitq, &wait);
+               } else
+                       cond_resched();
+       } while (!kthread_should_stop());
+       complete(&k->rendez);   /* tell spawner we're stopping */
+       return 0;
 }
 
 void
+aoe_ktstop(struct ktstate *k)
+{
+       kthread_stop(k->task);
+       wait_for_completion(&k->rendez);
+}
+
+int
+aoe_ktstart(struct ktstate *k)
+{
+       struct task_struct *task;
+
+       init_completion(&k->rendez);
+       task = kthread_run(kthread, k, k->name);
+       if (task == NULL || IS_ERR(task))
+               return -ENOMEM;
+       k->task = task;
+       wait_for_completion(&k->rendez); /* allow kthread to start */
+       init_completion(&k->rendez);    /* for waiting for exit later */
+       return 0;
+}
+
+/* pass it off to kthreads for processing */
+static void
+ktcomplete(struct frame *f, struct sk_buff *skb)
+{
+       ulong flags;
+
+       f->r_skb = skb;
+       spin_lock_irqsave(&iocq.lock, flags);
+       list_add_tail(&f->head, &iocq.head);
+       spin_unlock_irqrestore(&iocq.lock, flags);
+       wake_up(&ktiowq);
+}
+
+struct sk_buff *
 aoecmd_ata_rsp(struct sk_buff *skb)
 {
-       struct sk_buff_head queue;
        struct aoedev *d;
-       struct aoe_hdr *hin, *hout;
-       struct aoe_atahdr *ahin, *ahout;
+       struct aoe_hdr *h;
        struct frame *f;
-       struct buf *buf;
        struct aoetgt *t;
-       struct aoeif *ifp;
-       register long n;
+       u32 n;
        ulong flags;
        char ebuf[128];
        u16 aoemajor;
 
-       hin = (struct aoe_hdr *) skb_mac_header(skb);
-       aoemajor = get_unaligned_be16(&hin->major);
-       d = aoedev_by_aoeaddr(aoemajor, hin->minor);
+       h = (struct aoe_hdr *) skb->data;
+       aoemajor = be16_to_cpu(get_unaligned(&h->major));
+       d = aoedev_by_aoeaddr(aoemajor, h->minor);
        if (d == NULL) {
                snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
                        "for unknown device %d.%d\n",
-                        aoemajor, hin->minor);
+                       aoemajor, h->minor);
                aoechr_error(ebuf);
-               return;
+               return skb;
        }
 
        spin_lock_irqsave(&d->lock, flags);
 
-       n = get_unaligned_be32(&hin->tag);
-       t = gettgt(d, hin->src);
-       if (t == NULL) {
-               printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
-                       d->aoemajor, d->aoeminor, hin->src);
-               spin_unlock_irqrestore(&d->lock, flags);
-               return;
-       }
-       f = getframe(t, n);
+       n = be32_to_cpu(get_unaligned(&h->tag));
+       f = getframe(d, n);
        if (f == NULL) {
                calc_rttavg(d, -tsince(n));
                spin_unlock_irqrestore(&d->lock, flags);
+               aoedev_put(d);
                snprintf(ebuf, sizeof ebuf,
                        "%15s e%d.%d    tag=%08x@%08lx\n",
                        "unexpected rsp",
-                       get_unaligned_be16(&hin->major),
-                       hin->minor,
-                       get_unaligned_be32(&hin->tag),
+                       get_unaligned_be16(&h->major),
+                       h->minor,
+                       get_unaligned_be32(&h->tag),
                        jiffies);
                aoechr_error(ebuf);
-               return;
+               return skb;
        }
-
+       t = f->t;
        calc_rttavg(d, tsince(f->tag));
-
-       ahin = (struct aoe_atahdr *) (hin+1);
-       hout = (struct aoe_hdr *) skb_mac_header(f->skb);
-       ahout = (struct aoe_atahdr *) (hout+1);
-       buf = f->buf;
-
-       if (ahin->cmdstat & 0xa9) {     /* these bits cleared on success */
-               printk(KERN_ERR
-                       "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
-                       ahout->cmdstat, ahin->cmdstat,
-                       d->aoemajor, d->aoeminor);
-               if (buf)
-                       buf->flags |= BUFFL_FAIL;
-       } else {
-               if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
-                       d->htgt = NULL;
-               n = ahout->scnt << 9;
-               switch (ahout->cmdstat) {
-               case ATA_CMD_PIO_READ:
-               case ATA_CMD_PIO_READ_EXT:
-                       if (skb->len - sizeof *hin - sizeof *ahin < n) {
-                               printk(KERN_ERR
-                                       "aoe: %s.  skb->len=%d need=%ld\n",
-                                       "runt data size in read", skb->len, n);
-                               /* fail frame f?  just returning will rexmit. */
-                               spin_unlock_irqrestore(&d->lock, flags);
-                               return;
-                       }
-                       memcpy(f->bufaddr, ahin+1, n);
-               case ATA_CMD_PIO_WRITE:
-               case ATA_CMD_PIO_WRITE_EXT:
-                       ifp = getif(t, skb->dev);
-                       if (ifp) {
-                               ifp->lost = 0;
-                               if (n > DEFAULTBCNT)
-                                       ifp->lostjumbo = 0;
-                       }
-                       if (f->bcnt -= n) {
-                               f->lba += n >> 9;
-                               f->bufaddr += n;
-                               resend(d, t, f);
-                               goto xmit;
-                       }
-                       break;
-               case ATA_CMD_ID_ATA:
-                       if (skb->len - sizeof *hin - sizeof *ahin < 512) {
-                               printk(KERN_INFO
-                                       "aoe: runt data size in ataid.  skb->len=%d\n",
-                                       skb->len);
-                               spin_unlock_irqrestore(&d->lock, flags);
-                               return;
-                       }
-                       ataid_complete(d, t, (char *) (ahin+1));
-                       break;
-               default:
-                       printk(KERN_INFO
-                               "aoe: unrecognized ata command %2.2Xh for %d.%d\n",
-                               ahout->cmdstat,
-                               get_unaligned_be16(&hin->major),
-                               hin->minor);
-               }
-       }
-
-       if (buf && --buf->nframesout == 0 && buf->resid == 0) {
-               diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
-               if (buf->flags & BUFFL_FAIL)
-                       bio_endio(buf->bio, -EIO);
-               else {
-                       bio_flush_dcache_pages(buf->bio);
-                       bio_endio(buf->bio, 0);
-               }
-               mempool_free(buf, d->bufpool);
-       }
-
-       f->buf = NULL;
-       f->tag = FREETAG;
        t->nout--;
-
        aoecmd_work(d);
-xmit:
-       __skb_queue_head_init(&queue);
-       skb_queue_splice_init(&d->sendq, &queue);
 
        spin_unlock_irqrestore(&d->lock, flags);
-       aoenet_xmit(&queue);
+
+       ktcomplete(f, skb);
+
+       /*
+        * Note here that we do not perform an aoedev_put, as we are
+        * leaving this reference for the ktio to release.
+        */
+       return NULL;
 }
 
 void
@@ -895,7 +1210,7 @@ aoecmd_ata_id(struct aoedev *d)
        struct sk_buff *skb;
        struct aoetgt *t;
 
-       f = freeframe(d);
+       f = newframe(d);
        if (f == NULL)
                return NULL;
 
@@ -908,6 +1223,7 @@ aoecmd_ata_id(struct aoedev *d)
        skb_put(skb, sizeof *h + sizeof *ah);
        memset(h, 0, skb->len);
        f->tag = aoehdr_atainit(d, t, h);
+       fhash(f);
        t->nout++;
        f->waited = 0;
 
@@ -928,7 +1244,6 @@ static struct aoetgt *
 addtgt(struct aoedev *d, char *addr, ulong nframes)
 {
        struct aoetgt *t, **tt, **te;
-       struct frame *f, *e;
 
        tt = d->targets;
        te = tt + NTARGETS;
@@ -940,26 +1255,73 @@ addtgt(struct aoedev *d, char *addr, ulong nframes)
                        "aoe: device addtgt failure; too many targets\n");
                return NULL;
        }
-       t = kcalloc(1, sizeof *t, GFP_ATOMIC);
-       f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
-       if (!t || !f) {
-               kfree(f);
-               kfree(t);
+       t = kzalloc(sizeof(*t), GFP_ATOMIC);
+       if (!t) {
                printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
                return NULL;
        }
 
+       d->ntargets++;
        t->nframes = nframes;
-       t->frames = f;
-       e = f + nframes;
-       for (; f < e; f++)
-               f->tag = FREETAG;
+       t->d = d;
        memcpy(t->addr, addr, sizeof t->addr);
        t->ifp = t->ifs;
        t->maxout = t->nframes;
+       INIT_LIST_HEAD(&t->ffree);
        return *tt = t;
 }
 
+static void
+setdbcnt(struct aoedev *d)
+{
+       struct aoetgt **t, **e;
+       int bcnt = 0;
+
+       t = d->targets;
+       e = t + NTARGETS;
+       for (; t < e && *t; t++)
+               if (bcnt == 0 || bcnt > (*t)->minbcnt)
+                       bcnt = (*t)->minbcnt;
+       if (bcnt != d->maxbcnt) {
+               d->maxbcnt = bcnt;
+               pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
+                       d->aoemajor, d->aoeminor, bcnt);
+       }
+}
+
+static void
+setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
+{
+       struct aoedev *d;
+       struct aoeif *p, *e;
+       int minbcnt;
+
+       d = t->d;
+       minbcnt = bcnt;
+       p = t->ifs;
+       e = p + NAOEIFS;
+       for (; p < e; p++) {
+               if (p->nd == NULL)
+                       break;          /* end of the valid interfaces */
+               if (p->nd == nd) {
+                       p->bcnt = bcnt; /* we're updating */
+                       nd = NULL;
+               } else if (minbcnt > p->bcnt)
+                       minbcnt = p->bcnt; /* find the min interface */
+       }
+       if (nd) {
+               if (p == e) {
+                       pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
+                       return;
+               }
+               dev_hold(nd);
+               p->nd = nd;
+               p->bcnt = bcnt;
+       }
+       t->minbcnt = minbcnt;
+       setdbcnt(d);
+}
+
 void
 aoecmd_cfg_rsp(struct sk_buff *skb)
 {
@@ -967,11 +1329,12 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
        struct aoe_hdr *h;
        struct aoe_cfghdr *ch;
        struct aoetgt *t;
-       struct aoeif *ifp;
        ulong flags, sysminor, aoemajor;
        struct sk_buff *sl;
+       struct sk_buff_head queue;
        u16 n;
 
+       sl = NULL;
        h = (struct aoe_hdr *) skb_mac_header(skb);
        ch = (struct aoe_cfghdr *) (h+1);
 
@@ -985,6 +1348,13 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
                        "Check shelf dip switches.\n");
                return;
        }
+       if (h->minor >= NPERSHELF) {
+               pr_err("aoe: e%ld.%d %s, %d\n",
+                       aoemajor, h->minor,
+                       "slot number larger than the maximum",
+                       NPERSHELF-1);
+               return;
+       }
 
        sysminor = SYSMINOR(aoemajor, h->minor);
        if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
@@ -1008,52 +1378,26 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
        t = gettgt(d, h->src);
        if (!t) {
                t = addtgt(d, h->src, n);
-               if (!t) {
-                       spin_unlock_irqrestore(&d->lock, flags);
-                       return;
-               }
-       }
-       ifp = getif(t, skb->dev);
-       if (!ifp) {
-               ifp = addif(t, skb->dev);
-               if (!ifp) {
-                       printk(KERN_INFO
-                               "aoe: device addif failure; "
-                               "too many interfaces?\n");
-                       spin_unlock_irqrestore(&d->lock, flags);
-                       return;
-               }
-       }
-       if (ifp->maxbcnt) {
-               n = ifp->nd->mtu;
-               n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
-               n /= 512;
-               if (n > ch->scnt)
-                       n = ch->scnt;
-               n = n ? n * 512 : DEFAULTBCNT;
-               if (n != ifp->maxbcnt) {
-                       printk(KERN_INFO
-                               "aoe: e%ld.%d: setting %d%s%s:%pm\n",
-                               d->aoemajor, d->aoeminor, n,
-                               " byte data frames on ", ifp->nd->name,
-                               t->addr);
-                       ifp->maxbcnt = n;
-               }
+               if (!t)
+                       goto bail;
        }
+       n = skb->dev->mtu;
+       n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
+       n /= 512;
+       if (n > ch->scnt)
+               n = ch->scnt;
+       n = n ? n * 512 : DEFAULTBCNT;
+       setifbcnt(t, skb->dev, n);
 
        /* don't change users' perspective */
-       if (d->nopen) {
-               spin_unlock_irqrestore(&d->lock, flags);
-               return;
+       if (d->nopen == 0) {
+               d->fw_ver = be16_to_cpu(ch->fwver);
+               sl = aoecmd_ata_id(d);
        }
-       d->fw_ver = be16_to_cpu(ch->fwver);
-
-       sl = aoecmd_ata_id(d);
-
+bail:
        spin_unlock_irqrestore(&d->lock, flags);
-
+       aoedev_put(d);
        if (sl) {
-               struct sk_buff_head queue;
                __skb_queue_head_init(&queue);
                __skb_queue_tail(&queue, sl);
                aoenet_xmit(&queue);
@@ -1064,20 +1408,74 @@ void
 aoecmd_cleanslate(struct aoedev *d)
 {
        struct aoetgt **t, **te;
-       struct aoeif *p, *e;
 
        d->mintimer = MINTIMER;
+       d->maxbcnt = 0;
 
        t = d->targets;
        te = t + NTARGETS;
-       for (; t < te && *t; t++) {
+       for (; t < te && *t; t++)
                (*t)->maxout = (*t)->nframes;
-               p = (*t)->ifs;
-               e = p + NAOEIFS;
-               for (; p < e; p++) {
-                       p->lostjumbo = 0;
-                       p->lost = 0;
-                       p->maxbcnt = DEFAULTBCNT;
+}
+
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+       if (buf == NULL)
+               return;
+       buf->resid = 0;
+       clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+       if (buf->nframesout == 0)
+               aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
+{
+       struct frame *f;
+       struct aoedev *d;
+       LIST_HEAD(flist);
+       struct list_head *pos;
+       struct sk_buff *skb;
+       ulong flags;
+
+       spin_lock_irqsave(&iocq.lock, flags);
+       list_splice_init(&iocq.head, &flist);
+       spin_unlock_irqrestore(&iocq.lock, flags);
+       while (!list_empty(&flist)) {
+               pos = flist.next;
+               list_del(pos);
+               f = list_entry(pos, struct frame, head);
+               d = f->t->d;
+               skb = f->r_skb;
+               spin_lock_irqsave(&d->lock, flags);
+               if (f->buf) {
+                       f->buf->nframesout--;
+                       aoe_failbuf(d, f->buf);
                }
+               aoe_freetframe(f);
+               spin_unlock_irqrestore(&d->lock, flags);
+               dev_kfree_skb(skb);
+               aoedev_put(d);
        }
 }
+
+int __init
+aoecmd_init(void)
+{
+       INIT_LIST_HEAD(&iocq.head);
+       spin_lock_init(&iocq.lock);
+       init_waitqueue_head(&ktiowq);
+       kts.name = "aoe_ktio";
+       kts.fn = ktio;
+       kts.waitq = &ktiowq;
+       kts.lock = &iocq.lock;
+       return aoe_ktstart(&kts);
+}
+
+void
+aoecmd_exit(void)
+{
+       aoe_ktstop(&kts);
+       aoe_flush_iocq();
+}
index 6b5110a474582fb296e7abbb0076c99a466d7fc8..ccaecff4c69b1514dce4405d9e293217f23e8c7c 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoedev.c
  * AoE device utility functions; maintains device list.
@@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d);
 static struct aoedev *devlist;
 static DEFINE_SPINLOCK(devlist_lock);
 
+/*
+ * Users who grab a pointer to the device with aoedev_by_aoeaddr or
+ * aoedev_by_sysminor_m automatically get a reference count and must
+ * be responsible for performing a aoedev_put.  With the addition of
+ * async kthread processing I'm no longer confident that we can
+ * guarantee consistency in the face of device flushes.
+ *
+ * For the time being, we only bother to add extra references for
+ * frames sitting on the iocq.  When the kthreads finish processing
+ * these frames, they will aoedev_put the device.
+ */
 struct aoedev *
 aoedev_by_aoeaddr(int maj, int min)
 {
@@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min)
        spin_lock_irqsave(&devlist_lock, flags);
 
        for (d=devlist; d; d=d->next)
-               if (d->aoemajor == maj && d->aoeminor == min)
+               if (d->aoemajor == maj && d->aoeminor == min) {
+                       d->ref++;
                        break;
+               }
 
        spin_unlock_irqrestore(&devlist_lock, flags);
        return d;
 }
 
+void
+aoedev_put(struct aoedev *d)
+{
+       ulong flags;
+
+       spin_lock_irqsave(&devlist_lock, flags);
+       d->ref--;
+       spin_unlock_irqrestore(&devlist_lock, flags);
+}
+
 static void
 dummy_timer(ulong vp)
 {
@@ -47,54 +70,74 @@ dummy_timer(ulong vp)
        add_timer(&d->timer);
 }
 
+static void
+aoe_failip(struct aoedev *d)
+{
+       struct request *rq;
+       struct bio *bio;
+       unsigned long n;
+
+       aoe_failbuf(d, d->ip.buf);
+
+       rq = d->ip.rq;
+       if (rq == NULL)
+               return;
+       while ((bio = d->ip.nxbio)) {
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+               d->ip.nxbio = bio->bi_next;
+               n = (unsigned long) rq->special;
+               rq->special = (void *) --n;
+       }
+       if ((unsigned long) rq->special == 0)
+               aoe_end_request(d, rq, 0);
+}
+
 void
 aoedev_downdev(struct aoedev *d)
 {
-       struct aoetgt **t, **te;
-       struct frame *f, *e;
-       struct buf *buf;
-       struct bio *bio;
+       struct aoetgt *t, **tt, **te;
+       struct frame *f;
+       struct list_head *head, *pos, *nx;
+       struct request *rq;
+       int i;
 
-       t = d->targets;
-       te = t + NTARGETS;
-       for (; t < te && *t; t++) {
-               f = (*t)->frames;
-               e = f + (*t)->nframes;
-               for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) {
-                       if (f->tag == FREETAG || f->buf == NULL)
-                               continue;
-                       buf = f->buf;
-                       bio = buf->bio;
-                       if (--buf->nframesout == 0
-                       && buf != d->inprocess) {
-                               mempool_free(buf, d->bufpool);
-                               bio_endio(bio, -EIO);
+       d->flags &= ~DEVFL_UP;
+
+       /* clean out active buffers */
+       for (i = 0; i < NFACTIVE; i++) {
+               head = &d->factive[i];
+               list_for_each_safe(pos, nx, head) {
+                       f = list_entry(pos, struct frame, head);
+                       list_del(pos);
+                       if (f->buf) {
+                               f->buf->nframesout--;
+                               aoe_failbuf(d, f->buf);
                        }
+                       aoe_freetframe(f);
                }
-               (*t)->maxout = (*t)->nframes;
-               (*t)->nout = 0;
        }
-       buf = d->inprocess;
-       if (buf) {
-               bio = buf->bio;
-               mempool_free(buf, d->bufpool);
-               bio_endio(bio, -EIO);
+       /* reset window dressings */
+       tt = d->targets;
+       te = tt + NTARGETS;
+       for (; tt < te && (t = *tt); tt++) {
+               t->maxout = t->nframes;
+               t->nout = 0;
        }
-       d->inprocess = NULL;
+
+       /* clean out the in-process request (if any) */
+       aoe_failip(d);
        d->htgt = NULL;
 
-       while (!list_empty(&d->bufq)) {
-               buf = container_of(d->bufq.next, struct buf, bufs);
-               list_del(d->bufq.next);
-               bio = buf->bio;
-               mempool_free(buf, d->bufpool);
-               bio_endio(bio, -EIO);
+       /* fast fail all pending I/O */
+       if (d->blkq) {
+               while ((rq = blk_peek_request(d->blkq))) {
+                       blk_start_request(rq);
+                       aoe_end_request(d, rq, 1);
+               }
        }
 
        if (d->gd)
                set_capacity(d->gd, 0);
-
-       d->flags &= ~DEVFL_UP;
 }
 
 static void
@@ -107,6 +150,7 @@ aoedev_freedev(struct aoedev *d)
                aoedisk_rm_sysfs(d);
                del_gendisk(d->gd);
                put_disk(d->gd);
+               blk_cleanup_queue(d->blkq);
        }
        t = d->targets;
        e = t + NTARGETS;
@@ -115,7 +159,6 @@ aoedev_freedev(struct aoedev *d)
        if (d->bufpool)
                mempool_destroy(d->bufpool);
        skbpoolfree(d);
-       blk_cleanup_queue(d->blkq);
        kfree(d);
 }
 
@@ -142,7 +185,8 @@ aoedev_flush(const char __user *str, size_t cnt)
                spin_lock(&d->lock);
                if ((!all && (d->flags & DEVFL_UP))
                || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
-               || d->nopen) {
+               || d->nopen
+               || d->ref) {
                        spin_unlock(&d->lock);
                        dd = &d->next;
                        continue;
@@ -163,12 +207,15 @@ aoedev_flush(const char __user *str, size_t cnt)
        return 0;
 }
 
-/* I'm not really sure that this is a realistic problem, but if the
-network driver goes gonzo let's just leak memory after complaining. */
+/* This has been confirmed to occur once with Tms=3*1000 due to the
+ * driver changing link and not processing its transmit ring.  The
+ * problem is hard enough to solve by returning an error that I'm
+ * still punting on "solving" this.
+ */
 static void
 skbfree(struct sk_buff *skb)
 {
-       enum { Sms = 100, Tms = 3*1000};
+       enum { Sms = 250, Tms = 30 * 1000};
        int i = Tms / Sms;
 
        if (skb == NULL)
@@ -182,6 +229,7 @@ skbfree(struct sk_buff *skb)
                        "cannot free skb -- memory leaked.");
                return;
        }
+       skb->truesize -= skb->data_len;
        skb_shinfo(skb)->nr_frags = skb->data_len = 0;
        skb_trim(skb, 0);
        dev_kfree_skb(skb);
@@ -203,13 +251,16 @@ struct aoedev *
 aoedev_by_sysminor_m(ulong sysminor)
 {
        struct aoedev *d;
+       int i;
        ulong flags;
 
        spin_lock_irqsave(&devlist_lock, flags);
 
        for (d=devlist; d; d=d->next)
-               if (d->sysminor == sysminor)
+               if (d->sysminor == sysminor) {
+                       d->ref++;
                        break;
+               }
        if (d)
                goto out;
        d = kcalloc(1, sizeof *d, GFP_ATOMIC);
@@ -217,7 +268,6 @@ aoedev_by_sysminor_m(ulong sysminor)
                goto out;
        INIT_WORK(&d->work, aoecmd_sleepwork);
        spin_lock_init(&d->lock);
-       skb_queue_head_init(&d->sendq);
        skb_queue_head_init(&d->skbpool);
        init_timer(&d->timer);
        d->timer.data = (ulong) d;
@@ -226,7 +276,9 @@ aoedev_by_sysminor_m(ulong sysminor)
        add_timer(&d->timer);
        d->bufpool = NULL;      /* defer to aoeblk_gdalloc */
        d->tgt = d->targets;
-       INIT_LIST_HEAD(&d->bufq);
+       d->ref = 1;
+       for (i = 0; i < NFACTIVE; i++)
+               INIT_LIST_HEAD(&d->factive[i]);
        d->sysminor = sysminor;
        d->aoemajor = AOEMAJOR(sysminor);
        d->aoeminor = AOEMINOR(sysminor);
@@ -241,13 +293,23 @@ aoedev_by_sysminor_m(ulong sysminor)
 static void
 freetgt(struct aoedev *d, struct aoetgt *t)
 {
-       struct frame *f, *e;
+       struct frame *f;
+       struct list_head *pos, *nx, *head;
+       struct aoeif *ifp;
 
-       f = t->frames;
-       e = f + t->nframes;
-       for (; f < e; f++)
+       for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
+               if (!ifp->nd)
+                       break;
+               dev_put(ifp->nd);
+       }
+
+       head = &t->ffree;
+       list_for_each_safe(pos, nx, head) {
+               list_del(pos);
+               f = list_entry(pos, struct frame, head);
                skbfree(f->skb);
-       kfree(t->frames);
+               kfree(f);
+       }
        kfree(t);
 }
 
@@ -257,6 +319,7 @@ aoedev_exit(void)
        struct aoedev *d;
        ulong flags;
 
+       aoe_flush_iocq();
        while ((d = devlist)) {
                devlist = d->next;
 
index 7f83ad90e76fd9f4e971ec4d00826eea2e1cffe7..04793c2c701b7b0bf707a576a5a9e1a01bccc376 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoemain.c
  * Module initialization routines, discover timer
@@ -61,6 +61,7 @@ aoe_exit(void)
 
        aoenet_exit();
        unregister_blkdev(AOE_MAJOR, DEVICE_NAME);
+       aoecmd_exit();
        aoechr_exit();
        aoedev_exit();
        aoeblk_exit();          /* free cache after de-allocating bufs */
@@ -83,17 +84,20 @@ aoe_init(void)
        ret = aoenet_init();
        if (ret)
                goto net_fail;
+       ret = aoecmd_init();
+       if (ret)
+               goto cmd_fail;
        ret = register_blkdev(AOE_MAJOR, DEVICE_NAME);
        if (ret < 0) {
                printk(KERN_ERR "aoe: can't register major\n");
                goto blkreg_fail;
        }
-
        printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION);
        discover_timer(TINIT);
        return 0;
-
  blkreg_fail:
+       aoecmd_exit();
+ cmd_fail:
        aoenet_exit();
  net_fail:
        aoeblk_exit();
index 4d3bc0d49df59394ea550a74c05c4ad0c84c436b..162c6471275c6792454871732e1d4d21cefa04ac 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoenet.c
  * Ethernet portion of AoE driver
@@ -33,6 +33,9 @@ static char aoe_iflist[IFLISTSZ];
 module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600);
 MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\"");
 
+static wait_queue_head_t txwq;
+static struct ktstate kts;
+
 #ifndef MODULE
 static int __init aoe_iflist_setup(char *str)
 {
@@ -44,6 +47,23 @@ static int __init aoe_iflist_setup(char *str)
 __setup("aoe_iflist=", aoe_iflist_setup);
 #endif
 
+static spinlock_t txlock;
+static struct sk_buff_head skbtxq;
+
+/* enters with txlock held */
+static int
+tx(void)
+{
+       struct sk_buff *skb;
+
+       while ((skb = skb_dequeue(&skbtxq))) {
+               spin_unlock_irq(&txlock);
+               dev_queue_xmit(skb);
+               spin_lock_irq(&txlock);
+       }
+       return 0;
+}
+
 int
 is_aoe_netif(struct net_device *ifp)
 {
@@ -88,10 +108,14 @@ void
 aoenet_xmit(struct sk_buff_head *queue)
 {
        struct sk_buff *skb, *tmp;
+       ulong flags;
 
        skb_queue_walk_safe(queue, skb, tmp) {
                __skb_unlink(skb, queue);
-               dev_queue_xmit(skb);
+               spin_lock_irqsave(&txlock, flags);
+               skb_queue_tail(&skbtxq, skb);
+               spin_unlock_irqrestore(&txlock, flags);
+               wake_up(&txwq);
        }
 }
 
@@ -102,7 +126,9 @@ static int
 aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct aoe_hdr *h;
+       struct aoe_atahdr *ah;
        u32 n;
+       int sn;
 
        if (dev_net(ifp) != &init_net)
                goto exit;
@@ -110,13 +136,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (skb == NULL)
                return 0;
-       if (skb_linearize(skb))
-               goto exit;
        if (!is_aoe_netif(ifp))
                goto exit;
        skb_push(skb, ETH_HLEN);        /* (1) */
-
-       h = (struct aoe_hdr *) skb_mac_header(skb);
+       sn = sizeof(*h) + sizeof(*ah);
+       if (skb->len >= sn) {
+               sn -= skb_headlen(skb);
+               if (sn > 0 && !__pskb_pull_tail(skb, sn))
+                       goto exit;
+       }
+       h = (struct aoe_hdr *) skb->data;
        n = get_unaligned_be32(&h->tag);
        if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
                goto exit;
@@ -137,7 +166,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
 
        switch (h->cmd) {
        case AOECMD_ATA:
-               aoecmd_ata_rsp(skb);
+               /* ata_rsp may keep skb for later processing or give it back */
+               skb = aoecmd_ata_rsp(skb);
                break;
        case AOECMD_CFG:
                aoecmd_cfg_rsp(skb);
@@ -145,8 +175,12 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
        default:
                if (h->cmd >= AOECMD_VEND_MIN)
                        break;  /* don't complain about vendor commands */
-               printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd);
+               pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd);
+               break;
        }
+
+       if (!skb)
+               return 0;
 exit:
        dev_kfree_skb(skb);
        return 0;
@@ -160,6 +194,15 @@ static struct packet_type aoe_pt __read_mostly = {
 int __init
 aoenet_init(void)
 {
+       skb_queue_head_init(&skbtxq);
+       init_waitqueue_head(&txwq);
+       spin_lock_init(&txlock);
+       kts.lock = &txlock;
+       kts.fn = tx;
+       kts.waitq = &txwq;
+       kts.name = "aoe_tx";
+       if (aoe_ktstart(&kts))
+               return -EAGAIN;
        dev_add_pack(&aoe_pt);
        return 0;
 }
@@ -167,6 +210,8 @@ aoenet_init(void)
 void
 aoenet_exit(void)
 {
+       aoe_ktstop(&kts);
+       skb_queue_purge(&skbtxq);
        dev_remove_pack(&aoe_pt);
 }
 
index ca83f96756ad86b2a339971050b7378f9a9752d9..6526157edafc151e2761adf740a6404a6c046f26 100644 (file)
@@ -41,8 +41,9 @@
 #include <linux/spinlock.h>
 #include <linux/compat.h>
 #include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <linux/io.h>
 #include <asm/uaccess.h>
-#include <asm/io.h>
 
 #include <linux/dma-mapping.h>
 #include <linux/blkdev.h>
@@ -978,8 +979,7 @@ static CommandList_struct *cmd_alloc(ctlr_info_t *h)
                i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
                if (i == h->nr_cmds)
                        return NULL;
-       } while (test_and_set_bit(i & (BITS_PER_LONG - 1),
-                 h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
+       } while (test_and_set_bit(i, h->cmd_pool_bits) != 0);
        c = h->cmd_pool + i;
        memset(c, 0, sizeof(CommandList_struct));
        cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct);
@@ -1046,8 +1046,7 @@ static void cmd_free(ctlr_info_t *h, CommandList_struct *c)
        int i;
 
        i = c - h->cmd_pool;
-       clear_bit(i & (BITS_PER_LONG - 1),
-                 h->cmd_pool_bits + (i / BITS_PER_LONG));
+       clear_bit(i, h->cmd_pool_bits);
        h->nr_frees++;
 }
 
@@ -4268,10 +4267,7 @@ static void __devinit cciss_find_board_params(ctlr_info_t *h)
 
 static inline bool CISS_signature_present(ctlr_info_t *h)
 {
-       if ((readb(&h->cfgtable->Signature[0]) != 'C') ||
-           (readb(&h->cfgtable->Signature[1]) != 'I') ||
-           (readb(&h->cfgtable->Signature[2]) != 'S') ||
-           (readb(&h->cfgtable->Signature[3]) != 'S')) {
+       if (!check_signature(h->cfgtable->Signature, "CISS", 4)) {
                dev_warn(&h->pdev->dev, "not a valid CISS config table\n");
                return false;
        }
@@ -4812,8 +4808,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
 
 static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h)
 {
-       h->cmd_pool_bits = kmalloc(
-               DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) *
+       h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) *
                sizeof(unsigned long), GFP_KERNEL);
        h->cmd_pool = pci_alloc_consistent(h->pdev,
                h->nr_cmds * sizeof(CommandList_struct),
@@ -5068,9 +5063,7 @@ reinit_after_soft_reset:
        pci_set_drvdata(pdev, h);
        /* command and error info recs zeroed out before
           they are used */
-       memset(h->cmd_pool_bits, 0,
-              DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG)
-                       * sizeof(unsigned long));
+       bitmap_zero(h->cmd_pool_bits, h->nr_cmds);
 
        h->num_luns = 0;
        h->highest_lun = -1;
index 0c03411c59eb8cba0f744e72aed93342dc9cace8..043ddcca4abf936d576a03c1a9e7241e5d59d2eb 100644 (file)
@@ -78,6 +78,8 @@ static const char *ioctl_cmd_to_ascii(int cmd)
        case NBD_SET_SOCK: return "set-sock";
        case NBD_SET_BLKSIZE: return "set-blksize";
        case NBD_SET_SIZE: return "set-size";
+       case NBD_SET_TIMEOUT: return "set-timeout";
+       case NBD_SET_FLAGS: return "set-flags";
        case NBD_DO_IT: return "do-it";
        case NBD_CLEAR_SOCK: return "clear-sock";
        case NBD_CLEAR_QUE: return "clear-que";
@@ -96,6 +98,7 @@ static const char *nbdcmd_to_ascii(int cmd)
        case  NBD_CMD_READ: return "read";
        case NBD_CMD_WRITE: return "write";
        case  NBD_CMD_DISC: return "disconnect";
+       case  NBD_CMD_TRIM: return "trim/discard";
        }
        return "invalid";
 }
@@ -467,8 +470,12 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 
        nbd_cmd(req) = NBD_CMD_READ;
        if (rq_data_dir(req) == WRITE) {
-               nbd_cmd(req) = NBD_CMD_WRITE;
-               if (nbd->flags & NBD_READ_ONLY) {
+               if ((req->cmd_flags & REQ_DISCARD)) {
+                       WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM));
+                       nbd_cmd(req) = NBD_CMD_TRIM;
+               } else
+                       nbd_cmd(req) = NBD_CMD_WRITE;
+               if (nbd->flags & NBD_FLAG_READ_ONLY) {
                        dev_err(disk_to_dev(nbd->disk),
                                "Write on read-only\n");
                        goto error_out;
@@ -651,6 +658,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
                nbd->xmit_timeout = arg * HZ;
                return 0;
 
+       case NBD_SET_FLAGS:
+               nbd->flags = arg;
+               return 0;
+
        case NBD_SET_SIZE_BLOCKS:
                nbd->bytesize = ((u64) arg) * nbd->blksize;
                bdev->bd_inode->i_size = nbd->bytesize;
@@ -670,6 +681,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 
                mutex_unlock(&nbd->tx_lock);
 
+               if (nbd->flags & NBD_FLAG_SEND_TRIM)
+                       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+                               nbd->disk->queue);
+
                thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
                if (IS_ERR(thread)) {
                        mutex_lock(&nbd->tx_lock);
@@ -687,6 +702,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
                nbd->file = NULL;
                nbd_clear_que(nbd);
                dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
+               queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
                if (file)
                        fput(file);
                nbd->bytesize = 0;
@@ -805,6 +821,9 @@ static int __init nbd_init(void)
                 * Tell the block layer that we are not a rotational device
                 */
                queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
+               disk->queue->limits.discard_granularity = 512;
+               disk->queue->limits.max_discard_sectors = UINT_MAX;
+               disk->queue->limits.discard_zeroes_data = 0;
        }
 
        if (register_blkdev(NBD_MAJOR, "nbd")) {
index 47ff7e470d87dc15cd0f8ba558b00c1e45e5ecbc..161b1094eb47394baab373824f41f5964da1047f 100644 (file)
@@ -507,7 +507,7 @@ static int mbcs_gscr_mmap(struct file *fp, struct vm_area_struct *vma)
 
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
-       /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
+       /* Remap-pfn-range will mark the range VM_IO */
        if (remap_pfn_range(vma,
                            vma->vm_start,
                            __pa(soft->gscr_addr) >> PAGE_SHIFT,
index e5eedfa24c91010bd8358db7b7076e4f0af6ad1b..0537903c985b653bfd78d3005133267f43bdf538 100644 (file)
@@ -322,7 +322,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
 
        vma->vm_ops = &mmap_mem_ops;
 
-       /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */
+       /* Remap-pfn-range will mark the range VM_IO */
        if (remap_pfn_range(vma,
                            vma->vm_start,
                            vma->vm_pgoff,
index 845f97fd18326fbe2301a515da53483d966907a8..e1f60f968fddc8e55b932f4d2ac185835871dfb3 100644 (file)
@@ -286,7 +286,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
        atomic_set(&vdata->refcnt, 1);
        vma->vm_private_data = vdata;
 
-       vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND);
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED)
                vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
        vma->vm_ops = &mspec_vm_ops;
index 3491654cdf7b84fdcd2253f71542b3a53dcefdb9..a815d44c70a41b802c771dea337bf2e5348592a8 100644 (file)
@@ -582,7 +582,7 @@ void dmaengine_get(void)
                                list_del_rcu(&device->global_node);
                                break;
                        } else if (err)
-                               pr_err("%s: failed to get %s: (%d)\n",
+                               pr_debug("%s: failed to get %s: (%d)\n",
                                       __func__, dma_chan_name(chan), err);
                }
        }
index b298158cb9224dd24eb65c5c00d2733b5d6cfc98..fd3ae6290d71ecfd927cc4447a942c099bd223ea 100644 (file)
@@ -16,6 +16,7 @@
  */
 static char dmi_empty_string[] = "        ";
 
+static u16 __initdata dmi_ver;
 /*
  * Catch too early calls to dmi_check_system():
  */
@@ -118,12 +119,12 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *,
        return 0;
 }
 
-static int __init dmi_checksum(const u8 *buf)
+static int __init dmi_checksum(const u8 *buf, u8 len)
 {
        u8 sum = 0;
        int a;
 
-       for (a = 0; a < 15; a++)
+       for (a = 0; a < len; a++)
                sum += buf[a];
 
        return sum == 0;
@@ -161,8 +162,10 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int inde
                return;
 
        for (i = 0; i < 16 && (is_ff || is_00); i++) {
-               if(d[i] != 0x00) is_ff = 0;
-               if(d[i] != 0xFF) is_00 = 0;
+               if (d[i] != 0x00)
+                       is_00 = 0;
+               if (d[i] != 0xFF)
+                       is_ff = 0;
        }
 
        if (is_ff || is_00)
@@ -172,7 +175,15 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int inde
        if (!s)
                return;
 
-       sprintf(s, "%pUB", d);
+       /*
+        * As of version 2.6 of the SMBIOS specification, the first 3 fields of
+        * the UUID are supposed to be little-endian encoded.  The specification
+        * says that this is the defacto standard.
+        */
+       if (dmi_ver >= 0x0206)
+               sprintf(s, "%pUL", d);
+       else
+               sprintf(s, "%pUB", d);
 
         dmi_ident[slot] = s;
 }
@@ -404,29 +415,57 @@ static int __init dmi_present(const char __iomem *p)
        u8 buf[15];
 
        memcpy_fromio(buf, p, 15);
-       if ((memcmp(buf, "_DMI_", 5) == 0) && dmi_checksum(buf)) {
+       if (dmi_checksum(buf, 15)) {
                dmi_num = (buf[13] << 8) | buf[12];
                dmi_len = (buf[7] << 8) | buf[6];
                dmi_base = (buf[11] << 24) | (buf[10] << 16) |
                        (buf[9] << 8) | buf[8];
 
-               /*
-                * DMI version 0.0 means that the real version is taken from
-                * the SMBIOS version, which we don't know at this point.
-                */
-               if (buf[14] != 0)
-                       printk(KERN_INFO "DMI %d.%d present.\n",
-                              buf[14] >> 4, buf[14] & 0xF);
-               else
-                       printk(KERN_INFO "DMI present.\n");
                if (dmi_walk_early(dmi_decode) == 0) {
+                       if (dmi_ver)
+                               pr_info("SMBIOS %d.%d present.\n",
+                                      dmi_ver >> 8, dmi_ver & 0xFF);
+                       else {
+                               dmi_ver = (buf[14] & 0xF0) << 4 |
+                                          (buf[14] & 0x0F);
+                               pr_info("Legacy DMI %d.%d present.\n",
+                                      dmi_ver >> 8, dmi_ver & 0xFF);
+                       }
                        dmi_dump_ids();
                        return 0;
                }
        }
+       dmi_ver = 0;
        return 1;
 }
 
+static int __init smbios_present(const char __iomem *p)
+{
+       u8 buf[32];
+       int offset = 0;
+
+       memcpy_fromio(buf, p, 32);
+       if ((buf[5] < 32) && dmi_checksum(buf, buf[5])) {
+               dmi_ver = (buf[6] << 8) + buf[7];
+
+               /* Some BIOS report weird SMBIOS version, fix that up */
+               switch (dmi_ver) {
+               case 0x021F:
+               case 0x0221:
+                       pr_debug("SMBIOS version fixup(2.%d->2.%d)\n",
+                              dmi_ver & 0xFF, 3);
+                       dmi_ver = 0x0203;
+                       break;
+               case 0x0233:
+                       pr_debug("SMBIOS version fixup(2.%d->2.%d)\n", 51, 6);
+                       dmi_ver = 0x0206;
+                       break;
+               }
+               offset = 16;
+       }
+       return dmi_present(buf + offset);
+}
+
 void __init dmi_scan_machine(void)
 {
        char __iomem *p, *q;
@@ -444,7 +483,7 @@ void __init dmi_scan_machine(void)
                if (p == NULL)
                        goto error;
 
-               rc = dmi_present(p + 0x10); /* offset of _DMI_ string */
+               rc = smbios_present(p);
                dmi_iounmap(p, 32);
                if (!rc) {
                        dmi_available = 1;
@@ -462,7 +501,12 @@ void __init dmi_scan_machine(void)
                        goto error;
 
                for (q = p; q < p + 0x10000; q += 16) {
-                       rc = dmi_present(q);
+                       if (memcmp(q, "_SM_", 4) == 0 && q - p <= 0xFFE0)
+                               rc = smbios_present(q);
+                       else if (memcmp(q, "_DMI_", 5) == 0)
+                               rc = dmi_present(q);
+                       else
+                               continue;
                        if (!rc) {
                                dmi_available = 1;
                                dmi_iounmap(p, 0x10000);
index fbe0842038b59f182f657cf5d55a07c667c1acbd..611c99d49421c9948f61ae6956f40c7cbce4087b 100644 (file)
@@ -706,7 +706,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
                goto out_unlock;
        }
 
-       vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = obj->dev->driver->gem_vm_ops;
        vma->vm_private_data = map->handle;
        vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
index 6fed215023132981c1af48c8fa8521ce59a210d0..19c512d708605504e9bdaaeac4b5c97cd42582ff 100644 (file)
@@ -514,8 +514,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma)
 
        vma->vm_ops = &drm_vm_dma_ops;
 
-       vma->vm_flags |= VM_RESERVED;   /* Don't swap */
-       vma->vm_flags |= VM_DONTEXPAND;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
        drm_vm_open_locked(dev, vma);
        return 0;
@@ -643,21 +642,16 @@ int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma)
        case _DRM_SHM:
                vma->vm_ops = &drm_vm_shm_ops;
                vma->vm_private_data = (void *)map;
-               /* Don't let this area swap.  Change when
-                  DRM_KERNEL advisory is supported. */
-               vma->vm_flags |= VM_RESERVED;
                break;
        case _DRM_SCATTER_GATHER:
                vma->vm_ops = &drm_vm_sg_ops;
                vma->vm_private_data = (void *)map;
-               vma->vm_flags |= VM_RESERVED;
                vma->vm_page_prot = drm_dma_prot(map->type, vma);
                break;
        default:
                return -EINVAL; /* This should never happen. */
        }
-       vma->vm_flags |= VM_RESERVED;   /* Don't swap */
-       vma->vm_flags |= VM_DONTEXPAND;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
        drm_vm_open_locked(dev, vma);
        return 0;
index a38051c95ec4384176ddc5f8bd8937e4270c3fb6..9787b2de4be34fbffed761702e65672a17aa90e7 100644 (file)
@@ -501,7 +501,7 @@ static int exynos_drm_gem_mmap_buffer(struct file *filp,
 
        DRM_DEBUG_KMS("%s\n", __FILE__);
 
-       vma->vm_flags |= (VM_IO | VM_RESERVED);
+       vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
        update_vm_cache_attr(exynos_gem_obj, vma);
 
index 884ba73ac6ce0f42dbafd82799110cd2fdfcd15f..afded54dbb10c2b8c3b1e727625a5f637002330e 100644 (file)
@@ -178,8 +178,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
         */
        vma->vm_ops = &psbfb_vm_ops;
        vma->vm_private_data = (void *)psbfb;
-       vma->vm_flags |= VM_RESERVED | VM_IO |
-                                       VM_MIXEDMAP | VM_DONTEXPAND;
+       vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 
index e2d30b8c6e1f7689a30de8854924947797ab844f..85f0a93225935a2ca59cf7237518eb69be9c53a7 100644 (file)
@@ -2545,7 +2545,8 @@ static void gen6_update_ring_freq(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
        int min_freq = 15;
-       int gpu_freq, ia_freq, max_ia_freq;
+       int gpu_freq;
+       unsigned int ia_freq, max_ia_freq;
        int scaling_factor = 180;
 
        WARN_ON(!mutex_is_locked(&dev->struct_mutex));
index a877813571a45f01b2383bc403163ad6e6bfabed..3ba72dbdc4bd68495030c047b4b1ae87c7b9e13a 100644 (file)
@@ -285,7 +285,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
         */
 
        vma->vm_private_data = bo;
-       vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
+       vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 out_unref:
        ttm_bo_unref(&bo);
@@ -300,7 +300,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
 
        vma->vm_ops = &ttm_bo_vm_ops;
        vma->vm_private_data = ttm_bo_reference(bo);
-       vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
+       vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
        return 0;
 }
 EXPORT_SYMBOL(ttm_fbdev_mmap);
index b8c00ed3305159053da066dc95a1831cc6427ba9..8f9d0bd0c0d823ff376493d424b49bba65678a2c 100644 (file)
@@ -244,7 +244,7 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
                        size = 0;
        }
 
-       vma->vm_flags |= VM_RESERVED;   /* avoid to swap out this VMA */
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
        return 0;
 }
 
index bfa92a3cc2586aa8d4d144fe10cce3581d4dad31..a7edf987a339f7e1708faa290d4a030c5d7de939 100644 (file)
@@ -982,7 +982,7 @@ int i2c_add_numbered_adapter(struct i2c_adapter *adap)
 
        if (adap->nr == -1) /* -1 means dynamically assign bus id */
                return i2c_add_adapter(adap);
-       if (adap->nr & ~MAX_ID_MASK)
+       if (adap->nr & ~MAX_IDR_MASK)
                return -EINVAL;
 
 retry:
index d67999f6e34a147a6f99cdcd162fba4ebfdbb82d..394fea2ba1bc2018cac2f047ee7f0802823b0222 100644 (file)
@@ -390,7 +390,7 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv)
                ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
                                        next_id, &id);
                if (!ret)
-                       next_id = ((unsigned) id + 1) & MAX_ID_MASK;
+                       next_id = ((unsigned) id + 1) & MAX_IDR_MASK;
                spin_unlock_irqrestore(&cm.lock, flags);
        } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
 
index 45ee89b65c23fbf57350e8a5cb04840180bd2035..1a1d5d99fcf9175bedddb9af689ff245e230131b 100644 (file)
@@ -117,7 +117,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
        physical = galpas->user.fw_handle;
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
        ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
-       /* VM_IO | VM_RESERVED are set by remap_pfn_range() */
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
        ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
                           vma->vm_page_prot);
        if (unlikely(ret)) {
@@ -139,7 +139,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
        u64 start, ofs;
        struct page *page;
 
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        start = vma->vm_start;
        for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
                u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
index 736d9edbdbe720bb25a2e1ace073984e43373faa..3eb7e454849b4e1e7064dcb5cbdd84a03c18f307 100644 (file)
@@ -1225,7 +1225,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
 
        vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
        vma->vm_ops = &ipath_file_vm_ops;
-       vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        ret = 1;
 
 bail:
index faa44cb08071c7cf3f5413448f09aefb8de1b1c1..959a5c4ff812de7034d27d6a0935c567ade3ebf1 100644 (file)
@@ -971,7 +971,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
 
        vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
        vma->vm_ops = &qib_file_vm_ops;
-       vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        ret = 1;
 
 bail:
index 7bc775219f9722fb6e6dd2e27905cf6113cf7dfd..e5a76da860816897ff7740b905f16015d71aadc0 100644 (file)
@@ -1647,7 +1647,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma)
 
        vma->vm_ops = &meye_vm_ops;
        vma->vm_flags &= ~VM_IO;        /* not I/O memory */
-       vma->vm_flags |= VM_RESERVED;   /* avoid to swap out this VMA */
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_private_data = (void *) (offset / gbufsize);
        meye_vm_open(vma);
 
index 409da0f8e5cfdb92fa0347bd3b4a6f4cee56cfa3..636c2b2e3173a0d008df7e7a79cadb4ed3369191 100644 (file)
@@ -911,7 +911,7 @@ static int omap_vout_mmap(struct file *file, struct vm_area_struct *vma)
 
        q->bufs[i]->baddr = vma->vm_start;
 
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
        vma->vm_ops = &omap_vout_vm_ops;
        vma->vm_private_data = (void *) vout;
index aae1720b2f2d14a0ceb1ca1080fafdb125d28fc4..cc9110ce636f435c01d7e89f3c731eb91e08647b 100644 (file)
@@ -3950,7 +3950,7 @@ found:
 
        fb->map_count = 1;
 
-       vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_flags &= ~VM_IO;
        vma->vm_private_data = fb;
        vma->vm_file = file;
index 19ea780b16ffbc9b9539eb0c0891cb2d6af27ce0..5bfc8e2f018f20fb07389b7fadf8902c7073714e 100644 (file)
@@ -2126,8 +2126,7 @@ static int sn9c102_mmap(struct file* filp, struct vm_area_struct *vma)
                return -EINVAL;
        }
 
-       vma->vm_flags |= VM_IO;
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
        pos = cam->frame[i].bufmem;
        while (size > 0) { /* size is page-aligned */
index 8a4317979a43d784972283d4d61f978ff51236c2..9049fa290baf984a10fc639cadb1a6bd68c77569 100644 (file)
@@ -1108,8 +1108,7 @@ static int usbvision_mmap(struct file *file, struct vm_area_struct *vma)
        }
 
        /* VM_IO is eventually going to replace PageReserved altogether */
-       vma->vm_flags |= VM_IO;
-       vma->vm_flags |= VM_RESERVED;   /* avoid to swap out this VMA */
+       vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
        pos = usbvision->frame[i].data;
        while (size > 0) {
index f300deafd268e555e54f1d9cdefbc22c97cd75cc..828e7c10bd701cc9b598f26721a413de7d378cde 100644 (file)
@@ -582,7 +582,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q,
        map->count    = 1;
        map->q        = q;
        vma->vm_ops   = &videobuf_vm_ops;
-       vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */
        vma->vm_private_data = map;
        dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n",
index df142580e44cfac0b9451411a43bcd461f572de0..2ff7fcc77b1104fe7d1ca1a2a9d5738ede27acb7 100644 (file)
@@ -270,7 +270,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q,
        }
 
        vma->vm_ops          = &videobuf_vm_ops;
-       vma->vm_flags       |= VM_DONTEXPAND | VM_RESERVED;
+       vma->vm_flags       |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_private_data = map;
 
        dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n",
index 504cd4cbe29e44f40f822240eb21b03364136f65..051ea3571b208968d6b55ed3f8568038898f1b84 100644 (file)
@@ -163,7 +163,7 @@ int vb2_mmap_pfn_range(struct vm_area_struct *vma, unsigned long paddr,
                return ret;
        }
 
-       vma->vm_flags           |= VM_DONTEXPAND | VM_RESERVED;
+       vma->vm_flags           |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_private_data    = priv;
        vma->vm_ops             = vm_ops;
 
index 0c43297ed9ac6262ca2e901703f3e3bb3c10a2cb..8835eabb3b8730d45d4deffb533bb6bf5ff332c4 100644 (file)
@@ -1243,8 +1243,6 @@ static int data_mmap(struct file *filp, struct vm_area_struct *vma)
                return -EINVAL;
        }
 
-       /* IO memory (stop cacheing) */
-       vma->vm_flags |= VM_IO | VM_RESERVED;
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
        return io_remap_pfn_range(vma, vma->vm_start, addr, vsize,
index 9d37c576d526674dc12dc5dc478ffc45374c2c36..79349ecf080fb3ca517edd85a90cce4e857a834f 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/miscdevice.h>
 #include <linux/pm_runtime.h>
 #include <linux/atomic.h>
+#include <linux/of_device.h>
 #include "lis3lv02d.h"
 
 #define DRIVER_NAME     "lis3lv02d"
@@ -942,6 +943,153 @@ static void lis3lv02d_8b_configure(struct lis3lv02d *lis3,
        }
 }
 
+#ifdef CONFIG_OF
+static int lis3lv02d_init_dt(struct lis3lv02d *lis3)
+{
+       struct lis3lv02d_platform_data *pdata;
+       struct device_node *np = lis3->of_node;
+       u32 val;
+
+       if (!lis3->of_node)
+               return 0;
+
+       pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+       if (!pdata)
+               return -ENOMEM;
+
+       if (of_get_property(np, "st,click-single-x", NULL))
+               pdata->click_flags |= LIS3_CLICK_SINGLE_X;
+       if (of_get_property(np, "st,click-double-x", NULL))
+               pdata->click_flags |= LIS3_CLICK_DOUBLE_X;
+
+       if (of_get_property(np, "st,click-single-y", NULL))
+               pdata->click_flags |= LIS3_CLICK_SINGLE_Y;
+       if (of_get_property(np, "st,click-double-y", NULL))
+               pdata->click_flags |= LIS3_CLICK_DOUBLE_Y;
+
+       if (of_get_property(np, "st,click-single-z", NULL))
+               pdata->click_flags |= LIS3_CLICK_SINGLE_Z;
+       if (of_get_property(np, "st,click-double-z", NULL))
+               pdata->click_flags |= LIS3_CLICK_DOUBLE_Z;
+
+       if (!of_property_read_u32(np, "st,click-threshold-x", &val))
+               pdata->click_thresh_x = val;
+       if (!of_property_read_u32(np, "st,click-threshold-y", &val))
+               pdata->click_thresh_y = val;
+       if (!of_property_read_u32(np, "st,click-threshold-z", &val))
+               pdata->click_thresh_z = val;
+
+       if (!of_property_read_u32(np, "st,click-time-limit", &val))
+               pdata->click_time_limit = val;
+       if (!of_property_read_u32(np, "st,click-latency", &val))
+               pdata->click_latency = val;
+       if (!of_property_read_u32(np, "st,click-window", &val))
+               pdata->click_window = val;
+
+       if (of_get_property(np, "st,irq1-disable", NULL))
+               pdata->irq_cfg |= LIS3_IRQ1_DISABLE;
+       if (of_get_property(np, "st,irq1-ff-wu-1", NULL))
+               pdata->irq_cfg |= LIS3_IRQ1_FF_WU_1;
+       if (of_get_property(np, "st,irq1-ff-wu-2", NULL))
+               pdata->irq_cfg |= LIS3_IRQ1_FF_WU_2;
+       if (of_get_property(np, "st,irq1-data-ready", NULL))
+               pdata->irq_cfg |= LIS3_IRQ1_DATA_READY;
+       if (of_get_property(np, "st,irq1-click", NULL))
+               pdata->irq_cfg |= LIS3_IRQ1_CLICK;
+
+       if (of_get_property(np, "st,irq2-disable", NULL))
+               pdata->irq_cfg |= LIS3_IRQ2_DISABLE;
+       if (of_get_property(np, "st,irq2-ff-wu-1", NULL))
+               pdata->irq_cfg |= LIS3_IRQ2_FF_WU_1;
+       if (of_get_property(np, "st,irq2-ff-wu-2", NULL))
+               pdata->irq_cfg |= LIS3_IRQ2_FF_WU_2;
+       if (of_get_property(np, "st,irq2-data-ready", NULL))
+               pdata->irq_cfg |= LIS3_IRQ2_DATA_READY;
+       if (of_get_property(np, "st,irq2-click", NULL))
+               pdata->irq_cfg |= LIS3_IRQ2_CLICK;
+
+       if (of_get_property(np, "st,irq-open-drain", NULL))
+               pdata->irq_cfg |= LIS3_IRQ_OPEN_DRAIN;
+       if (of_get_property(np, "st,irq-active-low", NULL))
+               pdata->irq_cfg |= LIS3_IRQ_ACTIVE_LOW;
+
+       if (!of_property_read_u32(np, "st,wu-duration-1", &val))
+               pdata->duration1 = val;
+       if (!of_property_read_u32(np, "st,wu-duration-2", &val))
+               pdata->duration2 = val;
+
+       if (of_get_property(np, "st,wakeup-x-lo", NULL))
+               pdata->wakeup_flags |= LIS3_WAKEUP_X_LO;
+       if (of_get_property(np, "st,wakeup-x-hi", NULL))
+               pdata->wakeup_flags |= LIS3_WAKEUP_X_HI;
+       if (of_get_property(np, "st,wakeup-y-lo", NULL))
+               pdata->wakeup_flags |= LIS3_WAKEUP_Y_LO;
+       if (of_get_property(np, "st,wakeup-y-hi", NULL))
+               pdata->wakeup_flags |= LIS3_WAKEUP_Y_HI;
+       if (of_get_property(np, "st,wakeup-z-lo", NULL))
+               pdata->wakeup_flags |= LIS3_WAKEUP_Z_LO;
+       if (of_get_property(np, "st,wakeup-z-hi", NULL))
+               pdata->wakeup_flags |= LIS3_WAKEUP_Z_HI;
+
+       if (!of_property_read_u32(np, "st,highpass-cutoff-hz", &val)) {
+               switch (val) {
+               case 1:
+                       pdata->hipass_ctrl = LIS3_HIPASS_CUTFF_1HZ;
+                       break;
+               case 2:
+                       pdata->hipass_ctrl = LIS3_HIPASS_CUTFF_2HZ;
+                       break;
+               case 4:
+                       pdata->hipass_ctrl = LIS3_HIPASS_CUTFF_4HZ;
+                       break;
+               case 8:
+                       pdata->hipass_ctrl = LIS3_HIPASS_CUTFF_8HZ;
+                       break;
+               }
+       }
+
+       if (of_get_property(np, "st,hipass1-disable", NULL))
+               pdata->hipass_ctrl |= LIS3_HIPASS1_DISABLE;
+       if (of_get_property(np, "st,hipass2-disable", NULL))
+               pdata->hipass_ctrl |= LIS3_HIPASS2_DISABLE;
+
+       if (of_get_property(np, "st,axis-x", &val))
+               pdata->axis_x = val;
+       if (of_get_property(np, "st,axis-y", &val))
+               pdata->axis_y = val;
+       if (of_get_property(np, "st,axis-z", &val))
+               pdata->axis_z = val;
+
+       if (of_get_property(np, "st,default-rate", NULL))
+               pdata->default_rate = val;
+
+       if (of_get_property(np, "st,min-limit-x", &val))
+               pdata->st_min_limits[0] = val;
+       if (of_get_property(np, "st,min-limit-y", &val))
+               pdata->st_min_limits[1] = val;
+       if (of_get_property(np, "st,min-limit-z", &val))
+               pdata->st_min_limits[2] = val;
+
+       if (of_get_property(np, "st,max-limit-x", &val))
+               pdata->st_max_limits[0] = val;
+       if (of_get_property(np, "st,max-limit-y", &val))
+               pdata->st_max_limits[1] = val;
+       if (of_get_property(np, "st,max-limit-z", &val))
+               pdata->st_max_limits[2] = val;
+
+
+       lis3->pdata = pdata;
+
+       return 0;
+}
+
+#else
+static int lis3lv02d_init_dt(struct lis3lv02d *lis3)
+{
+       return 0;
+}
+#endif
+
 /*
  * Initialise the accelerometer and the various subsystems.
  * Should be rather independent of the bus system.
@@ -952,6 +1100,10 @@ int lis3lv02d_init_device(struct lis3lv02d *lis3)
        irq_handler_t thread_fn;
        int irq_flags = 0;
 
+       err = lis3lv02d_init_dt(lis3);
+       if (err < 0)
+               return err;
+
        lis3->whoami = lis3lv02d_read_8(lis3, WHO_AM_I);
 
        switch (lis3->whoami) {
index c1a545e136a05bada02a9df4b79276ed78527c8c..4cf0779d2a0fe6de0a8cbec7bde471f61cc233ec 100644 (file)
@@ -314,6 +314,10 @@ struct lis3lv02d {
 
        struct lis3lv02d_platform_data *pdata;  /* for passing board config */
        struct mutex            mutex;     /* Serialize poll and selftest */
+
+#ifdef CONFIG_OF
+       struct device_node      *of_node;
+#endif
 };
 
 int lis3lv02d_init_device(struct lis3lv02d *lis3);
index 80880e984b4fea1c6b8ed23ef69fb29aeb3200cc..23f398610a3990e7d69e10291244a012de8574e6 100644 (file)
@@ -17,6 +17,8 @@
 #include <linux/workqueue.h>
 #include <linux/spi/spi.h>
 #include <linux/pm.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
 
 #include "lis3lv02d.h"
 
@@ -58,6 +60,12 @@ static int lis3_spi_init(struct lis3lv02d *lis3)
 static union axis_conversion lis3lv02d_axis_normal =
        { .as_array = { 1, 2, 3 } };
 
+static struct of_device_id lis302dl_spi_dt_ids[] = {
+       { .compatible = "st,lis302dl-spi" },
+       {}
+};
+MODULE_DEVICE_TABLE(of, lis302dl_spi_dt_ids);
+
 static int __devinit lis302dl_spi_probe(struct spi_device *spi)
 {
        int ret;
@@ -75,6 +83,12 @@ static int __devinit lis302dl_spi_probe(struct spi_device *spi)
        lis3_dev.irq            = spi->irq;
        lis3_dev.ac             = lis3lv02d_axis_normal;
        lis3_dev.pdata          = spi->dev.platform_data;
+
+#ifdef CONFIG_OF
+       if (of_match_device(lis302dl_spi_dt_ids, &spi->dev))
+               lis3_dev.of_node = spi->dev.of_node;
+#endif
+
        spi_set_drvdata(spi, &lis3_dev);
 
        return lis3lv02d_init_device(&lis3_dev);
@@ -121,6 +135,7 @@ static struct spi_driver lis302dl_spi_driver = {
                .name   = DRV_NAME,
                .owner  = THIS_MODULE,
                .pm     = &lis3lv02d_spi_pm,
+               .of_match_table = of_match_ptr(lis302dl_spi_dt_ids),
        },
        .probe  = lis302dl_spi_probe,
        .remove = __devexit_p(lis302dl_spi_remove),
index ecafa4ba238b3ca4991a20dc5e6bb88458aee8de..492c8cac69acfbd78d6ee33170b58617d8891bc3 100644 (file)
@@ -108,9 +108,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma)
                                vma->vm_end & (GRU_GSEG_PAGESIZE - 1))
                return -EINVAL;
 
-       vma->vm_flags |=
-           (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP |
-                       VM_RESERVED);
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED |
+                        VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_page_prot = PAGE_SHARED;
        vma->vm_ops = &gru_vm_ops;
 
index f2f482bec5736b21a562da5e4fda11375e8cf457..c4e01c5480a5f15b6be84533e638d28abd224956 100644 (file)
@@ -1146,7 +1146,7 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma)
 
                off += start;
                vma->vm_pgoff = off >> PAGE_SHIFT;
-               vma->vm_flags |= VM_IO | VM_RESERVED;
+               vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 
 #ifdef pgprot_noncached
                if (file->f_flags & O_DSYNC || off >= __pa(high_memory))
index ec794a72975dd886205f2460e2eaba08089a8daf..374c46dff7dd65d3aea7eef69b22a332d727ad0f 100644 (file)
@@ -1077,8 +1077,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  * until the request succeeds or until the allocation size falls below
  * the system page size. This attempts to make sure it does not adversely
  * impact system performance, so when allocating more than one page, we
- * ask the memory allocator to avoid re-trying, swapping, writing back
- * or performing I/O.
+ * ask the memory allocator to avoid re-trying.
  *
  * Note, this function also makes sure that the allocated buffer is aligned to
  * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value.
@@ -1092,8 +1091,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  */
 void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
 {
-       gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
-                      __GFP_NORETRY | __GFP_NO_KSWAPD;
+       gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;
        size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
        void *kbuf;
 
index f34b5b29fb955cb3129dd38f15296583ce64ff5e..d93b2b6b1f7a2b3598bb8fcc3bb9ba4d2231e626 100644 (file)
@@ -216,7 +216,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 }
 
 
-/* Look up the dcookie for the task's first VM_EXECUTABLE mapping,
+/* Look up the dcookie for the task's mm->exe_file,
  * which corresponds loosely to "application name". This is
  * not strictly necessary but allows oprofile to associate
  * shared-library samples with particular applications
@@ -224,21 +224,10 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 static unsigned long get_exec_dcookie(struct mm_struct *mm)
 {
        unsigned long cookie = NO_COOKIE;
-       struct vm_area_struct *vma;
-
-       if (!mm)
-               goto out;
 
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if (!vma->vm_file)
-                       continue;
-               if (!(vma->vm_flags & VM_EXECUTABLE))
-                       continue;
-               cookie = fast_get_dcookie(&vma->vm_file->f_path);
-               break;
-       }
+       if (mm && mm->exe_file)
+               cookie = fast_get_dcookie(&mm->exe_file->f_path);
 
-out:
        return cookie;
 }
 
index 673c14ea11e3570f8bcf7ed872d46412b496d9ac..5292db69c426cb31df6e19004edaaf282ee0bc49 100644 (file)
@@ -484,7 +484,7 @@ static int socket_early_resume(struct pcmcia_socket *skt)
 
 static int socket_late_resume(struct pcmcia_socket *skt)
 {
-       int ret;
+       int ret = 0;
 
        mutex_lock(&skt->ops_mutex);
        skt->state &= ~SOCKET_SUSPEND;
@@ -511,19 +511,31 @@ static int socket_late_resume(struct pcmcia_socket *skt)
                return socket_insert(skt);
        }
 
+       if (!(skt->state & SOCKET_CARDBUS) && (skt->callback))
+               ret = skt->callback->early_resume(skt);
+       return ret;
+}
+
+/*
+ * Finalize the resume. In case of a cardbus socket, we have
+ * to rebind the devices as we can't be certain that it has been
+ * replaced, or not.
+ */
+static int socket_complete_resume(struct pcmcia_socket *skt)
+{
+       int ret = 0;
 #ifdef CONFIG_CARDBUS
        if (skt->state & SOCKET_CARDBUS) {
                /* We can't be sure the CardBus card is the same
                 * as the one previously inserted. Therefore, remove
                 * and re-add... */
                cb_free(skt);
-               cb_alloc(skt);
-               return 0;
+               ret = cb_alloc(skt);
+               if (ret)
+                       cb_free(skt);
        }
 #endif
-       if (!(skt->state & SOCKET_CARDBUS) && (skt->callback))
-               skt->callback->early_resume(skt);
-       return 0;
+       return ret;
 }
 
 /*
@@ -533,11 +545,15 @@ static int socket_late_resume(struct pcmcia_socket *skt)
  */
 static int socket_resume(struct pcmcia_socket *skt)
 {
+       int err;
        if (!(skt->state & SOCKET_SUSPEND))
                return -EBUSY;
 
        socket_early_resume(skt);
-       return socket_late_resume(skt);
+       err = socket_late_resume(skt);
+       if (!err)
+               err = socket_complete_resume(skt);
+       return err;
 }
 
 static void socket_remove(struct pcmcia_socket *skt)
@@ -848,6 +864,12 @@ static int __used pcmcia_socket_dev_resume(struct device *dev)
        return __pcmcia_pm_op(dev, socket_late_resume);
 }
 
+static void __used pcmcia_socket_dev_complete(struct device *dev)
+{
+       WARN(__pcmcia_pm_op(dev, socket_complete_resume),
+               "failed to complete resume");
+}
+
 static const struct dev_pm_ops pcmcia_socket_pm_ops = {
        /* dev_resume may be called with IRQs enabled */
        SET_SYSTEM_SLEEP_PM_OPS(NULL,
@@ -862,6 +884,7 @@ static const struct dev_pm_ops pcmcia_socket_pm_ops = {
        .resume_noirq = pcmcia_socket_dev_resume_noirq,
        .thaw_noirq = pcmcia_socket_dev_resume_noirq,
        .restore_noirq = pcmcia_socket_dev_resume_noirq,
+       .complete = pcmcia_socket_dev_complete,
 };
 
 #define PCMCIA_SOCKET_CLASS_PM_OPS (&pcmcia_socket_pm_ops)
index e771487132f7542123a0e8a669cd8582481ab051..2420d5af05839a8403bf6b0bd00fe07ad9c5ae42 100644 (file)
@@ -306,7 +306,7 @@ int pps_register_cdev(struct pps_device *pps)
        if (err < 0)
                return err;
 
-       pps->id &= MAX_ID_MASK;
+       pps->id &= MAX_IDR_MASK;
        if (pps->id >= PPS_MAX_SOURCES) {
                pr_err("%s: too many PPS sources in the system\n",
                                        pps->info.name);
index d5e1625bbac2432ff7965a2b70ab438b98aa4959..19743597cd9540fe03f03fe97a0ddc70adf1c8be 100644 (file)
@@ -2165,7 +2165,8 @@ static int __devinit tsi721_setup_mport(struct tsi721_device *priv)
        rio_init_dbell_res(&mport->riores[RIO_DOORBELL_RESOURCE], 0, 0xffff);
        rio_init_mbox_res(&mport->riores[RIO_INB_MBOX_RESOURCE], 0, 3);
        rio_init_mbox_res(&mport->riores[RIO_OUTB_MBOX_RESOURCE], 0, 3);
-       strcpy(mport->name, "Tsi721 mport");
+       snprintf(mport->name, RIO_MAX_MPORT_NAME, "%s(%s)",
+                dev_driver_string(&pdev->dev), dev_name(&pdev->dev));
 
        /* Hook up interrupt handler */
 
index fabc99a75c6596d2b797e55a59fa7ecd2db49fc8..94764560d1102fc9aa394b6191ad86c24cbd6e1a 100644 (file)
@@ -19,7 +19,6 @@ if RTC_CLASS
 
 config RTC_HCTOSYS
        bool "Set system time from RTC on startup and resume"
-       depends on RTC_CLASS = y
        default y
        help
          If you say yes here, the system time (wall clock) will be set using
@@ -51,7 +50,6 @@ config RTC_HCTOSYS_DEVICE
 
 config RTC_DEBUG
        bool "RTC debug support"
-       depends on RTC_CLASS = y
        help
          Say yes here to enable debugging support in the RTC framework
          and individual RTC drivers.
@@ -61,7 +59,6 @@ comment "RTC interfaces"
 config RTC_INTF_SYSFS
        boolean "/sys/class/rtc/rtcN (sysfs)"
        depends on SYSFS
-       default RTC_CLASS
        help
          Say yes here if you want to use your RTCs using sysfs interfaces,
          /sys/class/rtc/rtc0 through /sys/.../rtcN.
@@ -69,19 +66,19 @@ config RTC_INTF_SYSFS
          If unsure, say Y.
 
 config RTC_INTF_PROC
-       boolean "/proc/driver/rtc (procfs for rtc0)"
+       boolean "/proc/driver/rtc (procfs for rtcN)"
        depends on PROC_FS
-       default RTC_CLASS
        help
-         Say yes here if you want to use your first RTC through the proc
-         interface, /proc/driver/rtc. Other RTCs will not be available
-         through that API.
+         Say yes here if you want to use your system clock RTC through
+         the proc interface, /proc/driver/rtc.
+         Other RTCs will not be available through that API.
+         If there is no RTC for the system clock, then the first RTC(rtc0)
+         is used by default.
 
          If unsure, say Y.
 
 config RTC_INTF_DEV
        boolean "/dev/rtcN (character devices)"
-       default RTC_CLASS
        help
          Say yes here if you want to use your RTCs using the /dev
          interfaces, which "udev" sets up as /dev/rtc0 through
@@ -127,7 +124,7 @@ if I2C
 
 config RTC_DRV_88PM860X
        tristate "Marvell 88PM860x"
-       depends on RTC_CLASS && I2C && MFD_88PM860X
+       depends on I2C && MFD_88PM860X
        help
          If you say yes here you get support for RTC function in Marvell
          88PM860x chips.
@@ -137,7 +134,7 @@ config RTC_DRV_88PM860X
 
 config RTC_DRV_88PM80X
        tristate "Marvell 88PM80x"
-       depends on RTC_CLASS && I2C && MFD_88PM800
+       depends on I2C && MFD_88PM800
        help
          If you say yes here you get support for RTC function in Marvell
          88PM80x chips.
@@ -165,7 +162,7 @@ config RTC_DRV_DS1307
 
 config RTC_DRV_DS1374
        tristate "Dallas/Maxim DS1374"
-       depends on RTC_CLASS && I2C
+       depends on I2C
        help
          If you say yes here you get support for Dallas Semiconductor
          DS1374 real-time clock chips. If an interrupt is associated
@@ -185,7 +182,7 @@ config RTC_DRV_DS1672
 
 config RTC_DRV_DS3232
        tristate "Dallas/Maxim DS3232"
-       depends on RTC_CLASS && I2C
+       depends on I2C
        help
          If you say yes here you get support for Dallas Semiconductor
          DS3232 real-time clock chips. If an interrupt is associated
@@ -203,6 +200,16 @@ config RTC_DRV_MAX6900
          This driver can also be built as a module. If so, the module
          will be called rtc-max6900.
 
+config RTC_DRV_MAX8907
+       tristate "Maxim MAX8907"
+       depends on MFD_MAX8907
+       help
+         If you say yes here you will get support for the
+         RTC of Maxim MAX8907 PMIC.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-max8907.
+
 config RTC_DRV_MAX8925
        tristate "Maxim MAX8925"
        depends on MFD_MAX8925
@@ -325,7 +332,7 @@ config RTC_DRV_TWL92330
 
 config RTC_DRV_TWL4030
        tristate "TI TWL4030/TWL5030/TWL6030/TPS659x0"
-       depends on RTC_CLASS && TWL4030_CORE
+       depends on TWL4030_CORE
        help
          If you say yes here you get support for the RTC on the
          TWL4030/TWL5030/TWL6030 family chips, used mostly with OMAP3 platforms.
@@ -333,6 +340,26 @@ config RTC_DRV_TWL4030
          This driver can also be built as a module. If so, the module
          will be called rtc-twl.
 
+config RTC_DRV_TPS65910
+       tristate "TI TPS65910 RTC driver"
+       depends on RTC_CLASS && MFD_TPS65910
+       help
+         If you say yes here you get support for the RTC on the
+         TPS65910 chips.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-tps65910.
+
+config RTC_DRV_RC5T583
+       tristate "RICOH 5T583 RTC driver"
+       depends on MFD_RC5T583
+       help
+         If you say yes here you get support for the RTC on the
+         RICOH 5T583 chips.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-rc5t583.
+
 config RTC_DRV_S35390A
        tristate "Seiko Instruments S-35390A"
        select BITREVERSE
@@ -538,7 +565,6 @@ config RTC_DRV_DS1302
 
 config RTC_DRV_DS1511
        tristate "Dallas DS1511"
-       depends on RTC_CLASS
        help
          If you say yes here you get support for the
          Dallas DS1511 timekeeping/watchdog chip.
@@ -583,7 +609,6 @@ config RTC_DRV_EFI
 
 config RTC_DRV_STK17TA8
        tristate "Simtek STK17TA8"
-       depends on RTC_CLASS
        help
          If you say yes here you get support for the
          Simtek STK17TA8 timekeeping chip.
@@ -658,6 +683,15 @@ config RTC_DRV_V3020
          This driver can also be built as a module. If so, the module
          will be called rtc-v3020.
 
+config RTC_DRV_DS2404
+       tristate "Dallas DS2404"
+       help
+         If you say yes here you get support for the
+         Dallas DS2404 RTC chip.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-ds2404.
+
 config RTC_DRV_WM831X
        tristate "Wolfson Microelectronics WM831x RTC"
        depends on MFD_WM831X
@@ -711,7 +745,7 @@ config RTC_DRV_AB8500
 
 config RTC_DRV_NUC900
        tristate "NUC910/NUC920 RTC driver"
-       depends on RTC_CLASS && ARCH_W90X900
+       depends on ARCH_W90X900
        help
          If you say yes here you get support for the RTC subsystem of the
          NUC910/NUC920 used in embedded systems.
@@ -731,7 +765,6 @@ config RTC_DRV_DAVINCI
 config RTC_DRV_IMXDI
        tristate "Freescale IMX DryIce Real Time Clock"
        depends on SOC_IMX25
-       depends on RTC_CLASS
        help
           Support for Freescale IMX DryIce RTC
 
@@ -791,7 +824,7 @@ config RTC_DRV_SA1100
 
 config RTC_DRV_SH
        tristate "SuperH On-Chip RTC"
-       depends on RTC_CLASS && SUPERH && HAVE_CLK
+       depends on SUPERH && HAVE_CLK
        help
          Say Y here to enable support for the on-chip RTC found in
          most SuperH processors.
@@ -1023,7 +1056,6 @@ config RTC_DRV_MPC5121
 
 config RTC_DRV_JZ4740
        tristate "Ingenic JZ4740 SoC"
-       depends on RTC_CLASS
        depends on MACH_JZ4740
        help
          If you say yes here you get support for the Ingenic JZ4740 SoC RTC
@@ -1053,7 +1085,7 @@ config RTC_DRV_PM8XXX
 
 config RTC_DRV_TEGRA
        tristate "NVIDIA Tegra Internal RTC driver"
-       depends on RTC_CLASS && ARCH_TEGRA
+       depends on ARCH_TEGRA
        help
          If you say yes here you get support for the
          Tegra 200 series internal RTC module.
@@ -1090,7 +1122,6 @@ config RTC_DRV_LOONGSON1
 config RTC_DRV_MXC
        tristate "Freescale MXC Real Time Clock"
        depends on ARCH_MXC
-       depends on RTC_CLASS
        help
           If you say yes here you get support for the Freescale MXC
           RTC module.
@@ -1098,4 +1129,15 @@ config RTC_DRV_MXC
           This driver can also be built as a module, if so, the module
           will be called "rtc-mxc".
 
+config RTC_DRV_SNVS
+       tristate "Freescale SNVS RTC support"
+       depends on HAS_IOMEM
+       depends on OF
+       help
+          If you say yes here you get support for the Freescale SNVS
+          Low Power (LP) RTC module.
+
+          This driver can also be built as a module, if so, the module
+          will be called "rtc-snvs".
+
 endif # RTC_CLASS
index 0d5b2b66f90d4f47f97b3cf88fec79ebb2ff4028..56297f0fd3884fde9d85cd305cd4b16b439053d2 100644 (file)
@@ -43,6 +43,7 @@ obj-$(CONFIG_RTC_DRV_DS1511)  += rtc-ds1511.o
 obj-$(CONFIG_RTC_DRV_DS1553)   += rtc-ds1553.o
 obj-$(CONFIG_RTC_DRV_DS1672)   += rtc-ds1672.o
 obj-$(CONFIG_RTC_DRV_DS1742)   += rtc-ds1742.o
+obj-$(CONFIG_RTC_DRV_DS2404)    += rtc-ds2404.o
 obj-$(CONFIG_RTC_DRV_DS3232)   += rtc-ds3232.o
 obj-$(CONFIG_RTC_DRV_DS3234)   += rtc-ds3234.o
 obj-$(CONFIG_RTC_DRV_EFI)      += rtc-efi.o
@@ -64,6 +65,7 @@ obj-$(CONFIG_RTC_DRV_M48T59)  += rtc-m48t59.o
 obj-$(CONFIG_RTC_DRV_M48T86)   += rtc-m48t86.o
 obj-$(CONFIG_RTC_DRV_MXC)      += rtc-mxc.o
 obj-$(CONFIG_RTC_DRV_MAX6900)  += rtc-max6900.o
+obj-$(CONFIG_RTC_DRV_MAX8907)  += rtc-max8907.o
 obj-$(CONFIG_RTC_DRV_MAX8925)  += rtc-max8925.o
 obj-$(CONFIG_RTC_DRV_MAX8998)  += rtc-max8998.o
 obj-$(CONFIG_RTC_DRV_MAX6902)  += rtc-max6902.o
@@ -85,6 +87,7 @@ obj-$(CONFIG_RTC_DRV_PS3)     += rtc-ps3.o
 obj-$(CONFIG_RTC_DRV_PUV3)     += rtc-puv3.o
 obj-$(CONFIG_RTC_DRV_PXA)      += rtc-pxa.o
 obj-$(CONFIG_RTC_DRV_R9701)    += rtc-r9701.o
+obj-$(CONFIG_RTC_DRV_RC5T583)  += rtc-rc5t583.o
 obj-$(CONFIG_RTC_DRV_RP5C01)   += rtc-rp5c01.o
 obj-$(CONFIG_RTC_DRV_RS5C313)  += rtc-rs5c313.o
 obj-$(CONFIG_RTC_DRV_RS5C348)  += rtc-rs5c348.o
@@ -96,6 +99,7 @@ obj-$(CONFIG_RTC_DRV_S35390A) += rtc-s35390a.o
 obj-$(CONFIG_RTC_DRV_S3C)      += rtc-s3c.o
 obj-$(CONFIG_RTC_DRV_SA1100)   += rtc-sa1100.o
 obj-$(CONFIG_RTC_DRV_SH)       += rtc-sh.o
+obj-$(CONFIG_RTC_DRV_SNVS)     += rtc-snvs.o
 obj-$(CONFIG_RTC_DRV_SPEAR)    += rtc-spear.o
 obj-$(CONFIG_RTC_DRV_STARFIRE) += rtc-starfire.o
 obj-$(CONFIG_RTC_DRV_STK17TA8) += rtc-stk17ta8.o
@@ -105,6 +109,7 @@ obj-$(CONFIG_RTC_DRV_TEGRA) += rtc-tegra.o
 obj-$(CONFIG_RTC_DRV_TEST)     += rtc-test.o
 obj-$(CONFIG_RTC_DRV_TILE)     += rtc-tile.o
 obj-$(CONFIG_RTC_DRV_TWL4030)  += rtc-twl.o
+obj-$(CONFIG_RTC_DRV_TPS65910) += rtc-tps65910.o
 obj-$(CONFIG_RTC_DRV_TX4939)   += rtc-tx4939.o
 obj-$(CONFIG_RTC_DRV_V3020)    += rtc-v3020.o
 obj-$(CONFIG_RTC_DRV_VR41XX)   += rtc-vr41xx.o
index dc4c2748bbc38bfac593cc47a2ff7bac6a34c8fe..f8a0aab218cbcd2777ea93b9e74aa96494e729fe 100644 (file)
@@ -31,8 +31,12 @@ static void rtc_device_release(struct device *dev)
        kfree(rtc);
 }
 
-#if defined(CONFIG_PM) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+#ifdef CONFIG_RTC_HCTOSYS_DEVICE
+/* Result of the last RTC to system clock attempt. */
+int rtc_hctosys_ret = -ENODEV;
+#endif
 
+#if defined(CONFIG_PM) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
 /*
  * On suspend(), measure the delta between one RTC and the
  * system's wall clock; restore it on resume().
@@ -84,6 +88,7 @@ static int rtc_resume(struct device *dev)
        struct timespec         new_system, new_rtc;
        struct timespec         sleep_time;
 
+       rtc_hctosys_ret = -ENODEV;
        if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0)
                return 0;
 
@@ -117,6 +122,7 @@ static int rtc_resume(struct device *dev)
 
        if (sleep_time.tv_sec >= 0)
                timekeeping_inject_sleeptime(&sleep_time);
+       rtc_hctosys_ret = 0;
        return 0;
 }
 
@@ -238,6 +244,7 @@ void rtc_device_unregister(struct rtc_device *rtc)
                rtc_proc_del_device(rtc);
                device_unregister(&rtc->dev);
                rtc->ops = NULL;
+               ida_simple_remove(&rtc_ida, rtc->id);
                mutex_unlock(&rtc->ops_lock);
                put_device(&rtc->dev);
        }
index bc90b091f1954faecf087f2094c6094e31606da2..4aa60d74004e41ffdd7be050728f9cf63a7fa48c 100644 (file)
@@ -22,8 +22,6 @@
  * the best guess is to add 0.5s.
  */
 
-int rtc_hctosys_ret = -ENODEV;
-
 static int __init rtc_hctosys(void)
 {
        int err = -ENODEV;
@@ -56,7 +54,7 @@ static int __init rtc_hctosys(void)
 
        rtc_tm_to_time(&tm, &tv.tv_sec);
 
-       do_settimeofday(&tv);
+       err = do_settimeofday(&tv);
 
        dev_info(rtc->dev.parent,
                "setting system clock to "
index 1dd61f402b040441c99078214017f802050c8fa5..2dfe7a2fb99800d063c8010c57ce3ecaf464b800 100644 (file)
@@ -473,18 +473,7 @@ static struct platform_driver at91_rtc_driver = {
        },
 };
 
-static int __init at91_rtc_init(void)
-{
-       return platform_driver_register(&at91_rtc_driver);
-}
-module_init(at91_rtc_init);
-
-static void __exit at91_rtc_exit(void)
-{
-       platform_driver_unregister(&at91_rtc_driver);
-}
-module_exit(at91_rtc_exit);
-
+module_platform_driver(at91_rtc_driver);
 
 MODULE_AUTHOR("Michel Benoit");
 MODULE_DESCRIPTION("RTC driver for Atmel AT91SAM9x");
index 76b2156d3c62252c80aec1eacce8fe0684950d75..c8115b83e5ab513d2360adbdb4911352c3cd3187 100644 (file)
@@ -276,8 +276,7 @@ static void coh901331_shutdown(struct platform_device *pdev)
 
        clk_enable(rtap->clk);
        writel(0, rtap->virtbase + COH901331_IRQ_MASK);
-       clk_disable(rtap->clk);
-       clk_unprepare(rtap->clk);
+       clk_disable_unprepare(rtap->clk);
 }
 
 static struct platform_driver coh901331_driver = {
diff --git a/drivers/rtc/rtc-ds2404.c b/drivers/rtc/rtc-ds2404.c
new file mode 100644 (file)
index 0000000..5ea9df7
--- /dev/null
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2012 Sven Schnelle <svens@stackframe.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rtc.h>
+#include <linux/types.h>
+#include <linux/bcd.h>
+#include <linux/rtc-ds2404.h>
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/slab.h>
+
+#include <linux/io.h>
+
+#define DS2404_STATUS_REG 0x200
+#define DS2404_CONTROL_REG 0x201
+#define DS2404_RTC_REG 0x202
+
+#define DS2404_WRITE_SCRATCHPAD_CMD 0x0f
+#define DS2404_READ_SCRATCHPAD_CMD 0xaa
+#define DS2404_COPY_SCRATCHPAD_CMD 0x55
+#define DS2404_READ_MEMORY_CMD 0xf0
+
+struct ds2404;
+
+struct ds2404_chip_ops {
+       int (*map_io)(struct ds2404 *chip, struct platform_device *pdev,
+                     struct ds2404_platform_data *pdata);
+       void (*unmap_io)(struct ds2404 *chip);
+};
+
+#define DS2404_RST     0
+#define DS2404_CLK     1
+#define DS2404_DQ      2
+
+struct ds2404_gpio {
+       const char *name;
+       unsigned int gpio;
+};
+
+struct ds2404 {
+       struct ds2404_gpio *gpio;
+       struct ds2404_chip_ops *ops;
+       struct rtc_device *rtc;
+};
+
+static struct ds2404_gpio ds2404_gpio[] = {
+       { "RTC RST", 0 },
+       { "RTC CLK", 0 },
+       { "RTC DQ", 0 },
+};
+
+static int ds2404_gpio_map(struct ds2404 *chip, struct platform_device *pdev,
+                         struct ds2404_platform_data *pdata)
+{
+       int i, err;
+
+       ds2404_gpio[DS2404_RST].gpio = pdata->gpio_rst;
+       ds2404_gpio[DS2404_CLK].gpio = pdata->gpio_clk;
+       ds2404_gpio[DS2404_DQ].gpio = pdata->gpio_dq;
+
+       for (i = 0; i < ARRAY_SIZE(ds2404_gpio); i++) {
+               err = gpio_request(ds2404_gpio[i].gpio, ds2404_gpio[i].name);
+               if (err) {
+                       printk(KERN_ERR "error mapping gpio %s: %d\n",
+                               ds2404_gpio[i].name, err);
+                       goto err_request;
+               }
+               if (i != DS2404_DQ)
+                       gpio_direction_output(ds2404_gpio[i].gpio, 1);
+       }
+
+       chip->gpio = ds2404_gpio;
+       return 0;
+
+err_request:
+       while (--i >= 0)
+               gpio_free(ds2404_gpio[i].gpio);
+       return err;
+}
+
+static void ds2404_gpio_unmap(struct ds2404 *chip)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(ds2404_gpio); i++)
+               gpio_free(ds2404_gpio[i].gpio);
+}
+
+static struct ds2404_chip_ops ds2404_gpio_ops = {
+       .map_io         = ds2404_gpio_map,
+       .unmap_io       = ds2404_gpio_unmap,
+};
+
+static void ds2404_reset(struct device *dev)
+{
+       gpio_set_value(ds2404_gpio[DS2404_RST].gpio, 0);
+       udelay(1000);
+       gpio_set_value(ds2404_gpio[DS2404_RST].gpio, 1);
+       gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 0);
+       gpio_direction_output(ds2404_gpio[DS2404_DQ].gpio, 0);
+       udelay(10);
+}
+
+static void ds2404_write_byte(struct device *dev, u8 byte)
+{
+       int i;
+
+       gpio_direction_output(ds2404_gpio[DS2404_DQ].gpio, 1);
+       for (i = 0; i < 8; i++) {
+               gpio_set_value(ds2404_gpio[DS2404_DQ].gpio, byte & (1 << i));
+               udelay(10);
+               gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 1);
+               udelay(10);
+               gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 0);
+               udelay(10);
+       }
+}
+
+static u8 ds2404_read_byte(struct device *dev)
+{
+       int i;
+       u8 ret = 0;
+
+       gpio_direction_input(ds2404_gpio[DS2404_DQ].gpio);
+
+       for (i = 0; i < 8; i++) {
+               gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 0);
+               udelay(10);
+               if (gpio_get_value(ds2404_gpio[DS2404_DQ].gpio))
+                       ret |= 1 << i;
+               gpio_set_value(ds2404_gpio[DS2404_CLK].gpio, 1);
+               udelay(10);
+       }
+       return ret;
+}
+
+static void ds2404_read_memory(struct device *dev, u16 offset,
+                              int length, u8 *out)
+{
+       ds2404_reset(dev);
+       ds2404_write_byte(dev, DS2404_READ_MEMORY_CMD);
+       ds2404_write_byte(dev, offset & 0xff);
+       ds2404_write_byte(dev, (offset >> 8) & 0xff);
+       while (length--)
+               *out++ = ds2404_read_byte(dev);
+}
+
+static void ds2404_write_memory(struct device *dev, u16 offset,
+                               int length, u8 *out)
+{
+       int i;
+       u8 ta01, ta02, es;
+
+       ds2404_reset(dev);
+       ds2404_write_byte(dev, DS2404_WRITE_SCRATCHPAD_CMD);
+       ds2404_write_byte(dev, offset & 0xff);
+       ds2404_write_byte(dev, (offset >> 8) & 0xff);
+
+       for (i = 0; i < length; i++)
+               ds2404_write_byte(dev, out[i]);
+
+       ds2404_reset(dev);
+       ds2404_write_byte(dev, DS2404_READ_SCRATCHPAD_CMD);
+
+       ta01 = ds2404_read_byte(dev);
+       ta02 = ds2404_read_byte(dev);
+       es = ds2404_read_byte(dev);
+
+       for (i = 0; i < length; i++) {
+               if (out[i] != ds2404_read_byte(dev)) {
+                       printk(KERN_ERR "read invalid data\n");
+                       return;
+               }
+       }
+
+       ds2404_reset(dev);
+       ds2404_write_byte(dev, DS2404_COPY_SCRATCHPAD_CMD);
+       ds2404_write_byte(dev, ta01);
+       ds2404_write_byte(dev, ta02);
+       ds2404_write_byte(dev, es);
+
+       gpio_direction_input(ds2404_gpio[DS2404_DQ].gpio);
+       while (gpio_get_value(ds2404_gpio[DS2404_DQ].gpio))
+               ;
+}
+
+static void ds2404_enable_osc(struct device *dev)
+{
+       u8 in[1] = { 0x10 }; /* enable oscillator */
+       ds2404_write_memory(dev, 0x201, 1, in);
+}
+
+static int ds2404_read_time(struct device *dev, struct rtc_time *dt)
+{
+       unsigned long time = 0;
+
+       ds2404_read_memory(dev, 0x203, 4, (u8 *)&time);
+       time = le32_to_cpu(time);
+
+       rtc_time_to_tm(time, dt);
+       return rtc_valid_tm(dt);
+}
+
+static int ds2404_set_mmss(struct device *dev, unsigned long secs)
+{
+       u32 time = cpu_to_le32(secs);
+       ds2404_write_memory(dev, 0x203, 4, (u8 *)&time);
+       return 0;
+}
+
+static const struct rtc_class_ops ds2404_rtc_ops = {
+       .read_time      = ds2404_read_time,
+       .set_mmss       = ds2404_set_mmss,
+};
+
+static int rtc_probe(struct platform_device *pdev)
+{
+       struct ds2404_platform_data *pdata = pdev->dev.platform_data;
+       struct ds2404 *chip;
+       int retval = -EBUSY;
+
+       chip = kzalloc(sizeof(struct ds2404), GFP_KERNEL);
+       if (!chip)
+               return -ENOMEM;
+
+       chip->ops = &ds2404_gpio_ops;
+
+       retval = chip->ops->map_io(chip, pdev, pdata);
+       if (retval)
+               goto err_chip;
+
+       dev_info(&pdev->dev, "using GPIOs RST:%d, CLK:%d, DQ:%d\n",
+                chip->gpio[DS2404_RST].gpio, chip->gpio[DS2404_CLK].gpio,
+                chip->gpio[DS2404_DQ].gpio);
+
+       platform_set_drvdata(pdev, chip);
+
+       chip->rtc = rtc_device_register("ds2404",
+                               &pdev->dev, &ds2404_rtc_ops, THIS_MODULE);
+       if (IS_ERR(chip->rtc)) {
+               retval = PTR_ERR(chip->rtc);
+               goto err_io;
+       }
+
+       ds2404_enable_osc(&pdev->dev);
+       return 0;
+
+err_io:
+       chip->ops->unmap_io(chip);
+err_chip:
+       kfree(chip);
+       return retval;
+}
+
+static int rtc_remove(struct platform_device *dev)
+{
+       struct ds2404 *chip = platform_get_drvdata(dev);
+       struct rtc_device *rtc = chip->rtc;
+
+       if (rtc)
+               rtc_device_unregister(rtc);
+
+       chip->ops->unmap_io(chip);
+       kfree(chip);
+
+       return 0;
+}
+
+static struct platform_driver rtc_device_driver = {
+       .probe  = rtc_probe,
+       .remove = rtc_remove,
+       .driver = {
+               .name   = "ds2404",
+               .owner  = THIS_MODULE,
+       },
+};
+
+static __init int ds2404_init(void)
+{
+       return platform_driver_register(&rtc_device_driver);
+}
+
+static __exit void ds2404_exit(void)
+{
+       platform_driver_unregister(&rtc_device_driver);
+}
+
+module_init(ds2404_init);
+module_exit(ds2404_exit);
+
+MODULE_DESCRIPTION("DS2404 RTC");
+MODULE_AUTHOR("Sven Schnelle");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:ds2404");
index dd2aeee6c66a05a7df3644f612d9033e9add496b..0ea86acdffbca212c616837a698ec3ec4090adad 100644 (file)
@@ -697,6 +697,7 @@ isl1208_remove(struct i2c_client *client)
 
 static const struct i2c_device_id isl1208_id[] = {
        { "isl1208", 0 },
+       { "isl1218", 0 },
        { }
 };
 MODULE_DEVICE_TABLE(i2c, isl1208_id);
index 05ab227eeff725aaa11761990fc520de9b71ce07..1224182d3eabb165892aeb3350c1914bf590c9a1 100644 (file)
@@ -42,7 +42,7 @@ struct jz4740_rtc {
 
        struct rtc_device *rtc;
 
-       unsigned int irq;
+       int irq;
 
        spinlock_t lock;
 };
diff --git a/drivers/rtc/rtc-max8907.c b/drivers/rtc/rtc-max8907.c
new file mode 100644 (file)
index 0000000..4880374
--- /dev/null
@@ -0,0 +1,245 @@
+/*
+ * RTC driver for Maxim MAX8907
+ *
+ * Copyright (c) 2011-2012, NVIDIA Corporation.
+ *
+ * Based on drivers/rtc/rtc-max8925.c,
+ * Copyright (C) 2009-2010 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/bcd.h>
+#include <linux/i2c.h>
+#include <linux/mfd/max8907.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/rtc.h>
+#include <linux/slab.h>
+
+enum {
+       RTC_SEC = 0,
+       RTC_MIN,
+       RTC_HOUR,
+       RTC_WEEKDAY,
+       RTC_DATE,
+       RTC_MONTH,
+       RTC_YEAR1,
+       RTC_YEAR2,
+};
+
+#define TIME_NUM                       8
+#define ALARM_1SEC                     (1 << 7)
+#define HOUR_12                                (1 << 7)
+#define HOUR_AM_PM                     (1 << 5)
+#define ALARM0_IRQ                     (1 << 3)
+#define ALARM1_IRQ                     (1 << 2)
+#define ALARM0_STATUS                  (1 << 2)
+#define ALARM1_STATUS                  (1 << 1)
+
+struct max8907_rtc {
+       struct max8907          *max8907;
+       struct regmap           *regmap;
+       struct rtc_device       *rtc_dev;
+       int                     irq;
+};
+
+static irqreturn_t max8907_irq_handler(int irq, void *data)
+{
+       struct max8907_rtc *rtc = data;
+
+       regmap_update_bits(rtc->regmap, MAX8907_REG_ALARM0_CNTL, 0x7f, 0);
+
+       rtc_update_irq(rtc->rtc_dev, 1, RTC_IRQF | RTC_AF);
+
+       return IRQ_HANDLED;
+}
+
+static void regs_to_tm(u8 *regs, struct rtc_time *tm)
+{
+       tm->tm_year = bcd2bin(regs[RTC_YEAR2]) * 100 +
+               bcd2bin(regs[RTC_YEAR1]) - 1900;
+       tm->tm_mon = bcd2bin(regs[RTC_MONTH] & 0x1f) - 1;
+       tm->tm_mday = bcd2bin(regs[RTC_DATE] & 0x3f);
+       tm->tm_wday = (regs[RTC_WEEKDAY] & 0x07) - 1;
+       if (regs[RTC_HOUR] & HOUR_12) {
+               tm->tm_hour = bcd2bin(regs[RTC_HOUR] & 0x01f);
+               if (tm->tm_hour == 12)
+                       tm->tm_hour = 0;
+               if (regs[RTC_HOUR] & HOUR_AM_PM)
+                       tm->tm_hour += 12;
+       } else {
+               tm->tm_hour = bcd2bin(regs[RTC_HOUR] & 0x03f);
+       }
+       tm->tm_min = bcd2bin(regs[RTC_MIN] & 0x7f);
+       tm->tm_sec = bcd2bin(regs[RTC_SEC] & 0x7f);
+}
+
+static void tm_to_regs(struct rtc_time *tm, u8 *regs)
+{
+       u8 high, low;
+
+       high = (tm->tm_year + 1900) / 100;
+       low = tm->tm_year % 100;
+       regs[RTC_YEAR2] = bin2bcd(high);
+       regs[RTC_YEAR1] = bin2bcd(low);
+       regs[RTC_MONTH] = bin2bcd(tm->tm_mon + 1);
+       regs[RTC_DATE] = bin2bcd(tm->tm_mday);
+       regs[RTC_WEEKDAY] = tm->tm_wday + 1;
+       regs[RTC_HOUR] = bin2bcd(tm->tm_hour);
+       regs[RTC_MIN] = bin2bcd(tm->tm_min);
+       regs[RTC_SEC] = bin2bcd(tm->tm_sec);
+}
+
+static int max8907_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct max8907_rtc *rtc = dev_get_drvdata(dev);
+       u8 regs[TIME_NUM];
+       int ret;
+
+       ret = regmap_bulk_read(rtc->regmap, MAX8907_REG_RTC_SEC, regs,
+                              TIME_NUM);
+       if (ret < 0)
+               return ret;
+
+       regs_to_tm(regs, tm);
+
+       return 0;
+}
+
+static int max8907_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct max8907_rtc *rtc = dev_get_drvdata(dev);
+       u8 regs[TIME_NUM];
+
+       tm_to_regs(tm, regs);
+
+       return regmap_bulk_write(rtc->regmap, MAX8907_REG_RTC_SEC, regs,
+                                TIME_NUM);
+}
+
+static int max8907_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct max8907_rtc *rtc = dev_get_drvdata(dev);
+       u8 regs[TIME_NUM];
+       unsigned int val;
+       int ret;
+
+       ret = regmap_bulk_read(rtc->regmap, MAX8907_REG_ALARM0_SEC, regs,
+                              TIME_NUM);
+       if (ret < 0)
+               return ret;
+
+       regs_to_tm(regs, &alrm->time);
+
+       ret = regmap_read(rtc->regmap, MAX8907_REG_ALARM0_CNTL, &val);
+       if (ret < 0)
+               return ret;
+
+       alrm->enabled = !!(val & 0x7f);
+
+       return 0;
+}
+
+static int max8907_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct max8907_rtc *rtc = dev_get_drvdata(dev);
+       u8 regs[TIME_NUM];
+       int ret;
+
+       tm_to_regs(&alrm->time, regs);
+
+       /* Disable alarm while we update the target time */
+       ret = regmap_update_bits(rtc->regmap, MAX8907_REG_ALARM0_CNTL, 0x7f, 0);
+       if (ret < 0)
+               return ret;
+
+       ret = regmap_bulk_write(rtc->regmap, MAX8907_REG_ALARM0_SEC, regs,
+                               TIME_NUM);
+       if (ret < 0)
+               return ret;
+
+       if (alrm->enabled)
+               ret = regmap_update_bits(rtc->regmap, MAX8907_REG_ALARM0_CNTL,
+                                        0x7f, 0x7f);
+
+       return ret;
+}
+
+static const struct rtc_class_ops max8907_rtc_ops = {
+       .read_time      = max8907_rtc_read_time,
+       .set_time       = max8907_rtc_set_time,
+       .read_alarm     = max8907_rtc_read_alarm,
+       .set_alarm      = max8907_rtc_set_alarm,
+};
+
+static int __devinit max8907_rtc_probe(struct platform_device *pdev)
+{
+       struct max8907 *max8907 = dev_get_drvdata(pdev->dev.parent);
+       struct max8907_rtc *rtc;
+       int ret;
+
+       rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+       if (!rtc)
+               return -ENOMEM;
+       platform_set_drvdata(pdev, rtc);
+
+       rtc->max8907 = max8907;
+       rtc->regmap = max8907->regmap_rtc;
+
+       rtc->rtc_dev = rtc_device_register("max8907-rtc", &pdev->dev,
+                                       &max8907_rtc_ops, THIS_MODULE);
+       if (IS_ERR(rtc->rtc_dev)) {
+               ret = PTR_ERR(rtc->rtc_dev);
+               dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret);
+               return ret;
+       }
+
+       rtc->irq = regmap_irq_get_virq(max8907->irqc_rtc,
+                                      MAX8907_IRQ_RTC_ALARM0);
+       if (rtc->irq < 0) {
+               ret = rtc->irq;
+               goto err_unregister;
+       }
+
+       ret = request_threaded_irq(rtc->irq, NULL, max8907_irq_handler,
+                                  IRQF_ONESHOT, "max8907-alarm0", rtc);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "Failed to request IRQ%d: %d\n",
+                       rtc->irq, ret);
+               goto err_unregister;
+       }
+
+       return 0;
+
+err_unregister:
+       rtc_device_unregister(rtc->rtc_dev);
+       return ret;
+}
+
+static int __devexit max8907_rtc_remove(struct platform_device *pdev)
+{
+       struct max8907_rtc *rtc = platform_get_drvdata(pdev);
+
+       free_irq(rtc->irq, rtc);
+       rtc_device_unregister(rtc->rtc_dev);
+
+       return 0;
+}
+
+static struct platform_driver max8907_rtc_driver = {
+       .driver = {
+               .name = "max8907-rtc",
+               .owner = THIS_MODULE,
+       },
+       .probe = max8907_rtc_probe,
+       .remove = __devexit_p(max8907_rtc_remove),
+};
+module_platform_driver(max8907_rtc_driver);
+
+MODULE_DESCRIPTION("Maxim MAX8907 RTC driver");
+MODULE_LICENSE("GPL v2");
+
index e3e50d69baf85e75966bdea6d7f7b95ae6d65af0..cd0106293a4903672f2de56c008ecc7f0534c2e7 100644 (file)
@@ -343,7 +343,7 @@ static struct rtc_class_ops mxc_rtc_ops = {
        .alarm_irq_enable       = mxc_rtc_alarm_irq_enable,
 };
 
-static int __init mxc_rtc_probe(struct platform_device *pdev)
+static int __devinit mxc_rtc_probe(struct platform_device *pdev)
 {
        struct resource *res;
        struct rtc_device *rtc;
@@ -367,14 +367,14 @@ static int __init mxc_rtc_probe(struct platform_device *pdev)
        pdata->ioaddr = devm_ioremap(&pdev->dev, res->start,
                                     resource_size(res));
 
-       pdata->clk = clk_get(&pdev->dev, "rtc");
+       pdata->clk = devm_clk_get(&pdev->dev, NULL);
        if (IS_ERR(pdata->clk)) {
                dev_err(&pdev->dev, "unable to get clock!\n");
                ret = PTR_ERR(pdata->clk);
                goto exit_free_pdata;
        }
 
-       clk_enable(pdata->clk);
+       clk_prepare_enable(pdata->clk);
        rate = clk_get_rate(pdata->clk);
 
        if (rate == 32768)
@@ -426,22 +426,20 @@ static int __init mxc_rtc_probe(struct platform_device *pdev)
 exit_clr_drvdata:
        platform_set_drvdata(pdev, NULL);
 exit_put_clk:
-       clk_disable(pdata->clk);
-       clk_put(pdata->clk);
+       clk_disable_unprepare(pdata->clk);
 
 exit_free_pdata:
 
        return ret;
 }
 
-static int __exit mxc_rtc_remove(struct platform_device *pdev)
+static int __devexit mxc_rtc_remove(struct platform_device *pdev)
 {
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
 
        rtc_device_unregister(pdata->rtc);
 
-       clk_disable(pdata->clk);
-       clk_put(pdata->clk);
+       clk_disable_unprepare(pdata->clk);
        platform_set_drvdata(pdev, NULL);
 
        return 0;
@@ -482,21 +480,11 @@ static struct platform_driver mxc_rtc_driver = {
 #endif
                   .owner       = THIS_MODULE,
        },
-       .remove         = __exit_p(mxc_rtc_remove),
+       .probe = mxc_rtc_probe,
+       .remove = __devexit_p(mxc_rtc_remove),
 };
 
-static int __init mxc_rtc_init(void)
-{
-       return platform_driver_probe(&mxc_rtc_driver, mxc_rtc_probe);
-}
-
-static void __exit mxc_rtc_exit(void)
-{
-       platform_driver_unregister(&mxc_rtc_driver);
-}
-
-module_init(mxc_rtc_init);
-module_exit(mxc_rtc_exit);
+module_platform_driver(mxc_rtc_driver)
 
 MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
 MODULE_DESCRIPTION("RTC driver for Freescale MXC");
index 0a59fda5c09d176c5fa0af1f90625c04a29232c4..e96236ac2e78a74cc9673f71ef8c07233343277f 100644 (file)
 
 #include "rtc-core.h"
 
+#define NAME_SIZE      10
+
+#if defined(CONFIG_RTC_HCTOSYS_DEVICE)
+static bool is_rtc_hctosys(struct rtc_device *rtc)
+{
+       int size;
+       char name[NAME_SIZE];
+
+       size = scnprintf(name, NAME_SIZE, "rtc%d", rtc->id);
+       if (size > NAME_SIZE)
+               return false;
+
+       return !strncmp(name, CONFIG_RTC_HCTOSYS_DEVICE, NAME_SIZE);
+}
+#else
+static bool is_rtc_hctosys(struct rtc_device *rtc)
+{
+       return (rtc->id == 0);
+}
+#endif
 
 static int rtc_proc_show(struct seq_file *seq, void *offset)
 {
@@ -117,12 +137,12 @@ static const struct file_operations rtc_proc_fops = {
 
 void rtc_proc_add_device(struct rtc_device *rtc)
 {
-       if (rtc->id == 0)
+       if (is_rtc_hctosys(rtc))
                proc_create_data("driver/rtc", 0, NULL, &rtc_proc_fops, rtc);
 }
 
 void rtc_proc_del_device(struct rtc_device *rtc)
 {
-       if (rtc->id == 0)
+       if (is_rtc_hctosys(rtc))
                remove_proc_entry("driver/rtc", NULL);
 }
diff --git a/drivers/rtc/rtc-rc5t583.c b/drivers/rtc/rtc-rc5t583.c
new file mode 100644 (file)
index 0000000..cdb140c
--- /dev/null
@@ -0,0 +1,331 @@
+/*
+ * rtc-rc5t583.c -- RICOH RC5T583 Real Time Clock
+ *
+ * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ * Author: Venu Byravarasu <vbyravarasu@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/rtc.h>
+#include <linux/bcd.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/rc5t583.h>
+
+struct rc5t583_rtc {
+       struct rtc_device       *rtc;
+       /* To store the list of enabled interrupts, during system suspend */
+       u32 irqen;
+};
+
+/* Total number of RTC registers needed to set time*/
+#define NUM_TIME_REGS  (RC5T583_RTC_YEAR - RC5T583_RTC_SEC + 1)
+
+/* Total number of RTC registers needed to set Y-Alarm*/
+#define NUM_YAL_REGS   (RC5T583_RTC_AY_YEAR - RC5T583_RTC_AY_MIN + 1)
+
+/* Set Y-Alarm interrupt */
+#define SET_YAL BIT(5)
+
+/* Get Y-Alarm interrupt status*/
+#define GET_YAL_STATUS BIT(3)
+
+static int rc5t583_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       u8 val;
+
+       /* Set Y-Alarm, based on 'enabled' */
+       val = enabled ? SET_YAL : 0;
+
+       return regmap_update_bits(rc5t583->regmap, RC5T583_RTC_CTL1, SET_YAL,
+               val);
+}
+
+/*
+ * Gets current rc5t583 RTC time and date parameters.
+ *
+ * The RTC's time/alarm representation is not what gmtime(3) requires
+ * Linux to use:
+ *
+ *  - Months are 1..12 vs Linux 0-11
+ *  - Years are 0..99 vs Linux 1900..N (we assume 21st century)
+ */
+static int rc5t583_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       u8 rtc_data[NUM_TIME_REGS];
+       int ret;
+
+       ret = regmap_bulk_read(rc5t583->regmap, RC5T583_RTC_SEC, rtc_data,
+               NUM_TIME_REGS);
+       if (ret < 0) {
+               dev_err(dev, "RTC read time failed with err:%d\n", ret);
+               return ret;
+       }
+
+       tm->tm_sec = bcd2bin(rtc_data[0]);
+       tm->tm_min = bcd2bin(rtc_data[1]);
+       tm->tm_hour = bcd2bin(rtc_data[2]);
+       tm->tm_wday = bcd2bin(rtc_data[3]);
+       tm->tm_mday = bcd2bin(rtc_data[4]);
+       tm->tm_mon = bcd2bin(rtc_data[5]) - 1;
+       tm->tm_year = bcd2bin(rtc_data[6]) + 100;
+
+       return ret;
+}
+
+static int rc5t583_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       unsigned char rtc_data[NUM_TIME_REGS];
+       int ret;
+
+       rtc_data[0] = bin2bcd(tm->tm_sec);
+       rtc_data[1] = bin2bcd(tm->tm_min);
+       rtc_data[2] = bin2bcd(tm->tm_hour);
+       rtc_data[3] = bin2bcd(tm->tm_wday);
+       rtc_data[4] = bin2bcd(tm->tm_mday);
+       rtc_data[5] = bin2bcd(tm->tm_mon + 1);
+       rtc_data[6] = bin2bcd(tm->tm_year - 100);
+
+       ret = regmap_bulk_write(rc5t583->regmap, RC5T583_RTC_SEC, rtc_data,
+               NUM_TIME_REGS);
+       if (ret < 0) {
+               dev_err(dev, "RTC set time failed with error %d\n", ret);
+               return ret;
+       }
+
+       return ret;
+}
+
+static int rc5t583_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       unsigned char alarm_data[NUM_YAL_REGS];
+       u32 interrupt_enable;
+       int ret;
+
+       ret = regmap_bulk_read(rc5t583->regmap, RC5T583_RTC_AY_MIN, alarm_data,
+               NUM_YAL_REGS);
+       if (ret < 0) {
+               dev_err(dev, "rtc_read_alarm error %d\n", ret);
+               return ret;
+       }
+
+       alm->time.tm_min = bcd2bin(alarm_data[0]);
+       alm->time.tm_hour = bcd2bin(alarm_data[1]);
+       alm->time.tm_mday = bcd2bin(alarm_data[2]);
+       alm->time.tm_mon = bcd2bin(alarm_data[3]) - 1;
+       alm->time.tm_year = bcd2bin(alarm_data[4]) + 100;
+
+       ret = regmap_read(rc5t583->regmap, RC5T583_RTC_CTL1, &interrupt_enable);
+       if (ret < 0)
+               return ret;
+
+       /* check if YALE is set */
+       if (interrupt_enable & SET_YAL)
+               alm->enabled = 1;
+
+       return ret;
+}
+
+static int rc5t583_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       unsigned char alarm_data[NUM_YAL_REGS];
+       int ret;
+
+       ret = rc5t583_rtc_alarm_irq_enable(dev, 0);
+       if (ret)
+               return ret;
+
+       alarm_data[0] = bin2bcd(alm->time.tm_min);
+       alarm_data[1] = bin2bcd(alm->time.tm_hour);
+       alarm_data[2] = bin2bcd(alm->time.tm_mday);
+       alarm_data[3] = bin2bcd(alm->time.tm_mon + 1);
+       alarm_data[4] = bin2bcd(alm->time.tm_year - 100);
+
+       ret = regmap_bulk_write(rc5t583->regmap, RC5T583_RTC_AY_MIN, alarm_data,
+               NUM_YAL_REGS);
+       if (ret) {
+               dev_err(dev, "rtc_set_alarm error %d\n", ret);
+               return ret;
+       }
+
+       if (alm->enabled)
+               ret = rc5t583_rtc_alarm_irq_enable(dev, 1);
+
+       return ret;
+}
+
+static irqreturn_t rc5t583_rtc_interrupt(int irq, void *rtc)
+{
+       struct device *dev = rtc;
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       struct rc5t583_rtc *rc5t583_rtc = dev_get_drvdata(dev);
+       unsigned long events = 0;
+       int ret;
+       u32 rtc_reg;
+
+       ret = regmap_read(rc5t583->regmap, RC5T583_RTC_CTL2, &rtc_reg);
+       if (ret < 0)
+               return IRQ_NONE;
+
+       if (rtc_reg & GET_YAL_STATUS) {
+               events = RTC_IRQF | RTC_AF;
+               /* clear pending Y-alarm interrupt bit */
+               rtc_reg &= ~GET_YAL_STATUS;
+       }
+
+       ret = regmap_write(rc5t583->regmap, RC5T583_RTC_CTL2, rtc_reg);
+       if (ret)
+               return IRQ_NONE;
+
+       /* Notify RTC core on event */
+       rtc_update_irq(rc5t583_rtc->rtc, 1, events);
+
+       return IRQ_HANDLED;
+}
+
+static const struct rtc_class_ops rc5t583_rtc_ops = {
+       .read_time      = rc5t583_rtc_read_time,
+       .set_time       = rc5t583_rtc_set_time,
+       .read_alarm     = rc5t583_rtc_read_alarm,
+       .set_alarm      = rc5t583_rtc_set_alarm,
+       .alarm_irq_enable = rc5t583_rtc_alarm_irq_enable,
+};
+
+static int __devinit rc5t583_rtc_probe(struct platform_device *pdev)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(pdev->dev.parent);
+       struct rc5t583_rtc *ricoh_rtc;
+       struct rc5t583_platform_data *pmic_plat_data;
+       int ret;
+       int irq;
+
+       ricoh_rtc = devm_kzalloc(&pdev->dev, sizeof(struct rc5t583_rtc),
+                       GFP_KERNEL);
+       if (!ricoh_rtc)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, ricoh_rtc);
+
+       /* Clear pending interrupts */
+       ret = regmap_write(rc5t583->regmap, RC5T583_RTC_CTL2, 0);
+       if (ret < 0)
+               return ret;
+
+       /* clear RTC Adjust register */
+       ret = regmap_write(rc5t583->regmap, RC5T583_RTC_ADJ, 0);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "unable to program rtc_adjust reg\n");
+               return -EBUSY;
+       }
+
+       pmic_plat_data = dev_get_platdata(rc5t583->dev);
+       irq = pmic_plat_data->irq_base;
+       if (irq <= 0) {
+               dev_warn(&pdev->dev, "Wake up is not possible as irq = %d\n",
+                       irq);
+               return ret;
+       }
+
+       irq += RC5T583_IRQ_YALE;
+       ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+               rc5t583_rtc_interrupt, IRQF_TRIGGER_LOW,
+               "rtc-rc5t583", &pdev->dev);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "IRQ is not free.\n");
+               return ret;
+       }
+       device_init_wakeup(&pdev->dev, 1);
+
+       ricoh_rtc->rtc = rtc_device_register(pdev->name, &pdev->dev,
+               &rc5t583_rtc_ops, THIS_MODULE);
+       if (IS_ERR(ricoh_rtc->rtc)) {
+               ret = PTR_ERR(ricoh_rtc->rtc);
+               dev_err(&pdev->dev, "RTC device register: err %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * Disable rc5t583 RTC interrupts.
+ * Sets status flag to free.
+ */
+static int __devexit rc5t583_rtc_remove(struct platform_device *pdev)
+{
+       struct rc5t583_rtc *rc5t583_rtc = dev_get_drvdata(&pdev->dev);
+
+       rc5t583_rtc_alarm_irq_enable(&rc5t583_rtc->rtc->dev, 0);
+
+       rtc_device_unregister(rc5t583_rtc->rtc);
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+
+static int rc5t583_rtc_suspend(struct device *dev)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       struct rc5t583_rtc *rc5t583_rtc = dev_get_drvdata(dev);
+       int ret;
+
+       /* Store current list of enabled interrupts*/
+       ret = regmap_read(rc5t583->regmap, RC5T583_RTC_CTL1,
+               &rc5t583_rtc->irqen);
+       return ret;
+}
+
+static int rc5t583_rtc_resume(struct device *dev)
+{
+       struct rc5t583 *rc5t583 = dev_get_drvdata(dev->parent);
+       struct rc5t583_rtc *rc5t583_rtc = dev_get_drvdata(dev);
+
+       /* Restore list of enabled interrupts before suspend */
+       return regmap_write(rc5t583->regmap, RC5T583_RTC_CTL1,
+               rc5t583_rtc->irqen);
+}
+
+static const struct dev_pm_ops rc5t583_rtc_pm_ops = {
+       .suspend        = rc5t583_rtc_suspend,
+       .resume         = rc5t583_rtc_resume,
+};
+
+#define DEV_PM_OPS     (&rc5t583_rtc_pm_ops)
+#else
+#define DEV_PM_OPS     NULL
+#endif
+
+static struct platform_driver rc5t583_rtc_driver = {
+       .probe          = rc5t583_rtc_probe,
+       .remove         = __devexit_p(rc5t583_rtc_remove),
+       .driver         = {
+               .owner  = THIS_MODULE,
+               .name   = "rtc-rc5t583",
+               .pm     = DEV_PM_OPS,
+       },
+};
+
+module_platform_driver(rc5t583_rtc_driver);
+MODULE_ALIAS("platform:rtc-rc5t583");
+MODULE_AUTHOR("Venu Byravarasu <vbyravarasu@nvidia.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
new file mode 100644 (file)
index 0000000..3c0da33
--- /dev/null
@@ -0,0 +1,350 @@
+/*
+ * Copyright (C) 2011-2012 Freescale Semiconductor, Inc.
+ *
+ * The code contained herein is licensed under the GNU General Public
+ * License. You may obtain a copy of the GNU General Public License
+ * Version 2 or later at the following locations:
+ *
+ * http://www.opensource.org/licenses/gpl-license.html
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+/* These register offsets are relative to LP (Low Power) range */
+#define SNVS_LPCR              0x04
+#define SNVS_LPSR              0x18
+#define SNVS_LPSRTCMR          0x1c
+#define SNVS_LPSRTCLR          0x20
+#define SNVS_LPTAR             0x24
+#define SNVS_LPPGDR            0x30
+
+#define SNVS_LPCR_SRTC_ENV     (1 << 0)
+#define SNVS_LPCR_LPTA_EN      (1 << 1)
+#define SNVS_LPCR_LPWUI_EN     (1 << 3)
+#define SNVS_LPSR_LPTA         (1 << 0)
+
+#define SNVS_LPPGDR_INIT       0x41736166
+#define CNTR_TO_SECS_SH                15
+
+struct snvs_rtc_data {
+       struct rtc_device *rtc;
+       void __iomem *ioaddr;
+       int irq;
+       spinlock_t lock;
+};
+
+static u32 rtc_read_lp_counter(void __iomem *ioaddr)
+{
+       u64 read1, read2;
+
+       do {
+               read1 = readl(ioaddr + SNVS_LPSRTCMR);
+               read1 <<= 32;
+               read1 |= readl(ioaddr + SNVS_LPSRTCLR);
+
+               read2 = readl(ioaddr + SNVS_LPSRTCMR);
+               read2 <<= 32;
+               read2 |= readl(ioaddr + SNVS_LPSRTCLR);
+       } while (read1 != read2);
+
+       /* Convert 47-bit counter to 32-bit raw second count */
+       return (u32) (read1 >> CNTR_TO_SECS_SH);
+}
+
+static void rtc_write_sync_lp(void __iomem *ioaddr)
+{
+       u32 count1, count2, count3;
+       int i;
+
+       /* Wait for 3 CKIL cycles */
+       for (i = 0; i < 3; i++) {
+               do {
+                       count1 = readl(ioaddr + SNVS_LPSRTCLR);
+                       count2 = readl(ioaddr + SNVS_LPSRTCLR);
+               } while (count1 != count2);
+
+               /* Now wait until counter value changes */
+               do {
+                       do {
+                               count2 = readl(ioaddr + SNVS_LPSRTCLR);
+                               count3 = readl(ioaddr + SNVS_LPSRTCLR);
+                       } while (count2 != count3);
+               } while (count3 == count1);
+       }
+}
+
+static int snvs_rtc_enable(struct snvs_rtc_data *data, bool enable)
+{
+       unsigned long flags;
+       int timeout = 1000;
+       u32 lpcr;
+
+       spin_lock_irqsave(&data->lock, flags);
+
+       lpcr = readl(data->ioaddr + SNVS_LPCR);
+       if (enable)
+               lpcr |= SNVS_LPCR_SRTC_ENV;
+       else
+               lpcr &= ~SNVS_LPCR_SRTC_ENV;
+       writel(lpcr, data->ioaddr + SNVS_LPCR);
+
+       spin_unlock_irqrestore(&data->lock, flags);
+
+       while (--timeout) {
+               lpcr = readl(data->ioaddr + SNVS_LPCR);
+
+               if (enable) {
+                       if (lpcr & SNVS_LPCR_SRTC_ENV)
+                               break;
+               } else {
+                       if (!(lpcr & SNVS_LPCR_SRTC_ENV))
+                               break;
+               }
+       }
+
+       if (!timeout)
+               return -ETIMEDOUT;
+
+       return 0;
+}
+
+static int snvs_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+       unsigned long time = rtc_read_lp_counter(data->ioaddr);
+
+       rtc_time_to_tm(time, tm);
+
+       return 0;
+}
+
+static int snvs_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+       unsigned long time;
+
+       rtc_tm_to_time(tm, &time);
+
+       /* Disable RTC first */
+       snvs_rtc_enable(data, false);
+
+       /* Write 32-bit time to 47-bit timer, leaving 15 LSBs blank */
+       writel(time << CNTR_TO_SECS_SH, data->ioaddr + SNVS_LPSRTCLR);
+       writel(time >> (32 - CNTR_TO_SECS_SH), data->ioaddr + SNVS_LPSRTCMR);
+
+       /* Enable RTC again */
+       snvs_rtc_enable(data, true);
+
+       return 0;
+}
+
+static int snvs_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+       u32 lptar, lpsr;
+
+       lptar = readl(data->ioaddr + SNVS_LPTAR);
+       rtc_time_to_tm(lptar, &alrm->time);
+
+       lpsr = readl(data->ioaddr + SNVS_LPSR);
+       alrm->pending = (lpsr & SNVS_LPSR_LPTA) ? 1 : 0;
+
+       return 0;
+}
+
+static int snvs_rtc_alarm_irq_enable(struct device *dev, unsigned int enable)
+{
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+       u32 lpcr;
+       unsigned long flags;
+
+       spin_lock_irqsave(&data->lock, flags);
+
+       lpcr = readl(data->ioaddr + SNVS_LPCR);
+       if (enable)
+               lpcr |= (SNVS_LPCR_LPTA_EN | SNVS_LPCR_LPWUI_EN);
+       else
+               lpcr &= ~(SNVS_LPCR_LPTA_EN | SNVS_LPCR_LPWUI_EN);
+       writel(lpcr, data->ioaddr + SNVS_LPCR);
+
+       spin_unlock_irqrestore(&data->lock, flags);
+
+       rtc_write_sync_lp(data->ioaddr);
+
+       return 0;
+}
+
+static int snvs_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+       struct rtc_time *alrm_tm = &alrm->time;
+       unsigned long time;
+       unsigned long flags;
+       u32 lpcr;
+
+       rtc_tm_to_time(alrm_tm, &time);
+
+       spin_lock_irqsave(&data->lock, flags);
+
+       /* Have to clear LPTA_EN before programming new alarm time in LPTAR */
+       lpcr = readl(data->ioaddr + SNVS_LPCR);
+       lpcr &= ~SNVS_LPCR_LPTA_EN;
+       writel(lpcr, data->ioaddr + SNVS_LPCR);
+
+       spin_unlock_irqrestore(&data->lock, flags);
+
+       writel(time, data->ioaddr + SNVS_LPTAR);
+
+       /* Clear alarm interrupt status bit */
+       writel(SNVS_LPSR_LPTA, data->ioaddr + SNVS_LPSR);
+
+       return snvs_rtc_alarm_irq_enable(dev, alrm->enabled);
+}
+
+static const struct rtc_class_ops snvs_rtc_ops = {
+       .read_time = snvs_rtc_read_time,
+       .set_time = snvs_rtc_set_time,
+       .read_alarm = snvs_rtc_read_alarm,
+       .set_alarm = snvs_rtc_set_alarm,
+       .alarm_irq_enable = snvs_rtc_alarm_irq_enable,
+};
+
+static irqreturn_t snvs_rtc_irq_handler(int irq, void *dev_id)
+{
+       struct device *dev = dev_id;
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+       u32 lpsr;
+       u32 events = 0;
+
+       lpsr = readl(data->ioaddr + SNVS_LPSR);
+
+       if (lpsr & SNVS_LPSR_LPTA) {
+               events |= (RTC_AF | RTC_IRQF);
+
+               /* RTC alarm should be one-shot */
+               snvs_rtc_alarm_irq_enable(dev, 0);
+
+               rtc_update_irq(data->rtc, 1, events);
+       }
+
+       /* clear interrupt status */
+       writel(lpsr, data->ioaddr + SNVS_LPSR);
+
+       return events ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int __devinit snvs_rtc_probe(struct platform_device *pdev)
+{
+       struct snvs_rtc_data *data;
+       struct resource *res;
+       int ret;
+
+       data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       data->ioaddr = devm_request_and_ioremap(&pdev->dev, res);
+       if (!data->ioaddr)
+               return -EADDRNOTAVAIL;
+
+       data->irq = platform_get_irq(pdev, 0);
+       if (data->irq < 0)
+               return data->irq;
+
+       platform_set_drvdata(pdev, data);
+
+       spin_lock_init(&data->lock);
+
+       /* Initialize glitch detect */
+       writel(SNVS_LPPGDR_INIT, data->ioaddr + SNVS_LPPGDR);
+
+       /* Clear interrupt status */
+       writel(0xffffffff, data->ioaddr + SNVS_LPSR);
+
+       /* Enable RTC */
+       snvs_rtc_enable(data, true);
+
+       device_init_wakeup(&pdev->dev, true);
+
+       ret = devm_request_irq(&pdev->dev, data->irq, snvs_rtc_irq_handler,
+                              IRQF_SHARED, "rtc alarm", &pdev->dev);
+       if (ret) {
+               dev_err(&pdev->dev, "failed to request irq %d: %d\n",
+                       data->irq, ret);
+               return ret;
+       }
+
+       data->rtc = rtc_device_register(pdev->name, &pdev->dev,
+                                       &snvs_rtc_ops, THIS_MODULE);
+       if (IS_ERR(data->rtc)) {
+               ret = PTR_ERR(data->rtc);
+               dev_err(&pdev->dev, "failed to register rtc: %d\n", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int __devexit snvs_rtc_remove(struct platform_device *pdev)
+{
+       struct snvs_rtc_data *data = platform_get_drvdata(pdev);
+
+       rtc_device_unregister(data->rtc);
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int snvs_rtc_suspend(struct device *dev)
+{
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+
+       if (device_may_wakeup(dev))
+               enable_irq_wake(data->irq);
+
+       return 0;
+}
+
+static int snvs_rtc_resume(struct device *dev)
+{
+       struct snvs_rtc_data *data = dev_get_drvdata(dev);
+
+       if (device_may_wakeup(dev))
+               disable_irq_wake(data->irq);
+
+       return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(snvs_rtc_pm_ops, snvs_rtc_suspend, snvs_rtc_resume);
+
+static const struct of_device_id __devinitconst snvs_dt_ids[] = {
+       { .compatible = "fsl,sec-v4.0-mon-rtc-lp", },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, snvs_dt_ids);
+
+static struct platform_driver snvs_rtc_driver = {
+       .driver = {
+               .name   = "snvs_rtc",
+               .owner  = THIS_MODULE,
+               .pm     = &snvs_rtc_pm_ops,
+               .of_match_table = snvs_dt_ids,
+       },
+       .probe          = snvs_rtc_probe,
+       .remove         = __devexit_p(snvs_rtc_remove),
+};
+module_platform_driver(snvs_rtc_driver);
+
+MODULE_AUTHOR("Freescale Semiconductor, Inc.");
+MODULE_DESCRIPTION("Freescale SNVS RTC Driver");
+MODULE_LICENSE("GPL");
index e2785479113ca06648949d6c6a78adee92c78367..bb507d23f6cea09e3c2ab80fec144e1acbb1c4a9 100644 (file)
@@ -235,7 +235,7 @@ static int spear_rtc_read_time(struct device *dev, struct rtc_time *tm)
 static int spear_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
        struct spear_rtc_config *config = dev_get_drvdata(dev);
-       unsigned int time, date, err = 0;
+       unsigned int time, date;
 
        if (tm2bcd(tm) < 0)
                return -EINVAL;
@@ -247,11 +247,8 @@ static int spear_rtc_set_time(struct device *dev, struct rtc_time *tm)
                (tm->tm_year << YEAR_SHIFT);
        writel(time, config->ioaddr + TIME_REG);
        writel(date, config->ioaddr + DATE_REG);
-       err = is_write_complete(config);
-       if (err < 0)
-               return err;
 
-       return 0;
+       return is_write_complete(config);
 }
 
 /*
@@ -295,7 +292,8 @@ static int spear_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
 static int spear_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 {
        struct spear_rtc_config *config = dev_get_drvdata(dev);
-       unsigned int time, date, err = 0;
+       unsigned int time, date;
+       int err;
 
        if (tm2bcd(&alm->time) < 0)
                return -EINVAL;
@@ -357,7 +355,7 @@ static int __devinit spear_rtc_probe(struct platform_device *pdev)
 {
        struct resource *res;
        struct spear_rtc_config *config;
-       unsigned int status = 0;
+       int status = 0;
        int irq;
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
index 380083ca572fd480ba0f060c2f9ead3925d855a4..b70e2bb6364500fb7d02c2dffb6c01575e1771e4 100644 (file)
@@ -102,6 +102,12 @@ rtc_sysfs_set_max_user_freq(struct device *dev, struct device_attribute *attr,
        return n;
 }
 
+/**
+ * rtc_sysfs_show_hctosys - indicate if the given RTC set the system time
+ *
+ * Returns 1 if the system clock was set by this RTC at the last
+ * boot or resume event.
+ */
 static ssize_t
 rtc_sysfs_show_hctosys(struct device *dev, struct device_attribute *attr,
                char *buf)
diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c
new file mode 100644 (file)
index 0000000..691ab96
--- /dev/null
@@ -0,0 +1,352 @@
+/*
+ * rtc-tps65910.c -- TPS65910 Real Time Clock interface
+ *
+ * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ * Author: Venu Byravarasu <vbyravarasu@nvidia.com>
+ *
+ * Based on original TI driver rtc-twl.c
+ *   Copyright (C) 2007 MontaVista Software, Inc
+ *   Author: Alexandre Rusev <source@mvista.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/rtc.h>
+#include <linux/bcd.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/tps65910.h>
+
+struct tps65910_rtc {
+       struct rtc_device       *rtc;
+       /* To store the list of enabled interrupts */
+       u32 irqstat;
+};
+
+/* Total number of RTC registers needed to set time*/
+#define NUM_TIME_REGS  (TPS65910_YEARS - TPS65910_SECONDS + 1)
+
+static int tps65910_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
+{
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       u8 val = 0;
+
+       if (enabled)
+               val = TPS65910_RTC_INTERRUPTS_IT_ALARM;
+
+       return regmap_write(tps->regmap, TPS65910_RTC_INTERRUPTS, val);
+}
+
+/*
+ * Gets current tps65910 RTC time and date parameters.
+ *
+ * The RTC's time/alarm representation is not what gmtime(3) requires
+ * Linux to use:
+ *
+ *  - Months are 1..12 vs Linux 0-11
+ *  - Years are 0..99 vs Linux 1900..N (we assume 21st century)
+ */
+static int tps65910_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       unsigned char rtc_data[NUM_TIME_REGS];
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       int ret;
+
+       /* Copy RTC counting registers to static registers or latches */
+       ret = regmap_update_bits(tps->regmap, TPS65910_RTC_CTRL,
+               TPS65910_RTC_CTRL_GET_TIME, TPS65910_RTC_CTRL_GET_TIME);
+       if (ret < 0) {
+               dev_err(dev, "RTC CTRL reg update failed with err:%d\n", ret);
+               return ret;
+       }
+
+       ret = regmap_bulk_read(tps->regmap, TPS65910_SECONDS, rtc_data,
+               NUM_TIME_REGS);
+       if (ret < 0) {
+               dev_err(dev, "reading from RTC failed with err:%d\n", ret);
+               return ret;
+       }
+
+       tm->tm_sec = bcd2bin(rtc_data[0]);
+       tm->tm_min = bcd2bin(rtc_data[1]);
+       tm->tm_hour = bcd2bin(rtc_data[2]);
+       tm->tm_mday = bcd2bin(rtc_data[3]);
+       tm->tm_mon = bcd2bin(rtc_data[4]) - 1;
+       tm->tm_year = bcd2bin(rtc_data[5]) + 100;
+
+       return ret;
+}
+
+static int tps65910_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       unsigned char rtc_data[NUM_TIME_REGS];
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       int ret;
+
+       rtc_data[0] = bin2bcd(tm->tm_sec);
+       rtc_data[1] = bin2bcd(tm->tm_min);
+       rtc_data[2] = bin2bcd(tm->tm_hour);
+       rtc_data[3] = bin2bcd(tm->tm_mday);
+       rtc_data[4] = bin2bcd(tm->tm_mon + 1);
+       rtc_data[5] = bin2bcd(tm->tm_year - 100);
+
+       /* Stop RTC while updating the RTC time registers */
+       ret = regmap_update_bits(tps->regmap, TPS65910_RTC_CTRL,
+               TPS65910_RTC_CTRL_STOP_RTC, 0);
+       if (ret < 0) {
+               dev_err(dev, "RTC stop failed with err:%d\n", ret);
+               return ret;
+       }
+
+       /* update all the time registers in one shot */
+       ret = regmap_bulk_write(tps->regmap, TPS65910_SECONDS, rtc_data,
+               NUM_TIME_REGS);
+       if (ret < 0) {
+               dev_err(dev, "rtc_set_time error %d\n", ret);
+               return ret;
+       }
+
+       /* Start back RTC */
+       ret = regmap_update_bits(tps->regmap, TPS65910_RTC_CTRL,
+               TPS65910_RTC_CTRL_STOP_RTC, 1);
+       if (ret < 0)
+               dev_err(dev, "RTC start failed with err:%d\n", ret);
+
+       return ret;
+}
+
+/*
+ * Gets current tps65910 RTC alarm time.
+ */
+static int tps65910_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+       unsigned char alarm_data[NUM_TIME_REGS];
+       u32 int_val;
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       int ret;
+
+       ret = regmap_bulk_read(tps->regmap, TPS65910_SECONDS, alarm_data,
+               NUM_TIME_REGS);
+       if (ret < 0) {
+               dev_err(dev, "rtc_read_alarm error %d\n", ret);
+               return ret;
+       }
+
+       alm->time.tm_sec = bcd2bin(alarm_data[0]);
+       alm->time.tm_min = bcd2bin(alarm_data[1]);
+       alm->time.tm_hour = bcd2bin(alarm_data[2]);
+       alm->time.tm_mday = bcd2bin(alarm_data[3]);
+       alm->time.tm_mon = bcd2bin(alarm_data[4]) - 1;
+       alm->time.tm_year = bcd2bin(alarm_data[5]) + 100;
+
+       ret = regmap_read(tps->regmap, TPS65910_RTC_INTERRUPTS, &int_val);
+       if (ret < 0)
+               return ret;
+
+       if (int_val & TPS65910_RTC_INTERRUPTS_IT_ALARM)
+               alm->enabled = 1;
+
+       return ret;
+}
+
+static int tps65910_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+       unsigned char alarm_data[NUM_TIME_REGS];
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       int ret;
+
+       ret = tps65910_rtc_alarm_irq_enable(dev, 0);
+       if (ret)
+               return ret;
+
+       alarm_data[0] = bin2bcd(alm->time.tm_sec);
+       alarm_data[1] = bin2bcd(alm->time.tm_min);
+       alarm_data[2] = bin2bcd(alm->time.tm_hour);
+       alarm_data[3] = bin2bcd(alm->time.tm_mday);
+       alarm_data[4] = bin2bcd(alm->time.tm_mon + 1);
+       alarm_data[5] = bin2bcd(alm->time.tm_year - 100);
+
+       /* update all the alarm registers in one shot */
+       ret = regmap_bulk_write(tps->regmap, TPS65910_ALARM_SECONDS,
+               alarm_data, NUM_TIME_REGS);
+       if (ret) {
+               dev_err(dev, "rtc_set_alarm error %d\n", ret);
+               return ret;
+       }
+
+       if (alm->enabled)
+               ret = tps65910_rtc_alarm_irq_enable(dev, 1);
+
+       return ret;
+}
+
+static irqreturn_t tps65910_rtc_interrupt(int irq, void *rtc)
+{
+       struct device *dev = rtc;
+       unsigned long events = 0;
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       struct tps65910_rtc *tps_rtc = dev_get_drvdata(dev);
+       int ret;
+       u32 rtc_reg;
+
+       ret = regmap_read(tps->regmap, TPS65910_RTC_STATUS, &rtc_reg);
+       if (ret)
+               return IRQ_NONE;
+
+       if (rtc_reg & TPS65910_RTC_STATUS_ALARM)
+               events = RTC_IRQF | RTC_AF;
+
+       ret = regmap_write(tps->regmap, TPS65910_RTC_STATUS, rtc_reg);
+       if (ret)
+               return IRQ_NONE;
+
+       /* Notify RTC core on event */
+       rtc_update_irq(tps_rtc->rtc, 1, events);
+
+       return IRQ_HANDLED;
+}
+
+static const struct rtc_class_ops tps65910_rtc_ops = {
+       .read_time      = tps65910_rtc_read_time,
+       .set_time       = tps65910_rtc_set_time,
+       .read_alarm     = tps65910_rtc_read_alarm,
+       .set_alarm      = tps65910_rtc_set_alarm,
+       .alarm_irq_enable = tps65910_rtc_alarm_irq_enable,
+};
+
+static int __devinit tps65910_rtc_probe(struct platform_device *pdev)
+{
+       struct tps65910 *tps65910 = NULL;
+       struct tps65910_rtc *tps_rtc = NULL;
+       struct tps65910_board *pmic_plat_data;
+       int ret;
+       int irq;
+       u32 rtc_reg;
+
+       tps65910 = dev_get_drvdata(pdev->dev.parent);
+
+       tps_rtc = devm_kzalloc(&pdev->dev, sizeof(struct tps65910_rtc),
+                       GFP_KERNEL);
+       if (!tps_rtc)
+               return -ENOMEM;
+
+       /* Clear pending interrupts */
+       ret = regmap_read(tps65910->regmap, TPS65910_RTC_STATUS, &rtc_reg);
+       if (ret < 0)
+               return ret;
+
+       ret = regmap_write(tps65910->regmap, TPS65910_RTC_STATUS, rtc_reg);
+       if (ret < 0)
+               return ret;
+
+       dev_dbg(&pdev->dev, "Enabling rtc-tps65910.\n");
+       rtc_reg = TPS65910_RTC_CTRL_STOP_RTC;
+       ret = regmap_write(tps65910->regmap, TPS65910_RTC_CTRL, rtc_reg);
+       if (ret < 0)
+               return ret;
+
+       pmic_plat_data = dev_get_platdata(tps65910->dev);
+       irq = pmic_plat_data->irq_base;
+       if (irq <= 0) {
+               dev_warn(&pdev->dev, "Wake up is not possible as irq = %d\n",
+                       irq);
+               return ret;
+       }
+
+       irq += TPS65910_IRQ_RTC_ALARM;
+       ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+               tps65910_rtc_interrupt, IRQF_TRIGGER_LOW,
+               "rtc-tps65910", &pdev->dev);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "IRQ is not free.\n");
+               return ret;
+       }
+       device_init_wakeup(&pdev->dev, 1);
+
+       tps_rtc->rtc = rtc_device_register(pdev->name, &pdev->dev,
+               &tps65910_rtc_ops, THIS_MODULE);
+       if (IS_ERR(tps_rtc->rtc)) {
+               ret = PTR_ERR(tps_rtc->rtc);
+               dev_err(&pdev->dev, "RTC device register: err %d\n", ret);
+               return ret;
+       }
+
+       platform_set_drvdata(pdev, tps_rtc);
+
+       return 0;
+}
+
+/*
+ * Disable tps65910 RTC interrupts.
+ * Sets status flag to free.
+ */
+static int __devexit tps65910_rtc_remove(struct platform_device *pdev)
+{
+       /* leave rtc running, but disable irqs */
+       struct rtc_device *rtc = platform_get_drvdata(pdev);
+
+       tps65910_rtc_alarm_irq_enable(&rtc->dev, 0);
+
+       rtc_device_unregister(rtc);
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+
+static int tps65910_rtc_suspend(struct device *dev)
+{
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+       u8 alarm = TPS65910_RTC_INTERRUPTS_IT_ALARM;
+       int ret;
+
+       /* Store current list of enabled interrupts*/
+       ret = regmap_read(tps->regmap, TPS65910_RTC_INTERRUPTS,
+               &tps->rtc->irqstat);
+       if (ret < 0)
+               return ret;
+
+       /* Enable RTC ALARM interrupt only */
+       return regmap_write(tps->regmap, TPS65910_RTC_INTERRUPTS, alarm);
+}
+
+static int tps65910_rtc_resume(struct device *dev)
+{
+       struct tps65910 *tps = dev_get_drvdata(dev->parent);
+
+       /* Restore list of enabled interrupts before suspend */
+       return regmap_write(tps->regmap, TPS65910_RTC_INTERRUPTS,
+               tps->rtc->irqstat);
+}
+
+static const struct dev_pm_ops tps65910_rtc_pm_ops = {
+       .suspend        = tps65910_rtc_suspend,
+       .resume         = tps65910_rtc_resume,
+};
+
+#define DEV_PM_OPS     (&tps65910_rtc_pm_ops)
+#else
+#define DEV_PM_OPS     NULL
+#endif
+
+static struct platform_driver tps65910_rtc_driver = {
+       .probe          = tps65910_rtc_probe,
+       .remove         = __devexit_p(tps65910_rtc_remove),
+       .driver         = {
+               .owner  = THIS_MODULE,
+               .name   = "rtc-tps65910",
+               .pm     = DEV_PM_OPS,
+       },
+};
+
+module_platform_driver(tps65910_rtc_driver);
+MODULE_ALIAS("platform:rtc-tps65910");
+MODULE_AUTHOR("Venu Byravarasu <vbyravarasu@nvidia.com>");
+MODULE_LICENSE("GPL");
index 68ce08552f699b6752cecfaa2c8a9174e8c7c634..a540162ac59c1483bfd71c43a2384df4a1525529 100644 (file)
@@ -1173,7 +1173,16 @@ wait_io1:
        outw(val, tmport);
        outb(2, 0x80);
 TCM_SYNC:
-       udelay(0x800);
+       /*
+        * The funny division into multiple delays is to accomodate
+        * arches like ARM where udelay() multiplies its argument by
+        * a large number to initialize a loop counter.  To avoid
+        * overflow, the maximum supported udelay is 2000 microseconds.
+        *
+        * XXX it would be more polite to find a way to use msleep()
+        */
+       mdelay(2);
+       udelay(48);
        if ((inb(tmport) & 0x80) == 0x00) {     /* bsy ? */
                outw(0, tmport--);
                outb(0, tmport);
index 9c5c5f2b3962626c39d645bfaa8da63c16808df4..be2c9a6561ffa4a95f0f1bb2b8718f74a6923ea2 100644 (file)
@@ -1257,7 +1257,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
        }
 
        sfp->mmap_called = 1;
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_private_data = sfp;
        vma->vm_ops = &sg_mmap_vm_ops;
        return 0;
index 94a740d2883dde4cce85a2e4859a0e94a949d436..634b9ae713e090d087b76c58bfe595637766d9f6 100644 (file)
@@ -332,7 +332,6 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma->vm_file)
                fput(vma->vm_file);
        vma->vm_file = asma->file;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
 
 out:
        mutex_unlock(&ashmem_mutex);
index 42728e0cc1945816574dd5879b8bcdc907c2bb92..c6f3ef6f57b9671e1dd18cc158faa831442895b4 100644 (file)
@@ -160,7 +160,7 @@ static int omap_gem_dmabuf_mmap(struct dma_buf *buffer,
                goto out_unlock;
        }
 
-       vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = obj->dev->driver->gem_vm_ops;
        vma->vm_private_data = obj;
        vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
index 6acea2b56aa47ac6e70d2aed1000c7faaf40cca9..cab9a437d00b7aa3ff2ad4d83c10bab0b699ef27 100644 (file)
@@ -261,7 +261,7 @@ static int bridge_mmap(struct file *filp, struct vm_area_struct *vma)
 {
        u32 status;
 
-       vma->vm_flags |= VM_RESERVED | VM_IO;
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
        dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx "
index 99a5d7551b3395c69b2cf962b6da0627ae6c6cee..80126be5ee937c2e47883a344325a9acb9042a90 100644 (file)
@@ -86,7 +86,7 @@ again:
        else if (unlikely(err))
                return err;
 
-       *id = *id & MAX_ID_MASK;
+       *id = *id & MAX_IDR_MASK;
        return 0;
 }
 
index d552a96f08cd1bf1abd12f2977a5ffd35eef6da5..9ee42ca4d289754cb99b374a32eb8bdc59371e46 100644 (file)
@@ -88,7 +88,7 @@ again:
        else if (unlikely(err))
                return err;
 
-       *id = *id & MAX_ID_MASK;
+       *id = *id & MAX_IDR_MASK;
        return 0;
 }
 
index a783d533a1a6fe662e99cd26ebec87c6569bd9bf..5110f367f1f15884bb313468ff5f06453aafd7b4 100644 (file)
@@ -653,8 +653,6 @@ static int uio_mmap_physical(struct vm_area_struct *vma)
        if (mi < 0)
                return -EINVAL;
 
-       vma->vm_flags |= VM_IO | VM_RESERVED;
-
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
        return remap_pfn_range(vma,
@@ -666,7 +664,7 @@ static int uio_mmap_physical(struct vm_area_struct *vma)
 
 static int uio_mmap_logical(struct vm_area_struct *vma)
 {
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &uio_vm_ops;
        uio_vma_open(vma);
        return 0;
index 91cd85076a44b1b22a3cd93f4fcb96d743228bc5..9a62e89d6dc0b720cf7cdd59b1530be024187c28 100644 (file)
@@ -1247,7 +1247,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma)
 {
        /* don't do anything here: "fault" will set up page table entries */
        vma->vm_ops = &mon_bin_vm_ops;
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_private_data = filp->private_data;
        mon_bin_vma_open(vma);
        return 0;
index 6968b7232232a07c8f6cdd96e6634cad2435511b..384091a0bec0f8d7b37220a8af33958b519cb034 100644 (file)
@@ -461,7 +461,7 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
        }
 
        vma->vm_private_data = vdev;
-       vma->vm_flags |= (VM_IO | VM_RESERVED);
+       vma->vm_flags |= VM_IO;
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
        phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
index a425d65d5ba2d5d2e66f513f5dfcfa29b62de186..fa44fbed397d0ee5eac4694efba7432fefb025e7 100644 (file)
@@ -400,7 +400,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 #ifndef MMU
        /* this is uClinux (no MMU) specific code */
 
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_start = videomemory;
 
        return 0;
index 3f2e8c13f1ca745bd0d0fbe9ce51784823c276e4..868932f904ef21b85e12b4de1b260bed1c662065 100644 (file)
@@ -1942,8 +1942,7 @@ static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
        off = vma->vm_pgoff << PAGE_SHIFT;
        size = vma->vm_end - vma->vm_start;
 
-       /* To stop the swapper from even considering these pages. */
-       vma->vm_flags |= (VM_IO | VM_RESERVED);
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 
        if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) ||
            ((off == info->fix.smem_len) && (size == PAGE_SIZE)))
index cf282763a8dc941ab96554ed1cae855d959199de..03e5719c914a25e1bc8e5f34e71e666542b5e275 100644 (file)
@@ -229,13 +229,6 @@ config BACKLIGHT_HP700
          If you have an HP Jornada 700 series,
          say Y to include backlight control driver.
 
-config BACKLIGHT_PROGEAR
-       tristate "Frontpath ProGear Backlight Driver"
-       depends on PCI && X86
-       help
-         If you have a Frontpath ProGear say Y to enable the
-         backlight driver.
-
 config BACKLIGHT_CARILLO_RANCH
        tristate "Intel Carillo Ranch Backlight Driver"
        depends on LCD_CLASS_DEVICE && PCI && X86 && FB_LE80578
@@ -352,6 +345,22 @@ config BACKLIGHT_AAT2870
          If you have a AnalogicTech AAT2870 say Y to enable the
          backlight driver.
 
+config BACKLIGHT_LM3630
+       tristate "Backlight Driver for LM3630"
+       depends on BACKLIGHT_CLASS_DEVICE && I2C
+       select REGMAP_I2C
+       help
+         This supports TI LM3630 Backlight Driver
+
+config BACKLIGHT_LM3639
+       tristate "Backlight Driver for LM3639"
+       depends on BACKLIGHT_CLASS_DEVICE && I2C
+       select REGMAP_I2C
+       select NEW_LEDS
+       select LEDS_CLASS
+       help
+         This supports TI LM3639 Backlight + 1.5A Flash LED Driver
+
 config BACKLIGHT_LP855X
        tristate "Backlight driver for TI LP855X"
        depends on BACKLIGHT_CLASS_DEVICE && I2C
index a2ac9cfbaf6bf1869e1c2b020b597952d4a7633a..7817e07f05f3d5beeddb3961b20290e9d44177e5 100644 (file)
@@ -23,10 +23,11 @@ obj-$(CONFIG_BACKLIGHT_HP700)       += jornada720_bl.o
 obj-$(CONFIG_BACKLIGHT_HP680)  += hp680_bl.o
 obj-$(CONFIG_BACKLIGHT_LM3533) += lm3533_bl.o
 obj-$(CONFIG_BACKLIGHT_LOCOMO) += locomolcd.o
+obj-$(CONFIG_BACKLIGHT_LM3630) += lm3630_bl.o
+obj-$(CONFIG_BACKLIGHT_LM3639) += lm3639_bl.o
 obj-$(CONFIG_BACKLIGHT_LP855X) += lp855x_bl.o
 obj-$(CONFIG_BACKLIGHT_OMAP1)  += omap1_bl.o
 obj-$(CONFIG_BACKLIGHT_PANDORA)        += pandora_bl.o
-obj-$(CONFIG_BACKLIGHT_PROGEAR) += progear_bl.o
 obj-$(CONFIG_BACKLIGHT_CARILLO_RANCH) += cr_bllcd.o
 obj-$(CONFIG_BACKLIGHT_PWM)    += pwm_bl.o
 obj-$(CONFIG_BACKLIGHT_DA903X) += da903x_bl.o
index b628d68f516284ea69f9ab204532d183d5ce0dc8..10485c927ac65a993c58068737d029e03335a5e9 100644 (file)
@@ -72,7 +72,7 @@ static int da9052_adjust_wled_brightness(struct da9052_bl *wleds)
        if (ret < 0)
                return ret;
 
-       msleep(10);
+       usleep_range(10000, 11000);
 
        if (wleds->brightness) {
                ret = da9052_reg_write(wleds->da9052, wled_bank[wleds->led_reg],
index 72dd5556a35bdbe825f4fb86033f82828f206bc3..6c5ed6b242cc06f87706fe7dd85a4700a743ca82 100644 (file)
@@ -34,9 +34,9 @@ static void kb3886_bl_set_intensity(int intensity)
        mutex_lock(&bl_mutex);
        intensity = intensity&0xff;
        outb(KB3886_ADC_DAC_PWM, KB3886_PARENT);
-       msleep(10);
+       usleep_range(10000, 11000);
        outb(KB3886_PWM0_WRITE, KB3886_IO);
-       msleep(10);
+       usleep_range(10000, 11000);
        outb(intensity, KB3886_IO);
        mutex_unlock(&bl_mutex);
 }
diff --git a/drivers/video/backlight/lm3630_bl.c b/drivers/video/backlight/lm3630_bl.c
new file mode 100644 (file)
index 0000000..dc19144
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+* Simple driver for Texas Instruments LM3630 Backlight driver chip
+* Copyright (C) 2012 Texas Instruments
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2 as
+* published by the Free Software Foundation.
+*
+*/
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/backlight.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/regmap.h>
+#include <linux/platform_data/lm3630_bl.h>
+
+#define REG_CTRL       0x00
+#define REG_CONFIG     0x01
+#define REG_BRT_A      0x03
+#define REG_BRT_B      0x04
+#define REG_INT_STATUS 0x09
+#define REG_INT_EN     0x0A
+#define REG_FAULT      0x0B
+#define REG_PWM_OUTLOW 0x12
+#define REG_PWM_OUTHIGH        0x13
+#define REG_MAX                0x1F
+
+#define INT_DEBOUNCE_MSEC      10
+
+enum lm3630_leds {
+       BLED_ALL = 0,
+       BLED_1,
+       BLED_2
+};
+
+static const char *bled_name[] = {
+       [BLED_ALL] = "lm3630_bled",     /*Bank1 controls all string */
+       [BLED_1] = "lm3630_bled1",      /*Bank1 controls bled1 */
+       [BLED_2] = "lm3630_bled2",      /*Bank1 or 2 controls bled2 */
+};
+
+struct lm3630_chip_data {
+       struct device *dev;
+       struct delayed_work work;
+       int irq;
+       struct workqueue_struct *irqthread;
+       struct lm3630_platform_data *pdata;
+       struct backlight_device *bled1;
+       struct backlight_device *bled2;
+       struct regmap *regmap;
+};
+
+/* initialize chip */
+static int __devinit lm3630_chip_init(struct lm3630_chip_data *pchip)
+{
+       int ret;
+       unsigned int reg_val;
+       struct lm3630_platform_data *pdata = pchip->pdata;
+
+       /*pwm control */
+       reg_val = ((pdata->pwm_active & 0x01) << 2) | (pdata->pwm_ctrl & 0x03);
+       ret = regmap_update_bits(pchip->regmap, REG_CONFIG, 0x07, reg_val);
+       if (ret < 0)
+               goto out;
+
+       /* bank control */
+       reg_val = ((pdata->bank_b_ctrl & 0x01) << 1) |
+                       (pdata->bank_a_ctrl & 0x07);
+       ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x07, reg_val);
+       if (ret < 0)
+               goto out;
+
+       ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x80, 0x00);
+       if (ret < 0)
+               goto out;
+
+       /* set initial brightness */
+       if (pdata->bank_a_ctrl != BANK_A_CTRL_DISABLE) {
+               ret = regmap_write(pchip->regmap,
+                                  REG_BRT_A, pdata->init_brt_led1);
+               if (ret < 0)
+                       goto out;
+       }
+
+       if (pdata->bank_b_ctrl != BANK_B_CTRL_DISABLE) {
+               ret = regmap_write(pchip->regmap,
+                                  REG_BRT_B, pdata->init_brt_led2);
+               if (ret < 0)
+                       goto out;
+       }
+       return ret;
+
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return ret;
+}
+
+/* interrupt handling */
+static void lm3630_delayed_func(struct work_struct *work)
+{
+       int ret;
+       unsigned int reg_val;
+       struct lm3630_chip_data *pchip;
+
+       pchip = container_of(work, struct lm3630_chip_data, work.work);
+
+       ret = regmap_read(pchip->regmap, REG_INT_STATUS, &reg_val);
+       if (ret < 0) {
+               dev_err(pchip->dev,
+                       "i2c failed to access REG_INT_STATUS Register\n");
+               return;
+       }
+
+       dev_info(pchip->dev, "REG_INT_STATUS Register is 0x%x\n", reg_val);
+}
+
+static irqreturn_t lm3630_isr_func(int irq, void *chip)
+{
+       int ret;
+       struct lm3630_chip_data *pchip = chip;
+       unsigned long delay = msecs_to_jiffies(INT_DEBOUNCE_MSEC);
+
+       queue_delayed_work(pchip->irqthread, &pchip->work, delay);
+
+       ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x80, 0x00);
+       if (ret < 0)
+               goto out;
+
+       return IRQ_HANDLED;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return IRQ_HANDLED;
+}
+
+static int lm3630_intr_config(struct lm3630_chip_data *pchip)
+{
+       INIT_DELAYED_WORK(&pchip->work, lm3630_delayed_func);
+       pchip->irqthread = create_singlethread_workqueue("lm3630-irqthd");
+       if (!pchip->irqthread) {
+               dev_err(pchip->dev, "create irq thread fail...\n");
+               return -1;
+       }
+       if (request_threaded_irq
+           (pchip->irq, NULL, lm3630_isr_func,
+            IRQF_TRIGGER_FALLING | IRQF_ONESHOT, "lm3630_irq", pchip)) {
+               dev_err(pchip->dev, "request threaded irq fail..\n");
+               return -1;
+       }
+       return 0;
+}
+
+static bool
+set_intensity(struct backlight_device *bl, struct lm3630_chip_data *pchip)
+{
+       if (!pchip->pdata->pwm_set_intensity)
+               return false;
+       pchip->pdata->pwm_set_intensity(bl->props.brightness - 1,
+                                       pchip->pdata->pwm_period);
+       return true;
+}
+
+/* update and get brightness */
+static int lm3630_bank_a_update_status(struct backlight_device *bl)
+{
+       int ret;
+       struct lm3630_chip_data *pchip = bl_get_data(bl);
+       enum lm3630_pwm_ctrl pwm_ctrl = pchip->pdata->pwm_ctrl;
+
+       /* brightness 0 means disable */
+       if (!bl->props.brightness) {
+               ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x04, 0x00);
+               if (ret < 0)
+                       goto out;
+               return bl->props.brightness;
+       }
+
+       /* pwm control */
+       if (pwm_ctrl == PWM_CTRL_BANK_A || pwm_ctrl == PWM_CTRL_BANK_ALL) {
+               if (!set_intensity(bl, pchip))
+                       dev_err(pchip->dev, "No pwm control func. in plat-data\n");
+       } else {
+
+               /* i2c control */
+               ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x80, 0x00);
+               if (ret < 0)
+                       goto out;
+               mdelay(1);
+               ret = regmap_write(pchip->regmap,
+                                  REG_BRT_A, bl->props.brightness - 1);
+               if (ret < 0)
+                       goto out;
+       }
+       return bl->props.brightness;
+out:
+       dev_err(pchip->dev, "i2c failed to access REG_CTRL\n");
+       return bl->props.brightness;
+}
+
+static int lm3630_bank_a_get_brightness(struct backlight_device *bl)
+{
+       unsigned int reg_val;
+       int brightness, ret;
+       struct lm3630_chip_data *pchip = bl_get_data(bl);
+       enum lm3630_pwm_ctrl pwm_ctrl = pchip->pdata->pwm_ctrl;
+
+       if (pwm_ctrl == PWM_CTRL_BANK_A || pwm_ctrl == PWM_CTRL_BANK_ALL) {
+               ret = regmap_read(pchip->regmap, REG_PWM_OUTHIGH, &reg_val);
+               if (ret < 0)
+                       goto out;
+               brightness = reg_val & 0x01;
+               ret = regmap_read(pchip->regmap, REG_PWM_OUTLOW, &reg_val);
+               if (ret < 0)
+                       goto out;
+               brightness = ((brightness << 8) | reg_val) + 1;
+       } else {
+               ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x80, 0x00);
+               if (ret < 0)
+                       goto out;
+               mdelay(1);
+               ret = regmap_read(pchip->regmap, REG_BRT_A, &reg_val);
+               if (ret < 0)
+                       goto out;
+               brightness = reg_val + 1;
+       }
+       bl->props.brightness = brightness;
+       return bl->props.brightness;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return 0;
+}
+
+static const struct backlight_ops lm3630_bank_a_ops = {
+       .options = BL_CORE_SUSPENDRESUME,
+       .update_status = lm3630_bank_a_update_status,
+       .get_brightness = lm3630_bank_a_get_brightness,
+};
+
+static int lm3630_bank_b_update_status(struct backlight_device *bl)
+{
+       int ret;
+       struct lm3630_chip_data *pchip = bl_get_data(bl);
+       enum lm3630_pwm_ctrl pwm_ctrl = pchip->pdata->pwm_ctrl;
+
+       if (pwm_ctrl == PWM_CTRL_BANK_B || pwm_ctrl == PWM_CTRL_BANK_ALL) {
+               if (!set_intensity(bl, pchip))
+                       dev_err(pchip->dev,
+                               "no pwm control func. in plat-data\n");
+       } else {
+               ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x80, 0x00);
+               if (ret < 0)
+                       goto out;
+               mdelay(1);
+               ret = regmap_write(pchip->regmap,
+                                  REG_BRT_B, bl->props.brightness - 1);
+       }
+       return bl->props.brightness;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return bl->props.brightness;
+}
+
+static int lm3630_bank_b_get_brightness(struct backlight_device *bl)
+{
+       unsigned int reg_val;
+       int brightness, ret;
+       struct lm3630_chip_data *pchip = bl_get_data(bl);
+       enum lm3630_pwm_ctrl pwm_ctrl = pchip->pdata->pwm_ctrl;
+
+       if (pwm_ctrl == PWM_CTRL_BANK_B || pwm_ctrl == PWM_CTRL_BANK_ALL) {
+               ret = regmap_read(pchip->regmap, REG_PWM_OUTHIGH, &reg_val);
+               if (ret < 0)
+                       goto out;
+               brightness = reg_val & 0x01;
+               ret = regmap_read(pchip->regmap, REG_PWM_OUTLOW, &reg_val);
+               if (ret < 0)
+                       goto out;
+               brightness = ((brightness << 8) | reg_val) + 1;
+       } else {
+               ret = regmap_update_bits(pchip->regmap, REG_CTRL, 0x80, 0x00);
+               if (ret < 0)
+                       goto out;
+               mdelay(1);
+               ret = regmap_read(pchip->regmap, REG_BRT_B, &reg_val);
+               if (ret < 0)
+                       goto out;
+               brightness = reg_val + 1;
+       }
+       bl->props.brightness = brightness;
+
+       return bl->props.brightness;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return bl->props.brightness;
+}
+
+static const struct backlight_ops lm3630_bank_b_ops = {
+       .options = BL_CORE_SUSPENDRESUME,
+       .update_status = lm3630_bank_b_update_status,
+       .get_brightness = lm3630_bank_b_get_brightness,
+};
+
+static int lm3630_backlight_register(struct lm3630_chip_data *pchip,
+                                    enum lm3630_leds ledno)
+{
+       const char *name = bled_name[ledno];
+       struct backlight_properties props;
+       struct lm3630_platform_data *pdata = pchip->pdata;
+
+       props.type = BACKLIGHT_RAW;
+       switch (ledno) {
+       case BLED_1:
+       case BLED_ALL:
+               props.brightness = pdata->init_brt_led1;
+               props.max_brightness = pdata->max_brt_led1;
+               pchip->bled1 =
+                   backlight_device_register(name, pchip->dev, pchip,
+                                             &lm3630_bank_a_ops, &props);
+               if (IS_ERR(pchip->bled1))
+                       return -EIO;
+               break;
+       case BLED_2:
+               props.brightness = pdata->init_brt_led2;
+               props.max_brightness = pdata->max_brt_led2;
+               pchip->bled2 =
+                   backlight_device_register(name, pchip->dev, pchip,
+                                             &lm3630_bank_b_ops, &props);
+               if (IS_ERR(pchip->bled2))
+                       return -EIO;
+               break;
+       }
+       return 0;
+}
+
+static void lm3630_backlight_unregister(struct lm3630_chip_data *pchip)
+{
+       if (pchip->bled1)
+               backlight_device_unregister(pchip->bled1);
+       if (pchip->bled2)
+               backlight_device_unregister(pchip->bled2);
+}
+
+static const struct regmap_config lm3630_regmap = {
+       .reg_bits = 8,
+       .val_bits = 8,
+       .max_register = REG_MAX,
+};
+
+static int __devinit lm3630_probe(struct i2c_client *client,
+                                 const struct i2c_device_id *id)
+{
+       struct lm3630_platform_data *pdata = client->dev.platform_data;
+       struct lm3630_chip_data *pchip;
+       int ret;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+               dev_err(&client->dev, "fail : i2c functionality check...\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (pdata == NULL) {
+               dev_err(&client->dev, "fail : no platform data.\n");
+               return -ENODATA;
+       }
+
+       pchip = devm_kzalloc(&client->dev, sizeof(struct lm3630_chip_data),
+                            GFP_KERNEL);
+       if (!pchip)
+               return -ENOMEM;
+       pchip->pdata = pdata;
+       pchip->dev = &client->dev;
+
+       pchip->regmap = devm_regmap_init_i2c(client, &lm3630_regmap);
+       if (IS_ERR(pchip->regmap)) {
+               ret = PTR_ERR(pchip->regmap);
+               dev_err(&client->dev, "fail : allocate register map: %d\n",
+                       ret);
+               return ret;
+       }
+       i2c_set_clientdata(client, pchip);
+
+       /* chip initialize */
+       ret = lm3630_chip_init(pchip);
+       if (ret < 0) {
+               dev_err(&client->dev, "fail : init chip\n");
+               goto err_chip_init;
+       }
+
+       switch (pdata->bank_a_ctrl) {
+       case BANK_A_CTRL_ALL:
+               ret = lm3630_backlight_register(pchip, BLED_ALL);
+               pdata->bank_b_ctrl = BANK_B_CTRL_DISABLE;
+               break;
+       case BANK_A_CTRL_LED1:
+               ret = lm3630_backlight_register(pchip, BLED_1);
+               break;
+       case BANK_A_CTRL_LED2:
+               ret = lm3630_backlight_register(pchip, BLED_2);
+               pdata->bank_b_ctrl = BANK_B_CTRL_DISABLE;
+               break;
+       default:
+               break;
+       }
+
+       if (ret < 0)
+               goto err_bl_reg;
+
+       if (pdata->bank_b_ctrl && pchip->bled2 == NULL) {
+               ret = lm3630_backlight_register(pchip, BLED_2);
+               if (ret < 0)
+                       goto err_bl_reg;
+       }
+
+       /* interrupt enable  : irq 0 is not allowed for lm3630 */
+       pchip->irq = client->irq;
+       if (pchip->irq)
+               lm3630_intr_config(pchip);
+
+       dev_info(&client->dev, "LM3630 backlight register OK.\n");
+       return 0;
+
+err_bl_reg:
+       dev_err(&client->dev, "fail : backlight register.\n");
+       lm3630_backlight_unregister(pchip);
+err_chip_init:
+       return ret;
+}
+
+static int __devexit lm3630_remove(struct i2c_client *client)
+{
+       int ret;
+       struct lm3630_chip_data *pchip = i2c_get_clientdata(client);
+
+       ret = regmap_write(pchip->regmap, REG_BRT_A, 0);
+       if (ret < 0)
+               dev_err(pchip->dev, "i2c failed to access register\n");
+
+       ret = regmap_write(pchip->regmap, REG_BRT_B, 0);
+       if (ret < 0)
+               dev_err(pchip->dev, "i2c failed to access register\n");
+
+       lm3630_backlight_unregister(pchip);
+       if (pchip->irq) {
+               free_irq(pchip->irq, pchip);
+               flush_workqueue(pchip->irqthread);
+               destroy_workqueue(pchip->irqthread);
+       }
+       return 0;
+}
+
+static const struct i2c_device_id lm3630_id[] = {
+       {LM3630_NAME, 0},
+       {}
+};
+
+MODULE_DEVICE_TABLE(i2c, lm3630_id);
+
+static struct i2c_driver lm3630_i2c_driver = {
+       .driver = {
+                  .name = LM3630_NAME,
+                  },
+       .probe = lm3630_probe,
+       .remove = __devexit_p(lm3630_remove),
+       .id_table = lm3630_id,
+};
+
+module_i2c_driver(lm3630_i2c_driver);
+
+MODULE_DESCRIPTION("Texas Instruments Backlight driver for LM3630");
+MODULE_AUTHOR("G.Shark Jeong <gshark.jeong@gmail.com>");
+MODULE_AUTHOR("Daniel Jeong <daniel.jeong@ti.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/video/backlight/lm3639_bl.c b/drivers/video/backlight/lm3639_bl.c
new file mode 100644 (file)
index 0000000..c6915c6
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+* Simple driver for Texas Instruments LM3639 Backlight + Flash LED driver chip
+* Copyright (C) 2012 Texas Instruments
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2 as
+* published by the Free Software Foundation.
+*
+*/
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/leds.h>
+#include <linux/backlight.h>
+#include <linux/err.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/regmap.h>
+#include <linux/platform_data/lm3639_bl.h>
+
+#define REG_DEV_ID     0x00
+#define REG_CHECKSUM   0x01
+#define REG_BL_CONF_1  0x02
+#define REG_BL_CONF_2  0x03
+#define REG_BL_CONF_3  0x04
+#define REG_BL_CONF_4  0x05
+#define REG_FL_CONF_1  0x06
+#define REG_FL_CONF_2  0x07
+#define REG_FL_CONF_3  0x08
+#define REG_IO_CTRL    0x09
+#define REG_ENABLE     0x0A
+#define REG_FLAG       0x0B
+#define REG_MAX                REG_FLAG
+
+struct lm3639_chip_data {
+       struct device *dev;
+       struct lm3639_platform_data *pdata;
+
+       struct backlight_device *bled;
+       struct led_classdev cdev_flash;
+       struct led_classdev cdev_torch;
+       struct regmap *regmap;
+
+       unsigned int bled_mode;
+       unsigned int bled_map;
+       unsigned int last_flag;
+};
+
+/* initialize chip */
+static int __devinit lm3639_chip_init(struct lm3639_chip_data *pchip)
+{
+       int ret;
+       unsigned int reg_val;
+       struct lm3639_platform_data *pdata = pchip->pdata;
+
+       /* input pins config. */
+       ret =
+           regmap_update_bits(pchip->regmap, REG_BL_CONF_1, 0x08,
+                              pdata->pin_pwm);
+       if (ret < 0)
+               goto out;
+
+       reg_val = (pdata->pin_pwm & 0x40) | pdata->pin_strobe | pdata->pin_tx;
+       ret = regmap_update_bits(pchip->regmap, REG_IO_CTRL, 0x7C, reg_val);
+       if (ret < 0)
+               goto out;
+
+       /* init brightness */
+       ret = regmap_write(pchip->regmap, REG_BL_CONF_4, pdata->init_brt_led);
+       if (ret < 0)
+               goto out;
+
+       ret = regmap_write(pchip->regmap, REG_BL_CONF_3, pdata->init_brt_led);
+       if (ret < 0)
+               goto out;
+
+       /* output pins config. */
+       if (!pdata->init_brt_led)
+               reg_val = pdata->fled_pins | pdata->bled_pins;
+       else
+               reg_val = pdata->fled_pins | pdata->bled_pins | 0x01;
+
+       ret = regmap_update_bits(pchip->regmap, REG_ENABLE, 0x79, reg_val);
+       if (ret < 0)
+               goto out;
+
+       return ret;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return ret;
+}
+
+/* update and get brightness */
+static int lm3639_bled_update_status(struct backlight_device *bl)
+{
+       int ret;
+       unsigned int reg_val;
+       struct lm3639_chip_data *pchip = bl_get_data(bl);
+       struct lm3639_platform_data *pdata = pchip->pdata;
+
+       ret = regmap_read(pchip->regmap, REG_FLAG, &reg_val);
+       if (ret < 0)
+               goto out;
+
+       if (reg_val != 0)
+               dev_info(pchip->dev, "last flag is 0x%x\n", reg_val);
+
+       /* pwm control */
+       if (pdata->pin_pwm) {
+               if (pdata->pwm_set_intensity)
+                       pdata->pwm_set_intensity(bl->props.brightness,
+                                                pdata->max_brt_led);
+               else
+                       dev_err(pchip->dev,
+                               "No pwm control func. in plat-data\n");
+               return bl->props.brightness;
+       }
+
+       /* i2c control and set brigtness */
+       ret = regmap_write(pchip->regmap, REG_BL_CONF_4, bl->props.brightness);
+       if (ret < 0)
+               goto out;
+       ret = regmap_write(pchip->regmap, REG_BL_CONF_3, bl->props.brightness);
+       if (ret < 0)
+               goto out;
+
+       if (!bl->props.brightness)
+               ret = regmap_update_bits(pchip->regmap, REG_ENABLE, 0x01, 0x00);
+       else
+               ret = regmap_update_bits(pchip->regmap, REG_ENABLE, 0x01, 0x01);
+       if (ret < 0)
+               goto out;
+
+       return bl->props.brightness;
+out:
+       dev_err(pchip->dev, "i2c failed to access registers\n");
+       return bl->props.brightness;
+}
+
+static int lm3639_bled_get_brightness(struct backlight_device *bl)
+{
+       int ret;
+       unsigned int reg_val;
+       struct lm3639_chip_data *pchip = bl_get_data(bl);
+       struct lm3639_platform_data *pdata = pchip->pdata;
+
+       if (pdata->pin_pwm) {
+               if (pdata->pwm_get_intensity)
+                       bl->props.brightness = pdata->pwm_get_intensity();
+               else
+                       dev_err(pchip->dev,
+                               "No pwm control func. in plat-data\n");
+               return bl->props.brightness;
+       }
+
+       ret = regmap_read(pchip->regmap, REG_BL_CONF_1, &reg_val);
+       if (ret < 0)
+               goto out;
+       if (reg_val & 0x10)
+               ret = regmap_read(pchip->regmap, REG_BL_CONF_4, &reg_val);
+       else
+               ret = regmap_read(pchip->regmap, REG_BL_CONF_3, &reg_val);
+       if (ret < 0)
+               goto out;
+       bl->props.brightness = reg_val;
+
+       return bl->props.brightness;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return bl->props.brightness;
+}
+
+static const struct backlight_ops lm3639_bled_ops = {
+       .options = BL_CORE_SUSPENDRESUME,
+       .update_status = lm3639_bled_update_status,
+       .get_brightness = lm3639_bled_get_brightness,
+};
+
+/* backlight mapping mode */
+static ssize_t lm3639_bled_mode_store(struct device *dev,
+                                     struct device_attribute *devAttr,
+                                     const char *buf, size_t size)
+{
+       ssize_t ret;
+       struct lm3639_chip_data *pchip = dev_get_drvdata(dev);
+       unsigned int state;
+
+       ret = kstrtouint(buf, 10, &state);
+       if (ret)
+               goto out_input;
+
+       if (!state)
+               ret =
+                   regmap_update_bits(pchip->regmap, REG_BL_CONF_1, 0x10,
+                                      0x00);
+       else
+               ret =
+                   regmap_update_bits(pchip->regmap, REG_BL_CONF_1, 0x10,
+                                      0x10);
+
+       if (ret < 0)
+               goto out;
+
+       return size;
+
+out:
+       dev_err(pchip->dev, "%s:i2c access fail to register\n", __func__);
+       return size;
+
+out_input:
+       dev_err(pchip->dev, "%s:input conversion fail\n", __func__);
+       return size;
+
+}
+
+static DEVICE_ATTR(bled_mode, 0666, NULL, lm3639_bled_mode_store);
+
+/* torch */
+static void lm3639_torch_brightness_set(struct led_classdev *cdev,
+                                       enum led_brightness brightness)
+{
+       int ret;
+       unsigned int reg_val;
+       struct lm3639_chip_data *pchip;
+
+       pchip = container_of(cdev, struct lm3639_chip_data, cdev_torch);
+
+       ret = regmap_read(pchip->regmap, REG_FLAG, &reg_val);
+       if (ret < 0)
+               goto out;
+       if (reg_val != 0)
+               dev_info(pchip->dev, "last flag is 0x%x\n", reg_val);
+
+       /* brightness 0 means off state */
+       if (!brightness) {
+               ret = regmap_update_bits(pchip->regmap, REG_ENABLE, 0x06, 0x00);
+               if (ret < 0)
+                       goto out;
+               return;
+       }
+
+       ret = regmap_update_bits(pchip->regmap,
+                                REG_FL_CONF_1, 0x70, (brightness - 1) << 4);
+       if (ret < 0)
+               goto out;
+       ret = regmap_update_bits(pchip->regmap, REG_ENABLE, 0x06, 0x02);
+       if (ret < 0)
+               goto out;
+
+       return;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return;
+}
+
+/* flash */
+static void lm3639_flash_brightness_set(struct led_classdev *cdev,
+                                       enum led_brightness brightness)
+{
+       int ret;
+       unsigned int reg_val;
+       struct lm3639_chip_data *pchip;
+
+       pchip = container_of(cdev, struct lm3639_chip_data, cdev_flash);
+
+       ret = regmap_read(pchip->regmap, REG_FLAG, &reg_val);
+       if (ret < 0)
+               goto out;
+       if (reg_val != 0)
+               dev_info(pchip->dev, "last flag is 0x%x\n", reg_val);
+
+       /* torch off before flash control */
+       ret = regmap_update_bits(pchip->regmap, REG_ENABLE, 0x06, 0x00);
+       if (ret < 0)
+               goto out;
+
+       /* brightness 0 means off state */
+       if (!brightness)
+               return;
+
+       ret = regmap_update_bits(pchip->regmap,
+                                REG_FL_CONF_1, 0x0F, brightness - 1);
+       if (ret < 0)
+               goto out;
+       ret = regmap_update_bits(pchip->regmap, REG_ENABLE, 0x06, 0x06);
+       if (ret < 0)
+               goto out;
+
+       return;
+out:
+       dev_err(pchip->dev, "i2c failed to access register\n");
+       return;
+}
+
+static const struct regmap_config lm3639_regmap = {
+       .reg_bits = 8,
+       .val_bits = 8,
+       .max_register = REG_MAX,
+};
+
+static int __devinit lm3639_probe(struct i2c_client *client,
+                                 const struct i2c_device_id *id)
+{
+       int ret;
+       struct lm3639_chip_data *pchip;
+       struct lm3639_platform_data *pdata = client->dev.platform_data;
+       struct backlight_properties props;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+               dev_err(&client->dev, "i2c functionality check fail.\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (pdata == NULL) {
+               dev_err(&client->dev, "Needs Platform Data.\n");
+               return -ENODATA;
+       }
+
+       pchip = devm_kzalloc(&client->dev,
+                            sizeof(struct lm3639_chip_data), GFP_KERNEL);
+       if (!pchip)
+               return -ENOMEM;
+
+       pchip->pdata = pdata;
+       pchip->dev = &client->dev;
+
+       pchip->regmap = devm_regmap_init_i2c(client, &lm3639_regmap);
+       if (IS_ERR(pchip->regmap)) {
+               ret = PTR_ERR(pchip->regmap);
+               dev_err(&client->dev, "fail : allocate register map: %d\n",
+                       ret);
+               return ret;
+       }
+       i2c_set_clientdata(client, pchip);
+
+       /* chip initialize */
+       ret = lm3639_chip_init(pchip);
+       if (ret < 0) {
+               dev_err(&client->dev, "fail : chip init\n");
+               goto err_out;
+       }
+
+       /* backlight */
+       props.type = BACKLIGHT_RAW;
+       props.brightness = pdata->init_brt_led;
+       props.max_brightness = pdata->max_brt_led;
+       pchip->bled =
+           backlight_device_register("lm3639_bled", pchip->dev, pchip,
+                                     &lm3639_bled_ops, &props);
+       if (IS_ERR(pchip->bled)) {
+               dev_err(&client->dev, "fail : backlight register\n");
+               ret = -EIO;
+               goto err_out;
+       }
+
+       ret = device_create_file(&(pchip->bled->dev), &dev_attr_bled_mode);
+       if (ret < 0) {
+               dev_err(&client->dev, "failed : add sysfs entries\n");
+               ret = -EIO;
+               goto err_bled_mode;
+       }
+
+       /* flash */
+       pchip->cdev_flash.name = "lm3639_flash";
+       pchip->cdev_flash.max_brightness = 16;
+       pchip->cdev_flash.brightness_set = lm3639_flash_brightness_set;
+       ret = led_classdev_register((struct device *)
+                                   &client->dev, &pchip->cdev_flash);
+       if (ret < 0) {
+               dev_err(&client->dev, "fail : flash register\n");
+               ret = -EIO;
+               goto err_flash;
+       }
+
+       /* torch */
+       pchip->cdev_torch.name = "lm3639_torch";
+       pchip->cdev_torch.max_brightness = 8;
+       pchip->cdev_torch.brightness_set = lm3639_torch_brightness_set;
+       ret = led_classdev_register((struct device *)
+                                   &client->dev, &pchip->cdev_torch);
+       if (ret < 0) {
+               dev_err(&client->dev, "fail : torch register\n");
+               ret = -EIO;
+               goto err_torch;
+       }
+
+       return 0;
+
+err_torch:
+       led_classdev_unregister(&pchip->cdev_flash);
+err_flash:
+       device_remove_file(&(pchip->bled->dev), &dev_attr_bled_mode);
+err_bled_mode:
+       backlight_device_unregister(pchip->bled);
+err_out:
+       return ret;
+}
+
+static int __devexit lm3639_remove(struct i2c_client *client)
+{
+       struct lm3639_chip_data *pchip = i2c_get_clientdata(client);
+
+       regmap_write(pchip->regmap, REG_ENABLE, 0x00);
+
+       if (&pchip->cdev_torch)
+               led_classdev_unregister(&pchip->cdev_torch);
+       if (&pchip->cdev_flash)
+               led_classdev_unregister(&pchip->cdev_flash);
+       if (pchip->bled) {
+               device_remove_file(&(pchip->bled->dev), &dev_attr_bled_mode);
+               backlight_device_unregister(pchip->bled);
+       }
+       return 0;
+}
+
+static const struct i2c_device_id lm3639_id[] = {
+       {LM3639_NAME, 0},
+       {}
+};
+
+MODULE_DEVICE_TABLE(i2c, lm3639_id);
+static struct i2c_driver lm3639_i2c_driver = {
+       .driver = {
+                  .name = LM3639_NAME,
+                  },
+       .probe = lm3639_probe,
+       .remove = __devexit_p(lm3639_remove),
+       .id_table = lm3639_id,
+};
+
+module_i2c_driver(lm3639_i2c_driver);
+
+MODULE_DESCRIPTION("Texas Instruments Backlight+Flash LED driver for LM3639");
+MODULE_AUTHOR("Daniel Jeong <daniel.jeong@ti.com>");
+MODULE_AUTHOR("G.Shark Jeong <gshark.jeong@gmail.com>");
+MODULE_LICENSE("GPL v2");
index 6c0f1ac0d32a93d9f278ad21155d2ca9d81b22d9..4066a5bbd826297928b2c01bba5e4b6587614968 100644 (file)
@@ -75,7 +75,7 @@ static int ltv350qv_power_on(struct ltv350qv *lcd)
        /* Power On Reset Display off State */
        if (ltv350qv_write_reg(lcd, LTV_PWRCTL1, 0x0000))
                goto err;
-       msleep(15);
+       usleep_range(15000, 16000);
 
        /* Power Setting Function 1 */
        if (ltv350qv_write_reg(lcd, LTV_PWRCTL1, LTV_VCOM_DISABLE))
@@ -153,7 +153,7 @@ err_settings:
 err_power2:
 err_power1:
        ltv350qv_write_reg(lcd, LTV_PWRCTL2, 0x0000);
-       msleep(1);
+       usleep_range(1000, 1100);
 err:
        ltv350qv_write_reg(lcd, LTV_PWRCTL1, LTV_VCOM_DISABLE);
        return -EIO;
@@ -175,7 +175,7 @@ static int ltv350qv_power_off(struct ltv350qv *lcd)
        ret |= ltv350qv_write_reg(lcd, LTV_PWRCTL2, 0x0000);
 
        /* Wait at least 1 ms */
-       msleep(1);
+       usleep_range(1000, 1100);
 
        /* Power down setting 2 */
        ret |= ltv350qv_write_reg(lcd, LTV_PWRCTL1, LTV_VCOM_DISABLE);
diff --git a/drivers/video/backlight/progear_bl.c b/drivers/video/backlight/progear_bl.c
deleted file mode 100644 (file)
index 69b35f0..0000000
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- *  Backlight Driver for Frontpath ProGear HX1050+
- *
- *  Copyright (c) 2006 Marcin Juszkiewicz
- *
- *  Based on Progear LCD driver by M Schacht
- *  <mschacht at alumni dot washington dot edu>
- *
- *  Based on Sharp's Corgi Backlight Driver
- *  Based on Backlight Driver for HP Jornada 680
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mutex.h>
-#include <linux/fb.h>
-#include <linux/backlight.h>
-#include <linux/pci.h>
-
-#define PMU_LPCR               0xB0
-#define SB_MPS1                0x61
-#define HW_LEVEL_MAX           0x77
-#define HW_LEVEL_MIN           0x4f
-
-static struct pci_dev *pmu_dev = NULL;
-static struct pci_dev *sb_dev = NULL;
-
-static int progearbl_set_intensity(struct backlight_device *bd)
-{
-       int intensity = bd->props.brightness;
-
-       if (bd->props.power != FB_BLANK_UNBLANK)
-               intensity = 0;
-       if (bd->props.fb_blank != FB_BLANK_UNBLANK)
-               intensity = 0;
-
-       pci_write_config_byte(pmu_dev, PMU_LPCR, intensity + HW_LEVEL_MIN);
-
-       return 0;
-}
-
-static int progearbl_get_intensity(struct backlight_device *bd)
-{
-       u8 intensity;
-       pci_read_config_byte(pmu_dev, PMU_LPCR, &intensity);
-
-       return intensity - HW_LEVEL_MIN;
-}
-
-static const struct backlight_ops progearbl_ops = {
-       .get_brightness = progearbl_get_intensity,
-       .update_status = progearbl_set_intensity,
-};
-
-static int progearbl_probe(struct platform_device *pdev)
-{
-       struct backlight_properties props;
-       u8 temp;
-       struct backlight_device *progear_backlight_device;
-       int ret;
-
-       pmu_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M7101, NULL);
-       if (!pmu_dev) {
-               pr_err("ALI M7101 PMU not found.\n");
-               return -ENODEV;
-       }
-
-       sb_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
-       if (!sb_dev) {
-               pr_err("ALI 1533 SB not found.\n");
-               ret = -ENODEV;
-               goto put_pmu;
-       }
-
-       /*     Set SB_MPS1 to enable brightness control. */
-       pci_read_config_byte(sb_dev, SB_MPS1, &temp);
-       pci_write_config_byte(sb_dev, SB_MPS1, temp | 0x20);
-
-       memset(&props, 0, sizeof(struct backlight_properties));
-       props.type = BACKLIGHT_RAW;
-       props.max_brightness = HW_LEVEL_MAX - HW_LEVEL_MIN;
-       progear_backlight_device = backlight_device_register("progear-bl",
-                                                            &pdev->dev, NULL,
-                                                            &progearbl_ops,
-                                                            &props);
-       if (IS_ERR(progear_backlight_device)) {
-               ret = PTR_ERR(progear_backlight_device);
-               goto put_sb;
-       }
-
-       platform_set_drvdata(pdev, progear_backlight_device);
-
-       progear_backlight_device->props.power = FB_BLANK_UNBLANK;
-       progear_backlight_device->props.brightness = HW_LEVEL_MAX - HW_LEVEL_MIN;
-       progearbl_set_intensity(progear_backlight_device);
-
-       return 0;
-put_sb:
-       pci_dev_put(sb_dev);
-put_pmu:
-       pci_dev_put(pmu_dev);
-       return ret;
-}
-
-static int progearbl_remove(struct platform_device *pdev)
-{
-       struct backlight_device *bd = platform_get_drvdata(pdev);
-       backlight_device_unregister(bd);
-
-       return 0;
-}
-
-static struct platform_driver progearbl_driver = {
-       .probe = progearbl_probe,
-       .remove = progearbl_remove,
-       .driver = {
-                  .name = "progear-bl",
-                  },
-};
-
-static struct platform_device *progearbl_device;
-
-static int __init progearbl_init(void)
-{
-       int ret = platform_driver_register(&progearbl_driver);
-
-       if (ret)
-               return ret;
-       progearbl_device = platform_device_register_simple("progear-bl", -1,
-                                                               NULL, 0);
-       if (IS_ERR(progearbl_device)) {
-               platform_driver_unregister(&progearbl_driver);
-               return PTR_ERR(progearbl_device);
-       }
-
-       return 0;
-}
-
-static void __exit progearbl_exit(void)
-{
-       pci_dev_put(pmu_dev);
-       pci_dev_put(sb_dev);
-
-       platform_device_unregister(progearbl_device);
-       platform_driver_unregister(&progearbl_driver);
-}
-
-module_init(progearbl_init);
-module_exit(progearbl_exit);
-
-MODULE_AUTHOR("Marcin Juszkiewicz <linux@hrw.one.pl>");
-MODULE_DESCRIPTION("ProGear Backlight Driver");
-MODULE_LICENSE("GPL");
index 995f0164c9b082c7da2836123adfcb2b7f10a6c7..0c910238eaf3a19e9c68235626aa636cb63c0b6c 100644 (file)
@@ -143,6 +143,11 @@ static int pwm_backlight_parse_dt(struct device *dev,
 
                data->dft_brightness = value;
                data->max_brightness--;
+
+               ret = of_property_read_u32(node, "low_threshold_brightness",
+                                          &value);
+               if (!ret)
+                       data->lth_brightness = value;
        }
 
        /*
@@ -213,7 +218,7 @@ static int pwm_backlight_probe(struct platform_device *pdev)
        pb->exit = data->exit;
        pb->dev = &pdev->dev;
 
-       pb->pwm = pwm_get(&pdev->dev, NULL);
+       pb->pwm = devm_pwm_get(&pdev->dev, NULL);
        if (IS_ERR(pb->pwm)) {
                dev_err(&pdev->dev, "unable to request PWM, trying legacy API\n");
 
@@ -246,7 +251,7 @@ static int pwm_backlight_probe(struct platform_device *pdev)
        if (IS_ERR(bl)) {
                dev_err(&pdev->dev, "failed to register backlight\n");
                ret = PTR_ERR(bl);
-               goto err_bl;
+               goto err_alloc;
        }
 
        bl->props.brightness = data->dft_brightness;
@@ -255,8 +260,6 @@ static int pwm_backlight_probe(struct platform_device *pdev)
        platform_set_drvdata(pdev, bl);
        return 0;
 
-err_bl:
-       pwm_put(pb->pwm);
 err_alloc:
        if (data->exit)
                data->exit(&pdev->dev);
@@ -271,7 +274,6 @@ static int pwm_backlight_remove(struct platform_device *pdev)
        backlight_device_unregister(bl);
        pwm_config(pb->pwm, 0, pb->period);
        pwm_disable(pb->pwm);
-       pwm_put(pb->pwm);
        if (pb->exit)
                pb->exit(&pdev->dev);
        return 0;
index 60a787fa32cfe97ddc8cdbb2999492e4da99144d..7d106f1f49069f78a43feb0dd2bdaee3befa43ca 100644 (file)
@@ -653,9 +653,8 @@ int unifb_mmap(struct fb_info *info,
                                vma->vm_page_prot))
                return -EAGAIN;
 
-       vma->vm_flags |= VM_RESERVED;   /* avoid to swap out this VMA */
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
        return 0;
-
 }
 
 static struct fb_ops unifb_ops = {
index 64cda560c488358c9205d66e064657bd6ba7203e..88cad6b8b479d44d4b402044a3d23460f64c8c2c 100644 (file)
@@ -166,7 +166,7 @@ static const struct address_space_operations fb_deferred_io_aops = {
 static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
        vma->vm_ops = &fb_deferred_io_vm_ops;
-       vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND );
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        if (!(info->flags & FBINFO_VIRTFB))
                vma->vm_flags |= VM_IO;
        vma->vm_private_data = info;
index 0dff12a1daef26af52949a6ff52f3b5c9de61a61..3ff0105a496a592cc9c1136c585efc463aa9de49 100644 (file)
@@ -1410,8 +1410,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
                return -EINVAL;
        off += start;
        vma->vm_pgoff = off >> PAGE_SHIFT;
-       /* This is an IO map - tell maydump to skip this VMA */
-       vma->vm_flags |= VM_IO | VM_RESERVED;
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by io_remap_pfn_range()*/
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        fb_pgprotect(file, vma, off);
        if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
index 9b79d3835f04732c7b4e44dc5d0d5116031ac4aa..7a65dd633e08d0ce0497b2b91a13e7a6b6ecb66c 100644 (file)
@@ -1024,7 +1024,7 @@ static int gbefb_mmap(struct fb_info *info,
        pgprot_val(vma->vm_page_prot) =
                pgprot_fb(pgprot_val(vma->vm_page_prot));
 
-       vma->vm_flags |= VM_IO | VM_RESERVED;
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
 
        /* look for the starting tile */
        tile = &gbe_tiles.cpu[offset >> TILE_SHIFT];
index f4477272d3c906b0d414a9e75f1c735414031218..9488d0215c58c568a5c5af790c3be5367bac7fcc 100644 (file)
@@ -1128,7 +1128,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma)
        DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off);
 
        vma->vm_pgoff = off >> PAGE_SHIFT;
-       vma->vm_flags |= VM_IO | VM_RESERVED;
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
        vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
        vma->vm_ops = &mmap_user_ops;
        vma->vm_private_data = rg;
index 3c1de981a18cf77a49fbe2b9239112e29ecfe547..296afae442f4853cf1a0ddb81d4bd2df15eaaa2a 100644 (file)
@@ -57,9 +57,8 @@ int sbusfb_mmap_helper(struct sbus_mmap_map *map,
 
        off = vma->vm_pgoff << PAGE_SHIFT;
 
-       /* To stop the swapper from even considering these pages */
-       vma->vm_flags |= (VM_IO | VM_RESERVED);
-       
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
+
        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
        /* Each page, see which map applies */
index 5533a32c6ca132e8c981b7306c88d354fea7a23c..97bd6620c36494d993b0d77877a2531be80250c3 100644 (file)
@@ -803,7 +803,6 @@ static int ufx_ops_mmap(struct fb_info *info, struct vm_area_struct *vma)
                        size = 0;
        }
 
-       vma->vm_flags |= VM_RESERVED;   /* avoid to swap out this VMA */
        return 0;
 }
 
index 8af64148294b88074a269f54691d05a9cb00a547..f45eba3d6150f73b6e12a3e78bba58dd389a3a90 100644 (file)
@@ -345,7 +345,6 @@ static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma)
                        size = 0;
        }
 
-       vma->vm_flags |= VM_RESERVED;   /* avoid to swap out this VMA */
        return 0;
 }
 
index 970e43d13f52bdce1a4a5cfc4e782745ddf53657..89aef343e2951b71f3a34d4377aa7aaa27b63fcb 100644 (file)
@@ -1018,7 +1018,6 @@ static int vmlfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
        offset += vinfo->vram_start;
        pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
        pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT;
-       vma->vm_flags |= VM_RESERVED | VM_IO;
        if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT,
                                                size, vma->vm_page_prot))
                return -EAGAIN;
index 501a922aa9dc425c0d45258c8fb19ac897d51da9..c7f692525b8849a64a492577bde6861c55bae71d 100644 (file)
@@ -439,7 +439,6 @@ static int vfb_mmap(struct fb_info *info,
                        size = 0;
        }
 
-       vma->vm_flags |= VM_RESERVED;   /* avoid to swap out this VMA */
        return 0;
 
 }
index 934985d14c2459f6893a8b1c4712b98207830d8a..4097987b330e6235d3ddf5df79119f15c7bf9b10 100644 (file)
@@ -535,7 +535,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma)
 
        vma->vm_private_data = vm_priv;
 
-       vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
        vma->vm_ops = &gntalloc_vmops;
 
index 5df9fd847b2eebbc87fe668e136d1c0684a7905c..610bfc6be17708594783d231d608c14651625376 100644 (file)
@@ -720,7 +720,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
 
        vma->vm_ops = &gntdev_vmops;
 
-       vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
        if (use_ptemod)
                vma->vm_flags |= VM_DONTCOPY;
index ef6389580b8c78dd30a8c7738098c51b6ef3133a..8adb9cc267f96e201ade041441ef8a317ce336d8 100644 (file)
@@ -455,7 +455,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
 {
        /* DONTCOPY is essential for Xen because copy_page_range doesn't know
         * how to recreate these mappings */
-       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY |
+                        VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &privcmd_vm_ops;
        vma->vm_private_data = NULL;
 
index b20af74657860bd19018809bdb8e1721a5820a4d..d934f04e77368e00dc76d4a54720cff0758d3a58 100644 (file)
@@ -568,6 +568,11 @@ static int v9fs_init_inode_cache(void)
  */
 static void v9fs_destroy_inode_cache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(v9fs_inode_cache);
 }
 
index dd6f7ee1e31258d94dfa1e038d2a272f17ecb541..c2483e97beee9f3e7f00cdf7b0a8f4ad1c86f27f 100644 (file)
@@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
 static const struct vm_operations_struct v9fs_file_vm_ops = {
        .fault = filemap_fault,
        .page_mkwrite = v9fs_vm_page_mkwrite,
+       .remap_pages = generic_file_remap_pages,
 };
 
 
index 02257420274966b6f0085f20b148e1fe582bc865..0efd1524b9770844cf74899f067686073a1c730d 100644 (file)
@@ -164,3 +164,11 @@ config BINFMT_MISC
          You may say M here for module support and later load the module when
          you have use for it; the module is called binfmt_misc. If you
          don't know what to answer at this point, say Y.
+
+config COREDUMP
+       bool "Enable core dump support" if EXPERT
+       default y
+       help
+         This option enables support for performing core dumps. You almost
+         certainly want to say Y here. Not necessary on systems that never
+         need debugging or only ever run flawless code.
index 2fb977934673812c52e2aa9e7ed0a392e5181a34..1d7af79288a04915ca0d1788bc84f32bd03793a0 100644 (file)
@@ -48,6 +48,7 @@ obj-$(CONFIG_FS_MBCACHE)      += mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)     += posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)       += nfs_common/
 obj-$(CONFIG_GENERIC_ACL)      += generic_acl.o
+obj-$(CONFIG_COREDUMP)         += coredump.o
 
 obj-$(CONFIG_FHANDLE)          += fhandle.o
 
index bdaec92353c2cc0e5e876115ee7a474e45c0ba4b..c830c857c663a4958379937f623097973e4b36de 100644 (file)
@@ -275,6 +275,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(adfs_inode_cachep);
 }
 
index 022cecb0757dd0986b0f86f634847bd98fa89a97..d8d70f920389b47e16bef281606815205584927c 100644 (file)
@@ -147,6 +147,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(affs_inode_cachep);
 }
 
index df8c6047c2a12c41e5cd2935066e9b0397226d9a..43165009428da56c51b47ae9d8bb4b97310c485f 100644 (file)
@@ -123,6 +123,11 @@ void __exit afs_fs_exit(void)
                BUG();
        }
 
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(afs_inode_cachep);
        _leave("");
 }
index cce7df53b694373b6288795f14b07eb91c0894fc..9689817dd26a490b5b9424c2058bb2f879cc6a13 100644 (file)
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -184,6 +184,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
                        inode_inc_iversion(inode);
        }
 
+       if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
+               if (attr->ia_size != inode->i_size)
+                       inode_inc_iversion(inode);
+       }
+
        if ((ia_valid & ATTR_MODE)) {
                umode_t amode = attr->ia_mode;
                /* Flag setting protected by i_mutex */
index cf7f3c67c8b7848e6e8e55b83c3c76bde007f60a..962b4f8f7994005c85e16869887cf9f9cb505724 100644 (file)
@@ -454,6 +454,11 @@ befs_init_inodecache(void)
 static void
 befs_destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(befs_inode_cachep);
 }
 
index 9870417c26e7c43852f98b3d641445b5a94c40b1..d5fc598d6e4ac554f527bebc036d9ad7ba798c81 100644 (file)
@@ -280,6 +280,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(bfs_inode_cachep);
 }
 
index d146e181d10df8611050c16745195b9efca62c93..4b5b5117f00aff7a7ad707bc3c50ecce4e9c92ca 100644 (file)
 
 static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
-static int aout_core_dump(struct coredump_params *cprm);
-
-static struct linux_binfmt aout_format = {
-       .module         = THIS_MODULE,
-       .load_binary    = load_aout_binary,
-       .load_shlib     = load_aout_library,
-       .core_dump      = aout_core_dump,
-       .min_coredump   = PAGE_SIZE
-};
-
-#define BAD_ADDR(x)    ((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
-       start = PAGE_ALIGN(start);
-       end = PAGE_ALIGN(end);
-       if (end > start) {
-               unsigned long addr;
-               addr = vm_brk(start, end - start);
-               if (BAD_ADDR(addr))
-                       return addr;
-       }
-       return 0;
-}
 
+#ifdef CONFIG_COREDUMP
 /*
  * Routine writes a core dump image in the current directory.
  * Currently only a stub-function.
@@ -66,7 +43,6 @@ static int set_brk(unsigned long start, unsigned long end)
  * field, which also makes sure the core-dumps won't be recursive if the
  * dumping of the process results in another error..
  */
-
 static int aout_core_dump(struct coredump_params *cprm)
 {
        struct file *file = cprm->file;
@@ -135,6 +111,32 @@ end_coredump:
        set_fs(fs);
        return has_dumped;
 }
+#else
+#define aout_core_dump NULL
+#endif
+
+static struct linux_binfmt aout_format = {
+       .module         = THIS_MODULE,
+       .load_binary    = load_aout_binary,
+       .load_shlib     = load_aout_library,
+       .core_dump      = aout_core_dump,
+       .min_coredump   = PAGE_SIZE
+};
+
+#define BAD_ADDR(x)    ((unsigned long)(x) >= TASK_SIZE)
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+       start = PAGE_ALIGN(start);
+       end = PAGE_ALIGN(end);
+       if (end > start) {
+               unsigned long addr;
+               addr = vm_brk(start, end - start);
+               if (BAD_ADDR(addr))
+                       return addr;
+       }
+       return 0;
+}
 
 /*
  * create_aout_tables() parses the env- and arg-strings in new user
index 2ab91905b2e2ca848c1ebc02caa313b5602065cf..1b4efbc461d23a16d15ea26052925bdfb0353d9e 100644 (file)
@@ -1114,7 +1114,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
        if (always_dump_vma(vma))
                goto whole;
 
-       if (vma->vm_flags & VM_NODUMP)
+       if (vma->vm_flags & VM_DONTDUMP)
                return 0;
 
        /* Hugetlb memory check */
@@ -1126,7 +1126,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
        }
 
        /* Do not dump I/O mapped devices or special mappings */
-       if (vma->vm_flags & (VM_IO | VM_RESERVED))
+       if (vma->vm_flags & VM_IO)
                return 0;
 
        /* By default, dump shared memory if mapped from an anonymous file. */
@@ -1695,30 +1695,19 @@ static int elf_note_info_init(struct elf_note_info *info)
                return 0;
        info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
        if (!info->psinfo)
-               goto notes_free;
+               return 0;
        info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
        if (!info->prstatus)
-               goto psinfo_free;
+               return 0;
        info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
        if (!info->fpu)
-               goto prstatus_free;
+               return 0;
 #ifdef ELF_CORE_COPY_XFPREGS
        info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
        if (!info->xfpu)
-               goto fpu_free;
+               return 0;
 #endif
        return 1;
-#ifdef ELF_CORE_COPY_XFPREGS
- fpu_free:
-       kfree(info->fpu);
-#endif
- prstatus_free:
-       kfree(info->prstatus);
- psinfo_free:
-       kfree(info->psinfo);
- notes_free:
-       kfree(info->notes);
-       return 0;
 }
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
index c298f2efc1bfcad428fbc0cd39b55831215276c6..3d8fae0096476bfa39ef2da854dd32e074c1f450 100644 (file)
@@ -1204,7 +1204,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
        int dump_ok;
 
        /* Do not dump I/O mapped devices or special mappings */
-       if (vma->vm_flags & (VM_IO | VM_RESERVED)) {
+       if (vma->vm_flags & VM_IO) {
                kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
                return 0;
        }
index 4c878476bb91ce0985dabc25464622442aaca54a..b08ea4717e9d70ef7967fe33c1658b669eee16ac 100644 (file)
@@ -107,6 +107,12 @@ void extent_io_exit(void)
                list_del(&eb->leak_list);
                kmem_cache_free(extent_buffer_cache, eb);
        }
+
+       /*
+        * Make sure all delayed rcu free are flushed before we
+        * destroy caches.
+        */
+       rcu_barrier();
        if (extent_state_cache)
                kmem_cache_destroy(extent_state_cache);
        if (extent_buffer_cache)
index 5caf285c6e4d0f1cb7adf82d8af911ab614a807a..f6b40e86121b007bf06954268254bcaae0fe7bf5 100644 (file)
@@ -1599,6 +1599,7 @@ out:
 static const struct vm_operations_struct btrfs_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = btrfs_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
 
 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file    *filp, struct vm_area_struct *vma)
 
        file_accessed(filp);
        vma->vm_ops = &btrfs_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
 
        return 0;
 }
index 316b07a866d246fbd9ae1880112f3387d66bdef0..847e7ec98b9ae8a370293d72e86915a489e704ca 100644 (file)
@@ -7076,6 +7076,11 @@ static void init_once(void *foo)
 
 void btrfs_destroy_cachep(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        if (btrfs_inode_cachep)
                kmem_cache_destroy(btrfs_inode_cachep);
        if (btrfs_trans_handle_cachep)
index 452e71a1b75388b2dc06d3ad1665f36a1ed1176f..36172f9723ffd158359959f72cf8c247acf8a3c4 100644 (file)
@@ -1225,6 +1225,7 @@ out:
 static struct vm_operations_struct ceph_vmops = {
        .fault          = filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
 
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1235,6 +1236,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &ceph_vmops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
index b982239f38f91dfab38fcc093e600d5b11e5c632..3a42d9326378d5aa3b2a18b9358a142bdcac51db 100644 (file)
@@ -603,6 +603,11 @@ bad_cap:
 
 static void destroy_caches(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ceph_inode_cachep);
        kmem_cache_destroy(ceph_cap_cachep);
        kmem_cache_destroy(ceph_dentry_cachep);
index a41044a310836cf28c65b9b8db7ab65659a54971..e7931cc55d0c96f7a4edde39d5fbd78fa24f82af 100644 (file)
@@ -968,6 +968,11 @@ cifs_init_inodecache(void)
 static void
 cifs_destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(cifs_inode_cachep);
 }
 
index 1d43303496962ecd57003ca6683273d95875a1b8..963fea95c72ea78572a067e976393200767ce248 100644 (file)
@@ -3004,6 +3004,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
        .page_mkwrite = cifs_page_mkwrite,
+       .remap_pages = generic_file_remap_pages,
 };
 
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
index f1813120d753e23ce153e700dd62d4d0d871d779..0c704608b5f2680543b5f6c41ccb95f57101d7db 100644 (file)
@@ -85,6 +85,11 @@ int coda_init_inodecache(void)
 
 void coda_destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(coda_inode_cachep);
 }
 
index edd4ab67cd1b722e5f0dd7439d3a396bee9b91e9..3f0674534a982c40b1a27ee1584c8d3bc292375c 100644 (file)
@@ -606,7 +606,6 @@ struct serial_struct32 {
 static int serial_struct_ioctl(unsigned fd, unsigned cmd,
                        struct serial_struct32 __user *ss32)
 {
-        typedef struct serial_struct SS;
         typedef struct serial_struct32 SS32;
         int err;
         struct serial_struct ss;
diff --git a/fs/coredump.c b/fs/coredump.c
new file mode 100644 (file)
index 0000000..1935b4d
--- /dev/null
@@ -0,0 +1,691 @@
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/swap.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/perf_event.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/key.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/coredump.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
+#include "internal.h"
+#include "coredump.h"
+
+#include <trace/events/sched.h>
+
+int core_uses_pid;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
+
+struct core_name {
+       char *corename;
+       int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
+
+/* The maximal length of core_pattern is also specified in sysctl.c */
+
+static int expand_corename(struct core_name *cn)
+{
+       char *old_corename = cn->corename;
+
+       cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+       cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+
+       if (!cn->corename) {
+               kfree(old_corename);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+       char *cur;
+       int need;
+       int ret;
+       va_list arg;
+
+       va_start(arg, fmt);
+       need = vsnprintf(NULL, 0, fmt, arg);
+       va_end(arg);
+
+       if (likely(need < cn->size - cn->used - 1))
+               goto out_printf;
+
+       ret = expand_corename(cn);
+       if (ret)
+               goto expand_fail;
+
+out_printf:
+       cur = cn->corename + cn->used;
+       va_start(arg, fmt);
+       vsnprintf(cur, need + 1, fmt, arg);
+       va_end(arg);
+       cn->used += need;
+       return 0;
+
+expand_fail:
+       return ret;
+}
+
+static void cn_escape(char *str)
+{
+       for (; *str; str++)
+               if (*str == '/')
+                       *str = '!';
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+       struct file *exe_file;
+       char *pathbuf, *path;
+       int ret;
+
+       exe_file = get_mm_exe_file(current->mm);
+       if (!exe_file) {
+               char *commstart = cn->corename + cn->used;
+               ret = cn_printf(cn, "%s (path unknown)", current->comm);
+               cn_escape(commstart);
+               return ret;
+       }
+
+       pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+       if (!pathbuf) {
+               ret = -ENOMEM;
+               goto put_exe_file;
+       }
+
+       path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+       if (IS_ERR(path)) {
+               ret = PTR_ERR(path);
+               goto free_buf;
+       }
+
+       cn_escape(path);
+
+       ret = cn_printf(cn, "%s", path);
+
+free_buf:
+       kfree(pathbuf);
+put_exe_file:
+       fput(exe_file);
+       return ret;
+}
+
+/* format_corename will inspect the pattern parameter, and output a
+ * name into corename, which must have space for at least
+ * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+ */
+static int format_corename(struct core_name *cn, long signr)
+{
+       const struct cred *cred = current_cred();
+       const char *pat_ptr = core_pattern;
+       int ispipe = (*pat_ptr == '|');
+       int pid_in_pattern = 0;
+       int err = 0;
+
+       cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+       cn->corename = kmalloc(cn->size, GFP_KERNEL);
+       cn->used = 0;
+
+       if (!cn->corename)
+               return -ENOMEM;
+
+       /* Repeat as long as we have more pattern to process and more output
+          space */
+       while (*pat_ptr) {
+               if (*pat_ptr != '%') {
+                       if (*pat_ptr == 0)
+                               goto out;
+                       err = cn_printf(cn, "%c", *pat_ptr++);
+               } else {
+                       switch (*++pat_ptr) {
+                       /* single % at the end, drop that */
+                       case 0:
+                               goto out;
+                       /* Double percent, output one percent */
+                       case '%':
+                               err = cn_printf(cn, "%c", '%');
+                               break;
+                       /* pid */
+                       case 'p':
+                               pid_in_pattern = 1;
+                               err = cn_printf(cn, "%d",
+                                             task_tgid_vnr(current));
+                               break;
+                       /* uid */
+                       case 'u':
+                               err = cn_printf(cn, "%d", cred->uid);
+                               break;
+                       /* gid */
+                       case 'g':
+                               err = cn_printf(cn, "%d", cred->gid);
+                               break;
+                       /* signal that caused the coredump */
+                       case 's':
+                               err = cn_printf(cn, "%ld", signr);
+                               break;
+                       /* UNIX time of coredump */
+                       case 't': {
+                               struct timeval tv;
+                               do_gettimeofday(&tv);
+                               err = cn_printf(cn, "%lu", tv.tv_sec);
+                               break;
+                       }
+                       /* hostname */
+                       case 'h': {
+                               char *namestart = cn->corename + cn->used;
+                               down_read(&uts_sem);
+                               err = cn_printf(cn, "%s",
+                                             utsname()->nodename);
+                               up_read(&uts_sem);
+                               cn_escape(namestart);
+                               break;
+                       }
+                       /* executable */
+                       case 'e': {
+                               char *commstart = cn->corename + cn->used;
+                               err = cn_printf(cn, "%s", current->comm);
+                               cn_escape(commstart);
+                               break;
+                       }
+                       case 'E':
+                               err = cn_print_exe_file(cn);
+                               break;
+                       /* core limit size */
+                       case 'c':
+                               err = cn_printf(cn, "%lu",
+                                             rlimit(RLIMIT_CORE));
+                               break;
+                       default:
+                               break;
+                       }
+                       ++pat_ptr;
+               }
+
+               if (err)
+                       return err;
+       }
+
+       /* Backward compatibility with core_uses_pid:
+        *
+        * If core_pattern does not include a %p (as is the default)
+        * and core_uses_pid is set, then .%pid will be appended to
+        * the filename. Do not do this for piped commands. */
+       if (!ispipe && !pid_in_pattern && core_uses_pid) {
+               err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+               if (err)
+                       return err;
+       }
+out:
+       return ispipe;
+}
+
+static int zap_process(struct task_struct *start, int exit_code)
+{
+       struct task_struct *t;
+       int nr = 0;
+
+       start->signal->flags = SIGNAL_GROUP_EXIT;
+       start->signal->group_exit_code = exit_code;
+       start->signal->group_stop_count = 0;
+
+       t = start;
+       do {
+               task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
+               if (t != current && t->mm) {
+                       sigaddset(&t->pending.signal, SIGKILL);
+                       signal_wake_up(t, 1);
+                       nr++;
+               }
+       } while_each_thread(start, t);
+
+       return nr;
+}
+
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+                               struct core_state *core_state, int exit_code)
+{
+       struct task_struct *g, *p;
+       unsigned long flags;
+       int nr = -EAGAIN;
+
+       spin_lock_irq(&tsk->sighand->siglock);
+       if (!signal_group_exit(tsk->signal)) {
+               mm->core_state = core_state;
+               nr = zap_process(tsk, exit_code);
+       }
+       spin_unlock_irq(&tsk->sighand->siglock);
+       if (unlikely(nr < 0))
+               return nr;
+
+       if (atomic_read(&mm->mm_users) == nr + 1)
+               goto done;
+       /*
+        * We should find and kill all tasks which use this mm, and we should
+        * count them correctly into ->nr_threads. We don't take tasklist
+        * lock, but this is safe wrt:
+        *
+        * fork:
+        *      None of sub-threads can fork after zap_process(leader). All
+        *      processes which were created before this point should be
+        *      visible to zap_threads() because copy_process() adds the new
+        *      process to the tail of init_task.tasks list, and lock/unlock
+        *      of ->siglock provides a memory barrier.
+        *
+        * do_exit:
+        *      The caller holds mm->mmap_sem. This means that the task which
+        *      uses this mm can't pass exit_mm(), so it can't exit or clear
+        *      its ->mm.
+        *
+        * de_thread:
+        *      It does list_replace_rcu(&leader->tasks, &current->tasks),
+        *      we must see either old or new leader, this does not matter.
+        *      However, it can change p->sighand, so lock_task_sighand(p)
+        *      must be used. Since p->mm != NULL and we hold ->mmap_sem
+        *      it can't fail.
+        *
+        *      Note also that "g" can be the old leader with ->mm == NULL
+        *      and already unhashed and thus removed from ->thread_group.
+        *      This is OK, __unhash_process()->list_del_rcu() does not
+        *      clear the ->next pointer, we will find the new leader via
+        *      next_thread().
+        */
+       rcu_read_lock();
+       for_each_process(g) {
+               if (g == tsk->group_leader)
+                       continue;
+               if (g->flags & PF_KTHREAD)
+                       continue;
+               p = g;
+               do {
+                       if (p->mm) {
+                               if (unlikely(p->mm == mm)) {
+                                       lock_task_sighand(p, &flags);
+                                       nr += zap_process(p, exit_code);
+                                       unlock_task_sighand(p, &flags);
+                               }
+                               break;
+                       }
+               } while_each_thread(g, p);
+       }
+       rcu_read_unlock();
+done:
+       atomic_set(&core_state->nr_threads, nr);
+       return nr;
+}
+
+static int coredump_wait(int exit_code, struct core_state *core_state)
+{
+       struct task_struct *tsk = current;
+       struct mm_struct *mm = tsk->mm;
+       int core_waiters = -EBUSY;
+
+       init_completion(&core_state->startup);
+       core_state->dumper.task = tsk;
+       core_state->dumper.next = NULL;
+
+       down_write(&mm->mmap_sem);
+       if (!mm->core_state)
+               core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+       up_write(&mm->mmap_sem);
+
+       if (core_waiters > 0) {
+               struct core_thread *ptr;
+
+               wait_for_completion(&core_state->startup);
+               /*
+                * Wait for all the threads to become inactive, so that
+                * all the thread context (extended register state, like
+                * fpu etc) gets copied to the memory.
+                */
+               ptr = core_state->dumper.next;
+               while (ptr != NULL) {
+                       wait_task_inactive(ptr->task, 0);
+                       ptr = ptr->next;
+               }
+       }
+
+       return core_waiters;
+}
+
+static void coredump_finish(struct mm_struct *mm)
+{
+       struct core_thread *curr, *next;
+       struct task_struct *task;
+
+       next = mm->core_state->dumper.next;
+       while ((curr = next) != NULL) {
+               next = curr->next;
+               task = curr->task;
+               /*
+                * see exit_mm(), curr->task must not see
+                * ->task == NULL before we read ->next.
+                */
+               smp_mb();
+               curr->task = NULL;
+               wake_up_process(task);
+       }
+
+       mm->core_state = NULL;
+}
+
+static void wait_for_dump_helpers(struct file *file)
+{
+       struct pipe_inode_info *pipe;
+
+       pipe = file->f_path.dentry->d_inode->i_pipe;
+
+       pipe_lock(pipe);
+       pipe->readers++;
+       pipe->writers--;
+
+       while ((pipe->readers > 1) && (!signal_pending(current))) {
+               wake_up_interruptible_sync(&pipe->wait);
+               kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+               pipe_wait(pipe);
+       }
+
+       pipe->readers--;
+       pipe->writers++;
+       pipe_unlock(pipe);
+
+}
+
+
+/*
+ * umh_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+       struct file *files[2];
+       struct fdtable *fdt;
+       struct coredump_params *cp = (struct coredump_params *)info->data;
+       struct files_struct *cf = current->files;
+       int err = create_pipe_files(files, 0);
+       if (err)
+               return err;
+
+       cp->file = files[1];
+
+       sys_close(0);
+       fd_install(0, files[0]);
+       spin_lock(&cf->file_lock);
+       fdt = files_fdtable(cf);
+       __set_open_fd(0, fdt);
+       __clear_close_on_exec(0, fdt);
+       spin_unlock(&cf->file_lock);
+
+       /* and disallow core files too */
+       current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+       return 0;
+}
+
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
+{
+       struct core_state core_state;
+       struct core_name cn;
+       struct mm_struct *mm = current->mm;
+       struct linux_binfmt * binfmt;
+       const struct cred *old_cred;
+       struct cred *cred;
+       int retval = 0;
+       int flag = 0;
+       int ispipe;
+       bool need_nonrelative = false;
+       static atomic_t core_dump_count = ATOMIC_INIT(0);
+       struct coredump_params cprm = {
+               .signr = signr,
+               .regs = regs,
+               .limit = rlimit(RLIMIT_CORE),
+               /*
+                * We must use the same mm->flags while dumping core to avoid
+                * inconsistency of bit flags, since this flag is not protected
+                * by any locks.
+                */
+               .mm_flags = mm->flags,
+       };
+
+       audit_core_dumps(signr);
+
+       binfmt = mm->binfmt;
+       if (!binfmt || !binfmt->core_dump)
+               goto fail;
+       if (!__get_dumpable(cprm.mm_flags))
+               goto fail;
+
+       cred = prepare_creds();
+       if (!cred)
+               goto fail;
+       /*
+        * We cannot trust fsuid as being the "true" uid of the process
+        * nor do we know its entire history. We only know it was tainted
+        * so we dump it as root in mode 2, and only into a controlled
+        * environment (pipe handler or fully qualified path).
+        */
+       if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
+               /* Setuid core dump mode */
+               flag = O_EXCL;          /* Stop rewrite attacks */
+               cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
+               need_nonrelative = true;
+       }
+
+       retval = coredump_wait(exit_code, &core_state);
+       if (retval < 0)
+               goto fail_creds;
+
+       old_cred = override_creds(cred);
+
+       /*
+        * Clear any false indication of pending signals that might
+        * be seen by the filesystem code called to write the core file.
+        */
+       clear_thread_flag(TIF_SIGPENDING);
+
+       ispipe = format_corename(&cn, signr);
+
+       if (ispipe) {
+               int dump_count;
+               char **helper_argv;
+
+               if (ispipe < 0) {
+                       printk(KERN_WARNING "format_corename failed\n");
+                       printk(KERN_WARNING "Aborting core\n");
+                       goto fail_corename;
+               }
+
+               if (cprm.limit == 1) {
+                       /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
+                        *
+                        * Normally core limits are irrelevant to pipes, since
+                        * we're not writing to the file system, but we use
+                        * cprm.limit of 1 here as a speacial value, this is a
+                        * consistent way to catch recursive crashes.
+                        * We can still crash if the core_pattern binary sets
+                        * RLIM_CORE = !1, but it runs as root, and can do
+                        * lots of stupid things.
+                        *
+                        * Note that we use task_tgid_vnr here to grab the pid
+                        * of the process group leader.  That way we get the
+                        * right pid if a thread in a multi-threaded
+                        * core_pattern process dies.
+                        */
+                       printk(KERN_WARNING
+                               "Process %d(%s) has RLIMIT_CORE set to 1\n",
+                               task_tgid_vnr(current), current->comm);
+                       printk(KERN_WARNING "Aborting core\n");
+                       goto fail_unlock;
+               }
+               cprm.limit = RLIM_INFINITY;
+
+               dump_count = atomic_inc_return(&core_dump_count);
+               if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+                       printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+                              task_tgid_vnr(current), current->comm);
+                       printk(KERN_WARNING "Skipping core dump\n");
+                       goto fail_dropcount;
+               }
+
+               helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+               if (!helper_argv) {
+                       printk(KERN_WARNING "%s failed to allocate memory\n",
+                              __func__);
+                       goto fail_dropcount;
+               }
+
+               retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+                                       NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+                                       NULL, &cprm);
+               argv_free(helper_argv);
+               if (retval) {
+                       printk(KERN_INFO "Core dump to %s pipe failed\n",
+                              cn.corename);
+                       goto close_fail;
+               }
+       } else {
+               struct inode *inode;
+
+               if (cprm.limit < binfmt->min_coredump)
+                       goto fail_unlock;
+
+               if (need_nonrelative && cn.corename[0] != '/') {
+                       printk(KERN_WARNING "Pid %d(%s) can only dump core "\
+                               "to fully qualified path!\n",
+                               task_tgid_vnr(current), current->comm);
+                       printk(KERN_WARNING "Skipping core dump\n");
+                       goto fail_unlock;
+               }
+
+               cprm.file = filp_open(cn.corename,
+                                O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+                                0600);
+               if (IS_ERR(cprm.file))
+                       goto fail_unlock;
+
+               inode = cprm.file->f_path.dentry->d_inode;
+               if (inode->i_nlink > 1)
+                       goto close_fail;
+               if (d_unhashed(cprm.file->f_path.dentry))
+                       goto close_fail;
+               /*
+                * AK: actually i see no reason to not allow this for named
+                * pipes etc, but keep the previous behaviour for now.
+                */
+               if (!S_ISREG(inode->i_mode))
+                       goto close_fail;
+               /*
+                * Dont allow local users get cute and trick others to coredump
+                * into their pre-created files.
+                */
+               if (!uid_eq(inode->i_uid, current_fsuid()))
+                       goto close_fail;
+               if (!cprm.file->f_op || !cprm.file->f_op->write)
+                       goto close_fail;
+               if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+                       goto close_fail;
+       }
+
+       retval = binfmt->core_dump(&cprm);
+       if (retval)
+               current->signal->group_exit_code |= 0x80;
+
+       if (ispipe && core_pipe_limit)
+               wait_for_dump_helpers(cprm.file);
+close_fail:
+       if (cprm.file)
+               filp_close(cprm.file, NULL);
+fail_dropcount:
+       if (ispipe)
+               atomic_dec(&core_dump_count);
+fail_unlock:
+       kfree(cn.corename);
+fail_corename:
+       coredump_finish(mm);
+       revert_creds(old_cred);
+fail_creds:
+       put_cred(cred);
+fail:
+       return;
+}
+
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+       return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+       int ret = 1;
+
+       if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+               if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+                       return 0;
+       } else {
+               char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+               if (!buf)
+                       return 0;
+               while (off > 0) {
+                       unsigned long n = off;
+
+                       if (n > PAGE_SIZE)
+                               n = PAGE_SIZE;
+                       if (!dump_write(file, buf, n)) {
+                               ret = 0;
+                               break;
+                       }
+                       off -= n;
+               }
+               free_page((unsigned long)buf);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/coredump.h b/fs/coredump.h
new file mode 100644 (file)
index 0000000..e39ff07
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef _FS_COREDUMP_H
+#define _FS_COREDUMP_H
+
+extern int __get_dumpable(unsigned long mm_flags);
+
+#endif
index 16521a9f203859a28a8804d3fb2f5f4cb894ed7f..9b467d9b003973a4fe3a4cb657b60821b606b36d 100644 (file)
@@ -1579,7 +1579,7 @@ EXPORT_SYMBOL(d_find_any_alias);
  */
 struct dentry *d_obtain_alias(struct inode *inode)
 {
-       static const struct qstr anonstring = { .name = "" };
+       static const struct qstr anonstring = QSTR_INIT("/", 1);
        struct dentry *tmp;
        struct dentry *res;
 
index 9b627c15010a3af35e1f2ec85ccafc2b18d97d44..34fcde765d242e3e04421d49f17f81a8a496eb58 100644 (file)
@@ -710,6 +710,12 @@ static void ecryptfs_free_kmem_caches(void)
 {
        int i;
 
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
+
        for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
                struct ecryptfs_cache_info *info;
 
index e755ec746c6967ed12e9ea036ddcd83214770d48..2002431ef9a0ff238d838b9d4c2f8bb9760007fa 100644 (file)
@@ -96,6 +96,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(efs_inode_cachep);
 }
 
index eedec84c1809173eb4c627815ad70122de66eeb3..2fbef110efa2b01bce6e7ef5fe2288384fa8351f 100644 (file)
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 static inline int ep_op_has_event(int op)
 {
-       return op != EPOLL_CTL_DEL;
+       return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
 }
 
 /* Initialize the poll safe wake up structure */
@@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        return 0;
 }
 
+/*
+ * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
+ * had no event flags set, indicating that another thread may be currently
+ * handling that item's events (in the case that EPOLLONESHOT was being
+ * used). Otherwise a zero result indicates that the item has been disabled
+ * from receiving events. A disabled item may be re-enabled via
+ * EPOLL_CTL_MOD. Must be called with "mtx" held.
+ */
+static int ep_disable(struct eventpoll *ep, struct epitem *epi)
+{
+       int result = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ep->lock, flags);
+       if (epi->event.events & ~EP_PRIVATE_BITS) {
+               if (ep_is_linked(&epi->rdllink))
+                       list_del_init(&epi->rdllink);
+               /* Ensure ep_poll_callback will not add epi back onto ready
+                  list: */
+               epi->event.events &= EP_PRIVATE_BITS;
+               }
+       else
+               result = -EBUSY;
+       spin_unlock_irqrestore(&ep->lock, flags);
+
+       return result;
+}
+
 static void ep_free(struct eventpoll *ep)
 {
        struct rb_node *rbp;
@@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
        rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
-
-
 #define PATH_ARR_SIZE 5
 /*
  * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                } else
                        error = -ENOENT;
                break;
+       case EPOLL_CTL_DISABLE:
+               if (epi)
+                       error = ep_disable(ep, epi);
+               else
+                       error = -ENOENT;
+               break;
        }
        mutex_unlock(&ep->mtx);
 
index 3947aef2283166fc7ba8f70332a61a522d69dc01..905e94baca1c0ca9a8d29a7198bccc47fb5d5615 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
 
 #include <trace/events/task.h>
 #include "internal.h"
+#include "coredump.h"
 
 #include <trace/events/sched.h>
 
-int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
-unsigned int core_pipe_limit;
 int suid_dumpable = 0;
 
-struct core_name {
-       char *corename;
-       int used, size;
-};
-static atomic_t call_count = ATOMIC_INIT(1);
-
-/* The maximal length of core_pattern is also specified in sysctl.c */
-
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
@@ -612,7 +602,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
         * process cleanup to remove whatever mess we made.
         */
        if (length != move_page_tables(vma, old_start,
-                                      vma, new_start, length))
+                                      vma, new_start, length, false))
                return -ENOMEM;
 
        lru_add_drain();
@@ -1631,353 +1621,6 @@ void set_binfmt(struct linux_binfmt *new)
 
 EXPORT_SYMBOL(set_binfmt);
 
-static int expand_corename(struct core_name *cn)
-{
-       char *old_corename = cn->corename;
-
-       cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
-       cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
-
-       if (!cn->corename) {
-               kfree(old_corename);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
-{
-       char *cur;
-       int need;
-       int ret;
-       va_list arg;
-
-       va_start(arg, fmt);
-       need = vsnprintf(NULL, 0, fmt, arg);
-       va_end(arg);
-
-       if (likely(need < cn->size - cn->used - 1))
-               goto out_printf;
-
-       ret = expand_corename(cn);
-       if (ret)
-               goto expand_fail;
-
-out_printf:
-       cur = cn->corename + cn->used;
-       va_start(arg, fmt);
-       vsnprintf(cur, need + 1, fmt, arg);
-       va_end(arg);
-       cn->used += need;
-       return 0;
-
-expand_fail:
-       return ret;
-}
-
-static void cn_escape(char *str)
-{
-       for (; *str; str++)
-               if (*str == '/')
-                       *str = '!';
-}
-
-static int cn_print_exe_file(struct core_name *cn)
-{
-       struct file *exe_file;
-       char *pathbuf, *path;
-       int ret;
-
-       exe_file = get_mm_exe_file(current->mm);
-       if (!exe_file) {
-               char *commstart = cn->corename + cn->used;
-               ret = cn_printf(cn, "%s (path unknown)", current->comm);
-               cn_escape(commstart);
-               return ret;
-       }
-
-       pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
-       if (!pathbuf) {
-               ret = -ENOMEM;
-               goto put_exe_file;
-       }
-
-       path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
-       if (IS_ERR(path)) {
-               ret = PTR_ERR(path);
-               goto free_buf;
-       }
-
-       cn_escape(path);
-
-       ret = cn_printf(cn, "%s", path);
-
-free_buf:
-       kfree(pathbuf);
-put_exe_file:
-       fput(exe_file);
-       return ret;
-}
-
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
- */
-static int format_corename(struct core_name *cn, long signr)
-{
-       const struct cred *cred = current_cred();
-       const char *pat_ptr = core_pattern;
-       int ispipe = (*pat_ptr == '|');
-       int pid_in_pattern = 0;
-       int err = 0;
-
-       cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
-       cn->corename = kmalloc(cn->size, GFP_KERNEL);
-       cn->used = 0;
-
-       if (!cn->corename)
-               return -ENOMEM;
-
-       /* Repeat as long as we have more pattern to process and more output
-          space */
-       while (*pat_ptr) {
-               if (*pat_ptr != '%') {
-                       if (*pat_ptr == 0)
-                               goto out;
-                       err = cn_printf(cn, "%c", *pat_ptr++);
-               } else {
-                       switch (*++pat_ptr) {
-                       /* single % at the end, drop that */
-                       case 0:
-                               goto out;
-                       /* Double percent, output one percent */
-                       case '%':
-                               err = cn_printf(cn, "%c", '%');
-                               break;
-                       /* pid */
-                       case 'p':
-                               pid_in_pattern = 1;
-                               err = cn_printf(cn, "%d",
-                                             task_tgid_vnr(current));
-                               break;
-                       /* uid */
-                       case 'u':
-                               err = cn_printf(cn, "%d", cred->uid);
-                               break;
-                       /* gid */
-                       case 'g':
-                               err = cn_printf(cn, "%d", cred->gid);
-                               break;
-                       /* signal that caused the coredump */
-                       case 's':
-                               err = cn_printf(cn, "%ld", signr);
-                               break;
-                       /* UNIX time of coredump */
-                       case 't': {
-                               struct timeval tv;
-                               do_gettimeofday(&tv);
-                               err = cn_printf(cn, "%lu", tv.tv_sec);
-                               break;
-                       }
-                       /* hostname */
-                       case 'h': {
-                               char *namestart = cn->corename + cn->used;
-                               down_read(&uts_sem);
-                               err = cn_printf(cn, "%s",
-                                             utsname()->nodename);
-                               up_read(&uts_sem);
-                               cn_escape(namestart);
-                               break;
-                       }
-                       /* executable */
-                       case 'e': {
-                               char *commstart = cn->corename + cn->used;
-                               err = cn_printf(cn, "%s", current->comm);
-                               cn_escape(commstart);
-                               break;
-                       }
-                       case 'E':
-                               err = cn_print_exe_file(cn);
-                               break;
-                       /* core limit size */
-                       case 'c':
-                               err = cn_printf(cn, "%lu",
-                                             rlimit(RLIMIT_CORE));
-                               break;
-                       default:
-                               break;
-                       }
-                       ++pat_ptr;
-               }
-
-               if (err)
-                       return err;
-       }
-
-       /* Backward compatibility with core_uses_pid:
-        *
-        * If core_pattern does not include a %p (as is the default)
-        * and core_uses_pid is set, then .%pid will be appended to
-        * the filename. Do not do this for piped commands. */
-       if (!ispipe && !pid_in_pattern && core_uses_pid) {
-               err = cn_printf(cn, ".%d", task_tgid_vnr(current));
-               if (err)
-                       return err;
-       }
-out:
-       return ispipe;
-}
-
-static int zap_process(struct task_struct *start, int exit_code)
-{
-       struct task_struct *t;
-       int nr = 0;
-
-       start->signal->flags = SIGNAL_GROUP_EXIT;
-       start->signal->group_exit_code = exit_code;
-       start->signal->group_stop_count = 0;
-
-       t = start;
-       do {
-               task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-               if (t != current && t->mm) {
-                       sigaddset(&t->pending.signal, SIGKILL);
-                       signal_wake_up(t, 1);
-                       nr++;
-               }
-       } while_each_thread(start, t);
-
-       return nr;
-}
-
-static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-                               struct core_state *core_state, int exit_code)
-{
-       struct task_struct *g, *p;
-       unsigned long flags;
-       int nr = -EAGAIN;
-
-       spin_lock_irq(&tsk->sighand->siglock);
-       if (!signal_group_exit(tsk->signal)) {
-               mm->core_state = core_state;
-               nr = zap_process(tsk, exit_code);
-       }
-       spin_unlock_irq(&tsk->sighand->siglock);
-       if (unlikely(nr < 0))
-               return nr;
-
-       if (atomic_read(&mm->mm_users) == nr + 1)
-               goto done;
-       /*
-        * We should find and kill all tasks which use this mm, and we should
-        * count them correctly into ->nr_threads. We don't take tasklist
-        * lock, but this is safe wrt:
-        *
-        * fork:
-        *      None of sub-threads can fork after zap_process(leader). All
-        *      processes which were created before this point should be
-        *      visible to zap_threads() because copy_process() adds the new
-        *      process to the tail of init_task.tasks list, and lock/unlock
-        *      of ->siglock provides a memory barrier.
-        *
-        * do_exit:
-        *      The caller holds mm->mmap_sem. This means that the task which
-        *      uses this mm can't pass exit_mm(), so it can't exit or clear
-        *      its ->mm.
-        *
-        * de_thread:
-        *      It does list_replace_rcu(&leader->tasks, &current->tasks),
-        *      we must see either old or new leader, this does not matter.
-        *      However, it can change p->sighand, so lock_task_sighand(p)
-        *      must be used. Since p->mm != NULL and we hold ->mmap_sem
-        *      it can't fail.
-        *
-        *      Note also that "g" can be the old leader with ->mm == NULL
-        *      and already unhashed and thus removed from ->thread_group.
-        *      This is OK, __unhash_process()->list_del_rcu() does not
-        *      clear the ->next pointer, we will find the new leader via
-        *      next_thread().
-        */
-       rcu_read_lock();
-       for_each_process(g) {
-               if (g == tsk->group_leader)
-                       continue;
-               if (g->flags & PF_KTHREAD)
-                       continue;
-               p = g;
-               do {
-                       if (p->mm) {
-                               if (unlikely(p->mm == mm)) {
-                                       lock_task_sighand(p, &flags);
-                                       nr += zap_process(p, exit_code);
-                                       unlock_task_sighand(p, &flags);
-                               }
-                               break;
-                       }
-               } while_each_thread(g, p);
-       }
-       rcu_read_unlock();
-done:
-       atomic_set(&core_state->nr_threads, nr);
-       return nr;
-}
-
-static int coredump_wait(int exit_code, struct core_state *core_state)
-{
-       struct task_struct *tsk = current;
-       struct mm_struct *mm = tsk->mm;
-       int core_waiters = -EBUSY;
-
-       init_completion(&core_state->startup);
-       core_state->dumper.task = tsk;
-       core_state->dumper.next = NULL;
-
-       down_write(&mm->mmap_sem);
-       if (!mm->core_state)
-               core_waiters = zap_threads(tsk, mm, core_state, exit_code);
-       up_write(&mm->mmap_sem);
-
-       if (core_waiters > 0) {
-               struct core_thread *ptr;
-
-               wait_for_completion(&core_state->startup);
-               /*
-                * Wait for all the threads to become inactive, so that
-                * all the thread context (extended register state, like
-                * fpu etc) gets copied to the memory.
-                */
-               ptr = core_state->dumper.next;
-               while (ptr != NULL) {
-                       wait_task_inactive(ptr->task, 0);
-                       ptr = ptr->next;
-               }
-       }
-
-       return core_waiters;
-}
-
-static void coredump_finish(struct mm_struct *mm)
-{
-       struct core_thread *curr, *next;
-       struct task_struct *task;
-
-       next = mm->core_state->dumper.next;
-       while ((curr = next) != NULL) {
-               next = curr->next;
-               task = curr->task;
-               /*
-                * see exit_mm(), curr->task must not see
-                * ->task == NULL before we read ->next.
-                */
-               smp_mb();
-               curr->task = NULL;
-               wake_up_process(task);
-       }
-
-       mm->core_state = NULL;
-}
-
 /*
  * set_dumpable converts traditional three-value dumpable to two flags and
  * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
@@ -2019,7 +1662,7 @@ void set_dumpable(struct mm_struct *mm, int value)
        }
 }
 
-static int __get_dumpable(unsigned long mm_flags)
+int __get_dumpable(unsigned long mm_flags)
 {
        int ret;
 
@@ -2032,293 +1675,6 @@ int get_dumpable(struct mm_struct *mm)
        return __get_dumpable(mm->flags);
 }
 
-static void wait_for_dump_helpers(struct file *file)
-{
-       struct pipe_inode_info *pipe;
-
-       pipe = file->f_path.dentry->d_inode->i_pipe;
-
-       pipe_lock(pipe);
-       pipe->readers++;
-       pipe->writers--;
-
-       while ((pipe->readers > 1) && (!signal_pending(current))) {
-               wake_up_interruptible_sync(&pipe->wait);
-               kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-               pipe_wait(pipe);
-       }
-
-       pipe->readers--;
-       pipe->writers++;
-       pipe_unlock(pipe);
-
-}
-
-
-/*
- * umh_pipe_setup
- * helper function to customize the process used
- * to collect the core in userspace.  Specifically
- * it sets up a pipe and installs it as fd 0 (stdin)
- * for the process.  Returns 0 on success, or
- * PTR_ERR on failure.
- * Note that it also sets the core limit to 1.  This
- * is a special value that we use to trap recursive
- * core dumps
- */
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
-{
-       struct file *files[2];
-       struct fdtable *fdt;
-       struct coredump_params *cp = (struct coredump_params *)info->data;
-       struct files_struct *cf = current->files;
-       int err = create_pipe_files(files, 0);
-       if (err)
-               return err;
-
-       cp->file = files[1];
-
-       sys_close(0);
-       fd_install(0, files[0]);
-       spin_lock(&cf->file_lock);
-       fdt = files_fdtable(cf);
-       __set_open_fd(0, fdt);
-       __clear_close_on_exec(0, fdt);
-       spin_unlock(&cf->file_lock);
-
-       /* and disallow core files too */
-       current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
-
-       return 0;
-}
-
-void do_coredump(long signr, int exit_code, struct pt_regs *regs)
-{
-       struct core_state core_state;
-       struct core_name cn;
-       struct mm_struct *mm = current->mm;
-       struct linux_binfmt * binfmt;
-       const struct cred *old_cred;
-       struct cred *cred;
-       int retval = 0;
-       int flag = 0;
-       int ispipe;
-       bool need_nonrelative = false;
-       static atomic_t core_dump_count = ATOMIC_INIT(0);
-       struct coredump_params cprm = {
-               .signr = signr,
-               .regs = regs,
-               .limit = rlimit(RLIMIT_CORE),
-               /*
-                * We must use the same mm->flags while dumping core to avoid
-                * inconsistency of bit flags, since this flag is not protected
-                * by any locks.
-                */
-               .mm_flags = mm->flags,
-       };
-
-       audit_core_dumps(signr);
-
-       binfmt = mm->binfmt;
-       if (!binfmt || !binfmt->core_dump)
-               goto fail;
-       if (!__get_dumpable(cprm.mm_flags))
-               goto fail;
-
-       cred = prepare_creds();
-       if (!cred)
-               goto fail;
-       /*
-        * We cannot trust fsuid as being the "true" uid of the process
-        * nor do we know its entire history. We only know it was tainted
-        * so we dump it as root in mode 2, and only into a controlled
-        * environment (pipe handler or fully qualified path).
-        */
-       if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
-               /* Setuid core dump mode */
-               flag = O_EXCL;          /* Stop rewrite attacks */
-               cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
-               need_nonrelative = true;
-       }
-
-       retval = coredump_wait(exit_code, &core_state);
-       if (retval < 0)
-               goto fail_creds;
-
-       old_cred = override_creds(cred);
-
-       /*
-        * Clear any false indication of pending signals that might
-        * be seen by the filesystem code called to write the core file.
-        */
-       clear_thread_flag(TIF_SIGPENDING);
-
-       ispipe = format_corename(&cn, signr);
-
-       if (ispipe) {
-               int dump_count;
-               char **helper_argv;
-
-               if (ispipe < 0) {
-                       printk(KERN_WARNING "format_corename failed\n");
-                       printk(KERN_WARNING "Aborting core\n");
-                       goto fail_corename;
-               }
-
-               if (cprm.limit == 1) {
-                       /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
-                        *
-                        * Normally core limits are irrelevant to pipes, since
-                        * we're not writing to the file system, but we use
-                        * cprm.limit of 1 here as a speacial value, this is a
-                        * consistent way to catch recursive crashes.
-                        * We can still crash if the core_pattern binary sets
-                        * RLIM_CORE = !1, but it runs as root, and can do
-                        * lots of stupid things.
-                        *
-                        * Note that we use task_tgid_vnr here to grab the pid
-                        * of the process group leader.  That way we get the
-                        * right pid if a thread in a multi-threaded
-                        * core_pattern process dies.
-                        */
-                       printk(KERN_WARNING
-                               "Process %d(%s) has RLIMIT_CORE set to 1\n",
-                               task_tgid_vnr(current), current->comm);
-                       printk(KERN_WARNING "Aborting core\n");
-                       goto fail_unlock;
-               }
-               cprm.limit = RLIM_INFINITY;
-
-               dump_count = atomic_inc_return(&core_dump_count);
-               if (core_pipe_limit && (core_pipe_limit < dump_count)) {
-                       printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
-                              task_tgid_vnr(current), current->comm);
-                       printk(KERN_WARNING "Skipping core dump\n");
-                       goto fail_dropcount;
-               }
-
-               helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
-               if (!helper_argv) {
-                       printk(KERN_WARNING "%s failed to allocate memory\n",
-                              __func__);
-                       goto fail_dropcount;
-               }
-
-               retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
-                                       NULL, UMH_WAIT_EXEC, umh_pipe_setup,
-                                       NULL, &cprm);
-               argv_free(helper_argv);
-               if (retval) {
-                       printk(KERN_INFO "Core dump to %s pipe failed\n",
-                              cn.corename);
-                       goto close_fail;
-               }
-       } else {
-               struct inode *inode;
-
-               if (cprm.limit < binfmt->min_coredump)
-                       goto fail_unlock;
-
-               if (need_nonrelative && cn.corename[0] != '/') {
-                       printk(KERN_WARNING "Pid %d(%s) can only dump core "\
-                               "to fully qualified path!\n",
-                               task_tgid_vnr(current), current->comm);
-                       printk(KERN_WARNING "Skipping core dump\n");
-                       goto fail_unlock;
-               }
-
-               cprm.file = filp_open(cn.corename,
-                                O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
-                                0600);
-               if (IS_ERR(cprm.file))
-                       goto fail_unlock;
-
-               inode = cprm.file->f_path.dentry->d_inode;
-               if (inode->i_nlink > 1)
-                       goto close_fail;
-               if (d_unhashed(cprm.file->f_path.dentry))
-                       goto close_fail;
-               /*
-                * AK: actually i see no reason to not allow this for named
-                * pipes etc, but keep the previous behaviour for now.
-                */
-               if (!S_ISREG(inode->i_mode))
-                       goto close_fail;
-               /*
-                * Dont allow local users get cute and trick others to coredump
-                * into their pre-created files.
-                */
-               if (!uid_eq(inode->i_uid, current_fsuid()))
-                       goto close_fail;
-               if (!cprm.file->f_op || !cprm.file->f_op->write)
-                       goto close_fail;
-               if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
-                       goto close_fail;
-       }
-
-       retval = binfmt->core_dump(&cprm);
-       if (retval)
-               current->signal->group_exit_code |= 0x80;
-
-       if (ispipe && core_pipe_limit)
-               wait_for_dump_helpers(cprm.file);
-close_fail:
-       if (cprm.file)
-               filp_close(cprm.file, NULL);
-fail_dropcount:
-       if (ispipe)
-               atomic_dec(&core_dump_count);
-fail_unlock:
-       kfree(cn.corename);
-fail_corename:
-       coredump_finish(mm);
-       revert_creds(old_cred);
-fail_creds:
-       put_cred(cred);
-fail:
-       return;
-}
-
-/*
- * Core dumping helper functions.  These are the only things you should
- * do on a core-file: use only these functions to write out all the
- * necessary info.
- */
-int dump_write(struct file *file, const void *addr, int nr)
-{
-       return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-EXPORT_SYMBOL(dump_write);
-
-int dump_seek(struct file *file, loff_t off)
-{
-       int ret = 1;
-
-       if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-               if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
-                       return 0;
-       } else {
-               char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-
-               if (!buf)
-                       return 0;
-               while (off > 0) {
-                       unsigned long n = off;
-
-                       if (n > PAGE_SIZE)
-                               n = PAGE_SIZE;
-                       if (!dump_write(file, buf, n)) {
-                               ret = 0;
-                               break;
-                       }
-                       off -= n;
-               }
-               free_page((unsigned long)buf);
-       }
-       return ret;
-}
-EXPORT_SYMBOL(dump_seek);
-
 #ifdef __ARCH_WANT_SYS_EXECVE
 SYSCALL_DEFINE3(execve,
                const char __user *, filename,
index dde41a75c7c8dbd36597272a13f5c6f3487507d3..59e3bbfac0b17af51d9101b0d35dc6414ad53187 100644 (file)
@@ -206,6 +206,11 @@ static int init_inodecache(void)
  */
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(exofs_inode_cachep);
 }
 
index af74d9e27b71b0cf569cac645e362c174c44389e..6c205d0c565b2595fe837eab5cf789d995df2d18 100644 (file)
@@ -206,6 +206,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ext2_inode_cachep);
 }
 
index 504fb3f514200864d63b3ea799cb0aabf9b93b02..0e17142854042ef1337a5a7eebde9bbce55c9ccd 100644 (file)
@@ -532,6 +532,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ext3_inode_cachep);
 }
 
index 3b0e3bdaabfc08ab566f40fe1d91bad4b9edc469..04e89676394b9d79b5e2f6ad6f9bd12d4edb9bc4 100644 (file)
@@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = ext4_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
 
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &ext4_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 
index cb918d26e3cc9092c9f301db07766d2fd9102f67..ce8c050fa14f6fac5a19230c4687fdf92fdfe78b 100644 (file)
@@ -1017,6 +1017,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ext4_inode_cachep);
 }
 
index e06190322c1c8af1544034c1476f3c779b165318..964b634f6667ca89a3427ee453a016285aec8cf8 100644 (file)
@@ -6,6 +6,6 @@ obj-$(CONFIG_FAT_FS) += fat.o
 obj-$(CONFIG_VFAT_FS) += vfat.o
 obj-$(CONFIG_MSDOS_FS) += msdos.o
 
-fat-y := cache.o dir.o fatent.o file.o inode.o misc.o
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o
 vfat-y := namei_vfat.o
 msdos-y := namei_msdos.o
index 1cc7038e273de02bb404c91623bec8f65cd85288..91ad9e1c94417a813c1661af830bc2ecbc962033 100644 (file)
@@ -190,7 +190,8 @@ static void __fat_cache_inval_inode(struct inode *inode)
        struct fat_cache *cache;
 
        while (!list_empty(&i->cache_lru)) {
-               cache = list_entry(i->cache_lru.next, struct fat_cache, cache_list);
+               cache = list_entry(i->cache_lru.next,
+                                  struct fat_cache, cache_list);
                list_del_init(&cache->cache_list);
                i->nr_caches--;
                fat_cache_free(cache);
@@ -261,9 +262,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                       fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
-                                              " (i_pos %lld)", __func__,
-                                              MSDOS_I(inode)->i_pos);
+                       fat_fs_error_ratelimit(sb,
+                                      "%s: invalid cluster chain (i_pos %lld)",
+                                      __func__,
+                                      MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                } else if (nr == FAT_ENT_EOF) {
index dc49ed2cbffa66af9407f89886d8d84cf2dc9d0f..bca6d0a1255ecab0a1a8ec2b86032296a7a49e13 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/time.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include "fat.h"
 
@@ -123,7 +123,8 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
 {
        /* Fast stuff first */
        if (*bh && *de &&
-           (*de - (struct msdos_dir_entry *)(*bh)->b_data) < MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
+          (*de - (struct msdos_dir_entry *)(*bh)->b_data) <
+                               MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
                *pos += sizeof(struct msdos_dir_entry);
                (*de)++;
                return 0;
@@ -155,7 +156,8 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
 
        while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
                ec = *ip++;
-               if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+               charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE);
+               if (charlen > 0) {
                        op += charlen;
                        len -= charlen;
                } else {
@@ -172,12 +174,12 @@ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
        }
 
        if (unlikely(*ip)) {
-               fat_msg(sb, KERN_WARNING, "filename was truncated while "
-                       "converting.");
+               fat_msg(sb, KERN_WARNING,
+                       "filename was truncated while converting.");
        }
 
        *op = 0;
-       return (op - ascii);
+       return op - ascii;
 }
 
 static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
@@ -205,7 +207,8 @@ fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
 }
 
 static inline int
-fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
+fat_short2lower_uni(struct nls_table *t, unsigned char *c,
+                   int clen, wchar_t *uni)
 {
        int charlen;
        wchar_t wc;
@@ -220,7 +223,8 @@ fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *un
                if (!nc)
                        nc = *c;
 
-               if ( (charlen = t->char2uni(&nc, 1, uni)) < 0) {
+               charlen = t->char2uni(&nc, 1, uni);
+               if (charlen < 0) {
                        *uni = 0x003f;  /* a question mark */
                        charlen = 1;
                }
@@ -537,7 +541,6 @@ end_of_dir:
 
        return err;
 }
-
 EXPORT_SYMBOL_GPL(fat_search_long);
 
 struct fat_ioctl_filldir_callback {
@@ -574,7 +577,8 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        /* Fake . and .. for the root directory. */
        if (inode->i_ino == MSDOS_ROOT_INO) {
                while (cpos < 2) {
-                       if (filldir(dirent, "..", cpos+1, cpos, MSDOS_ROOT_INO, DT_DIR) < 0)
+                       if (filldir(dirent, "..", cpos+1, cpos,
+                                   MSDOS_ROOT_INO, DT_DIR) < 0)
                                goto out;
                        cpos++;
                        filp->f_pos++;
@@ -872,25 +876,26 @@ static int fat_get_short_entry(struct inode *dir, loff_t *pos,
 }
 
 /*
- * The ".." entry can not provide the "struct fat_slot_info" informations
- * for inode. So, this function provide the some informations only.
+ * The ".." entry can not provide the "struct fat_slot_info" information
+ * for inode, nor a usable i_pos. So, this function provides some information
+ * only.
+ *
+ * Since this function walks through the on-disk inodes within a directory,
+ * callers are responsible for taking any locks necessary to prevent the
+ * directory from changing.
  */
 int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
-                        struct msdos_dir_entry **de, loff_t *i_pos)
+                        struct msdos_dir_entry **de)
 {
-       loff_t offset;
+       loff_t offset = 0;
 
-       offset = 0;
-       *bh = NULL;
+       *de = NULL;
        while (fat_get_short_entry(dir, &offset, bh, de) >= 0) {
-               if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) {
-                       *i_pos = fat_make_i_pos(dir->i_sb, *bh, *de);
+               if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME))
                        return 0;
-               }
        }
        return -ENOENT;
 }
-
 EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);
 
 /* See if directory is empty */
@@ -913,7 +918,6 @@ int fat_dir_empty(struct inode *dir)
        brelse(bh);
        return result;
 }
-
 EXPORT_SYMBOL_GPL(fat_dir_empty);
 
 /*
@@ -959,7 +963,6 @@ int fat_scan(struct inode *dir, const unsigned char *name,
        }
        return -ENOENT;
 }
-
 EXPORT_SYMBOL_GPL(fat_scan);
 
 static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
@@ -1047,7 +1050,6 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 
        return 0;
 }
-
 EXPORT_SYMBOL_GPL(fat_remove_entries);
 
 static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
@@ -1141,10 +1143,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
                de[0].ctime_cs = de[1].ctime_cs = 0;
                de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
        }
-       de[0].start = cpu_to_le16(cluster);
-       de[0].starthi = cpu_to_le16(cluster >> 16);
-       de[1].start = cpu_to_le16(MSDOS_I(dir)->i_logstart);
-       de[1].starthi = cpu_to_le16(MSDOS_I(dir)->i_logstart >> 16);
+       fat_set_start(&de[0], cluster);
+       fat_set_start(&de[1], MSDOS_I(dir)->i_logstart);
        de[0].size = de[1].size = 0;
        memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
        set_buffer_uptodate(bhs[0]);
@@ -1161,7 +1161,6 @@ error_free:
 error:
        return err;
 }
-
 EXPORT_SYMBOL_GPL(fat_alloc_new_dir);
 
 static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
@@ -1377,5 +1376,4 @@ error_remove:
                __fat_remove_entries(dir, pos, free_slots);
        return err;
 }
-
 EXPORT_SYMBOL_GPL(fat_add_entries);
index 2deeeb86f331c8ad70d56300f1b5d4d5b155fe7b..76f036f9e1387c17e9b244de84d72ecb03033363 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/string.h>
 #include <linux/nls.h>
 #include <linux/fs.h>
+#include <linux/hash.h>
 #include <linux/mutex.h>
 #include <linux/ratelimit.h>
 #include <linux/msdos_fs.h>
@@ -27,26 +28,27 @@ struct fat_mount_options {
        gid_t fs_gid;
        unsigned short fs_fmask;
        unsigned short fs_dmask;
-       unsigned short codepage;  /* Codepage for shortname conversions */
-       char *iocharset;          /* Charset used for filename input/display */
-       unsigned short shortname; /* flags for shortname display/create rule */
-       unsigned char name_check; /* r = relaxed, n = normal, s = strict */
-       unsigned char errors;     /* On error: continue, panic, remount-ro */
+       unsigned short codepage;   /* Codepage for shortname conversions */
+       char *iocharset;           /* Charset used for filename input/display */
+       unsigned short shortname;  /* flags for shortname display/create rule */
+       unsigned char name_check;  /* r = relaxed, n = normal, s = strict */
+       unsigned char errors;      /* On error: continue, panic, remount-ro */
        unsigned short allow_utime;/* permission for setting the [am]time */
-       unsigned quiet:1,         /* set = fake successful chmods and chowns */
-                showexec:1,      /* set = only set x bit for com/exe/bat */
-                sys_immutable:1, /* set = system files are immutable */
-                dotsOK:1,        /* set = hidden and system files are named '.filename' */
-                isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
-                utf8:1,          /* Use of UTF-8 character set (Default) */
-                unicode_xlate:1, /* create escape sequences for unhandled Unicode */
-                numtail:1,       /* Does first alias have a numeric '~1' type tail? */
-                flush:1,         /* write things quickly */
-                nocase:1,        /* Does this need case conversion? 0=need case conversion*/
-                usefree:1,       /* Use free_clusters for FAT32 */
-                tz_utc:1,        /* Filesystem timestamps are in UTC */
-                rodir:1,         /* allow ATTR_RO for directory */
-                discard:1;       /* Issue discard requests on deletions */
+       unsigned quiet:1,          /* set = fake successful chmods and chowns */
+                showexec:1,       /* set = only set x bit for com/exe/bat */
+                sys_immutable:1,  /* set = system files are immutable */
+                dotsOK:1,         /* set = hidden and system files are named '.filename' */
+                isvfat:1,         /* 0=no vfat long filename support, 1=vfat support */
+                utf8:1,           /* Use of UTF-8 character set (Default) */
+                unicode_xlate:1,  /* create escape sequences for unhandled Unicode */
+                numtail:1,        /* Does first alias have a numeric '~1' type tail? */
+                flush:1,          /* write things quickly */
+                nocase:1,         /* Does this need case conversion? 0=need case conversion*/
+                usefree:1,        /* Use free_clusters for FAT32 */
+                tz_utc:1,         /* Filesystem timestamps are in UTC */
+                rodir:1,          /* allow ATTR_RO for directory */
+                discard:1,        /* Issue discard requests on deletions */
+                nfs:1;            /* Do extra work needed for NFS export */
 };
 
 #define FAT_HASH_BITS  8
@@ -56,28 +58,28 @@ struct fat_mount_options {
  * MS-DOS file system in-core superblock data
  */
 struct msdos_sb_info {
-       unsigned short sec_per_clus; /* sectors/cluster */
-       unsigned short cluster_bits; /* log2(cluster_size) */
-       unsigned int cluster_size;   /* cluster size */
-       unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+       unsigned short sec_per_clus;  /* sectors/cluster */
+       unsigned short cluster_bits;  /* log2(cluster_size) */
+       unsigned int cluster_size;    /* cluster size */
+       unsigned char fats, fat_bits; /* number of FATs, FAT bits (12 or 16) */
        unsigned short fat_start;
-       unsigned long fat_length;    /* FAT start & length (sec.) */
+       unsigned long fat_length;     /* FAT start & length (sec.) */
        unsigned long dir_start;
-       unsigned short dir_entries;  /* root dir start & entries */
-       unsigned long data_start;    /* first data sector */
-       unsigned long max_cluster;   /* maximum cluster number */
-       unsigned long root_cluster;  /* first cluster of the root directory */
-       unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+       unsigned short dir_entries;   /* root dir start & entries */
+       unsigned long data_start;     /* first data sector */
+       unsigned long max_cluster;    /* maximum cluster number */
+       unsigned long root_cluster;   /* first cluster of the root directory */
+       unsigned long fsinfo_sector;  /* sector number of FAT32 fsinfo */
        struct mutex fat_lock;
-       unsigned int prev_free;      /* previously allocated cluster number */
-       unsigned int free_clusters;  /* -1 if undefined */
+       unsigned int prev_free;       /* previously allocated cluster number */
+       unsigned int free_clusters;   /* -1 if undefined */
        unsigned int free_clus_valid; /* is free_clusters valid? */
        struct fat_mount_options options;
-       struct nls_table *nls_disk;  /* Codepage used on disk */
-       struct nls_table *nls_io;    /* Charset used for input and display */
-       const void *dir_ops;                 /* Opaque; default directory operations */
-       int dir_per_block;           /* dir entries per block */
-       int dir_per_block_bits;      /* log2(dir_per_block) */
+       struct nls_table *nls_disk;   /* Codepage used on disk */
+       struct nls_table *nls_io;     /* Charset used for input and display */
+       const void *dir_ops;          /* Opaque; default directory operations */
+       int dir_per_block;            /* dir entries per block */
+       int dir_per_block_bits;       /* log2(dir_per_block) */
 
        int fatent_shift;
        struct fatent_operations *fatent_ops;
@@ -88,6 +90,9 @@ struct msdos_sb_info {
 
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+
+       spinlock_t dir_hash_lock;
+       struct hlist_head dir_hashtable[FAT_HASH_SIZE];
 };
 
 #define FAT_CACHE_VALID        0       /* special case for valid cache */
@@ -110,6 +115,7 @@ struct msdos_inode_info {
        int i_attrs;            /* unused attribute bits */
        loff_t i_pos;           /* on-disk position of directory entry or 0 */
        struct hlist_node i_fat_hash;   /* hash by i_location */
+       struct hlist_node i_dir_hash;   /* hash by i_logstart */
        struct rw_semaphore truncate_lock; /* protect bmap against truncate */
        struct inode vfs_inode;
 };
@@ -262,7 +268,7 @@ extern int fat_subdirs(struct inode *dir);
 extern int fat_scan(struct inode *dir, const unsigned char *name,
                    struct fat_slot_info *sinfo);
 extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
-                               struct msdos_dir_entry **de, loff_t *i_pos);
+                               struct msdos_dir_entry **de);
 extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
 extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
                           struct fat_slot_info *sinfo);
@@ -322,7 +328,7 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
                              unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
-extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern int fat_setattr(struct dentry *dentry, struct iattr *attr);
 extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
@@ -340,7 +346,12 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
                          int isvfat, void (*setup)(struct super_block *));
 
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
-                           struct inode *i2);
+                           struct inode *i2);
+static inline unsigned long fat_dir_hash(int logstart)
+{
+       return hash_32(logstart, FAT_HASH_BITS);
+}
+
 /* fat/misc.c */
 extern __printf(3, 4) __cold
 void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
@@ -366,6 +377,14 @@ extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 int fat_cache_init(void);
 void fat_cache_destroy(void);
 
+/* fat/nfs.c */
+struct fid;
+extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                      int fh_len, int fh_type);
+extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+                                      int fh_len, int fh_type);
+extern struct dentry *fat_get_parent(struct dentry *child_dir);
+
 /* helper for printk */
 typedef unsigned long long     llu;
 
index 31f08ab62c562d1926a75183c802793642cd390c..260705c58062cc425b8c656434883de98e1b9e2a 100644 (file)
@@ -186,9 +186,6 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
 
 static void fat32_ent_put(struct fat_entry *fatent, int new)
 {
-       if (new == FAT_ENT_EOF)
-               new = EOF_FAT32;
-
        WARN_ON(new & 0xf0000000);
        new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
        *fatent->u.ent32_p = cpu_to_le32(new);
@@ -203,15 +200,18 @@ static int fat12_ent_next(struct fat_entry *fatent)
 
        fatent->entry++;
        if (fatent->nr_bhs == 1) {
-               WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2)));
-               WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1)));
+               WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data +
+                                                       (bhs[0]->b_size - 2)));
+               WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data +
+                                                       (bhs[0]->b_size - 1)));
                if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) {
                        ent12_p[0] = nextp - 1;
                        ent12_p[1] = nextp;
                        return 1;
                }
        } else {
-               WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1)));
+               WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data +
+                                                       (bhs[0]->b_size - 1)));
                WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data);
                ent12_p[0] = nextp - 1;
                ent12_p[1] = nextp;
@@ -631,7 +631,6 @@ error:
 
        return err;
 }
-
 EXPORT_SYMBOL_GPL(fat_free_clusters);
 
 /* 128kb is the whole sectors for FAT12 and FAT16 */
index 05e897fe9866c49fb96fa09459240bf7a550586c..056297d794c37ebbc63b2b32fffb48479a76eb5f 100644 (file)
@@ -281,15 +281,42 @@ static inline unsigned long fat_hash(loff_t i_pos)
        return hash_32(i_pos, FAT_HASH_BITS);
 }
 
+static void dir_hash_init(struct super_block *sb)
+{
+       struct msdos_sb_info *sbi = MSDOS_SB(sb);
+       int i;
+
+       spin_lock_init(&sbi->dir_hash_lock);
+       for (i = 0; i < FAT_HASH_SIZE; i++)
+               INIT_HLIST_HEAD(&sbi->dir_hashtable[i]);
+}
+
 void fat_attach(struct inode *inode, loff_t i_pos)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-       struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
 
-       spin_lock(&sbi->inode_hash_lock);
-       MSDOS_I(inode)->i_pos = i_pos;
-       hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
-       spin_unlock(&sbi->inode_hash_lock);
+       if (inode->i_ino != MSDOS_ROOT_INO) {
+               struct hlist_head *head =   sbi->inode_hashtable
+                                         + fat_hash(i_pos);
+
+               spin_lock(&sbi->inode_hash_lock);
+               MSDOS_I(inode)->i_pos = i_pos;
+               hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
+               spin_unlock(&sbi->inode_hash_lock);
+       }
+
+       /* If NFS support is enabled, cache the mapping of start cluster
+        * to directory inode. This is used during reconnection of
+        * dentries to the filesystem root.
+        */
+       if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
+               struct hlist_head *d_head = sbi->dir_hashtable;
+               d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart);
+
+               spin_lock(&sbi->dir_hash_lock);
+               hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head);
+               spin_unlock(&sbi->dir_hash_lock);
+       }
 }
 EXPORT_SYMBOL_GPL(fat_attach);
 
@@ -300,6 +327,12 @@ void fat_detach(struct inode *inode)
        MSDOS_I(inode)->i_pos = 0;
        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
        spin_unlock(&sbi->inode_hash_lock);
+
+       if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
+               spin_lock(&sbi->dir_hash_lock);
+               hlist_del_init(&MSDOS_I(inode)->i_dir_hash);
+               spin_unlock(&sbi->dir_hash_lock);
+       }
 }
 EXPORT_SYMBOL_GPL(fat_detach);
 
@@ -504,6 +537,7 @@ static void init_once(void *foo)
        ei->cache_valid_id = FAT_CACHE_VALID + 1;
        INIT_LIST_HEAD(&ei->cache_lru);
        INIT_HLIST_NODE(&ei->i_fat_hash);
+       INIT_HLIST_NODE(&ei->i_dir_hash);
        inode_init_once(&ei->vfs_inode);
 }
 
@@ -521,6 +555,11 @@ static int __init fat_init_inodecache(void)
 
 static void __exit fat_destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(fat_inode_cachep);
 }
 
@@ -663,125 +702,9 @@ static const struct super_operations fat_sops = {
        .show_options   = fat_show_options,
 };
 
-/*
- * a FAT file handle with fhtype 3 is
- *  0/  i_ino - for fast, reliable lookup if still in the cache
- *  1/  i_generation - to see if i_ino is still valid
- *          bit 0 == 0 iff directory
- *  2/  i_pos(8-39) - if ino has changed, but still in cache
- *  3/  i_pos(4-7)|i_logstart - to semi-verify inode found at i_pos
- *  4/  i_pos(0-3)|parent->i_logstart - maybe used to hunt for the file on disc
- *
- * Hack for NFSv2: Maximum FAT entry number is 28bits and maximum
- * i_pos is 40bits (blocknr(32) + dir offset(8)), so two 4bits
- * of i_logstart is used to store the directory entry offset.
- */
-
-static struct dentry *fat_fh_to_dentry(struct super_block *sb,
-               struct fid *fid, int fh_len, int fh_type)
-{
-       struct inode *inode = NULL;
-       u32 *fh = fid->raw;
-
-       if (fh_len < 5 || fh_type != 3)
-               return NULL;
-
-       inode = ilookup(sb, fh[0]);
-       if (!inode || inode->i_generation != fh[1]) {
-               if (inode)
-                       iput(inode);
-               inode = NULL;
-       }
-       if (!inode) {
-               loff_t i_pos;
-               int i_logstart = fh[3] & 0x0fffffff;
-
-               i_pos = (loff_t)fh[2] << 8;
-               i_pos |= ((fh[3] >> 24) & 0xf0) | (fh[4] >> 28);
-
-               /* try 2 - see if i_pos is in F-d-c
-                * require i_logstart to be the same
-                * Will fail if you truncate and then re-write
-                */
-
-               inode = fat_iget(sb, i_pos);
-               if (inode && MSDOS_I(inode)->i_logstart != i_logstart) {
-                       iput(inode);
-                       inode = NULL;
-               }
-       }
-
-       /*
-        * For now, do nothing if the inode is not found.
-        *
-        * What we could do is:
-        *
-        *      - follow the file starting at fh[4], and record the ".." entry,
-        *        and the name of the fh[2] entry.
-        *      - then follow the ".." file finding the next step up.
-        *
-        * This way we build a path to the root of the tree. If this works, we
-        * lookup the path and so get this inode into the cache.  Finally try
-        * the fat_iget lookup again.  If that fails, then we are totally out
-        * of luck.  But all that is for another day
-        */
-       return d_obtain_alias(inode);
-}
-
-static int
-fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
-{
-       int len = *lenp;
-       struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-       loff_t i_pos;
-
-       if (len < 5) {
-               *lenp = 5;
-               return 255; /* no room */
-       }
-
-       i_pos = fat_i_pos_read(sbi, inode);
-       *lenp = 5;
-       fh[0] = inode->i_ino;
-       fh[1] = inode->i_generation;
-       fh[2] = i_pos >> 8;
-       fh[3] = ((i_pos & 0xf0) << 24) | MSDOS_I(inode)->i_logstart;
-       fh[4] = (i_pos & 0x0f) << 28;
-       if (parent)
-               fh[4] |= MSDOS_I(parent)->i_logstart;
-       return 3;
-}
-
-static struct dentry *fat_get_parent(struct dentry *child)
-{
-       struct super_block *sb = child->d_sb;
-       struct buffer_head *bh;
-       struct msdos_dir_entry *de;
-       loff_t i_pos;
-       struct dentry *parent;
-       struct inode *inode;
-       int err;
-
-       lock_super(sb);
-
-       err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
-       if (err) {
-               parent = ERR_PTR(err);
-               goto out;
-       }
-       inode = fat_build_inode(sb, de, i_pos);
-       brelse(bh);
-
-       parent = d_obtain_alias(inode);
-out:
-       unlock_super(sb);
-
-       return parent;
-}
-
 static const struct export_operations fat_export_ops = {
-       .encode_fh      = fat_encode_fh,
        .fh_to_dentry   = fat_fh_to_dentry,
+       .fh_to_parent   = fat_fh_to_parent,
        .get_parent     = fat_get_parent,
 };
 
@@ -829,6 +752,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",usefree");
        if (opts->quiet)
                seq_puts(m, ",quiet");
+       if (opts->nfs)
+               seq_puts(m, ",nfs");
        if (opts->showexec)
                seq_puts(m, ",showexec");
        if (opts->sys_immutable)
@@ -873,7 +798,7 @@ enum {
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
        Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-       Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
+       Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -902,6 +827,7 @@ static const match_table_t fat_tokens = {
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
        {Opt_discard, "discard"},
+       {Opt_nfs, "nfs"},
        {Opt_obsolete, "conv=binary"},
        {Opt_obsolete, "conv=text"},
        {Opt_obsolete, "conv=auto"},
@@ -982,6 +908,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
        opts->tz_utc = 0;
+       opts->nfs = 0;
        opts->errors = FAT_ERRORS_RO;
        *debug = 0;
 
@@ -1142,6 +1069,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
                case Opt_discard:
                        opts->discard = 1;
                        break;
+               case Opt_nfs:
+                       opts->nfs = 1;
+                       break;
 
                /* obsolete mount options */
                case Opt_obsolete:
@@ -1432,6 +1362,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 
        /* set up enough so that it can read an inode */
        fat_hash_init(sb);
+       dir_hash_init(sb);
        fat_ent_access_init(sb);
 
        /*
@@ -1486,6 +1417,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
        }
        error = -ENOMEM;
        insert_inode_hash(root_inode);
+       fat_attach(root_inode, 0);
        sb->s_root = d_make_root(root_inode);
        if (!sb->s_root) {
                fat_msg(sb, KERN_ERR, "get root inode failed");
@@ -1525,18 +1457,14 @@ static int writeback_inode(struct inode *inode)
 {
 
        int ret;
-       struct address_space *mapping = inode->i_mapping;
-       struct writeback_control wbc = {
-              .sync_mode = WB_SYNC_NONE,
-             .nr_to_write = 0,
-       };
-       /* if we used WB_SYNC_ALL, sync_inode waits for the io for the
-       * inode to finish.  So WB_SYNC_NONE is sent down to sync_inode
+
+       /* if we used wait=1, sync_inode_metadata waits for the io for the
+       * inode to finish.  So wait=0 is sent down to sync_inode_metadata
        * and filemap_fdatawrite is used for the data blocks
        */
-       ret = sync_inode(inode, &wbc);
+       ret = sync_inode_metadata(inode, 0);
        if (!ret)
-              ret = filemap_fdatawrite(mapping);
+               ret = filemap_fdatawrite(inode->i_mapping);
        return ret;
 }
 
index b0e12bf9f4a1c0fc32dc42facffaf84ff75fe880..c1055e778fff51a980105797ad82ce4c28005691 100644 (file)
@@ -407,7 +407,7 @@ out:
 static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
-       struct super_block *sb= inode->i_sb;
+       struct super_block *sb = inode->i_sb;
        struct fat_slot_info sinfo;
        int err;
 
@@ -440,7 +440,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
        struct inode *old_inode, *new_inode;
        struct fat_slot_info old_sinfo, sinfo;
        struct timespec ts;
-       loff_t dotdot_i_pos, new_i_pos;
+       loff_t new_i_pos;
        int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
 
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -456,8 +456,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
        is_dir = S_ISDIR(old_inode->i_mode);
        update_dotdot = (is_dir && old_dir != new_dir);
        if (update_dotdot) {
-               if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-                                        &dotdot_i_pos) < 0) {
+               if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
                        err = -EIO;
                        goto out;
                }
index 6a6d8c0715a1c16daf2b1ac53e8d461eb2253810..e535dd75b986779f8910831c8bafba44ab2a5e71 100644 (file)
@@ -914,7 +914,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode, *new_inode;
        struct fat_slot_info old_sinfo, sinfo;
        struct timespec ts;
-       loff_t dotdot_i_pos, new_i_pos;
+       loff_t new_i_pos;
        int err, is_dir, update_dotdot, corrupt = 0;
        struct super_block *sb = old_dir->i_sb;
 
@@ -929,8 +929,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        is_dir = S_ISDIR(old_inode->i_mode);
        update_dotdot = (is_dir && old_dir != new_dir);
        if (update_dotdot) {
-               if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-                                        &dotdot_i_pos) < 0) {
+               if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
                        err = -EIO;
                        goto out;
                }
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
new file mode 100644 (file)
index 0000000..ef4b5fa
--- /dev/null
@@ -0,0 +1,101 @@
+/* fs/fat/nfs.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/exportfs.h>
+#include "fat.h"
+
+/**
+ * Look up a directory inode given its starting cluster.
+ */
+static struct inode *fat_dget(struct super_block *sb, int i_logstart)
+{
+       struct msdos_sb_info *sbi = MSDOS_SB(sb);
+       struct hlist_head *head;
+       struct hlist_node *_p;
+       struct msdos_inode_info *i;
+       struct inode *inode = NULL;
+
+       head = sbi->dir_hashtable + fat_dir_hash(i_logstart);
+       spin_lock(&sbi->dir_hash_lock);
+       hlist_for_each_entry(i, _p, head, i_dir_hash) {
+               BUG_ON(i->vfs_inode.i_sb != sb);
+               if (i->i_logstart != i_logstart)
+                       continue;
+               inode = igrab(&i->vfs_inode);
+               if (inode)
+                       break;
+       }
+       spin_unlock(&sbi->dir_hash_lock);
+       return inode;
+}
+
+static struct inode *fat_nfs_get_inode(struct super_block *sb,
+                                      u64 ino, u32 generation)
+{
+       struct inode *inode;
+
+       if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
+               return NULL;
+
+       inode = ilookup(sb, ino);
+       if (inode && generation && (inode->i_generation != generation)) {
+               iput(inode);
+               inode = NULL;
+       }
+
+       return inode;
+}
+
+/**
+ * Map a NFS file handle to a corresponding dentry.
+ * The dentry may or may not be connected to the filesystem root.
+ */
+struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                               int fh_len, int fh_type)
+{
+       return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                   fat_nfs_get_inode);
+}
+
+/*
+ * Find the parent for a file specified by NFS handle.
+ * This requires that the handle contain the i_ino of the parent.
+ */
+struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+                               int fh_len, int fh_type)
+{
+       return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                   fat_nfs_get_inode);
+}
+
+/*
+ * Find the parent for a directory that is not currently connected to
+ * the filesystem root.
+ *
+ * On entry, the caller holds child_dir->d_inode->i_mutex.
+ */
+struct dentry *fat_get_parent(struct dentry *child_dir)
+{
+       struct super_block *sb = child_dir->d_sb;
+       struct buffer_head *bh = NULL;
+       struct msdos_dir_entry *de;
+       struct inode *parent_inode = NULL;
+
+       if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) {
+               int parent_logstart = fat_get_start(MSDOS_SB(sb), de);
+               parent_inode = fat_dget(sb, parent_logstart);
+       }
+       brelse(bh);
+
+       return d_obtain_alias(parent_inode);
+}
index d4fabd26084ed8340e772e3699faa2e9e239da91..fed2c8afb3a9f401945ca5d8a226a4df5ebb4c9f 100644 (file)
@@ -279,6 +279,11 @@ static void __exit
 vxfs_cleanup(void)
 {
        unregister_filesystem(&vxfs_fs_type);
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(vxfs_inode_cachep);
 }
 
index aba15f1b7ad2974aa85e2c7272afc3d7836a40bc..78d2837bc940292aa757aba68f84fa18fb4057da 100644 (file)
@@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
        .close          = fuse_vma_close,
        .fault          = filemap_fault,
        .page_mkwrite   = fuse_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
index 407392d88aaf8ec6673057ad623856e0fe619f19..11c08189d06b733330830a32f720584cdaeb8ec2 100644 (file)
@@ -1200,6 +1200,12 @@ static void fuse_fs_cleanup(void)
 {
        unregister_filesystem(&fuse_fs_type);
        unregister_fuseblk();
+
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(fuse_inode_cachep);
 }
 
index 30e21997a1a1323963e6de594482d3a4f9e74878..0def0504afc1816ae40b55de02c07aa68cc3a003 100644 (file)
@@ -492,6 +492,7 @@ out:
 static const struct vm_operations_struct gfs2_vm_ops = {
        .fault = filemap_fault,
        .page_mkwrite = gfs2_page_mkwrite,
+       .remap_pages = generic_file_remap_pages,
 };
 
 /**
@@ -526,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
                        return error;
        }
        vma->vm_ops = &gfs2_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
 
        return 0;
 }
index 4eb873e0c07b137c0225f30b20fdf78bfb0c0006..941d7a8c2197dd17eb0ab3b7ade3211f75a14a57 100644 (file)
@@ -482,6 +482,12 @@ static int __init init_hfs_fs(void)
 static void __exit exit_hfs_fs(void)
 {
        unregister_filesystem(&hfs_fs_type);
+
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(hfs_inode_cachep);
 }
 
index fdafb2d71654740776bd1b77931b955da350ecad..811a84d2d9643677832219a0b960c6d42109480e 100644 (file)
@@ -635,6 +635,12 @@ static int __init init_hfsplus_fs(void)
 static void __exit exit_hfsplus_fs(void)
 {
        unregister_filesystem(&hfsplus_fs_type);
+
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(hfsplus_inode_cachep);
 }
 
index 706a12c083ea726a7a268d647ae266b02a3a2ca7..3cb1da56eb73438bf6137405e7ec9997399e1d96 100644 (file)
@@ -210,6 +210,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(hpfs_inode_cachep);
 }
 
index 8349a899912e5c47ca26c66df1be66c684bdce4d..b00385be5df0a91a3d7d7c343b1f9fc16f27f65f 100644 (file)
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
         * way when do_mmap_pgoff unwinds (may be important on powerpc
         * and ia64).
         */
-       vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
+       vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &hugetlb_vm_ops;
 
        if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
@@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 }
 
 static inline void
-hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
+hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
 {
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
 
-       vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) {
+       vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
                unsigned long v_offset;
 
                /*
                 * Can the expression below overflow on 32-bit arches?
-                * No, because the prio_tree returns us only those vmas
+                * No, because the interval tree returns us only those vmas
                 * which overlap the truncated area starting at pgoff,
                 * and no vma on a 32-bit arch can span beyond the 4GB.
                 */
@@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 
        i_size_write(inode, offset);
        mutex_lock(&mapping->i_mmap_mutex);
-       if (!prio_tree_empty(&mapping->i_mmap))
+       if (!RB_EMPTY_ROOT(&mapping->i_mmap))
                hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
        mutex_unlock(&mapping->i_mmap_mutex);
        truncate_hugepages(inode, offset);
@@ -1042,6 +1041,11 @@ static int __init init_hugetlbfs_fs(void)
 
 static void __exit exit_hugetlbfs_fs(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(hugetlbfs_inode_cachep);
        kern_unmount(hugetlbfs_vfsmount);
        unregister_filesystem(&hugetlbfs_fs_type);
index ac8d904b3f1624bfa945ba088d2909e740e16a6c..b03c7195724685e74cba0275adaa718755eb5455 100644 (file)
@@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping)
        mutex_init(&mapping->i_mmap_mutex);
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
-       INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+       mapping->i_mmap = RB_ROOT;
        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
index 29037c365ba4f4807eca405680def2c78353f966..f94cde4527e880078ed2dd20f1148f0c030eaa7f 100644 (file)
@@ -114,6 +114,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(isofs_inode_cachep);
 }
 
index 1ea349fff68b625389c5647f03094ef56f5e4262..ae81b01e6fd7ebe8291cfcba8b419b245a6a2310 100644 (file)
@@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 }
 
 /* Trivial function to remove the last node in the tree. Which by definition
-   has no right-hand -- so can be removed just by making its only child (if
-   any) take its place under its parent. */
+   has no right-hand child â€” so can be removed just by making its left-hand
+   child (if any) take its place under its parent. Since this is only done
+   when we're consuming the whole tree, there's no need to use rb_erase()
+   and let it worry about adjusting colours and balancing the tree. That
+   would just be a waste of time. */
 static void eat_last(struct rb_root *root, struct rb_node *node)
 {
        struct rb_node *parent = rb_parent(node);
@@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node)
                link = &parent->rb_right;
 
        *link = node->rb_left;
-       /* Colour doesn't matter now. Only the parent pointer. */
        if (node->rb_left)
-               node->rb_left->rb_parent_color = node->rb_parent_color;
+               node->rb_left->__rb_parent_color = node->__rb_parent_color;
 }
 
-/* We put this in reverse order, so we can just use eat_last */
+/* We put the version tree in reverse order, so we can use the same eat_last()
+   function that we use to consume the tmpnode tree (tn_root). */
 static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn)
 {
        struct rb_node **link = &ver_root->rb_node;
index 1224d6b48e7e5270830928e323da026a7fe33c41..d3d8799e2187233e23e1614990443203d4fff1ad 100644 (file)
@@ -422,6 +422,12 @@ static void __exit exit_jffs2_fs(void)
        unregister_filesystem(&jffs2_fs_type);
        jffs2_destroy_slab_caches();
        jffs2_compressors_exit();
+
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(jffs2_inode_cachep);
 }
 
index 6f4ac1c070f0118d8dbda1a9405a2abd4f6ad426..0161b326e516eeda314db86cf6b4a0cece8683a2 100644 (file)
@@ -944,6 +944,12 @@ static void __exit exit_jfs_fs(void)
        jfs_proc_clean();
 #endif
        unregister_filesystem(&jfs_fs_type);
+
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(jfs_inode_cachep);
 }
 
index 6984562738d36bc4142a3e0556730ae9e3bf3a57..121bba2cf6f2cff56f7e2f90fcbe204d02e52fd1 100644 (file)
@@ -417,5 +417,10 @@ int logfs_init_inode_cache(void)
 
 void logfs_destroy_inode_cache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(logfs_inode_cache);
 }
index 2a503ad020d5da4bd1de5442e306f5b98a0b02d9..dc8d3629c20a39636f0dfc5f9b462d333fde80c0 100644 (file)
@@ -100,6 +100,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(minix_inode_cachep);
 }
 
index eaa74323663aa2b769971a279045734fdcee7f82..d7e9fe77188a6869073bae3daa8c5a731d2a1edb 100644 (file)
@@ -89,6 +89,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ncp_inode_cachep);
 }
 
index 6a7fcab7ecb3115c7630573c17f4d8285a418591..f692be97676d264d8e9100df4306e9e9fb021a7f 100644 (file)
@@ -578,6 +578,7 @@ out:
 static const struct vm_operations_struct nfs_file_vm_ops = {
        .fault = filemap_fault,
        .page_mkwrite = nfs_vm_page_mkwrite,
+       .remap_pages = generic_file_remap_pages,
 };
 
 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
index 9b47610338f59f03f6b4fdc0280d6aa61c266d4f..e4c716d374a86b16352f539a1cdd364c652986ad 100644 (file)
@@ -1571,6 +1571,11 @@ static int __init nfs_init_inodecache(void)
 
 static void nfs_destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(nfs_inode_cachep);
 }
 
index a4d56ac02e6cf075b81dfaee284175d86776219d..185b8e62d639a36e7128aec438e482496d5ebd5b 100644 (file)
@@ -134,13 +134,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct nilfs_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = nilfs_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
 
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &nilfs_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 
index 6a10812711c1d37bca6660530cd6c34cbb7b30fb..3c991dc84f2f2df6f8c9dccd922d8edbf1ff32a5 100644 (file)
@@ -1382,6 +1382,12 @@ static void nilfs_segbuf_init_once(void *obj)
 
 static void nilfs_destroy_cachep(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
+
        if (nilfs_inode_cachep)
                kmem_cache_destroy(nilfs_inode_cachep);
        if (nilfs_transaction_cachep)
index 2bc149d6a784e74ba485216d888ccd95f12e9516..fe08d4afa10636c24d1a9085397aa10b5c9d6f80 100644 (file)
@@ -3168,6 +3168,12 @@ static void __exit exit_ntfs_fs(void)
        ntfs_debug("Unregistering NTFS driver.");
 
        unregister_filesystem(&ntfs_fs_type);
+
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ntfs_big_inode_cache);
        kmem_cache_destroy(ntfs_inode_cache);
        kmem_cache_destroy(ntfs_name_cache);
index 83b6f98e0665433bda36a2f1d4f34a1ce7bec4cd..16b712d260d4fcb628fa67e13f1684b9239698d6 100644 (file)
@@ -691,6 +691,11 @@ static void __exit exit_dlmfs_fs(void)
        flush_workqueue(user_dlm_worker);
        destroy_workqueue(user_dlm_worker);
 
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(dlmfs_inode_cache);
 
        bdi_destroy(&dlmfs_backing_dev_info);
index d150372fd81da7967eb3a8f3d2a3b0d9c8c41c82..47a87dda54ce38a5a911099256775f01ff4eb5f5 100644 (file)
@@ -173,6 +173,7 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
        .fault          = ocfs2_fault,
        .page_mkwrite   = ocfs2_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
 
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
@@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
        ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
 out:
        vma->vm_ops = &ocfs2_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 
index 68f4541c2db98b26a3aba49234c721e56d56e3c3..0e91ec22a940ea1488bab00191b8f54356578bd5 100644 (file)
@@ -1818,6 +1818,11 @@ static int ocfs2_initialize_mem_caches(void)
 
 static void ocfs2_free_mem_caches(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        if (ocfs2_inode_cachep)
                kmem_cache_destroy(ocfs2_inode_cachep);
        ocfs2_inode_cachep = NULL;
index 4a3477949bca6da23a13161708e0ee7168fdb654..2ad080faca34e38d2ac8c5aed09a33135cae7d29 100644 (file)
@@ -463,6 +463,11 @@ static int __init init_openprom_fs(void)
 static void __exit exit_openprom_fs(void)
 {
        unregister_filesystem(&openprom_fs_type);
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(op_inode_cachep);
 }
 
index 1b6c84cbdb732e5684ccaa823548b8780cf1c16d..21fb23050698a0700bf41cd65edac583fbeaa6fe 100644 (file)
@@ -874,111 +874,6 @@ static const struct file_operations proc_environ_operations = {
        .release        = mem_release,
 };
 
-static ssize_t oom_adjust_read(struct file *file, char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-       char buffer[PROC_NUMBUF];
-       size_t len;
-       int oom_adjust = OOM_DISABLE;
-       unsigned long flags;
-
-       if (!task)
-               return -ESRCH;
-
-       if (lock_task_sighand(task, &flags)) {
-               oom_adjust = task->signal->oom_adj;
-               unlock_task_sighand(task, &flags);
-       }
-
-       put_task_struct(task);
-
-       len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
-
-       return simple_read_from_buffer(buf, count, ppos, buffer, len);
-}
-
-static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       struct task_struct *task;
-       char buffer[PROC_NUMBUF];
-       int oom_adjust;
-       unsigned long flags;
-       int err;
-
-       memset(buffer, 0, sizeof(buffer));
-       if (count > sizeof(buffer) - 1)
-               count = sizeof(buffer) - 1;
-       if (copy_from_user(buffer, buf, count)) {
-               err = -EFAULT;
-               goto out;
-       }
-
-       err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
-       if (err)
-               goto out;
-       if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-            oom_adjust != OOM_DISABLE) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       task = get_proc_task(file->f_path.dentry->d_inode);
-       if (!task) {
-               err = -ESRCH;
-               goto out;
-       }
-
-       task_lock(task);
-       if (!task->mm) {
-               err = -EINVAL;
-               goto err_task_lock;
-       }
-
-       if (!lock_task_sighand(task, &flags)) {
-               err = -ESRCH;
-               goto err_task_lock;
-       }
-
-       if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-               err = -EACCES;
-               goto err_sighand;
-       }
-
-       /*
-        * Warn that /proc/pid/oom_adj is deprecated, see
-        * Documentation/feature-removal-schedule.txt.
-        */
-       printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
-                 current->comm, task_pid_nr(current), task_pid_nr(task),
-                 task_pid_nr(task));
-       task->signal->oom_adj = oom_adjust;
-       /*
-        * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
-        * value is always attainable.
-        */
-       if (task->signal->oom_adj == OOM_ADJUST_MAX)
-               task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
-       else
-               task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-                                                               -OOM_DISABLE;
-       trace_oom_score_adj_update(task);
-err_sighand:
-       unlock_task_sighand(task, &flags);
-err_task_lock:
-       task_unlock(task);
-       put_task_struct(task);
-out:
-       return err < 0 ? err : count;
-}
-
-static const struct file_operations proc_oom_adjust_operations = {
-       .read           = oom_adjust_read,
-       .write          = oom_adjust_write,
-       .llseek         = generic_file_llseek,
-};
-
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
 {
@@ -1052,15 +947,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
        trace_oom_score_adj_update(task);
-       /*
-        * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
-        * always attainable.
-        */
-       if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-               task->signal->oom_adj = OOM_DISABLE;
-       else
-               task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
-                                                       OOM_SCORE_ADJ_MAX;
+
 err_sighand:
        unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -3084,7 +2971,6 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
        INF("oom_score",  S_IRUGO, proc_oom_score),
-       REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -3450,7 +3336,6 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
        INF("oom_score", S_IRUGO, proc_oom_score),
-       REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
index b3647fe6a60870e55f36710497fd9dff583963ee..0d80cef4cfb93ea5bbd423b6cf887039f15b093a 100644 (file)
@@ -427,7 +427,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
                        pde_get(de);
                        spin_unlock(&proc_subdir_lock);
-                       error = -EINVAL;
+                       error = -ENOMEM;
                        inode = proc_get_inode(dir->i_sb, de);
                        goto out_unlock;
                }
@@ -605,7 +605,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
        unsigned int len;
 
        /* make sure name is valid */
-       if (!name || !strlen(name)) goto out;
+       if (!name || !strlen(name))
+               goto out;
 
        if (xlate_proc_name(name, parent, &fn) != 0)
                goto out;
@@ -616,20 +617,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 
        len = strlen(fn);
 
-       ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
-       if (!ent) goto out;
+       ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
+       if (!ent)
+               goto out;
 
-       memset(ent, 0, sizeof(struct proc_dir_entry));
        memcpy(ent->name, fn, len + 1);
        ent->namelen = len;
        ent->mode = mode;
        ent->nlink = nlink;
        atomic_set(&ent->count, 1);
-       ent->pde_users = 0;
        spin_lock_init(&ent->pde_unload_lock);
-       ent->pde_unload_completion = NULL;
        INIT_LIST_HEAD(&ent->pde_openers);
- out:
+out:
        return ent;
 }
 
index 7ac817b64a7193b71cf867f72fc0d94d68294c90..3b22bbdee9ec6e8bb9a8b6f19d19bc508e9d2879 100644 (file)
@@ -450,7 +450,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
                return NULL;
        if (inode->i_state & I_NEW) {
                inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-               PROC_I(inode)->fd = 0;
                PROC_I(inode)->pde = de;
 
                if (de->mode) {
index eb7cc91b7258870f89b77a60de4172dd6f1502b6..a781bdf0669464ced3e213faad5ed6ab3a37a06b 100644 (file)
@@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
        }
 
        rb_link_node(node, parent, p);
+       rb_insert_color(node, &head->parent->root);
        return 0;
 }
 
@@ -168,10 +169,8 @@ static void init_header(struct ctl_table_header *head,
        head->node = node;
        if (node) {
                struct ctl_table *entry;
-               for (entry = table; entry->procname; entry++, node++) {
-                       rb_init_node(&node->node);
+               for (entry = table; entry->procname; entry++, node++)
                        node->header = head;
-               }
        }
 }
 
@@ -266,8 +265,7 @@ void sysctl_head_put(struct ctl_table_header *head)
 
 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
 {
-       if (!head)
-               BUG();
+       BUG_ON(!head);
        spin_lock(&sysctl_lock);
        if (!use_table(head))
                head = ERR_PTR(-ENOENT);
index 9a2d9fd7cadd2acde77a4e00615f9bbafefc8d96..9889a92d2e01773113a5c7db29975cb47d7dcb1d 100644 (file)
@@ -61,7 +61,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
                if (!*p)
                        continue;
 
-               args[0].to = args[0].from = 0;
+               args[0].to = args[0].from = NULL;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_gid:
index 4540b8f76f163fbaaef250b6facf161424a90869..79827ce03e3bc00208aced44caa7ae9fd2d08bd3 100644 (file)
@@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                "VmPTE:\t%8lu kB\n"
                "VmSwap:\t%8lu kB\n",
                hiwater_vm << (PAGE_SHIFT-10),
-               (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+               total_vm << (PAGE_SHIFT-10),
                mm->locked_vm << (PAGE_SHIFT-10),
                mm->pinned_vm << (PAGE_SHIFT-10),
                hiwater_rss << (PAGE_SHIFT-10),
index 552e994e3aa156dff4f6aa43fa87e3885ee44513..9534b4f765790324e5d571c91c8e1d380c0fadad 100644 (file)
@@ -391,6 +391,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(qnx4_inode_cachep);
 }
 
index 2049c814bda475a7e02e9e4411d54cc74f304660..1b37fff7b5ff00b36a405e24d959775ec5568f20 100644 (file)
@@ -651,6 +651,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(qnx6_inode_cachep);
 }
 
index 7a37dabf5a968b7c8977c2028665f58554987912..1078ae179993bb12f105d39fb1d847eb84c12414 100644 (file)
@@ -608,6 +608,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(reiserfs_inode_cachep);
 }
 
index 77c5f21739837753efdfed09e806e32d1376a219..fd7c5f60b46b84f9cdcba814c2c5e803ef02daa2 100644 (file)
@@ -648,6 +648,11 @@ error_register:
 static void __exit exit_romfs_fs(void)
 {
        unregister_filesystem(&romfs_fs_type);
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(romfs_inode_cachep);
 }
 
index 29cd014ed3a17a776db73b4c1e492f44d78cba4e..260e3928d4f52bb94076e0025f5fd8bef08bb1c3 100644 (file)
@@ -425,6 +425,11 @@ static int __init init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(squashfs_inode_cachep);
 }
 
index 0902cfa6a12efd21e4ebd52a39333b7f9d6270eb..a3bc935069d9d5c643657793602ea42201263ed4 100644 (file)
@@ -307,12 +307,6 @@ void deactivate_locked_super(struct super_block *s)
 
                /* caches are now gone, we can safely kill the shrinker now */
                unregister_shrinker(&s->s_shrink);
-
-               /*
-                * We need to call rcu_barrier so all the delayed rcu free
-                * inodes are flushed before we release the fs module.
-                */
-               rcu_barrier();
                put_filesystem(fs);
                put_super(s);
        } else {
@@ -871,7 +865,7 @@ int get_anon_bdev(dev_t *p)
        else if (error)
                return -EAGAIN;
 
-       if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
+       if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) {
                spin_lock(&unnamed_dev_lock);
                ida_remove(&unnamed_dev_ida, dev);
                if (unnamed_dev_start > dev)
index 80e1e2b18df17f3537050cd7557955a1c1cd6b49..0d0c50bd3321d3e5f54d00a28bccaa27afbb6da4 100644 (file)
@@ -360,5 +360,10 @@ int __init sysv_init_icache(void)
 
 void sysv_destroy_icache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(sysv_inode_cachep);
 }
index ff48c5a853092e92cd6b4ef77362694ce9ba8cb5..5bc77817f382c8d527237f3a91976c4de9bf1873 100644 (file)
@@ -1536,6 +1536,7 @@ out_unlock:
 static const struct vm_operations_struct ubifs_file_vm_ops = {
        .fault        = filemap_fault,
        .page_mkwrite = ubifs_vm_page_mkwrite,
+       .remap_pages = generic_file_remap_pages,
 };
 
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
index 7d51f2802bda30b73fb1ba87890609c43b9462a5..862e119d7d0c1d26249d910b50967f32f4691909 100644 (file)
@@ -2289,6 +2289,12 @@ static void __exit ubifs_exit(void)
        dbg_debugfs_exit();
        ubifs_compressors_exit();
        unregister_shrinker(&ubifs_shrinker_info);
+
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ubifs_inode_slab);
        unregister_filesystem(&ubifs_fs_type);
 }
index 18fc038a438da4b6bbf58fa73c23c27ecd0cb721..b8d27642ab061a79b3f96b5a4bd90cfe2377d8ea 100644 (file)
@@ -171,6 +171,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(udf_inode_cachep);
 }
 
index 444927e5706b773380d9050213a8179b3d611fd9..f7cfecfe1caba90f9c6dc8d1cde2938e5128cb79 100644 (file)
@@ -1466,6 +1466,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+       /*
+        * Make sure all delayed rcu free inodes are flushed before we
+        * destroy cache.
+        */
+       rcu_barrier();
        kmem_cache_destroy(ufs_inode_cachep);
 }
 
index 1eaeb8be3aaea5d4a034c36949bbca34fca6ceac..aa473fa640a2dd16f56ae76abfb72c1f204ba43b 100644 (file)
@@ -940,7 +940,6 @@ xfs_file_mmap(
        struct vm_area_struct *vma)
 {
        vma->vm_ops = &xfs_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
 
        file_accessed(filp);
        return 0;
@@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = {
 static const struct vm_operations_struct xfs_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = xfs_vm_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
index 001537f92caf10becdaa26ab27ee2f944e489511..e0fd2734189ee561478cb8dfdd427baf6168ecfc 100644 (file)
@@ -1506,6 +1506,11 @@ xfs_init_zones(void)
 STATIC void
 xfs_destroy_zones(void)
 {
+       /*
+        * Make sure all delayed rcu free are flushed before we
+        * destroy caches.
+        */
+       rcu_barrier();
        kmem_zone_destroy(xfs_ili_zone);
        kmem_zone_destroy(xfs_inode_zone);
        kmem_zone_destroy(xfs_efi_zone);
index ff4947b7a9762b6ea414aa8b0159af8abb90e9dc..ebdd74fb8d6a7887006759bc430e61a1a1a1dd98 100644 (file)
@@ -162,6 +162,19 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
 #endif
 
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp);
+#endif
+
 #ifndef __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t pte_a, pte_t pte_b)
 {
@@ -381,48 +394,59 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 
 #ifndef __HAVE_PFNMAP_TRACKING
 /*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
- * track_pfn_vma_new is called when a _new_ pfn mapping is being established
- * for physical range indicated by pfn and size.
+ * Interfaces that can be used by architecture code to keep track of
+ * memory type of pfn mappings specified by the remap_pfn_range,
+ * vm_insert_pfn.
+ */
+
+/*
+ * track_pfn_remap is called when a _new_ pfn mapping is being established
+ * by remap_pfn_range() for physical range indicated by pfn and size.
  */
-static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
-                                       unsigned long pfn, unsigned long size)
+static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+                                 unsigned long pfn, unsigned long addr,
+                                 unsigned long size)
 {
        return 0;
 }
 
 /*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
- * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * track_pfn_insert is called when a _new_ single pfn is established
+ * by vm_insert_pfn().
+ */
+static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+                                  unsigned long pfn)
+{
+       return 0;
+}
+
+/*
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
  * copied through copy_page_range().
  */
-static inline int track_pfn_vma_copy(struct vm_area_struct *vma)
+static inline int track_pfn_copy(struct vm_area_struct *vma)
 {
        return 0;
 }
 
 /*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
  * untrack_pfn_vma is called while unmapping a pfnmap for a region.
  * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case size can be zero).
+ * can be for the entire vma (in which case pfn, size are zero).
  */
-static inline void untrack_pfn_vma(struct vm_area_struct *vma,
-                                       unsigned long pfn, unsigned long size)
+static inline void untrack_pfn(struct vm_area_struct *vma,
+                              unsigned long pfn, unsigned long size)
 {
 }
 #else
-extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
-                               unsigned long pfn, unsigned long size);
-extern int track_pfn_vma_copy(struct vm_area_struct *vma);
-extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
-                               unsigned long size);
+extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+                          unsigned long pfn, unsigned long addr,
+                          unsigned long size);
+extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+                           unsigned long pfn);
+extern int track_pfn_copy(struct vm_area_struct *vma);
+extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+                       unsigned long size);
 #endif
 
 #ifdef CONFIG_MMU
index 70cfcb2d63c49d5b5b72f6a5f42455060d89035d..5b08a8540ecfc3e6bcb5bf8b6fbef4d19df93f55 100644 (file)
@@ -86,6 +86,31 @@ static inline int atomic_dec_unless_positive(atomic_t *p)
 }
 #endif
 
+/*
+ * atomic_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+#ifndef atomic_dec_if_positive
+static inline int atomic_dec_if_positive(atomic_t *v)
+{
+       int c, old, dec;
+       c = atomic_read(v);
+       for (;;) {
+               dec = c - 1;
+               if (unlikely(dec < 0))
+                       break;
+               old = atomic_cmpxchg((v), c, dec);
+               if (likely(old == c))
+                       break;
+               c = old;
+       }
+       return dec;
+}
+#endif
+
 #ifndef CONFIG_ARCH_HAS_ATOMIC_OR
 static inline void atomic_or(int i, atomic_t *v)
 {
index 2a5073cf548a0d848f99c7810a124bded3313eb9..50f5538a3d07b2f4530c76173049d7dfdb027261 100644 (file)
@@ -624,38 +624,95 @@ static inline void audit_mmap_fd(int fd, int flags)
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
-#define audit_alloc(t) ({ 0; })
-#define audit_free(t) do { ; } while (0)
-#define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(r) do { ; } while (0)
-#define audit_dummy_context() 1
-#define audit_getname(n) do { ; } while (0)
-#define audit_putname(n) do { ; } while (0)
-#define __audit_inode(n,d) do { ; } while (0)
-#define __audit_inode_child(i,p) do { ; } while (0)
-#define audit_inode(n,d) do { (void)(d); } while (0)
-#define audit_inode_child(i,p) do { ; } while (0)
-#define audit_core_dumps(i) do { ; } while (0)
-#define audit_seccomp(i,s,c) do { ; } while (0)
-#define auditsc_get_stamp(c,t,s) (0)
+static inline int  audit_alloc(struct task_struct *task)
+{
+       return 0;
+}
+static inline void audit_free(struct task_struct *task)
+{ }
+static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
+                                      unsigned long a1, unsigned long a2,
+                                      unsigned long a3)
+{ }
+static inline void audit_syscall_exit(void *pt_regs)
+{ }
+static inline int audit_dummy_context(void)
+{
+       return 1;
+}
+static inline void audit_getname(const char *name)
+{ }
+static inline void audit_putname(const char *name)
+{ }
+static inline void __audit_inode(const char *name, const struct dentry *dentry)
+{ }
+static inline void __audit_inode_child(const struct dentry *dentry,
+                                       const struct inode *parent)
+{ }
+static inline void audit_inode(const char *name, const struct dentry *dentry)
+{ }
+static inline void audit_inode_child(const struct dentry *dentry,
+                                    const struct inode *parent)
+{ }
+static inline void audit_core_dumps(long signr)
+{ }
+static inline void __audit_seccomp(unsigned long syscall, long signr, int code)
+{ }
+static inline void audit_seccomp(unsigned long syscall, long signr, int code)
+{ }
+static inline int auditsc_get_stamp(struct audit_context *ctx,
+                             struct timespec *t, unsigned int *serial)
+{
+       return 0;
+}
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
-#define audit_log_task_context(b) do { ; } while (0)
-#define audit_log_task_info(b, t) do { ; } while (0)
-#define audit_ipc_obj(i) ((void)0)
-#define audit_ipc_set_perm(q,u,g,m) ((void)0)
-#define audit_bprm(p) ({ 0; })
-#define audit_socketcall(n,a) ((void)0)
-#define audit_fd_pair(n,a) ((void)0)
-#define audit_sockaddr(len, addr) ({ 0; })
-#define audit_mq_open(o,m,a) ((void)0)
-#define audit_mq_sendrecv(d,l,p,t) ((void)0)
-#define audit_mq_notify(d,n) ((void)0)
-#define audit_mq_getsetattr(d,s) ((void)0)
-#define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
-#define audit_log_capset(pid, ncr, ocr) ((void)0)
-#define audit_mmap_fd(fd, flags) ((void)0)
-#define audit_ptrace(t) ((void)0)
+static inline void audit_log_task_context(struct audit_buffer *ab)
+{ }
+static inline void audit_log_task_info(struct audit_buffer *ab,
+                                       struct task_struct *tsk)
+{ }
+static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
+{ }
+static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
+                                       gid_t gid, umode_t mode)
+{ }
+static inline int audit_bprm(struct linux_binprm *bprm)
+{
+       return 0;
+}
+static inline void audit_socketcall(int nargs, unsigned long *args)
+{ }
+static inline void audit_fd_pair(int fd1, int fd2)
+{ }
+static inline int audit_sockaddr(int len, void *addr)
+{
+       return 0;
+}
+static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
+{ }
+static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
+                                    unsigned int msg_prio,
+                                    const struct timespec *abs_timeout)
+{ }
+static inline void audit_mq_notify(mqd_t mqdes,
+                                  const struct sigevent *notification)
+{ }
+static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+{ }
+static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
+                                      const struct cred *new,
+                                      const struct cred *old)
+{
+       return 0;
+}
+static inline void audit_log_capset(pid_t pid, const struct cred *new,
+                                  const struct cred *old)
+{ }
+static inline void audit_mmap_fd(int fd, int flags)
+{ }
+static inline void audit_ptrace(struct task_struct *t)
+{ }
 #define audit_n_rules 0
 #define audit_signals 0
 #endif /* CONFIG_AUDITSYSCALL */
@@ -679,7 +736,6 @@ extern void             audit_log_n_hex(struct audit_buffer *ab,
 extern void                audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
-#define audit_log_string(a,b) audit_log_n_string(a, b, strlen(b));
 extern void                audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
@@ -696,7 +752,8 @@ extern void             audit_log_lost(const char *message);
 #ifdef CONFIG_SECURITY
 extern void                audit_log_secctx(struct audit_buffer *ab, u32 secid);
 #else
-#define audit_log_secctx(b,s) do { ; } while (0)
+static inline void         audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{ }
 #endif
 
 extern int                 audit_update_lsm_rules(void);
@@ -708,22 +765,50 @@ extern int  audit_receive_filter(int type, int pid, int uid, int seq,
                                void *data, size_t datasz, uid_t loginuid,
                                u32 sessionid, u32 sid);
 extern int audit_enabled;
-#else
-#define audit_log(c,g,t,f,...) do { ; } while (0)
-#define audit_log_start(c,g,t) ({ NULL; })
-#define audit_log_vformat(b,f,a) do { ; } while (0)
-#define audit_log_format(b,f,...) do { ; } while (0)
-#define audit_log_end(b) do { ; } while (0)
-#define audit_log_n_hex(a,b,l) do { ; } while (0)
-#define audit_log_n_string(a,c,l) do { ; } while (0)
-#define audit_log_string(a,c) do { ; } while (0)
-#define audit_log_n_untrustedstring(a,n,s) do { ; } while (0)
-#define audit_log_untrustedstring(a,s) do { ; } while (0)
-#define audit_log_d_path(b, p, d) do { ; } while (0)
-#define audit_log_key(b, k) do { ; } while (0)
-#define audit_log_link_denied(o, l) do { ; } while (0)
-#define audit_log_secctx(b,s) do { ; } while (0)
+#else /* CONFIG_AUDIT */
+static inline __printf(4, 5)
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
+              const char *fmt, ...)
+{ }
+static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
+                                                  gfp_t gfp_mask, int type)
+{
+       return NULL;
+}
+static inline __printf(2, 3)
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{ }
+static inline void audit_log_end(struct audit_buffer *ab)
+{ }
+static inline void audit_log_n_hex(struct audit_buffer *ab,
+                                  const unsigned char *buf, size_t len)
+{ }
+static inline void audit_log_n_string(struct audit_buffer *ab,
+                                     const char *buf, size_t n)
+{ }
+static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
+                                               const char *string, size_t n)
+{ }
+static inline void audit_log_untrustedstring(struct audit_buffer *ab,
+                                            const char *string)
+{ }
+static inline void audit_log_d_path(struct audit_buffer *ab,
+                                   const char *prefix,
+                                   const struct path *path)
+{ }
+static inline void audit_log_key(struct audit_buffer *ab, char *key)
+{ }
+static inline void audit_log_link_denied(const char *string,
+                                        const struct path *link)
+{ }
+static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{ }
 #define audit_enabled 0
-#endif
+#endif /* CONFIG_AUDIT */
+static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
+{
+       audit_log_n_string(ab, buf, strlen(buf));
+}
+
 #endif
 #endif
index d708558d96bd83bf0d3442a027c752c25f5243cf..52fb2eb306fff9452960fd86cfb903e64e13e719 100644 (file)
@@ -134,7 +134,6 @@ extern int copy_strings_kernel(int argc, const char *const *argv,
                               struct linux_binprm *bprm);
 extern int prepare_bprm_creds(struct linux_binprm *bprm);
 extern void install_exec_creds(struct linux_binprm *bprm);
-extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
 extern void set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
 
index ef658147e4e8391eec61aa7559a1452fcffe6a79..0e38a1deeb2374f9553b088bd7c43b248da954d2 100644 (file)
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *mask,
-                       bool sync, bool *contended);
+                       bool sync, bool *contended, struct page **page);
 extern int compact_pgdat(pg_data_t *pgdat, int order);
 extern unsigned long compaction_suitable(struct zone *zone, int order);
 
@@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 #else
 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                       bool sync, bool *contended)
+                       bool sync, bool *contended, struct page **page)
 {
        return COMPACT_CONTINUE;
 }
index ba4b85a6d9b8bc71853de37df2a36b865707b745..42f9752a0a404f900569c1f25a6caef55661cf74 100644 (file)
  */
 extern int dump_write(struct file *file, const void *addr, int nr);
 extern int dump_seek(struct file *file, loff_t off);
+#ifdef CONFIG_COREDUMP
+extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
+#else
+static inline void do_coredump(long signr, int exit_code, struct pt_regs *regs) {}
+#endif
 
 #endif /* _LINUX_COREDUMP_H */
index f4bb378ccf6a355bbe49e79f56019f9ef386d1d4..41085d0f3955b3ff247b426de9452b731d00c19b 100644 (file)
@@ -25,6 +25,7 @@
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_DISABLE 4
 
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
index aa110476a95be0b1d479555346c560d45ecc0b20..de773217ce87f6d766669d0202651562cfe642a5 100644 (file)
@@ -401,7 +401,7 @@ struct inodes_stat_t {
 #include <linux/cache.h>
 #include <linux/list.h>
 #include <linux/radix-tree.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/bug.h>
@@ -669,7 +669,7 @@ struct address_space {
        struct radix_tree_root  page_tree;      /* radix tree of all pages */
        spinlock_t              tree_lock;      /* and lock protecting it */
        unsigned int            i_mmap_writable;/* count VM_SHARED mappings */
-       struct prio_tree_root   i_mmap;         /* tree of private and shared mappings */
+       struct rb_root          i_mmap;         /* tree of private and shared mappings */
        struct list_head        i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
        struct mutex            i_mmap_mutex;   /* protect tree, count, list */
        /* Protected by tree_lock together with the radix tree */
@@ -741,7 +741,7 @@ int mapping_tagged(struct address_space *mapping, int tag);
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
-       return  !prio_tree_empty(&mapping->i_mmap) ||
+       return  !RB_EMPTY_ROOT(&mapping->i_mmap) ||
                !list_empty(&mapping->i_mmap_nonlinear);
 }
 
@@ -2548,6 +2548,8 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
+extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
+               unsigned long size, pgoff_t pgoff);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
index 4f440b3e89fe7fd7e47ead6c6ed911f173205431..79b8bba193635c94e7318d552c3e5437062bc203 100644 (file)
@@ -88,10 +88,14 @@ struct disk_stats {
 };
 
 #define PARTITION_META_INFO_VOLNAMELTH 64
-#define PARTITION_META_INFO_UUIDLTH    16
+/*
+ * Enough for the string representation of any kind of UUID plus NULL.
+ * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
+ */
+#define PARTITION_META_INFO_UUIDLTH    37
 
 struct partition_meta_info {
-       u8 uuid[PARTITION_META_INFO_UUIDLTH];   /* always big endian */
+       char uuid[PARTITION_META_INFO_UUIDLTH];
        u8 volname[PARTITION_META_INFO_VOLNAMELTH];
 };
 
index 4883f393f50a8049cc83298068b5751621a7a8dc..f9bc873ce7d6c353ab9b6fb4a7a9fce75854c8c8 100644 (file)
@@ -35,7 +35,6 @@ struct vm_area_struct;
 #else
 #define ___GFP_NOTRACK         0
 #endif
-#define ___GFP_NO_KSWAPD       0x400000u
 #define ___GFP_OTHER_NODE      0x800000u
 #define ___GFP_WRITE           0x1000000u
 
@@ -90,7 +89,6 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK  ((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
-#define __GFP_NO_KSWAPD        ((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 #define __GFP_WRITE    ((__force gfp_t)___GFP_WRITE)   /* Allocator intends to dirty page */
 
@@ -120,8 +118,7 @@ struct vm_area_struct;
                                 __GFP_MOVABLE)
 #define GFP_IOFS       (__GFP_IO | __GFP_FS)
 #define GFP_TRANSHUGE  (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-                        __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
-                        __GFP_NO_KSWAPD)
+                        __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE   (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
index 4c59b11311870e74ebbeb110f77e2017be97c3bb..6ab47af5a849b395440b4aa3a2ba88e641f13a0c 100644 (file)
@@ -11,7 +11,6 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
                               pmd_t orig_pmd);
-extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
 extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
                                          unsigned long addr,
                                          pmd_t *pmd,
index 255491cf522e1d08ea6d0066fc7872f32817c82c..87259a44c251472595598f0f6ed0852655171554 100644 (file)
 #define IDR_SIZE (1 << IDR_BITS)
 #define IDR_MASK ((1 << IDR_BITS)-1)
 
-#define MAX_ID_SHIFT (sizeof(int)*8 - 1)
-#define MAX_ID_BIT (1U << MAX_ID_SHIFT)
-#define MAX_ID_MASK (MAX_ID_BIT - 1)
+#define MAX_IDR_SHIFT (sizeof(int)*8 - 1)
+#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
+#define MAX_IDR_MASK (MAX_IDR_BIT - 1)
 
 /* Leave the possibility of an incomplete final layer */
-#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
+#define MAX_IDR_LEVEL ((MAX_IDR_SHIFT + IDR_BITS - 1) / IDR_BITS)
 
 /* Number of id_layer structs to leave in free list */
-#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL
+#define MAX_IDR_FREE (MAX_IDR_LEVEL * 2)
 
 struct idr_layer {
        unsigned long            bitmap; /* A zero bit means "space here" */
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
new file mode 100644 (file)
index 0000000..724556a
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef _LINUX_INTERVAL_TREE_H
+#define _LINUX_INTERVAL_TREE_H
+
+#include <linux/rbtree.h>
+
+struct interval_tree_node {
+       struct rb_node rb;
+       unsigned long start;    /* Start of interval */
+       unsigned long last;     /* Last location _in_ interval */
+       unsigned long __subtree_last;
+};
+
+extern void
+interval_tree_insert(struct interval_tree_node *node, struct rb_root *root);
+
+extern void
+interval_tree_remove(struct interval_tree_node *node, struct rb_root *root);
+
+extern struct interval_tree_node *
+interval_tree_iter_first(struct rb_root *root,
+                        unsigned long start, unsigned long last);
+
+extern struct interval_tree_node *
+interval_tree_iter_next(struct interval_tree_node *node,
+                       unsigned long start, unsigned long last);
+
+#endif /* _LINUX_INTERVAL_TREE_H */
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
new file mode 100644 (file)
index 0000000..58370e1
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+  Interval Trees
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  include/linux/interval_tree_generic.h
+*/
+
+#include <linux/rbtree_augmented.h>
+
+/*
+ * Template for implementing interval trees
+ *
+ * ITSTRUCT:   struct type of the interval tree nodes
+ * ITRB:       name of struct rb_node field within ITSTRUCT
+ * ITTYPE:     type of the interval endpoints
+ * ITSUBTREE:  name of ITTYPE field within ITSTRUCT holding last-in-subtree
+ * ITSTART(n): start endpoint of ITSTRUCT node n
+ * ITLAST(n):  last endpoint of ITSTRUCT node n
+ * ITSTATIC:   'static' or empty
+ * ITPREFIX:   prefix to use for the inline tree definitions
+ *
+ * Note - before using this, please consider if non-generic version
+ * (interval_tree.h) would work for you...
+ */
+
+#define INTERVAL_TREE_DEFINE(ITSTRUCT, ITRB, ITTYPE, ITSUBTREE,                      \
+                            ITSTART, ITLAST, ITSTATIC, ITPREFIX)             \
+                                                                             \
+/* Callbacks for augmented rbtree insert and remove */                       \
+                                                                             \
+static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node)       \
+{                                                                            \
+       ITTYPE max = ITLAST(node), subtree_last;                              \
+       if (node->ITRB.rb_left) {                                             \
+               subtree_last = rb_entry(node->ITRB.rb_left,                   \
+                                       ITSTRUCT, ITRB)->ITSUBTREE;           \
+               if (max < subtree_last)                                       \
+                       max = subtree_last;                                   \
+       }                                                                     \
+       if (node->ITRB.rb_right) {                                            \
+               subtree_last = rb_entry(node->ITRB.rb_right,                  \
+                                       ITSTRUCT, ITRB)->ITSUBTREE;           \
+               if (max < subtree_last)                                       \
+                       max = subtree_last;                                   \
+       }                                                                     \
+       return max;                                                           \
+}                                                                            \
+                                                                             \
+RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB,           \
+                    ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last)    \
+                                                                             \
+/* Insert / remove interval nodes from the tree */                           \
+                                                                             \
+ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root)              \
+{                                                                            \
+       struct rb_node **link = &root->rb_node, *rb_parent = NULL;            \
+       ITTYPE start = ITSTART(node), last = ITLAST(node);                    \
+       ITSTRUCT *parent;                                                     \
+                                                                             \
+       while (*link) {                                                       \
+               rb_parent = *link;                                            \
+               parent = rb_entry(rb_parent, ITSTRUCT, ITRB);                 \
+               if (parent->ITSUBTREE < last)                                 \
+                       parent->ITSUBTREE = last;                             \
+               if (start < ITSTART(parent))                                  \
+                       link = &parent->ITRB.rb_left;                         \
+               else                                                          \
+                       link = &parent->ITRB.rb_right;                        \
+       }                                                                     \
+                                                                             \
+       node->ITSUBTREE = last;                                               \
+       rb_link_node(&node->ITRB, rb_parent, link);                           \
+       rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment);        \
+}                                                                            \
+                                                                             \
+ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root)              \
+{                                                                            \
+       rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment);         \
+}                                                                            \
+                                                                             \
+/*                                                                           \
+ * Iterate over intervals intersecting [start;last]                          \
+ *                                                                           \
+ * Note that a node's interval intersects [start;last] iff:                  \
+ *   Cond1: ITSTART(node) <= last                                            \
+ * and                                                                       \
+ *   Cond2: start <= ITLAST(node)                                            \
+ */                                                                          \
+                                                                             \
+static ITSTRUCT *                                                            \
+ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)       \
+{                                                                            \
+       while (true) {                                                        \
+               /*                                                            \
+                * Loop invariant: start <= node->ITSUBTREE                   \
+                * (Cond2 is satisfied by one of the subtree nodes)           \
+                */                                                           \
+               if (node->ITRB.rb_left) {                                     \
+                       ITSTRUCT *left = rb_entry(node->ITRB.rb_left,         \
+                                                 ITSTRUCT, ITRB);            \
+                       if (start <= left->ITSUBTREE) {                       \
+                               /*                                            \
+                                * Some nodes in left subtree satisfy Cond2.  \
+                                * Iterate to find the leftmost such node N.  \
+                                * If it also satisfies Cond1, that's the     \
+                                * match we are looking for. Otherwise, there \
+                                * is no matching interval as nodes to the    \
+                                * right of N can't satisfy Cond1 either.     \
+                                */                                           \
+                               node = left;                                  \
+                               continue;                                     \
+                       }                                                     \
+               }                                                             \
+               if (ITSTART(node) <= last) {            /* Cond1 */           \
+                       if (start <= ITLAST(node))      /* Cond2 */           \
+                               return node;    /* node is leftmost match */  \
+                       if (node->ITRB.rb_right) {                            \
+                               node = rb_entry(node->ITRB.rb_right,          \
+                                               ITSTRUCT, ITRB);              \
+                               if (start <= node->ITSUBTREE)                 \
+                                       continue;                             \
+                       }                                                     \
+               }                                                             \
+               return NULL;    /* No match */                                \
+       }                                                                     \
+}                                                                            \
+                                                                             \
+ITSTATIC ITSTRUCT *                                                          \
+ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last)      \
+{                                                                            \
+       ITSTRUCT *node;                                                       \
+                                                                             \
+       if (!root->rb_node)                                                   \
+               return NULL;                                                  \
+       node = rb_entry(root->rb_node, ITSTRUCT, ITRB);                       \
+       if (node->ITSUBTREE < start)                                          \
+               return NULL;                                                  \
+       return ITPREFIX ## _subtree_search(node, start, last);                \
+}                                                                            \
+                                                                             \
+ITSTATIC ITSTRUCT *                                                          \
+ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last)            \
+{                                                                            \
+       struct rb_node *rb = node->ITRB.rb_right, *prev;                      \
+                                                                             \
+       while (true) {                                                        \
+               /*                                                            \
+                * Loop invariants:                                           \
+                *   Cond1: ITSTART(node) <= last                             \
+                *   rb == node->ITRB.rb_right                                \
+                *                                                            \
+                * First, search right subtree if suitable                    \
+                */                                                           \
+               if (rb) {                                                     \
+                       ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB);       \
+                       if (start <= right->ITSUBTREE)                        \
+                               return ITPREFIX ## _subtree_search(right,     \
+                                                               start, last); \
+               }                                                             \
+                                                                             \
+               /* Move up the tree until we come from a node's left child */ \
+               do {                                                          \
+                       rb = rb_parent(&node->ITRB);                          \
+                       if (!rb)                                              \
+                               return NULL;                                  \
+                       prev = &node->ITRB;                                   \
+                       node = rb_entry(rb, ITSTRUCT, ITRB);                  \
+                       rb = node->ITRB.rb_right;                             \
+               } while (prev == rb);                                         \
+                                                                             \
+               /* Check if the node intersects [start;last] */               \
+               if (last < ITSTART(node))               /* !Cond1 */          \
+                       return NULL;                                          \
+               else if (start <= ITLAST(node))         /* Cond2 */           \
+                       return node;                                          \
+       }                                                                     \
+}
index 19dc455b4f3dd072d0e58abdfed2de6f8ae5727c..569d67d4243ed99c4b896fac81442531b9d05cbc 100644 (file)
@@ -70,8 +70,7 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_end: ptr to ulong for end pfn of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  *
- * Walks over configured memory ranges.  Available after early_node_map is
- * populated.
+ * Walks over configured memory ranges.
  */
 #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid)          \
        for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
index 95b738c7abff9ac1e45a5e78e1daf1437b2283fd..cec56932560823ac6a34ce5c5c21bea24316a9ea 100644 (file)
@@ -188,7 +188,7 @@ struct sp_node {
 
 struct shared_policy {
        struct rb_root root;
-       spinlock_t lock;
+       struct mutex mutex;
 };
 
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
@@ -239,7 +239,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
 /* Check if a vma is migratable */
 static inline int vma_migratable(struct vm_area_struct *vma)
 {
-       if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+       if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP))
                return 0;
        /*
         * Migration allocates pages in the highest zone. If we cannot
index 3661c59aa1e9f52149e166032bba6d9f50eaa6f7..36c242e52ef17826d3e0e44ce053a62b77a88987 100644 (file)
 #define RC5T583_GPIO_MON_IOIN  0xAB
 #define RC5T583_GPIO_GPOFUNC   0xAC
 
+/* RTC registers */
+#define RC5T583_RTC_SEC                0xE0
+#define RC5T583_RTC_MIN                0xE1
+#define RC5T583_RTC_HOUR       0xE2
+#define RC5T583_RTC_WDAY       0xE3
+#define RC5T583_RTC_DAY                0xE4
+#define RC5T583_RTC_MONTH      0xE5
+#define RC5T583_RTC_YEAR       0xE6
+#define RC5T583_RTC_ADJ                0xE7
+#define RC5T583_RTC_AW_MIN     0xE8
+#define RC5T583_RTC_AW_HOUR    0xE9
+#define RC5T583_RTC_AW_WEEK    0xEA
+#define RC5T583_RTC_AD_MIN     0xEB
+#define RC5T583_RTC_AD_HOUR    0xEC
+#define RC5T583_RTC_CTL1       0xED
+#define RC5T583_RTC_CTL2       0xEE
+#define RC5T583_RTC_AY_MIN     0xF0
+#define RC5T583_RTC_AY_HOUR    0xF1
+#define RC5T583_RTC_AY_DAY     0xF2
+#define RC5T583_RTC_AY_MONTH 0xF3
+#define RC5T583_RTC_AY_YEAR    0xF4
+
 /* RICOH_RC5T583 IRQ definitions */
 enum {
        RC5T583_IRQ_ONKEY,
index ac772b36a1b169e47e9e5f1cb67e18f687b71b55..02e894f3ff45cee8ebdc244bcdf6dc714b86a309 100644 (file)
  *
  */
 
+/* RTC_CTRL_REG bitfields */
+#define TPS65910_RTC_CTRL_STOP_RTC                     0x01 /*0=stop, 1=run */
+#define TPS65910_RTC_CTRL_GET_TIME                     0x40
+
+/* RTC_STATUS_REG bitfields */
+#define TPS65910_RTC_STATUS_ALARM               0x40
+
+/* RTC_INTERRUPTS_REG bitfields */
+#define TPS65910_RTC_INTERRUPTS_EVERY           0x03
+#define TPS65910_RTC_INTERRUPTS_IT_ALARM        0x08
 
 /*Register BCK1  (0x80) register.RegisterDescription */
 #define BCK1_BCKUP_MASK                                        0xFF
index 311be906b57d8498d5c3529d0e42de0996a0c1ec..bcaab4e6fe913ac6e3f2ef0a6c83ea24d8972867 100644 (file)
@@ -10,7 +10,6 @@
 #include <linux/list.h>
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
-#include <linux/prio_tree.h>
 #include <linux/atomic.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
@@ -21,6 +20,7 @@
 
 struct mempolicy;
 struct anon_vma;
+struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
@@ -70,6 +70,8 @@ extern unsigned int kobjsize(const void *objp);
 /*
  * vm_flags in vm_area_struct, see mm_types.h.
  */
+#define VM_NONE                0x00000000
+
 #define VM_READ                0x00000001      /* currently active flags */
 #define VM_WRITE       0x00000002
 #define VM_EXEC                0x00000004
@@ -82,16 +84,9 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE    0x00000080
 
 #define VM_GROWSDOWN   0x00000100      /* general info on the segment */
-#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
-#define VM_GROWSUP     0x00000200
-#else
-#define VM_GROWSUP     0x00000000
-#define VM_NOHUGEPAGE  0x00000200      /* MADV_NOHUGEPAGE marked this vma */
-#endif
 #define VM_PFNMAP      0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE   0x00000800      /* ETXTBSY on write attempts.. */
 
-#define VM_EXECUTABLE  0x00001000
 #define VM_LOCKED      0x00002000
 #define VM_IO           0x00004000     /* Memory mapped I/O or similar */
 
@@ -101,25 +96,34 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY    0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x00040000      /* Cannot expand with mremap() */
-#define VM_RESERVED    0x00080000      /* Count as reserved_vm like IO */
 #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
 #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
 #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
 #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
-#ifndef CONFIG_TRANSPARENT_HUGEPAGE
-#define VM_MAPPED_COPY 0x01000000      /* T if mapped copy of data (nommu mmap) */
-#else
-#define VM_HUGEPAGE    0x01000000      /* MADV_HUGEPAGE marked this vma */
-#endif
-#define VM_INSERTPAGE  0x02000000      /* The vma has had "vm_insert_page()" done on it */
-#define VM_NODUMP      0x04000000      /* Do not include in the core dump */
+#define VM_ARCH_1      0x01000000      /* Architecture-specific flag */
+#define VM_DONTDUMP    0x04000000      /* Do not include in the core dump */
 
-#define VM_CAN_NONLINEAR 0x08000000    /* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP    0x10000000      /* Can contain "struct page" and pure PFN pages */
-#define VM_SAO         0x20000000      /* Strong Access Ordering (powerpc) */
-#define VM_PFN_AT_MMAP 0x40000000      /* PFNMAP vma that is fully mapped at mmap time */
+#define VM_HUGEPAGE    0x20000000      /* MADV_HUGEPAGE marked this vma */
+#define VM_NOHUGEPAGE  0x40000000      /* MADV_NOHUGEPAGE marked this vma */
 #define VM_MERGEABLE   0x80000000      /* KSM may merge identical pages */
 
+#if defined(CONFIG_X86)
+# define VM_PAT                VM_ARCH_1       /* PAT reserves whole VMA at once (x86) */
+#elif defined(CONFIG_PPC)
+# define VM_SAO                VM_ARCH_1       /* Strong Access Ordering (powerpc) */
+#elif defined(CONFIG_PARISC)
+# define VM_GROWSUP    VM_ARCH_1
+#elif defined(CONFIG_IA64)
+# define VM_GROWSUP    VM_ARCH_1
+#elif !defined(CONFIG_MMU)
+# define VM_MAPPED_COPY        VM_ARCH_1       /* T if mapped copy of data (nommu mmap) */
+#endif
+
+#ifndef VM_GROWSUP
+# define VM_GROWSUP    VM_NONE
+#endif
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP      (VM_RAND_READ | VM_SEQ_READ)
 
@@ -143,7 +147,7 @@ extern unsigned int kobjsize(const void *objp);
  * Special vmas that are non-mergable, non-mlock()able.
  * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP)
 
 /*
  * mapping from the currently active vm_flags protection bits (the
@@ -157,24 +161,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_ALLOW_RETRY 0x08    /* Retry fault if blocking */
 #define FAULT_FLAG_RETRY_NOWAIT        0x10    /* Don't drop mmap_sem and wait when retrying */
 #define FAULT_FLAG_KILLABLE    0x20    /* The fault task is in SIGKILL killable region */
-
-/*
- * This interface is used by x86 PAT code to identify a pfn mapping that is
- * linear over entire vma. This is to optimize PAT code that deals with
- * marking the physical region with a particular prot. This is not for generic
- * mm use. Note also that this check will not work if the pfn mapping is
- * linear for a vma starting at physical address 0. In which case PAT code
- * falls back to slow path of reserving physical range page by page.
- */
-static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
-{
-       return !!(vma->vm_flags & VM_PFN_AT_MMAP);
-}
-
-static inline int is_pfn_mapping(struct vm_area_struct *vma)
-{
-       return !!(vma->vm_flags & VM_PFNMAP);
-}
+#define FAULT_FLAG_TRIED       0x40    /* second try */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -182,8 +169,7 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma)
  * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
  * pgoff should be used in favour of virtual_address, if possible. If pgoff
- * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
- * mapping support.
+ * is used, one may implement ->remap_pages to get nonlinear mapping support.
  */
 struct vm_fault {
        unsigned int flags;             /* FAULT_FLAG_xxx flags */
@@ -241,6 +227,9 @@ struct vm_operations_struct {
        int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
                const nodemask_t *to, unsigned long flags);
 #endif
+       /* called by sys_remap_file_pages() to populate non-linear mapping */
+       int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
+                          unsigned long size, pgoff_t pgoff);
 };
 
 struct mmu_gather;
@@ -249,6 +238,18 @@ struct inode;
 #define page_private(page)             ((page)->private)
 #define set_page_private(page, v)      ((page)->private = (v))
 
+/* It's valid only if the page is free path or free_list */
+static inline void set_freepage_migratetype(struct page *page, int migratetype)
+{
+       page->index = migratetype;
+}
+
+/* It's valid only if the page is free path or free_list */
+static inline int get_freepage_migratetype(struct page *page)
+{
+       return page->index;
+}
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -454,6 +455,7 @@ void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
 int split_free_page(struct page *page);
+int capture_free_page(struct page *page, int alloc_order, int migratetype);
 
 /*
  * Compound pages have a destructor function.  Provide a
@@ -1071,7 +1073,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
 
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
-               unsigned long new_addr, unsigned long len);
+               unsigned long new_addr, unsigned long len,
+               bool need_rmap_locks);
 extern unsigned long do_mremap(unsigned long addr,
                               unsigned long old_len, unsigned long new_len,
                               unsigned long flags, unsigned long new_addr);
@@ -1366,24 +1369,45 @@ extern void zone_pcp_reset(struct zone *zone);
 extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
 
-/* prio_tree.c */
-void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
-void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
-void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-       struct prio_tree_iter *iter);
-
-#define vma_prio_tree_foreach(vma, iter, root, begin, end)     \
-       for (prio_tree_iter_init(iter, root, begin, end), vma = NULL;   \
-               (vma = vma_prio_tree_next(vma, iter)); )
+/* interval_tree.c */
+void vma_interval_tree_insert(struct vm_area_struct *node,
+                             struct rb_root *root);
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+                                   struct vm_area_struct *prev,
+                                   struct rb_root *root);
+void vma_interval_tree_remove(struct vm_area_struct *node,
+                             struct rb_root *root);
+struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root,
+                               unsigned long start, unsigned long last);
+struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
+                               unsigned long start, unsigned long last);
+
+#define vma_interval_tree_foreach(vma, root, start, last)              \
+       for (vma = vma_interval_tree_iter_first(root, start, last);     \
+            vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
 static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
                                        struct list_head *list)
 {
-       vma->shared.vm_set.parent = NULL;
-       list_add_tail(&vma->shared.vm_set.list, list);
+       list_add_tail(&vma->shared.nonlinear, list);
 }
 
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+                                  struct rb_root *root);
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+                                  struct rb_root *root);
+struct anon_vma_chain *anon_vma_interval_tree_iter_first(
+       struct rb_root *root, unsigned long start, unsigned long last);
+struct anon_vma_chain *anon_vma_interval_tree_iter_next(
+       struct anon_vma_chain *node, unsigned long start, unsigned long last);
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
+#endif
+
+#define anon_vma_interval_tree_foreach(avc, root, start, last)          \
+       for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
+            avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
+
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
 extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
@@ -1400,15 +1424,13 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
        struct rb_node **, struct rb_node *);
 extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
-       unsigned long addr, unsigned long len, pgoff_t pgoff);
+       unsigned long addr, unsigned long len, pgoff_t pgoff,
+       bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
 
 extern int mm_take_all_locks(struct mm_struct *mm);
 extern void mm_drop_all_locks(struct mm_struct *mm);
 
-/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
-extern void added_exe_file_vma(struct mm_struct *mm);
-extern void removed_exe_file_vma(struct mm_struct *mm);
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 
index bf7867200b95b488fd960c95f7200af5837286bb..31f8a3af7d942545562afd1fcacf11f52a4f82a0 100644 (file)
@@ -6,7 +6,6 @@
 #include <linux/threads.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/prio_tree.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/completion.h>
@@ -240,18 +239,15 @@ struct vm_area_struct {
 
        /*
         * For areas with an address space and backing store,
-        * linkage into the address_space->i_mmap prio tree, or
-        * linkage to the list of like vmas hanging off its node, or
+        * linkage into the address_space->i_mmap interval tree, or
         * linkage of vma in the address_space->i_mmap_nonlinear list.
         */
        union {
                struct {
-                       struct list_head list;
-                       void *parent;   /* aligns with prio_tree_node parent */
-                       struct vm_area_struct *head;
-               } vm_set;
-
-               struct raw_prio_tree_node prio_tree_node;
+                       struct rb_node rb;
+                       unsigned long rb_subtree_last;
+               } linear;
+               struct list_head nonlinear;
        } shared;
 
        /*
@@ -349,7 +345,6 @@ struct mm_struct {
        unsigned long shared_vm;        /* Shared pages (files) */
        unsigned long exec_vm;          /* VM_EXEC & ~VM_WRITE */
        unsigned long stack_vm;         /* VM_GROWSUP/DOWN */
-       unsigned long reserved_vm;      /* VM_RESERVED|VM_IO pages */
        unsigned long def_flags;
        unsigned long nr_ptes;          /* Page table pages */
        unsigned long start_code, end_code, start_data, end_data;
@@ -394,7 +389,6 @@ struct mm_struct {
 
        /* store ref to file /proc/<pid>/exe symlink points to */
        struct file *exe_file;
-       unsigned long num_exe_file_vmas;
 #ifdef CONFIG_MMU_NOTIFIER
        struct mmu_notifier_mm *mmu_notifier_mm;
 #endif
index 8b74e9b1d0ad53911cd15e31891be149bc7932af..77cec2f45cb77d54c19b049f086d51c2db3a980b 100644 (file)
@@ -86,7 +86,6 @@ calc_vm_flag_bits(unsigned long flags)
 {
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
-              _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }
 #endif /* __KERNEL__ */
index 1d1b1e13f79fbc13c0f2abfec525c8026a286b8f..4b7183e98061eae0efe6e54a14221eedbf7b5b1a 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm_types.h>
+#include <linux/srcu.h>
 
 struct mmu_notifier;
 struct mmu_notifier_ops;
@@ -311,14 +312,24 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __young;                                                        \
 })
 
+/*
+ * set_pte_at_notify() sets the pte _after_ running the notifier.
+ * This is safe to start by updating the secondary MMUs, because the primary MMU
+ * pte invalidate must have already happened with a ptep_clear_flush() before
+ * set_pte_at_notify() has been invoked.  Updating the secondary MMUs first is
+ * required when we change both the protection of the mapping from read-only to
+ * read-write and the pfn (like during copy on write page faults). Otherwise the
+ * old page would remain mapped readonly in the secondary MMUs after the new
+ * page is already writable by some CPU through the primary MMU.
+ */
 #define set_pte_at_notify(__mm, __address, __ptep, __pte)              \
 ({                                                                     \
        struct mm_struct *___mm = __mm;                                 \
        unsigned long ___address = __address;                           \
        pte_t ___pte = __pte;                                           \
                                                                        \
-       set_pte_at(___mm, ___address, __ptep, ___pte);                  \
        mmu_notifier_change_pte(___mm, ___address, ___pte);             \
+       set_pte_at(___mm, ___address, __ptep, ___pte);                  \
 })
 
 #else /* CONFIG_MMU_NOTIFIER */
index d146ca10c0f52449dd528e9fec61be47e0c85221..5c86e2b33e2db55644ccdaa360c49e9ce394856a 100644 (file)
 #define NBD_SET_SIZE_BLOCKS    _IO( 0xab, 7 )
 #define NBD_DISCONNECT  _IO( 0xab, 8 )
 #define NBD_SET_TIMEOUT _IO( 0xab, 9 )
+#define NBD_SET_FLAGS   _IO( 0xab, 10)
 
 enum {
        NBD_CMD_READ = 0,
        NBD_CMD_WRITE = 1,
-       NBD_CMD_DISC = 2
+       NBD_CMD_DISC = 2,
+       /* there is a gap here to match userspace */
+       NBD_CMD_TRIM = 4
 };
 
+/* values for flags field */
+#define NBD_FLAG_HAS_FLAGS    (1 << 0) /* nbd-server supports flags */
+#define NBD_FLAG_READ_ONLY    (1 << 1) /* device is read-only */
+/* there is a gap here to match userspace */
+#define NBD_FLAG_SEND_TRIM    (1 << 5) /* send trim/discard */
+
 #define nbd_cmd(req) ((req)->cmd[0])
 
 /* userspace doesn't need the nbd_device structure */
@@ -42,10 +51,6 @@ enum {
 #include <linux/wait.h>
 #include <linux/mutex.h>
 
-/* values for flags field */
-#define NBD_READ_ONLY 0x0001
-#define NBD_WRITE_NOCHK 0x0002
-
 struct request;
 
 struct nbd_device {
index 49a3031fda50d1156bf0f67e77fea5ae2161537f..d36a8221f58be3ac51ae37730469c380a433149e 100644 (file)
@@ -1,17 +1,6 @@
 #ifndef __INCLUDE_LINUX_OOM_H
 #define __INCLUDE_LINUX_OOM_H
 
-/*
- * /proc/<pid>/oom_adj is deprecated, see
- * Documentation/feature-removal-schedule.txt.
- *
- * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
- */
-#define OOM_DISABLE (-17)
-/* inclusive */
-#define OOM_ADJUST_MIN (-16)
-#define OOM_ADJUST_MAX 15
-
 /*
  * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
  * pid.
index 105077aa7685c61dbb66cabc8473302283a0e3a5..76a9539cfd3f629403b66e2daaf9c6b5ffbcef1b 100644 (file)
@@ -6,6 +6,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
                                int migratetype);
+int move_freepages(struct zone *zone,
+                         struct page *start_page, struct page *end_page,
+                         int migratetype);
+
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
  * If specified range includes migrate types other than MOVABLE or CMA,
@@ -37,6 +41,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn);
  */
 int set_migratetype_isolate(struct page *page);
 void unset_migratetype_isolate(struct page *page, unsigned migratetype);
-
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+                               int **resultp);
 
 #endif
diff --git a/include/linux/platform_data/lm3630_bl.h b/include/linux/platform_data/lm3630_bl.h
new file mode 100644 (file)
index 0000000..9176dd3
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+* Simple driver for Texas Instruments LM3630 LED Flash driver chip
+* Copyright (C) 2012 Texas Instruments
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2 as
+* published by the Free Software Foundation.
+*
+*/
+
+#ifndef __LINUX_LM3630_H
+#define __LINUX_LM3630_H
+
+#define LM3630_NAME "lm3630_bl"
+
+enum lm3630_pwm_ctrl {
+       PWM_CTRL_DISABLE = 0,
+       PWM_CTRL_BANK_A,
+       PWM_CTRL_BANK_B,
+       PWM_CTRL_BANK_ALL,
+};
+
+enum lm3630_pwm_active {
+       PWM_ACTIVE_HIGH = 0,
+       PWM_ACTIVE_LOW,
+};
+
+enum lm3630_bank_a_ctrl {
+       BANK_A_CTRL_DISABLE = 0x0,
+       BANK_A_CTRL_LED1 = 0x4,
+       BANK_A_CTRL_LED2 = 0x1,
+       BANK_A_CTRL_ALL = 0x5,
+};
+
+enum lm3630_bank_b_ctrl {
+       BANK_B_CTRL_DISABLE = 0,
+       BANK_B_CTRL_LED2,
+};
+
+struct lm3630_platform_data {
+
+       /* maximum brightness */
+       int max_brt_led1;
+       int max_brt_led2;
+
+       /* initial on brightness */
+       int init_brt_led1;
+       int init_brt_led2;
+       enum lm3630_pwm_ctrl pwm_ctrl;
+       enum lm3630_pwm_active pwm_active;
+       enum lm3630_bank_a_ctrl bank_a_ctrl;
+       enum lm3630_bank_b_ctrl bank_b_ctrl;
+       unsigned int pwm_period;
+       void (*pwm_set_intensity) (int brightness, int max_brightness);
+};
+
+#endif /* __LINUX_LM3630_H */
diff --git a/include/linux/platform_data/lm3639_bl.h b/include/linux/platform_data/lm3639_bl.h
new file mode 100644 (file)
index 0000000..5234cd5
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+* Simple driver for Texas Instruments LM3630 LED Flash driver chip
+* Copyright (C) 2012 Texas Instruments
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License version 2 as
+* published by the Free Software Foundation.
+*
+*/
+
+#ifndef __LINUX_LM3639_H
+#define __LINUX_LM3639_H
+
+#define LM3639_NAME "lm3639_bl"
+
+enum lm3639_pwm {
+       LM3639_PWM_DISABLE = 0x00,
+       LM3639_PWM_EN_ACTLOW = 0x48,
+       LM3639_PWM_EN_ACTHIGH = 0x40,
+};
+
+enum lm3639_strobe {
+       LM3639_STROBE_DISABLE = 0x00,
+       LM3639_STROBE_EN_ACTLOW = 0x10,
+       LM3639_STROBE_EN_ACTHIGH = 0x30,
+};
+
+enum lm3639_txpin {
+       LM3639_TXPIN_DISABLE = 0x00,
+       LM3639_TXPIN_EN_ACTLOW = 0x04,
+       LM3639_TXPIN_EN_ACTHIGH = 0x0C,
+};
+
+enum lm3639_fleds {
+       LM3639_FLED_DIASBLE_ALL = 0x00,
+       LM3639_FLED_EN_1 = 0x40,
+       LM3639_FLED_EN_2 = 0x20,
+       LM3639_FLED_EN_ALL = 0x60,
+};
+
+enum lm3639_bleds {
+       LM3639_BLED_DIASBLE_ALL = 0x00,
+       LM3639_BLED_EN_1 = 0x10,
+       LM3639_BLED_EN_2 = 0x08,
+       LM3639_BLED_EN_ALL = 0x18,
+};
+enum lm3639_bled_mode {
+       LM3639_BLED_MODE_EXPONETIAL = 0x00,
+       LM3639_BLED_MODE_LINEAR = 0x10,
+};
+
+struct lm3639_platform_data {
+       unsigned int max_brt_led;
+       unsigned int init_brt_led;
+
+       /* input pins */
+       enum lm3639_pwm pin_pwm;
+       enum lm3639_strobe pin_strobe;
+       enum lm3639_txpin pin_tx;
+
+       /* output pins */
+       enum lm3639_fleds fled_pins;
+       enum lm3639_bleds bled_pins;
+       enum lm3639_bled_mode bled_mode;
+
+       void (*pwm_set_intensity) (int brightness, int max_brightness);
+       int (*pwm_get_intensity) (void);
+};
+#endif /* __LINUX_LM3639_H */
index cc76f1f18f18fc88b2c8cceb94901eb3976733ae..761f31752367c46bc61b1715ecb019e963663043 100644 (file)
@@ -46,6 +46,8 @@
 #define LP8556_I2C_CONFIG      ((ENABLE_BL << BL_CTL_SHFT) | \
                                (LP8556_I2C_ONLY << BRT_MODE_SHFT))
 #define LP8556_COMB2_CONFIG    (LP8556_COMBINED2 << BRT_MODE_SHFT)
+#define LP8556_FAST_CONFIG     BIT(7) /* use it if EPROMs should be maintained
+                                         when exiting the low power mode */
 
 enum lp855x_chip_id {
        LP8550,
diff --git a/include/linux/prio_tree.h b/include/linux/prio_tree.h
deleted file mode 100644 (file)
index db04abb..0000000
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef _LINUX_PRIO_TREE_H
-#define _LINUX_PRIO_TREE_H
-
-/*
- * K&R 2nd ed. A8.3 somewhat obliquely hints that initial sequences of struct
- * fields with identical types should end up at the same location. We'll use
- * this until we can scrap struct raw_prio_tree_node.
- *
- * Note: all this could be done more elegantly by using unnamed union/struct
- * fields. However, gcc 2.95.3 and apparently also gcc 3.0.4 don't support this
- * language extension.
- */
-
-struct raw_prio_tree_node {
-       struct prio_tree_node   *left;
-       struct prio_tree_node   *right;
-       struct prio_tree_node   *parent;
-};
-
-struct prio_tree_node {
-       struct prio_tree_node   *left;
-       struct prio_tree_node   *right;
-       struct prio_tree_node   *parent;
-       unsigned long           start;
-       unsigned long           last;   /* last location _in_ interval */
-};
-
-struct prio_tree_root {
-       struct prio_tree_node   *prio_tree_node;
-       unsigned short          index_bits;
-       unsigned short          raw;
-               /*
-                * 0: nodes are of type struct prio_tree_node
-                * 1: nodes are of type raw_prio_tree_node
-                */
-};
-
-struct prio_tree_iter {
-       struct prio_tree_node   *cur;
-       unsigned long           mask;
-       unsigned long           value;
-       int                     size_level;
-
-       struct prio_tree_root   *root;
-       pgoff_t                 r_index;
-       pgoff_t                 h_index;
-};
-
-static inline void prio_tree_iter_init(struct prio_tree_iter *iter,
-               struct prio_tree_root *root, pgoff_t r_index, pgoff_t h_index)
-{
-       iter->root = root;
-       iter->r_index = r_index;
-       iter->h_index = h_index;
-       iter->cur = NULL;
-}
-
-#define __INIT_PRIO_TREE_ROOT(ptr, _raw)       \
-do {                                   \
-       (ptr)->prio_tree_node = NULL;   \
-       (ptr)->index_bits = 1;          \
-       (ptr)->raw = (_raw);            \
-} while (0)
-
-#define INIT_PRIO_TREE_ROOT(ptr)       __INIT_PRIO_TREE_ROOT(ptr, 0)
-#define INIT_RAW_PRIO_TREE_ROOT(ptr)   __INIT_PRIO_TREE_ROOT(ptr, 1)
-
-#define INIT_PRIO_TREE_NODE(ptr)                               \
-do {                                                           \
-       (ptr)->left = (ptr)->right = (ptr)->parent = (ptr);     \
-} while (0)
-
-#define INIT_PRIO_TREE_ITER(ptr)       \
-do {                                   \
-       (ptr)->cur = NULL;              \
-       (ptr)->mask = 0UL;              \
-       (ptr)->value = 0UL;             \
-       (ptr)->size_level = 0;          \
-} while (0)
-
-#define prio_tree_entry(ptr, type, member) \
-       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
-
-static inline int prio_tree_empty(const struct prio_tree_root *root)
-{
-       return root->prio_tree_node == NULL;
-}
-
-static inline int prio_tree_root(const struct prio_tree_node *node)
-{
-       return node->parent == node;
-}
-
-static inline int prio_tree_left_empty(const struct prio_tree_node *node)
-{
-       return node->left == node;
-}
-
-static inline int prio_tree_right_empty(const struct prio_tree_node *node)
-{
-       return node->right == node;
-}
-
-
-struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
-                struct prio_tree_node *old, struct prio_tree_node *node);
-struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
-                struct prio_tree_node *node);
-void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node);
-struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter);
-
-#define raw_prio_tree_replace(root, old, node) \
-       prio_tree_replace(root, (struct prio_tree_node *) (old), \
-           (struct prio_tree_node *) (node))
-#define raw_prio_tree_insert(root, node) \
-       prio_tree_insert(root, (struct prio_tree_node *) (node))
-#define raw_prio_tree_remove(root, node) \
-       prio_tree_remove(root, (struct prio_tree_node *) (node))
-
-#endif /* _LINUX_PRIO_TREE_H */
index 033b507b33b17075e5ab5cf287b264d3b9b22f35..0022c1bb1e26398c9db767a8eebc4fa153bff559 100644 (file)
   I know it's not the cleaner way,  but in C (not in C++) to get
   performances and genericity...
 
-  Some example of insert and search follows here. The search is a plain
-  normal search over an ordered tree. The insert instead must be implemented
-  in two steps: First, the code must insert the element in order as a red leaf
-  in the tree, and then the support library function rb_insert_color() must
-  be called. Such function will do the not trivial work to rebalance the
-  rbtree, if necessary.
-
------------------------------------------------------------------------
-static inline struct page * rb_search_page_cache(struct inode * inode,
-                                                unsigned long offset)
-{
-       struct rb_node * n = inode->i_rb_page_cache.rb_node;
-       struct page * page;
-
-       while (n)
-       {
-               page = rb_entry(n, struct page, rb_page_cache);
-
-               if (offset < page->offset)
-                       n = n->rb_left;
-               else if (offset > page->offset)
-                       n = n->rb_right;
-               else
-                       return page;
-       }
-       return NULL;
-}
-
-static inline struct page * __rb_insert_page_cache(struct inode * inode,
-                                                  unsigned long offset,
-                                                  struct rb_node * node)
-{
-       struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
-       struct rb_node * parent = NULL;
-       struct page * page;
-
-       while (*p)
-       {
-               parent = *p;
-               page = rb_entry(parent, struct page, rb_page_cache);
-
-               if (offset < page->offset)
-                       p = &(*p)->rb_left;
-               else if (offset > page->offset)
-                       p = &(*p)->rb_right;
-               else
-                       return page;
-       }
-
-       rb_link_node(node, parent, p);
-
-       return NULL;
-}
-
-static inline struct page * rb_insert_page_cache(struct inode * inode,
-                                                unsigned long offset,
-                                                struct rb_node * node)
-{
-       struct page * ret;
-       if ((ret = __rb_insert_page_cache(inode, offset, node)))
-               goto out;
-       rb_insert_color(node, &inode->i_rb_page_cache);
- out:
-       return ret;
-}
------------------------------------------------------------------------
+  See Documentation/rbtree.txt for documentation and samples.
 */
 
 #ifndef        _LINUX_RBTREE_H
@@ -97,63 +32,35 @@ static inline struct page * rb_insert_page_cache(struct inode * inode,
 #include <linux/kernel.h>
 #include <linux/stddef.h>
 
-struct rb_node
-{
-       unsigned long  rb_parent_color;
-#define        RB_RED          0
-#define        RB_BLACK        1
+struct rb_node {
+       unsigned long  __rb_parent_color;
        struct rb_node *rb_right;
        struct rb_node *rb_left;
 } __attribute__((aligned(sizeof(long))));
     /* The alignment might seem pointless, but allegedly CRIS needs it */
 
-struct rb_root
-{
+struct rb_root {
        struct rb_node *rb_node;
 };
 
 
-#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
-#define rb_color(r)   ((r)->rb_parent_color & 1)
-#define rb_is_red(r)   (!rb_color(r))
-#define rb_is_black(r) rb_color(r)
-#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
-#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
-
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
-       rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
-}
-static inline void rb_set_color(struct rb_node *rb, int color)
-{
-       rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
-}
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT        (struct rb_root) { NULL, }
 #define        rb_entry(ptr, type, member) container_of(ptr, type, member)
 
-#define RB_EMPTY_ROOT(root)    ((root)->rb_node == NULL)
-#define RB_EMPTY_NODE(node)    (rb_parent(node) == node)
-#define RB_CLEAR_NODE(node)    (rb_set_parent(node, node))
+#define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
+#define RB_EMPTY_NODE(node)  \
+       ((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+       ((node)->__rb_parent_color = (unsigned long)(node))
 
-static inline void rb_init_node(struct rb_node *rb)
-{
-       rb->rb_parent_color = 0;
-       rb->rb_right = NULL;
-       rb->rb_left = NULL;
-       RB_CLEAR_NODE(rb);
-}
 
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
-typedef void (*rb_augment_f)(struct rb_node *node, void *data);
-
-extern void rb_augment_insert(struct rb_node *node,
-                             rb_augment_f func, void *data);
-extern struct rb_node *rb_augment_erase_begin(struct rb_node *node);
-extern void rb_augment_erase_end(struct rb_node *node,
-                                rb_augment_f func, void *data);
 
 /* Find logical next and previous nodes in a tree */
 extern struct rb_node *rb_next(const struct rb_node *);
@@ -168,7 +75,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
                                struct rb_node ** rb_link)
 {
-       node->rb_parent_color = (unsigned long )parent;
+       node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;
 
        *rb_link = node;
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
new file mode 100644 (file)
index 0000000..214caa3
--- /dev/null
@@ -0,0 +1,223 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree_augmented.h
+*/
+
+#ifndef _LINUX_RBTREE_AUGMENTED_H
+#define _LINUX_RBTREE_AUGMENTED_H
+
+#include <linux/rbtree.h>
+
+/*
+ * Please note - only struct rb_augment_callbacks and the prototypes for
+ * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
+ * The rest are implementation details you are not expected to depend on.
+ *
+ * See Documentation/rbtree.txt for documentation and samples.
+ */
+
+struct rb_augment_callbacks {
+       void (*propagate)(struct rb_node *node, struct rb_node *stop);
+       void (*copy)(struct rb_node *old, struct rb_node *new);
+       void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+       void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+                   const struct rb_augment_callbacks *augment)
+{
+       __rb_insert_augmented(node, root, augment->rotate);
+}
+
+#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,      \
+                            rbtype, rbaugmented, rbcompute)            \
+static inline void                                                     \
+rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)         \
+{                                                                      \
+       while (rb != stop) {                                            \
+               rbstruct *node = rb_entry(rb, rbstruct, rbfield);       \
+               rbtype augmented = rbcompute(node);                     \
+               if (node->rbaugmented == augmented)                     \
+                       break;                                          \
+               node->rbaugmented = augmented;                          \
+               rb = rb_parent(&node->rbfield);                         \
+       }                                                               \
+}                                                                      \
+static inline void                                                     \
+rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
+{                                                                      \
+       rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);            \
+       rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);            \
+       new->rbaugmented = old->rbaugmented;                            \
+}                                                                      \
+static void                                                            \
+rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)      \
+{                                                                      \
+       rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);            \
+       rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);            \
+       new->rbaugmented = old->rbaugmented;                            \
+       old->rbaugmented = rbcompute(old);                              \
+}                                                                      \
+rbstatic const struct rb_augment_callbacks rbname = {                  \
+       rbname ## _propagate, rbname ## _copy, rbname ## _rotate        \
+};
+
+
+#define        RB_RED          0
+#define        RB_BLACK        1
+
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+       rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+                                      struct rb_node *p, int color)
+{
+       rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+                 struct rb_node *parent, struct rb_root *root)
+{
+       if (parent) {
+               if (parent->rb_left == old)
+                       parent->rb_left = new;
+               else
+                       parent->rb_right = new;
+       } else
+               root->rb_node = new;
+}
+
+extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+       void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+static __always_inline void
+rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+                  const struct rb_augment_callbacks *augment)
+{
+       struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+       struct rb_node *parent, *rebalance;
+       unsigned long pc;
+
+       if (!tmp) {
+               /*
+                * Case 1: node to erase has no more than 1 child (easy!)
+                *
+                * Note that if there is one child it must be red due to 5)
+                * and node must be black due to 4). We adjust colors locally
+                * so as to bypass __rb_erase_color() later on.
+                */
+               pc = node->__rb_parent_color;
+               parent = __rb_parent(pc);
+               __rb_change_child(node, child, parent, root);
+               if (child) {
+                       child->__rb_parent_color = pc;
+                       rebalance = NULL;
+               } else
+                       rebalance = __rb_is_black(pc) ? parent : NULL;
+               tmp = parent;
+       } else if (!child) {
+               /* Still case 1, but this time the child is node->rb_left */
+               tmp->__rb_parent_color = pc = node->__rb_parent_color;
+               parent = __rb_parent(pc);
+               __rb_change_child(node, tmp, parent, root);
+               rebalance = NULL;
+               tmp = parent;
+       } else {
+               struct rb_node *successor = child, *child2;
+               tmp = child->rb_left;
+               if (!tmp) {
+                       /*
+                        * Case 2: node's successor is its right child
+                        *
+                        *    (n)          (s)
+                        *    / \          / \
+                        *  (x) (s)  ->  (x) (c)
+                        *        \
+                        *        (c)
+                        */
+                       parent = successor;
+                       child2 = successor->rb_right;
+                       augment->copy(node, successor);
+               } else {
+                       /*
+                        * Case 3: node's successor is leftmost under
+                        * node's right child subtree
+                        *
+                        *    (n)          (s)
+                        *    / \          / \
+                        *  (x) (y)  ->  (x) (y)
+                        *      /            /
+                        *    (p)          (p)
+                        *    /            /
+                        *  (s)          (c)
+                        *    \
+                        *    (c)
+                        */
+                       do {
+                               parent = successor;
+                               successor = tmp;
+                               tmp = tmp->rb_left;
+                       } while (tmp);
+                       parent->rb_left = child2 = successor->rb_right;
+                       successor->rb_right = child;
+                       rb_set_parent(child, successor);
+                       augment->copy(node, successor);
+                       augment->propagate(parent, successor);
+               }
+
+               successor->rb_left = tmp = node->rb_left;
+               rb_set_parent(tmp, successor);
+
+               pc = node->__rb_parent_color;
+               tmp = __rb_parent(pc);
+               __rb_change_child(node, successor, tmp, root);
+               if (child2) {
+                       successor->__rb_parent_color = pc;
+                       rb_set_parent_color(child2, parent, RB_BLACK);
+                       rebalance = NULL;
+               } else {
+                       unsigned long pc2 = successor->__rb_parent_color;
+                       successor->__rb_parent_color = pc;
+                       rebalance = __rb_is_black(pc2) ? parent : NULL;
+               }
+               tmp = successor;
+       }
+
+       augment->propagate(tmp, NULL);
+       if (rebalance)
+               __rb_erase_color(rebalance, root, augment->rotate);
+}
+
+#endif /* _LINUX_RBTREE_AUGMENTED_H */
index a90ebadd9da055bb5130782246872a0ef53d8438..1a7b6c7787a5136c0c5617758c9df71ac0ceaee7 100644 (file)
@@ -30,6 +30,7 @@
 #define RIO_MAX_MPORTS         8
 #define RIO_MAX_MPORT_RESOURCES        16
 #define RIO_MAX_DEV_RESOURCES  16
+#define RIO_MAX_MPORT_NAME     40
 
 #define RIO_GLOBAL_TABLE       0xff    /* Indicates access of a switch's
                                           global routing table if it
@@ -255,7 +256,7 @@ struct rio_mport {
                                 */
        enum rio_phy_type phy_type;     /* RapidIO phy type */
        u32 phys_efptr;
-       unsigned char name[40];
+       unsigned char name[RIO_MAX_MPORT_NAME];
        void *priv;             /* Master port private data */
 #ifdef CONFIG_RAPIDIO_DMA_ENGINE
        struct dma_device       dma;
index 3fce545df394c61b3c8a7f4babfdbbee212131a8..1767045c287294836f274c030cda9dabdbc3f5bb 100644 (file)
@@ -35,16 +35,19 @@ struct anon_vma {
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;
+#ifdef CONFIG_SWAP
+       atomic_t swapra_miss;
+#endif
 
        /*
-        * NOTE: the LSB of the head.next is set by
+        * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
-        * head must only be read/written after taking the above lock
+        * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */
-       struct list_head head;  /* Chain of private "related" vmas */
+       struct rb_root rb_root; /* Interval tree of private "related" vmas */
 };
 
 /*
@@ -57,14 +60,18 @@ struct anon_vma {
  * with a VMA, or the VMAs associated with an anon_vma.
  * The "same_vma" list contains the anon_vma_chains linking
  * all the anon_vmas associated with this VMA.
- * The "same_anon_vma" list contains the anon_vma_chains
+ * The "rb" field indexes on an interval tree the anon_vma_chains
  * which link all the VMAs associated with this anon_vma.
  */
 struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
-       struct list_head same_anon_vma; /* locked by anon_vma->mutex */
+       struct rb_node rb;                      /* locked by anon_vma->mutex */
+       unsigned long rb_subtree_last;
+#ifdef CONFIG_DEBUG_VM_RB
+       unsigned long cached_vma_start, cached_vma_last;
+#endif
 };
 
 #ifdef CONFIG_MMU
@@ -120,7 +127,6 @@ void anon_vma_init(void);   /* create anon_vma_cachep */
 int  anon_vma_prepare(struct vm_area_struct *);
 void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
-void anon_vma_moveto_tail(struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 
 static inline void anon_vma_merge(struct vm_area_struct *vma,
diff --git a/include/linux/rtc-ds2404.h b/include/linux/rtc-ds2404.h
new file mode 100644 (file)
index 0000000..22c5382
--- /dev/null
@@ -0,0 +1,20 @@
+/*
+ * ds2404.h - platform data structure for the DS2404 RTC.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2012 Sven Schnelle <svens@stackframe.org>
+ */
+
+#ifndef __LINUX_DS2404_H
+#define __LINUX_DS2404_H
+
+struct ds2404_platform_data {
+
+       unsigned int gpio_rst;
+       unsigned int gpio_clk;
+       unsigned int gpio_dq;
+};
+#endif
index f071b3922c67f7a253c0b5f978b4bec1b7a690df..20ec4d3bed733d3818f4bb7f899b0322b253372e 100644 (file)
@@ -276,7 +276,7 @@ static inline bool is_leap_year(unsigned int year)
        return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
-#ifdef CONFIG_RTC_HCTOSYS
+#ifdef CONFIG_RTC_HCTOSYS_DEVICE
 extern int rtc_hctosys_ret;
 #else
 #define rtc_hctosys_ret -ENODEV
index 46f62fcbc7809012441307235d4bf71577a928e2..9098c813cb25efbb59f4b90133f716ead0284fd7 100644 (file)
@@ -670,7 +670,6 @@ struct signal_struct {
        struct rw_semaphore group_rwsem;
 #endif
 
-       int oom_adj;            /* OOM kill score adjustment (bit shift) */
        int oom_score_adj;      /* OOM kill score adjustment */
        int oom_score_adj_min;  /* OOM kill score adjustment minimum value.
                                 * Only settable by CAP_SYS_RESOURCE. */
index 5088727478fd8ad91bc7e17f9b85252b8ef8e4a2..a520fd70a59f371f40a34f79e883dcf7b32c23f7 100644 (file)
@@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
 
 static inline void timerqueue_init(struct timerqueue_node *node)
 {
-       rb_init_node(&node->node);
+       RB_CLEAR_NODE(&node->node);
 }
 
 static inline void timerqueue_init_head(struct timerqueue_head *head)
index d6fd8e5b14b76c41bfd532c3fa86255e4e92b0f3..9391706e92541d2b5d865610a88525bf8215a599 100644 (file)
@@ -36,7 +36,6 @@
        {(unsigned long)__GFP_RECLAIMABLE,      "GFP_RECLAIMABLE"},     \
        {(unsigned long)__GFP_MOVABLE,          "GFP_MOVABLE"},         \
        {(unsigned long)__GFP_NOTRACK,          "GFP_NOTRACK"},         \
-       {(unsigned long)__GFP_NO_KSWAPD,        "GFP_NO_KSWAPD"},       \
        {(unsigned long)__GFP_OTHER_NODE,       "GFP_OTHER_NODE"}       \
        ) : "GFP_NOWAIT"
 
index 538af702525b2f340a173a05b9f0e57e7d24e56f..c921ecfb81ce7a992d4f2bce6e494d4839e54da6 100644 (file)
@@ -1231,6 +1231,7 @@ config BUG
           Just say Y.
 
 config ELF_CORE
+       depends on COREDUMP
        default y
        bool "Enable ELF core dumps" if EXPERT
        help
index d3f0aeed2d39fe06aa07cb4147f747af8b7597ee..b6aa6324502f23a8c24cd738c472bd0ecc358723 100644 (file)
@@ -69,23 +69,28 @@ __setup("ro", readonly);
 __setup("rw", readwrite);
 
 #ifdef CONFIG_BLOCK
+struct uuidcmp {
+       const char *uuid;
+       int len;
+};
+
 /**
  * match_dev_by_uuid - callback for finding a partition using its uuid
  * @dev:       device passed in by the caller
- * @data:      opaque pointer to a 36 byte char array with a UUID
+ * @data:      opaque pointer to the desired struct uuidcmp to match
  *
  * Returns 1 if the device matches, and 0 otherwise.
  */
 static int match_dev_by_uuid(struct device *dev, void *data)
 {
-       u8 *uuid = data;
+       struct uuidcmp *cmp = data;
        struct hd_struct *part = dev_to_part(dev);
 
        if (!part->info)
                goto no_match;
 
-       if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid)))
-                       goto no_match;
+       if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len))
+               goto no_match;
 
        return 1;
 no_match:
@@ -95,7 +100,7 @@ no_match:
 
 /**
  * devt_from_partuuid - looks up the dev_t of a partition by its UUID
- * @uuid:      min 36 byte char array containing a hex ascii UUID
+ * @uuid:      char array containing ascii UUID
  *
  * The function will return the first partition which contains a matching
  * UUID value in its partition_meta_info struct.  This does not search
@@ -106,38 +111,41 @@ no_match:
  *
  * Returns the matching dev_t on success or 0 on failure.
  */
-static dev_t devt_from_partuuid(char *uuid_str)
+static dev_t devt_from_partuuid(const char *uuid_str)
 {
        dev_t res = 0;
+       struct uuidcmp cmp;
        struct device *dev = NULL;
-       u8 uuid[16];
        struct gendisk *disk;
        struct hd_struct *part;
        int offset = 0;
+       bool clear_root_wait = false;
+       char *slash;
 
-       if (strlen(uuid_str) < 36)
-               goto done;
+       cmp.uuid = uuid_str;
 
+       slash = strchr(uuid_str, '/');
        /* Check for optional partition number offset attributes. */
-       if (uuid_str[36]) {
+       if (slash) {
                char c = 0;
                /* Explicitly fail on poor PARTUUID syntax. */
-               if (sscanf(&uuid_str[36],
-                          "/PARTNROFF=%d%c", &offset, &c) != 1) {
-                       printk(KERN_ERR "VFS: PARTUUID= is invalid.\n"
-                        "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
-                       if (root_wait)
-                               printk(KERN_ERR
-                                    "Disabling rootwait; root= is invalid.\n");
-                       root_wait = 0;
+               if (sscanf(slash + 1,
+                          "PARTNROFF=%d%c", &offset, &c) != 1) {
+                       clear_root_wait = true;
                        goto done;
                }
+               cmp.len = slash - uuid_str;
+       } else {
+               cmp.len = strlen(uuid_str);
        }
 
-       /* Pack the requested UUID in the expected format. */
-       part_pack_uuid(uuid_str, uuid);
+       if (!cmp.len) {
+               clear_root_wait = true;
+               goto done;
+       }
 
-       dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid);
+       dev = class_find_device(&block_class, NULL, &cmp,
+                               &match_dev_by_uuid);
        if (!dev)
                goto done;
 
@@ -158,6 +166,13 @@ static dev_t devt_from_partuuid(char *uuid_str)
 no_offset:
        put_device(dev);
 done:
+       if (clear_root_wait) {
+               pr_err("VFS: PARTUUID= is invalid.\n"
+                      "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
+               if (root_wait)
+                       pr_err("Disabling rootwait; root= is invalid.\n");
+               root_wait = 0;
+       }
        return res;
 }
 #endif
@@ -174,6 +189,10 @@ done:
  *        used when disk name of partitioned disk ends on a digit.
  *     6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
  *        unique id of a partition if the partition table provides it.
+ *        The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
+ *        partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
+ *        filled hex representation of the 32-bit "NT disk signature", and PP
+ *        is a zero-filled hex representation of the 1-based partition number.
  *     7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
  *        a partition with a known unique id.
  *
index b28673087ac006e79f4563b1a06d39eb1b163d37..cf5f30f53f30dadae8a3269b3fcc1a3d7e93effb 100644 (file)
@@ -86,7 +86,6 @@ extern void init_IRQ(void);
 extern void fork_init(unsigned long);
 extern void mca_init(void);
 extern void sbus_init(void);
-extern void prio_tree_init(void);
 extern void radix_tree_init(void);
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
@@ -547,7 +546,6 @@ asmlinkage void __init start_kernel(void)
        /* init some links before init_ISA_irqs() */
        early_irq_init();
        init_IRQ();
-       prio_tree_init();
        init_timers();
        hrtimers_init();
        softirq_init();
index 9a08acc9e64923ea8af5bb9286d6f2ddd48c052e..a9d679395e9af6da654fc3f59b6b3d9841619f6a 100644 (file)
@@ -142,7 +142,6 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
                leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
                if (!leaf)
                        return -ENOMEM;
-               rb_init_node(&leaf->rb_node);
                INIT_LIST_HEAD(&leaf->msg_list);
                info->qsize += sizeof(*leaf);
        }
@@ -1013,7 +1012,6 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 
        if (!info->node_cache && new_leaf) {
                /* Save our speculative allocation into the cache */
-               rb_init_node(&new_leaf->rb_node);
                INIT_LIST_HEAD(&new_leaf->msg_list);
                info->node_cache = new_leaf;
                info->qsize += sizeof(*new_leaf);
@@ -1121,7 +1119,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 
        if (!info->node_cache && new_leaf) {
                /* Save our speculative allocation into the cache */
-               rb_init_node(&new_leaf->rb_node);
                INIT_LIST_HEAD(&new_leaf->msg_list);
                info->node_cache = new_leaf;
                info->qsize += sizeof(*new_leaf);
index 58d31f1c1eb59920a558705b677c8db3ff80b6d9..ebd8fececcfc63820a9904de2cb9a20481e805c4 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -61,8 +61,8 @@
  * - A woken up task may not even touch the semaphore array anymore, it may
  *   have been destroyed already by a semctl(RMID).
  * - The synchronizations between wake-ups due to a timeout/signal and a
- *   wake-up due to a completed semaphore operation is achieved by using an
- *   intermediate state (IN_WAKEUP).
+ *   wake-up due to a completed semaphore operation is achieved by using a
+ *   special wakeup scheme (queuewakeup_wait and support functions)
  * - UNDO values are stored in an array (one per process and per
  *   semaphore array, lazily allocated). For backwards compatibility, multiple
  *   modes for the UNDO variables are supported (per process, per thread)
 #include <asm/uaccess.h>
 #include "util.h"
 
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+       #define SYSVSEM_COMPLETION 1
+#else
+       #define SYSVSEM_CUSTOM 1
+#endif
+
+#ifdef SYSVSEM_COMPLETION
+       /* Using a completion causes some overhead, but avoids a busy loop
+        * that increases the worst case latency.
+        */
+       struct queue_done {
+               struct completion done;
+       };
+
+       static void queuewakeup_prepare(void)
+       {
+               /* no preparation necessary */
+       }
+
+       static void queuewakeup_completed(void)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_block(struct queue_done *qd)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_handsoff(struct queue_done *qd)
+       {
+               complete_all(&qd->done);
+       }
+
+       static void queuewakeup_init(struct queue_done *qd)
+       {
+               init_completion(&qd->done);
+       }
+
+       static void queuewakeup_wait(struct queue_done *qd)
+       {
+               wait_for_completion(&qd->done);
+       }
+
+#elif defined(SYSVSEM_SPINLOCK)
+       /* Note: Spinlocks do not work because:
+        * - lockdep complains [could be fixed]
+        * - only 255 concurrent spin_lock() calls are permitted, then the
+        *   preempt-counter overflows
+        */
+#error SYSVSEM_SPINLOCK is a prove of concept, does not work.
+       struct queue_done {
+               spinlock_t done;
+       };
+
+       static void queuewakeup_prepare(void)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_completed(void)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_block(struct queue_done *qd)
+       {
+               BUG_ON(spin_is_locked(&qd->done));
+               spin_lock(&qd->done);
+       }
+
+       static void queuewakeup_handsoff(struct queue_done *qd)
+       {
+               spin_unlock(&qd->done);
+       }
+
+       static void queuewakeup_init(struct queue_done *qd)
+       {
+               spin_lock_init(&qd->done);
+       }
+
+       static void queuewakeup_wait(struct queue_done *qd)
+       {
+               spin_unlock_wait(&qd->done);
+       }
+#else
+       struct queue_done {
+               atomic_t done;
+       };
+
+       static void queuewakeup_prepare(void)
+       {
+               preempt_disable();
+       }
+
+       static void queuewakeup_completed(void)
+       {
+               preempt_enable();
+       }
+
+       static void queuewakeup_block(struct queue_done *qd)
+       {
+               BUG_ON(atomic_read(&qd->done) != 1);
+               atomic_set(&qd->done, 2);
+       }
+
+       static void queuewakeup_handsoff(struct queue_done *qd)
+       {
+               BUG_ON(atomic_read(&qd->done) != 2);
+               smp_mb();
+               atomic_set(&qd->done, 1);
+       }
+
+       static void queuewakeup_init(struct queue_done *qd)
+       {
+               atomic_set(&qd->done, 1);
+       }
+
+       static void queuewakeup_wait(struct queue_done *qd)
+       {
+               while (atomic_read(&qd->done) != 1)
+                       cpu_relax();
+
+               smp_mb();
+       }
+#endif
+
+
 /* One semaphore structure for each semaphore in the system. */
 struct sem {
        int     semval;         /* current value */
@@ -108,6 +237,7 @@ struct sem_queue {
        struct sembuf           *sops;   /* array of pending operations */
        int                     nsops;   /* number of operations */
        int                     alter;   /* does *sops alter the array? */
+       struct queue_done       done;    /* completion synchronization */
 };
 
 /* Each task has a list of undo requests. They are executed automatically
@@ -245,23 +375,27 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
  * - queue.status is initialized to -EINTR before blocking.
  * - wakeup is performed by
  *     * unlinking the queue entry from sma->sem_pending
- *     * setting queue.status to IN_WAKEUP
- *       This is the notification for the blocked thread that a
- *       result value is imminent.
+ *     * setting queue.status to the actual result code
+ *       This is the notification for the blocked thread that someone
+ *       (usually: update_queue()) completed the semtimedop() operation.
  *     * call wake_up_process
- *     * set queue.status to the final value.
+ *     * queuewakeup_handsoff(&q->done);
  * - the previously blocked thread checks queue.status:
- *     * if it's IN_WAKEUP, then it must wait until the value changes
- *     * if it's not -EINTR, then the operation was completed by
- *       update_queue. semtimedop can return queue.status without
- *       performing any operation on the sem array.
- *     * otherwise it must acquire the spinlock and check what's up.
+ *     * if it's not -EINTR, then someone completed the operation.
+ *       First, queuewakeup_wait() must be called. Afterwards,
+ *       semtimedop must return queue.status without performing any
+ *       operation on the sem array.
+ *       - otherwise it must acquire the spinlock and repeat the test
+ *       - If it is still -EINTR, then no update_queue() completed the
+ *         operation, thus semtimedop() can proceed normally.
  *
- * The two-stage algorithm is necessary to protect against the following
+ * queuewakeup_wait() is necessary to protect against the following
  * races:
  * - if queue.status is set after wake_up_process, then the woken up idle
  *   thread could race forward and try (and fail) to acquire sma->lock
- *   before update_queue had a chance to set queue.status
+ *   before update_queue had a chance to set queue.status.
+ *   More importantly, it would mean that wake_up_process must be done
+ *   while holding sma->lock, i.e. this would reduce the scalability.
  * - if queue.status is written before wake_up_process and if the
  *   blocked process is woken up by a signal between writing
  *   queue.status and the wake_up_process, then the woken up
@@ -271,7 +405,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
  *   (yes, this happened on s390 with sysv msg).
  *
  */
-#define IN_WAKEUP      1
 
 /**
  * newary - Create a new semaphore set
@@ -461,15 +594,11 @@ undo:
 static void wake_up_sem_queue_prepare(struct list_head *pt,
                                struct sem_queue *q, int error)
 {
-       if (list_empty(pt)) {
-               /*
-                * Hold preempt off so that we don't get preempted and have the
-                * wakee busy-wait until we're scheduled back on.
-                */
-               preempt_disable();
-       }
-       q->status = IN_WAKEUP;
-       q->pid = error;
+       if (list_empty(pt))
+               queuewakeup_prepare();
+
+       queuewakeup_block(&q->done);
+       q->status = error;
 
        list_add_tail(&q->simple_list, pt);
 }
@@ -480,8 +609,8 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
  *
  * Do the actual wake-up.
  * The function is called without any locks held, thus the semaphore array
- * could be destroyed already and the tasks can disappear as soon as the
- * status is set to the actual return code.
+ * could be destroyed already and the tasks can disappear as soon as
+ * queuewakeup_handsoff() is called.
  */
 static void wake_up_sem_queue_do(struct list_head *pt)
 {
@@ -491,12 +620,11 @@ static void wake_up_sem_queue_do(struct list_head *pt)
        did_something = !list_empty(pt);
        list_for_each_entry_safe(q, t, pt, simple_list) {
                wake_up_process(q->sleeper);
-               /* q can disappear immediately after writing q->status. */
-               smp_wmb();
-               q->status = q->pid;
+               /* q can disappear immediately after completing q->done */
+               queuewakeup_handsoff(&q->done);
        }
        if (did_something)
-               preempt_enable();
+               queuewakeup_completed();
 }
 
 static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -1302,33 +1430,6 @@ out:
        return un;
 }
 
-
-/**
- * get_queue_result - Retrieve the result code from sem_queue
- * @q: Pointer to queue structure
- *
- * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
- * q->status, then we must loop until the value is replaced with the final
- * value: This may happen if a task is woken up by an unrelated event (e.g.
- * signal) and in parallel the task is woken up by another task because it got
- * the requested semaphores.
- *
- * The function can be called with or without holding the semaphore spinlock.
- */
-static int get_queue_result(struct sem_queue *q)
-{
-       int error;
-
-       error = q->status;
-       while (unlikely(error == IN_WAKEUP)) {
-               cpu_relax();
-               error = q->status;
-       }
-
-       return error;
-}
-
-
 SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                unsigned, nsops, const struct timespec __user *, timeout)
 {
@@ -1474,6 +1575,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 
        queue.status = -EINTR;
        queue.sleeper = current;
+       queuewakeup_init(&queue.done);
 
 sleep_again:
        current->state = TASK_INTERRUPTIBLE;
@@ -1484,17 +1586,14 @@ sleep_again:
        else
                schedule();
 
-       error = get_queue_result(&queue);
+       error = queue.status;
 
        if (error != -EINTR) {
                /* fast path: update_queue already obtained all requested
-                * resources.
-                * Perform a smp_mb(): User space could assume that semop()
-                * is a memory barrier: Without the mb(), the cpu could
-                * speculatively read in user space stale data that was
-                * overwritten by the previous owner of the semaphore.
+                * resources. Just ensure that update_queue completed
+                * it's access to &queue.
                 */
-               smp_mb();
+               queuewakeup_wait(&queue.done);
 
                goto out_free;
        }
@@ -1504,23 +1603,16 @@ sleep_again:
        /*
         * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
         */
-       error = get_queue_result(&queue);
-
-       /*
-        * Array removed? If yes, leave without sem_unlock().
-        */
-       if (IS_ERR(sma)) {
-               goto out_free;
-       }
-
-
-       /*
-        * If queue.status != -EINTR we are woken up by another process.
-        * Leave without unlink_queue(), but with sem_unlock().
-        */
-
+       error = queue.status;
        if (error != -EINTR) {
-               goto out_unlock_free;
+               /* If there is a return code, then we can leave immediately. */
+               if (!IS_ERR(sma)) {
+                       /* sem_lock() succeeded - then unlock */
+                       sem_unlock(sma);
+               }
+               /* Except that we must wait for the hands-off */
+               queuewakeup_wait(&queue.done);
+               goto out_free;
        }
 
        /*
index 37f52f27828df4890a4dce17c2a8402f013b74f8..04ac48f1b52984eb99963122411e289c54fb8ece 100644 (file)
@@ -1159,7 +1159,6 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
        const struct cred *cred;
        char name[sizeof(tsk->comm)];
        struct mm_struct *mm = tsk->mm;
-       struct vm_area_struct *vma;
        char *tty;
 
        if (!ab)
@@ -1193,16 +1192,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 
        if (mm) {
                down_read(&mm->mmap_sem);
-               vma = mm->mmap;
-               while (vma) {
-                       if ((vma->vm_flags & VM_EXECUTABLE) &&
-                           vma->vm_file) {
-                               audit_log_d_path(ab, " exe=",
-                                                &vma->vm_file->f_path);
-                               break;
-                       }
-                       vma = vma->vm_next;
-               }
+               if (mm->exe_file)
+                       audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
                up_read(&mm->mmap_sem);
        }
        audit_log_task_context(ab);
index bc7d6621abe2dd101c7dcbd5e0093702bdaed690..95354abb24a66f34490e4ab576d3e4867b21b8d6 100644 (file)
@@ -3678,7 +3678,7 @@ unlock:
                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
 
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
 
        return ret;
index 1666632e6edfcfc07c867c91d5b7039dc84ce2f6..468d91a7e0b79275009ccafd697effb390673801 100644 (file)
@@ -732,7 +732,6 @@ static struct map_info *
 build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 {
        unsigned long pgoff = offset >> PAGE_SHIFT;
-       struct prio_tree_iter iter;
        struct vm_area_struct *vma;
        struct map_info *curr = NULL;
        struct map_info *prev = NULL;
@@ -741,7 +740,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 
  again:
        mutex_lock(&mapping->i_mmap_mutex);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;
 
index 5a0e74d89a5aa2e459e42679d6105fda07b7e8bd..e7a601961b5453458c904deeb6e95dac84262ddd 100644 (file)
@@ -422,7 +422,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                mapping->i_mmap_writable++;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
-                       vma_prio_tree_add(tmp, mpnt);
+                       if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+                               vma_nonlinear_insert(tmp,
+                                               &mapping->i_mmap_nonlinear);
+                       else
+                               vma_interval_tree_insert_after(tmp, mpnt,
+                                                       &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        mutex_unlock(&mapping->i_mmap_mutex);
                }
@@ -621,26 +626,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas--;
-       if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
-               fput(mm->exe_file);
-               mm->exe_file = NULL;
-       }
-
-}
-
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
        if (new_exe_file)
@@ -648,15 +633,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
        if (mm->exe_file)
                fput(mm->exe_file);
        mm->exe_file = new_exe_file;
-       mm->num_exe_file_vmas = 0;
 }
 
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
        struct file *exe_file;
 
-       /* We need mmap_sem to protect against races with removal of
-        * VM_EXECUTABLE vmas */
+       /* We need mmap_sem to protect against races with removal of exe_file */
        down_read(&mm->mmap_sem);
        exe_file = mm->exe_file;
        if (exe_file)
@@ -1077,7 +1060,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        init_rwsem(&sig->group_rwsem);
 #endif
 
-       sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
index 0668d58d6413e8eeb8736b234be6d8d7378a4bea..5e4bd7864c5dedf836a7c85cc1ed8e3c4e31e6cb 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/hardirq.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
-#include <generated/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
 #include <linux/suspend.h>
index 34d45886ee8429acc7197c8ceaca18ef215fb2ae..73f35d4b30b9d22e727265f9ff06cd6250a88a36 100644 (file)
@@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root,
        struct resource *parent = root;
        struct resource *conflict;
        struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+       struct resource *next_res = NULL;
 
        if (!res)
                return;
@@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root,
        res->end = end;
        res->flags = IORESOURCE_BUSY;
 
-       conflict = __request_resource(parent, res);
-       if (!conflict)
-               return;
+       while (1) {
 
-       /* failed, split and try again */
-       kfree(res);
+               conflict = __request_resource(parent, res);
+               if (!conflict) {
+                       if (!next_res)
+                               break;
+                       res = next_res;
+                       next_res = NULL;
+                       continue;
+               }
 
-       /* conflict covered whole area */
-       if (conflict->start <= start && conflict->end >= end)
-               return;
+               /* conflict covered whole area */
+               if (conflict->start <= res->start &&
+                               conflict->end >= res->end) {
+                       kfree(res);
+                       WARN_ON(next_res);
+                       break;
+               }
+
+               /* failed, split and try again */
+               if (conflict->start > res->start) {
+                       end = res->end;
+                       res->end = conflict->start - 1;
+                       if (conflict->end < end) {
+                               next_res = kzalloc(sizeof(*next_res),
+                                               GFP_ATOMIC);
+                               if (!next_res) {
+                                       kfree(res);
+                                       break;
+                               }
+                               next_res->name = name;
+                               next_res->start = conflict->end + 1;
+                               next_res->end = end;
+                               next_res->flags = IORESOURCE_BUSY;
+                       }
+               } else {
+                       res->start = conflict->end + 1;
+               }
+       }
 
-       if (conflict->start > start)
-               __reserve_region_with_split(root, start, conflict->start-1, name);
-       if (conflict->end < end)
-               __reserve_region_with_split(root, conflict->end+1, end, name);
 }
 
 void __init reserve_region_with_split(struct resource *root,
index 2c681f11b7d24b9deab0e7b4a7c1498f11a23b16..2ad3f5904bd7f0e11b2b4db927db687083ab86c2 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
+#include <linux/coredump.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
index 241507f23eca097871bec58f5976476f352f3d75..6fab59a1fc24d965e8c2e936d5e80e91e623592c 100644 (file)
@@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 void kernel_restart(char *cmd)
 {
        kernel_restart_prepare(cmd);
+       disable_nonboot_cpus();
        if (!cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
index 84c76a34e41c7a01f3464506c5352cf00cf1f8ca..c2a2f8084bad0c95fbfc77ec08b2ee806d338097 100644 (file)
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int max_threads;
-extern int core_uses_pid;
 extern int suid_dumpable;
+#ifdef CONFIG_COREDUMP
+extern int core_uses_pid;
 extern char core_pattern[];
 extern unsigned int core_pipe_limit;
+#endif
 extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
@@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
 
 static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COREDUMP
 static int proc_dostring_coredump(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
@@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_COREDUMP
        {
                .procname       = "core_uses_pid",
                .data           = &core_uses_pid,
@@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#endif
 #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
@@ -2036,12 +2042,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 
 static void validate_coredump_safety(void)
 {
+#ifdef CONFIG_COREDUMP
        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
            core_pattern[0] != '/' && core_pattern[0] != '|') {
                printk(KERN_WARNING "Unsafe core_pattern used with "\
                        "suid_dumpable=2. Pipe handler or fully qualified "\
                        "core dump path required.\n");
        }
+#endif
 }
 
 static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
@@ -2053,6 +2061,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
        return error;
 }
 
+#ifdef CONFIG_COREDUMP
 static int proc_dostring_coredump(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2061,6 +2070,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
                validate_coredump_safety();
        return error;
 }
+#endif
 
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                     void __user *buffer,
index 123793cd06f931477ae61cb7666a466b616bdfb5..ab558cc66c850803ea5edfbe4e960e56cfdbab7d 100644 (file)
@@ -437,6 +437,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
                                sizeof(struct cgroupstats));
        if (na == NULL) {
+               nlmsg_free(rep_skb);
                rc = -EMSGSIZE;
                goto err;
        }
index ba744cf80696203b65406c7aef8faae3a23048d0..8d8bebd9ad60cd16b9ef8f0dac0ae2df8a9dfc1f 100644 (file)
@@ -232,7 +232,7 @@ EXPORT_SYMBOL(current_fs_time);
  * Avoid unnecessary multiplications/divisions in the
  * two most common HZ cases:
  */
-inline unsigned int jiffies_to_msecs(const unsigned long j)
+unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
@@ -248,7 +248,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
 
-inline unsigned int jiffies_to_usecs(const unsigned long j)
+unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
index eb51d76e058a401477776e279fba95c95a315a4a..04612394c53e2d32e393d8ff6d14e1de6e6a6c39 100644 (file)
@@ -370,7 +370,7 @@ if ($hz eq '--can') {
        }
 
        @val = @{$canned_values{$hz}};
-       if (!defined(@val)) {
+       if (!@val) {
                @val = compute_values($hz);
        }
        output($hz, @val);
index 35c4565ee8fa26d5ce1ca1147ac6ccbdf47cf386..6c582e7ec7dd0adf3b8f49199f84d416650b4957 100644 (file)
@@ -196,12 +196,13 @@ config LOCKUP_DETECTOR
          thresholds can be controlled through the sysctl watchdog_thresh.
 
 config HARDLOCKUP_DETECTOR
-       def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \
-                !HAVE_NMI_WATCHDOG
+       def_bool y
+       depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
+       depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 
 config BOOTPARAM_HARDLOCKUP_PANIC
        bool "Panic (Reboot) On Hard Lockups"
-       depends on LOCKUP_DETECTOR
+       depends on HARDLOCKUP_DETECTOR
        help
          Say Y here to enable the kernel to panic on "hard lockups",
          which are bugs that cause the kernel to loop in kernel
@@ -212,7 +213,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC
 
 config BOOTPARAM_HARDLOCKUP_PANIC_VALUE
        int
-       depends on LOCKUP_DETECTOR
+       depends on HARDLOCKUP_DETECTOR
        range 0 1
        default 0 if !BOOTPARAM_HARDLOCKUP_PANIC
        default 1 if BOOTPARAM_HARDLOCKUP_PANIC
@@ -797,6 +798,15 @@ config DEBUG_VM
 
          If unsure, say N.
 
+config DEBUG_VM_RB
+       bool "Debug VM red-black trees"
+       depends on DEBUG_VM
+       help
+         Enable this to turn on more extended checks in the virtual-memory
+         system that may impact performance.
+
+         If unsure, say N.
+
 config DEBUG_VIRTUAL
        bool "Debug VM translations"
        depends on DEBUG_KERNEL && X86
@@ -1281,6 +1291,19 @@ config LATENCYTOP
 source mm/Kconfig.debug
 source kernel/trace/Kconfig
 
+config RBTREE_TEST
+       tristate "Red-Black tree test"
+       depends on m && DEBUG_KERNEL
+       help
+         A benchmark measuring the performance of the rbtree library.
+         Also includes rbtree invariant checks.
+
+config INTERVAL_TREE_TEST
+       tristate "Interval tree test"
+       depends on m && DEBUG_KERNEL
+       help
+         A benchmark measuring the performance of the interval tree library
+
 config PROVIDE_OHCI1394_DMA_INIT
        bool "Remote debugging over FireWire early on boot"
        depends on PCI && X86
index 42d283edc4d3157cc81f897c51fd4ee20a615b62..3128e357e28649b58ddc05bf4d0802455a10f8fc 100644 (file)
@@ -9,7 +9,7 @@ endif
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
         rbtree.o radix-tree.o dump_stack.o timerqueue.o\
-        idr.o int_sqrt.o extable.o prio_tree.o \
+        idr.o int_sqrt.o extable.o \
         sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
         proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
         is_single_threaded.o plist.o decompress.o
@@ -140,6 +140,11 @@ $(foreach file, $(libfdt_files), \
        $(eval CFLAGS_$(file) = -I$(src)/../scripts/dtc/libfdt))
 lib-$(CONFIG_LIBFDT) += $(libfdt_files)
 
+obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o
+obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o
+
+interval_tree_test-objs := interval_tree_test_main.o interval_tree.o
+
 hostprogs-y    := gen_crc32table
 clean-files    := crc32table.h
 
index 3d766b7f60aba915b9957be45e089e699f164c3e..31a8042772820d695b85dc15edfb33f57666b0db 100644 (file)
@@ -14,6 +14,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
+#include <linux/init.h>
 
 #ifndef CONFIG_DECOMPRESS_GZIP
 # define gunzip NULL
 # define unlzo NULL
 #endif
 
-static const struct compress_format {
+struct compress_format {
        unsigned char magic[2];
        const char *name;
        decompress_fn decompressor;
-} compressed_formats[] = {
+};
+
+static const struct compress_format compressed_formats[] __initdata = {
        { {037, 0213}, "gzip", gunzip },
        { {037, 0236}, "gzip", gunzip },
        { {0x42, 0x5a}, "bzip2", bunzip2 },
@@ -45,7 +48,7 @@ static const struct compress_format {
        { {0, 0}, NULL, NULL }
 };
 
-decompress_fn decompress_method(const unsigned char *inbuf, int len,
+decompress_fn __init decompress_method(const unsigned char *inbuf, int len,
                                const char **name)
 {
        const struct compress_format *cf;
index cce4f3cd14b36f1511b7372a5c8a8106b9495437..3657f129d7b8c5db0bee11a94a97d5d4bf251b52 100644 (file)
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -9,6 +9,9 @@ unsigned long gcd(unsigned long a, unsigned long b)
 
        if (a < b)
                swap(a, b);
+
+       if (!b)
+               return a;
        while ((r = a % b) != 0) {
                a = b;
                b = r;
index 4046e29c0a997bb38fd3892adeb55a4cf4914e31..648239079dd21329f0329ddaf12552677149d846 100644 (file)
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -20,7 +20,7 @@
  * that id to this code and it returns your pointer.
 
  * You can release ids at any time. When all ids are released, most of
- * the memory is returned (we keep IDR_FREE_MAX) in a local pool so we
+ * the memory is returned (we keep MAX_IDR_FREE) in a local pool so we
  * don't need to go to the memory "store" during an id allocate, just
  * so you don't need to be too concerned about locking and conflicts
  * with the slab allocator.
@@ -122,7 +122,7 @@ static void idr_mark_full(struct idr_layer **pa, int id)
  */
 int idr_pre_get(struct idr *idp, gfp_t gfp_mask)
 {
-       while (idp->id_free_cnt < IDR_FREE_MAX) {
+       while (idp->id_free_cnt < MAX_IDR_FREE) {
                struct idr_layer *new;
                new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
                if (new == NULL)
@@ -179,7 +179,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa)
                        sh = IDR_BITS*l;
                        id = ((id >> sh) ^ n ^ m) << sh;
                }
-               if ((id >= MAX_ID_BIT) || (id < 0))
+               if ((id >= MAX_IDR_BIT) || (id < 0))
                        return IDR_NOMORE_SPACE;
                if (l == 0)
                        break;
@@ -223,7 +223,7 @@ build_up:
         * Add a new layer to the top of the tree if the requested
         * id is larger than the currently allocated space.
         */
-       while ((layers < (MAX_LEVEL - 1)) && (id >= (1 << (layers*IDR_BITS)))) {
+       while ((layers < (MAX_IDR_LEVEL - 1)) && (id >= (1 << (layers*IDR_BITS)))) {
                layers++;
                if (!p->count) {
                        /* special case: if the tree is currently empty,
@@ -265,7 +265,7 @@ build_up:
 
 static int idr_get_new_above_int(struct idr *idp, void *ptr, int starting_id)
 {
-       struct idr_layer *pa[MAX_LEVEL];
+       struct idr_layer *pa[MAX_IDR_LEVEL];
        int id;
 
        id = idr_get_empty_slot(idp, starting_id, pa);
@@ -357,7 +357,7 @@ static void idr_remove_warning(int id)
 static void sub_remove(struct idr *idp, int shift, int id)
 {
        struct idr_layer *p = idp->top;
-       struct idr_layer **pa[MAX_LEVEL];
+       struct idr_layer **pa[MAX_IDR_LEVEL];
        struct idr_layer ***paa = &pa[0];
        struct idr_layer *to_free;
        int n;
@@ -402,7 +402,7 @@ void idr_remove(struct idr *idp, int id)
        struct idr_layer *to_free;
 
        /* Mask off upper bits we don't use for the search. */
-       id &= MAX_ID_MASK;
+       id &= MAX_IDR_MASK;
 
        sub_remove(idp, (idp->layers - 1) * IDR_BITS, id);
        if (idp->top && idp->top->count == 1 && (idp->layers > 1) &&
@@ -420,7 +420,7 @@ void idr_remove(struct idr *idp, int id)
                to_free->bitmap = to_free->count = 0;
                free_layer(to_free);
        }
-       while (idp->id_free_cnt >= IDR_FREE_MAX) {
+       while (idp->id_free_cnt >= MAX_IDR_FREE) {
                p = get_from_free_list(idp);
                /*
                 * Note: we don't call the rcu callback here, since the only
@@ -451,7 +451,7 @@ void idr_remove_all(struct idr *idp)
        int n, id, max;
        int bt_mask;
        struct idr_layer *p;
-       struct idr_layer *pa[MAX_LEVEL];
+       struct idr_layer *pa[MAX_IDR_LEVEL];
        struct idr_layer **paa = &pa[0];
 
        n = idp->layers * IDR_BITS;
@@ -517,7 +517,7 @@ void *idr_find(struct idr *idp, int id)
        n = (p->layer+1) * IDR_BITS;
 
        /* Mask off upper bits we don't use for the search. */
-       id &= MAX_ID_MASK;
+       id &= MAX_IDR_MASK;
 
        if (id >= (1 << n))
                return NULL;
@@ -555,7 +555,7 @@ int idr_for_each(struct idr *idp,
 {
        int n, id, max, error = 0;
        struct idr_layer *p;
-       struct idr_layer *pa[MAX_LEVEL];
+       struct idr_layer *pa[MAX_IDR_LEVEL];
        struct idr_layer **paa = &pa[0];
 
        n = idp->layers * IDR_BITS;
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(idr_for_each);
  */
 void *idr_get_next(struct idr *idp, int *nextidp)
 {
-       struct idr_layer *p, *pa[MAX_LEVEL];
+       struct idr_layer *p, *pa[MAX_IDR_LEVEL];
        struct idr_layer **paa = &pa[0];
        int id = *nextidp;
        int n, max;
@@ -659,7 +659,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 
        n = (p->layer+1) * IDR_BITS;
 
-       id &= MAX_ID_MASK;
+       id &= MAX_IDR_MASK;
 
        if (id >= (1 << n))
                return ERR_PTR(-EINVAL);
@@ -780,7 +780,7 @@ EXPORT_SYMBOL(ida_pre_get);
  */
 int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
 {
-       struct idr_layer *pa[MAX_LEVEL];
+       struct idr_layer *pa[MAX_IDR_LEVEL];
        struct ida_bitmap *bitmap;
        unsigned long flags;
        int idr_id = starting_id / IDA_BITMAP_BITS;
@@ -793,7 +793,7 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
        if (t < 0)
                return _idr_rc_to_errno(t);
 
-       if (t * IDA_BITMAP_BITS >= MAX_ID_BIT)
+       if (t * IDA_BITMAP_BITS >= MAX_IDR_BIT)
                return -ENOSPC;
 
        if (t != idr_id)
@@ -827,7 +827,7 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
        }
 
        id = idr_id * IDA_BITMAP_BITS + t;
-       if (id >= MAX_ID_BIT)
+       if (id >= MAX_IDR_BIT)
                return -ENOSPC;
 
        __set_bit(t, bitmap->bitmap);
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
new file mode 100644 (file)
index 0000000..e6eb406
--- /dev/null
@@ -0,0 +1,10 @@
+#include <linux/init.h>
+#include <linux/interval_tree.h>
+#include <linux/interval_tree_generic.h>
+
+#define START(node) ((node)->start)
+#define LAST(node)  ((node)->last)
+
+INTERVAL_TREE_DEFINE(struct interval_tree_node, rb,
+                    unsigned long, __subtree_last,
+                    START, LAST,, interval_tree)
diff --git a/lib/interval_tree_test_main.c b/lib/interval_tree_test_main.c
new file mode 100644 (file)
index 0000000..b259039
--- /dev/null
@@ -0,0 +1,105 @@
+#include <linux/module.h>
+#include <linux/interval_tree.h>
+#include <linux/random.h>
+#include <asm/timex.h>
+
+#define NODES        100
+#define PERF_LOOPS   100000
+#define SEARCHES     100
+#define SEARCH_LOOPS 10000
+
+static struct rb_root root = RB_ROOT;
+static struct interval_tree_node nodes[NODES];
+static u32 queries[SEARCHES];
+
+static struct rnd_state rnd;
+
+static inline unsigned long
+search(unsigned long query, struct rb_root *root)
+{
+       struct interval_tree_node *node;
+       unsigned long results = 0;
+
+       for (node = interval_tree_iter_first(root, query, query); node;
+            node = interval_tree_iter_next(node, query, query))
+               results++;
+       return results;
+}
+
+static void init(void)
+{
+       int i;
+       for (i = 0; i < NODES; i++) {
+               u32 a = prandom32(&rnd), b = prandom32(&rnd);
+               if (a <= b) {
+                       nodes[i].start = a;
+                       nodes[i].last = b;
+               } else {
+                       nodes[i].start = b;
+                       nodes[i].last = a;
+               }
+       }
+       for (i = 0; i < SEARCHES; i++)
+               queries[i] = prandom32(&rnd);
+}
+
+static int interval_tree_test_init(void)
+{
+       int i, j;
+       unsigned long results;
+       cycles_t time1, time2, time;
+
+       printk(KERN_ALERT "interval tree insert/remove");
+
+       prandom32_seed(&rnd, 3141592653589793238ULL);
+       init();
+
+       time1 = get_cycles();
+
+       for (i = 0; i < PERF_LOOPS; i++) {
+               for (j = 0; j < NODES; j++)
+                       interval_tree_insert(nodes + j, &root);
+               for (j = 0; j < NODES; j++)
+                       interval_tree_remove(nodes + j, &root);
+       }
+
+       time2 = get_cycles();
+       time = time2 - time1;
+
+       time = div_u64(time, PERF_LOOPS);
+       printk(" -> %llu cycles\n", (unsigned long long)time);
+
+       printk(KERN_ALERT "interval tree search");
+
+       for (j = 0; j < NODES; j++)
+               interval_tree_insert(nodes + j, &root);
+
+       time1 = get_cycles();
+
+       results = 0;
+       for (i = 0; i < SEARCH_LOOPS; i++)
+               for (j = 0; j < SEARCHES; j++)
+                       results += search(queries[j], &root);
+
+       time2 = get_cycles();
+       time = time2 - time1;
+
+       time = div_u64(time, SEARCH_LOOPS);
+       results = div_u64(results, SEARCH_LOOPS);
+       printk(" -> %llu cycles (%lu results)\n",
+              (unsigned long long)time, results);
+
+       return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void interval_tree_test_exit(void)
+{
+       printk(KERN_ALERT "test exit\n");
+}
+
+module_init(interval_tree_test_init)
+module_exit(interval_tree_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michel Lespinasse");
+MODULE_DESCRIPTION("Interval Tree test");
index c43410084838725c18c3542cdd6e48794116d382..52cfa69f73dfe514b7424f0edf1c236a5e0edb59 100644 (file)
@@ -122,13 +122,14 @@ int match_token(char *s, const match_table_t table, substring_t args[])
  *
  * Description: Given a &substring_t and a base, attempts to parse the substring
  * as a number in that base. On success, sets @result to the integer represented
- * by the string and returns 0. Returns either -ENOMEM or -EINVAL on failure.
+ * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
  */
 static int match_number(substring_t *s, int *result, int base)
 {
        char *endp;
        char *buf;
        int ret;
+       long val;
        size_t len = s->to - s->from;
 
        buf = kmalloc(len + 1, GFP_KERNEL);
@@ -136,10 +137,15 @@ static int match_number(substring_t *s, int *result, int base)
                return -ENOMEM;
        memcpy(buf, s->from, len);
        buf[len] = '\0';
-       *result = simple_strtol(buf, &endp, base);
+
        ret = 0;
+       val = simple_strtol(buf, &endp, base);
        if (endp == buf)
                ret = -EINVAL;
+       else if (val < (long)INT_MIN || val > (long)INT_MAX)
+               ret = -ERANGE;
+       else
+               *result = (int) val;
        kfree(buf);
        return ret;
 }
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
deleted file mode 100644 (file)
index 8d443af..0000000
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * lib/prio_tree.c - priority search tree
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004   Initial version
- */
-
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-
-/*
- * A clever mix of heap and radix trees forms a radix priority search tree (PST)
- * which is useful for storing intervals, e.g, we can consider a vma as a closed
- * interval of file pages [offset_begin, offset_end], and store all vmas that
- * map a file in a PST. Then, using the PST, we can answer a stabbing query,
- * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a
- * given input interval X (a set of consecutive file pages), in "O(log n + m)"
- * time where 'log n' is the height of the PST, and 'm' is the number of stored
- * intervals (vmas) that overlap (map) with the input interval X (the set of
- * consecutive file pages).
- *
- * In our implementation, we store closed intervals of the form [radix_index,
- * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST
- * is designed for storing intervals with unique radix indices, i.e., each
- * interval have different radix_index. However, this limitation can be easily
- * overcome by using the size, i.e., heap_index - radix_index, as part of the
- * index, so we index the tree using [(radix_index,size), heap_index].
- *
- * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit
- * machine, the maximum height of a PST can be 64. We can use a balanced version
- * of the priority search tree to optimize the tree height, but the balanced
- * tree proposed by McCreight is too complex and memory-hungry for our purpose.
- */
-
-/*
- * The following macros are used for implementing prio_tree for i_mmap
- */
-
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)    (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)          ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-
-static void get_index(const struct prio_tree_root *root,
-    const struct prio_tree_node *node,
-    unsigned long *radix, unsigned long *heap)
-{
-       if (root->raw) {
-               struct vm_area_struct *vma = prio_tree_entry(
-                   node, struct vm_area_struct, shared.prio_tree_node);
-
-               *radix = RADIX_INDEX(vma);
-               *heap = HEAP_INDEX(vma);
-       }
-       else {
-               *radix = node->start;
-               *heap = node->last;
-       }
-}
-
-static unsigned long index_bits_to_maxindex[BITS_PER_LONG];
-
-void __init prio_tree_init(void)
-{
-       unsigned int i;
-
-       for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
-               index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
-       index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
-}
-
-/*
- * Maximum heap_index that can be stored in a PST with index_bits bits
- */
-static inline unsigned long prio_tree_maxindex(unsigned int bits)
-{
-       return index_bits_to_maxindex[bits - 1];
-}
-
-static void prio_set_parent(struct prio_tree_node *parent,
-                           struct prio_tree_node *child, bool left)
-{
-       if (left)
-               parent->left = child;
-       else
-               parent->right = child;
-
-       child->parent = parent;
-}
-
-/*
- * Extend a priority search tree so that it can store a node with heap_index
- * max_heap_index. In the worst case, this algorithm takes O((log n)^2).
- * However, this function is used rarely and the common case performance is
- * not bad.
- */
-static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root,
-               struct prio_tree_node *node, unsigned long max_heap_index)
-{
-       struct prio_tree_node *prev;
-
-       if (max_heap_index > prio_tree_maxindex(root->index_bits))
-               root->index_bits++;
-
-       prev = node;
-       INIT_PRIO_TREE_NODE(node);
-
-       while (max_heap_index > prio_tree_maxindex(root->index_bits)) {
-               struct prio_tree_node *tmp = root->prio_tree_node;
-
-               root->index_bits++;
-
-               if (prio_tree_empty(root))
-                       continue;
-
-               prio_tree_remove(root, root->prio_tree_node);
-               INIT_PRIO_TREE_NODE(tmp);
-
-               prio_set_parent(prev, tmp, true);
-               prev = tmp;
-       }
-
-       if (!prio_tree_empty(root))
-               prio_set_parent(prev, root->prio_tree_node, true);
-
-       root->prio_tree_node = node;
-       return node;
-}
-
-/*
- * Replace a prio_tree_node with a new node and return the old node
- */
-struct prio_tree_node *prio_tree_replace(struct prio_tree_root *root,
-               struct prio_tree_node *old, struct prio_tree_node *node)
-{
-       INIT_PRIO_TREE_NODE(node);
-
-       if (prio_tree_root(old)) {
-               BUG_ON(root->prio_tree_node != old);
-               /*
-                * We can reduce root->index_bits here. However, it is complex
-                * and does not help much to improve performance (IMO).
-                */
-               root->prio_tree_node = node;
-       } else
-               prio_set_parent(old->parent, node, old->parent->left == old);
-
-       if (!prio_tree_left_empty(old))
-               prio_set_parent(node, old->left, true);
-
-       if (!prio_tree_right_empty(old))
-               prio_set_parent(node, old->right, false);
-
-       return old;
-}
-
-/*
- * Insert a prio_tree_node @node into a radix priority search tree @root. The
- * algorithm typically takes O(log n) time where 'log n' is the number of bits
- * required to represent the maximum heap_index. In the worst case, the algo
- * can take O((log n)^2) - check prio_tree_expand.
- *
- * If a prior node with same radix_index and heap_index is already found in
- * the tree, then returns the address of the prior node. Otherwise, inserts
- * @node into the tree and returns @node.
- */
-struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root,
-               struct prio_tree_node *node)
-{
-       struct prio_tree_node *cur, *res = node;
-       unsigned long radix_index, heap_index;
-       unsigned long r_index, h_index, index, mask;
-       int size_flag = 0;
-
-       get_index(root, node, &radix_index, &heap_index);
-
-       if (prio_tree_empty(root) ||
-                       heap_index > prio_tree_maxindex(root->index_bits))
-               return prio_tree_expand(root, node, heap_index);
-
-       cur = root->prio_tree_node;
-       mask = 1UL << (root->index_bits - 1);
-
-       while (mask) {
-               get_index(root, cur, &r_index, &h_index);
-
-               if (r_index == radix_index && h_index == heap_index)
-                       return cur;
-
-                if (h_index < heap_index ||
-                   (h_index == heap_index && r_index > radix_index)) {
-                       struct prio_tree_node *tmp = node;
-                       node = prio_tree_replace(root, cur, node);
-                       cur = tmp;
-                       /* swap indices */
-                       index = r_index;
-                       r_index = radix_index;
-                       radix_index = index;
-                       index = h_index;
-                       h_index = heap_index;
-                       heap_index = index;
-               }
-
-               if (size_flag)
-                       index = heap_index - radix_index;
-               else
-                       index = radix_index;
-
-               if (index & mask) {
-                       if (prio_tree_right_empty(cur)) {
-                               INIT_PRIO_TREE_NODE(node);
-                               prio_set_parent(cur, node, false);
-                               return res;
-                       } else
-                               cur = cur->right;
-               } else {
-                       if (prio_tree_left_empty(cur)) {
-                               INIT_PRIO_TREE_NODE(node);
-                               prio_set_parent(cur, node, true);
-                               return res;
-                       } else
-                               cur = cur->left;
-               }
-
-               mask >>= 1;
-
-               if (!mask) {
-                       mask = 1UL << (BITS_PER_LONG - 1);
-                       size_flag = 1;
-               }
-       }
-       /* Should not reach here */
-       BUG();
-       return NULL;
-}
-
-/*
- * Remove a prio_tree_node @node from a radix priority search tree @root. The
- * algorithm takes O(log n) time where 'log n' is the number of bits required
- * to represent the maximum heap_index.
- */
-void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node)
-{
-       struct prio_tree_node *cur;
-       unsigned long r_index, h_index_right, h_index_left;
-
-       cur = node;
-
-       while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) {
-               if (!prio_tree_left_empty(cur))
-                       get_index(root, cur->left, &r_index, &h_index_left);
-               else {
-                       cur = cur->right;
-                       continue;
-               }
-
-               if (!prio_tree_right_empty(cur))
-                       get_index(root, cur->right, &r_index, &h_index_right);
-               else {
-                       cur = cur->left;
-                       continue;
-               }
-
-               /* both h_index_left and h_index_right cannot be 0 */
-               if (h_index_left >= h_index_right)
-                       cur = cur->left;
-               else
-                       cur = cur->right;
-       }
-
-       if (prio_tree_root(cur)) {
-               BUG_ON(root->prio_tree_node != cur);
-               __INIT_PRIO_TREE_ROOT(root, root->raw);
-               return;
-       }
-
-       if (cur->parent->right == cur)
-               cur->parent->right = cur->parent;
-       else
-               cur->parent->left = cur->parent;
-
-       while (cur != node)
-               cur = prio_tree_replace(root, cur->parent, cur);
-}
-
-static void iter_walk_down(struct prio_tree_iter *iter)
-{
-       iter->mask >>= 1;
-       if (iter->mask) {
-               if (iter->size_level)
-                       iter->size_level++;
-               return;
-       }
-
-       if (iter->size_level) {
-               BUG_ON(!prio_tree_left_empty(iter->cur));
-               BUG_ON(!prio_tree_right_empty(iter->cur));
-               iter->size_level++;
-               iter->mask = ULONG_MAX;
-       } else {
-               iter->size_level = 1;
-               iter->mask = 1UL << (BITS_PER_LONG - 1);
-       }
-}
-
-static void iter_walk_up(struct prio_tree_iter *iter)
-{
-       if (iter->mask == ULONG_MAX)
-               iter->mask = 1UL;
-       else if (iter->size_level == 1)
-               iter->mask = 1UL;
-       else
-               iter->mask <<= 1;
-       if (iter->size_level)
-               iter->size_level--;
-       if (!iter->size_level && (iter->value & iter->mask))
-               iter->value ^= iter->mask;
-}
-
-/*
- * Following functions help to enumerate all prio_tree_nodes in the tree that
- * overlap with the input interval X [radix_index, heap_index]. The enumeration
- * takes O(log n + m) time where 'log n' is the height of the tree (which is
- * proportional to # of bits required to represent the maximum heap_index) and
- * 'm' is the number of prio_tree_nodes that overlap the interval X.
- */
-
-static struct prio_tree_node *prio_tree_left(struct prio_tree_iter *iter,
-               unsigned long *r_index, unsigned long *h_index)
-{
-       if (prio_tree_left_empty(iter->cur))
-               return NULL;
-
-       get_index(iter->root, iter->cur->left, r_index, h_index);
-
-       if (iter->r_index <= *h_index) {
-               iter->cur = iter->cur->left;
-               iter_walk_down(iter);
-               return iter->cur;
-       }
-
-       return NULL;
-}
-
-static struct prio_tree_node *prio_tree_right(struct prio_tree_iter *iter,
-               unsigned long *r_index, unsigned long *h_index)
-{
-       unsigned long value;
-
-       if (prio_tree_right_empty(iter->cur))
-               return NULL;
-
-       if (iter->size_level)
-               value = iter->value;
-       else
-               value = iter->value | iter->mask;
-
-       if (iter->h_index < value)
-               return NULL;
-
-       get_index(iter->root, iter->cur->right, r_index, h_index);
-
-       if (iter->r_index <= *h_index) {
-               iter->cur = iter->cur->right;
-               iter_walk_down(iter);
-               return iter->cur;
-       }
-
-       return NULL;
-}
-
-static struct prio_tree_node *prio_tree_parent(struct prio_tree_iter *iter)
-{
-       iter->cur = iter->cur->parent;
-       iter_walk_up(iter);
-       return iter->cur;
-}
-
-static inline int overlap(struct prio_tree_iter *iter,
-               unsigned long r_index, unsigned long h_index)
-{
-       return iter->h_index >= r_index && iter->r_index <= h_index;
-}
-
-/*
- * prio_tree_first:
- *
- * Get the first prio_tree_node that overlaps with the interval [radix_index,
- * heap_index]. Note that always radix_index <= heap_index. We do a pre-order
- * traversal of the tree.
- */
-static struct prio_tree_node *prio_tree_first(struct prio_tree_iter *iter)
-{
-       struct prio_tree_root *root;
-       unsigned long r_index, h_index;
-
-       INIT_PRIO_TREE_ITER(iter);
-
-       root = iter->root;
-       if (prio_tree_empty(root))
-               return NULL;
-
-       get_index(root, root->prio_tree_node, &r_index, &h_index);
-
-       if (iter->r_index > h_index)
-               return NULL;
-
-       iter->mask = 1UL << (root->index_bits - 1);
-       iter->cur = root->prio_tree_node;
-
-       while (1) {
-               if (overlap(iter, r_index, h_index))
-                       return iter->cur;
-
-               if (prio_tree_left(iter, &r_index, &h_index))
-                       continue;
-
-               if (prio_tree_right(iter, &r_index, &h_index))
-                       continue;
-
-               break;
-       }
-       return NULL;
-}
-
-/*
- * prio_tree_next:
- *
- * Get the next prio_tree_node that overlaps with the input interval in iter
- */
-struct prio_tree_node *prio_tree_next(struct prio_tree_iter *iter)
-{
-       unsigned long r_index, h_index;
-
-       if (iter->cur == NULL)
-               return prio_tree_first(iter);
-
-repeat:
-       while (prio_tree_left(iter, &r_index, &h_index))
-               if (overlap(iter, r_index, h_index))
-                       return iter->cur;
-
-       while (!prio_tree_right(iter, &r_index, &h_index)) {
-               while (!prio_tree_root(iter->cur) &&
-                               iter->cur->parent->right == iter->cur)
-                       prio_tree_parent(iter);
-
-               if (prio_tree_root(iter->cur))
-                       return NULL;
-
-               prio_tree_parent(iter);
-       }
-
-       if (overlap(iter, r_index, h_index))
-               return iter->cur;
-
-       goto repeat;
-}
index d4175565dc2cb55e6de609a5b93ca4e3908b66b4..4f56a11d67fa9da105b34337280277d6fe437b2e 100644 (file)
@@ -2,7 +2,8 @@
   Red Black Trees
   (C) 1999  Andrea Arcangeli <andrea@suse.de>
   (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   linux/lib/rbtree.c
 */
 
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/export.h>
 
-static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
-{
-       struct rb_node *right = node->rb_right;
-       struct rb_node *parent = rb_parent(node);
-
-       if ((node->rb_right = right->rb_left))
-               rb_set_parent(right->rb_left, node);
-       right->rb_left = node;
-
-       rb_set_parent(right, parent);
+/*
+ * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
 
-       if (parent)
-       {
-               if (node == parent->rb_left)
-                       parent->rb_left = right;
-               else
-                       parent->rb_right = right;
-       }
-       else
-               root->rb_node = right;
-       rb_set_parent(node, right);
+static inline void rb_set_black(struct rb_node *rb)
+{
+       rb->__rb_parent_color |= RB_BLACK;
 }
 
-static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
 {
-       struct rb_node *left = node->rb_left;
-       struct rb_node *parent = rb_parent(node);
-
-       if ((node->rb_left = left->rb_right))
-               rb_set_parent(left->rb_right, node);
-       left->rb_right = node;
-
-       rb_set_parent(left, parent);
+       return (struct rb_node *)red->__rb_parent_color;
+}
 
-       if (parent)
-       {
-               if (node == parent->rb_right)
-                       parent->rb_right = left;
-               else
-                       parent->rb_left = left;
-       }
-       else
-               root->rb_node = left;
-       rb_set_parent(node, left);
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+                       struct rb_root *root, int color)
+{
+       struct rb_node *parent = rb_parent(old);
+       new->__rb_parent_color = old->__rb_parent_color;
+       rb_set_parent_color(old, new, color);
+       __rb_change_child(old, new, parent, root);
 }
 
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root,
+           void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-       struct rb_node *parent, *gparent;
-
-       while ((parent = rb_parent(node)) && rb_is_red(parent))
-       {
-               gparent = rb_parent(parent);
-
-               if (parent == gparent->rb_left)
-               {
-                       {
-                               register struct rb_node *uncle = gparent->rb_right;
-                               if (uncle && rb_is_red(uncle))
-                               {
-                                       rb_set_black(uncle);
-                                       rb_set_black(parent);
-                                       rb_set_red(gparent);
-                                       node = gparent;
-                                       continue;
-                               }
+       struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+       while (true) {
+               /*
+                * Loop invariant: node is red
+                *
+                * If there is a black parent, we are done.
+                * Otherwise, take some corrective action as we don't
+                * want a red root or two consecutive red nodes.
+                */
+               if (!parent) {
+                       rb_set_parent_color(node, NULL, RB_BLACK);
+                       break;
+               } else if (rb_is_black(parent))
+                       break;
+
+               gparent = rb_red_parent(parent);
+
+               tmp = gparent->rb_right;
+               if (parent != tmp) {    /* parent == gparent->rb_left */
+                       if (tmp && rb_is_red(tmp)) {
+                               /*
+                                * Case 1 - color flips
+                                *
+                                *       G            g
+                                *      / \          / \
+                                *     p   u  -->   P   U
+                                *    /            /
+                                *   n            N
+                                *
+                                * However, since g's parent might be red, and
+                                * 4) does not allow this, we need to recurse
+                                * at g.
+                                */
+                               rb_set_parent_color(tmp, gparent, RB_BLACK);
+                               rb_set_parent_color(parent, gparent, RB_BLACK);
+                               node = gparent;
+                               parent = rb_parent(node);
+                               rb_set_parent_color(node, parent, RB_RED);
+                               continue;
                        }
 
-                       if (parent->rb_right == node)
-                       {
-                               register struct rb_node *tmp;
-                               __rb_rotate_left(parent, root);
-                               tmp = parent;
+                       tmp = parent->rb_right;
+                       if (node == tmp) {
+                               /*
+                                * Case 2 - left rotate at parent
+                                *
+                                *      G             G
+                                *     / \           / \
+                                *    p   U  -->    n   U
+                                *     \           /
+                                *      n         p
+                                *
+                                * This still leaves us in violation of 4), the
+                                * continuation into Case 3 will fix that.
+                                */
+                               parent->rb_right = tmp = node->rb_left;
+                               node->rb_left = parent;
+                               if (tmp)
+                                       rb_set_parent_color(tmp, parent,
+                                                           RB_BLACK);
+                               rb_set_parent_color(parent, node, RB_RED);
+                               augment_rotate(parent, node);
                                parent = node;
-                               node = tmp;
+                               tmp = node->rb_right;
                        }
 
-                       rb_set_black(parent);
-                       rb_set_red(gparent);
-                       __rb_rotate_right(gparent, root);
+                       /*
+                        * Case 3 - right rotate at gparent
+                        *
+                        *        G           P
+                        *       / \         / \
+                        *      p   U  -->  n   g
+                        *     /                 \
+                        *    n                   U
+                        */
+                       gparent->rb_left = tmp;  /* == parent->rb_right */
+                       parent->rb_right = gparent;
+                       if (tmp)
+                               rb_set_parent_color(tmp, gparent, RB_BLACK);
+                       __rb_rotate_set_parents(gparent, parent, root, RB_RED);
+                       augment_rotate(gparent, parent);
+                       break;
                } else {
-                       {
-                               register struct rb_node *uncle = gparent->rb_left;
-                               if (uncle && rb_is_red(uncle))
-                               {
-                                       rb_set_black(uncle);
-                                       rb_set_black(parent);
-                                       rb_set_red(gparent);
-                                       node = gparent;
-                                       continue;
-                               }
+                       tmp = gparent->rb_left;
+                       if (tmp && rb_is_red(tmp)) {
+                               /* Case 1 - color flips */
+                               rb_set_parent_color(tmp, gparent, RB_BLACK);
+                               rb_set_parent_color(parent, gparent, RB_BLACK);
+                               node = gparent;
+                               parent = rb_parent(node);
+                               rb_set_parent_color(node, parent, RB_RED);
+                               continue;
                        }
 
-                       if (parent->rb_left == node)
-                       {
-                               register struct rb_node *tmp;
-                               __rb_rotate_right(parent, root);
-                               tmp = parent;
+                       tmp = parent->rb_left;
+                       if (node == tmp) {
+                               /* Case 2 - right rotate at parent */
+                               parent->rb_left = tmp = node->rb_right;
+                               node->rb_right = parent;
+                               if (tmp)
+                                       rb_set_parent_color(tmp, parent,
+                                                           RB_BLACK);
+                               rb_set_parent_color(parent, node, RB_RED);
+                               augment_rotate(parent, node);
                                parent = node;
-                               node = tmp;
+                               tmp = node->rb_left;
                        }
 
-                       rb_set_black(parent);
-                       rb_set_red(gparent);
-                       __rb_rotate_left(gparent, root);
+                       /* Case 3 - left rotate at gparent */
+                       gparent->rb_right = tmp;  /* == parent->rb_left */
+                       parent->rb_left = gparent;
+                       if (tmp)
+                               rb_set_parent_color(tmp, gparent, RB_BLACK);
+                       __rb_rotate_set_parents(gparent, parent, root, RB_RED);
+                       augment_rotate(gparent, parent);
+                       break;
                }
        }
-
-       rb_set_black(root->rb_node);
 }
-EXPORT_SYMBOL(rb_insert_color);
 
-static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
-                            struct rb_root *root)
+__always_inline void
+__rb_erase_color(struct rb_node *parent, struct rb_root *root,
+       void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-       struct rb_node *other;
-
-       while ((!node || rb_is_black(node)) && node != root->rb_node)
-       {
-               if (parent->rb_left == node)
-               {
-                       other = parent->rb_right;
-                       if (rb_is_red(other))
-                       {
-                               rb_set_black(other);
-                               rb_set_red(parent);
-                               __rb_rotate_left(parent, root);
-                               other = parent->rb_right;
+       struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
+
+       while (true) {
+               /*
+                * Loop invariants:
+                * - node is black (or NULL on first iteration)
+                * - node is not the root (parent is not NULL)
+                * - All leaf paths going through parent and node have a
+                *   black node count that is 1 lower than other leaf paths.
+                */
+               sibling = parent->rb_right;
+               if (node != sibling) {  /* node == parent->rb_left */
+                       if (rb_is_red(sibling)) {
+                               /*
+                                * Case 1 - left rotate at parent
+                                *
+                                *     P               S
+                                *    / \             / \
+                                *   N   s    -->    p   Sr
+                                *      / \         / \
+                                *     Sl  Sr      N   Sl
+                                */
+                               parent->rb_right = tmp1 = sibling->rb_left;
+                               sibling->rb_left = parent;
+                               rb_set_parent_color(tmp1, parent, RB_BLACK);
+                               __rb_rotate_set_parents(parent, sibling, root,
+                                                       RB_RED);
+                               augment_rotate(parent, sibling);
+                               sibling = tmp1;
                        }
-                       if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-                           (!other->rb_right || rb_is_black(other->rb_right)))
-                       {
-                               rb_set_red(other);
-                               node = parent;
-                               parent = rb_parent(node);
-                       }
-                       else
-                       {
-                               if (!other->rb_right || rb_is_black(other->rb_right))
-                               {
-                                       rb_set_black(other->rb_left);
-                                       rb_set_red(other);
-                                       __rb_rotate_right(other, root);
-                                       other = parent->rb_right;
+                       tmp1 = sibling->rb_right;
+                       if (!tmp1 || rb_is_black(tmp1)) {
+                               tmp2 = sibling->rb_left;
+                               if (!tmp2 || rb_is_black(tmp2)) {
+                                       /*
+                                        * Case 2 - sibling color flip
+                                        * (p could be either color here)
+                                        *
+                                        *    (p)           (p)
+                                        *    / \           / \
+                                        *   N   S    -->  N   s
+                                        *      / \           / \
+                                        *     Sl  Sr        Sl  Sr
+                                        *
+                                        * This leaves us violating 5) which
+                                        * can be fixed by flipping p to black
+                                        * if it was red, or by recursing at p.
+                                        * p is red when coming from Case 1.
+                                        */
+                                       rb_set_parent_color(sibling, parent,
+                                                           RB_RED);
+                                       if (rb_is_red(parent))
+                                               rb_set_black(parent);
+                                       else {
+                                               node = parent;
+                                               parent = rb_parent(node);
+                                               if (parent)
+                                                       continue;
+                                       }
+                                       break;
                                }
-                               rb_set_color(other, rb_color(parent));
-                               rb_set_black(parent);
-                               rb_set_black(other->rb_right);
-                               __rb_rotate_left(parent, root);
-                               node = root->rb_node;
-                               break;
-                       }
-               }
-               else
-               {
-                       other = parent->rb_left;
-                       if (rb_is_red(other))
-                       {
-                               rb_set_black(other);
-                               rb_set_red(parent);
-                               __rb_rotate_right(parent, root);
-                               other = parent->rb_left;
+                               /*
+                                * Case 3 - right rotate at sibling
+                                * (p could be either color here)
+                                *
+                                *   (p)           (p)
+                                *   / \           / \
+                                *  N   S    -->  N   Sl
+                                *     / \             \
+                                *    sl  Sr            s
+                                *                       \
+                                *                        Sr
+                                */
+                               sibling->rb_left = tmp1 = tmp2->rb_right;
+                               tmp2->rb_right = sibling;
+                               parent->rb_right = tmp2;
+                               if (tmp1)
+                                       rb_set_parent_color(tmp1, sibling,
+                                                           RB_BLACK);
+                               augment_rotate(sibling, tmp2);
+                               tmp1 = sibling;
+                               sibling = tmp2;
                        }
-                       if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-                           (!other->rb_right || rb_is_black(other->rb_right)))
-                       {
-                               rb_set_red(other);
-                               node = parent;
-                               parent = rb_parent(node);
+                       /*
+                        * Case 4 - left rotate at parent + color flips
+                        * (p and sl could be either color here.
+                        *  After rotation, p becomes black, s acquires
+                        *  p's color, and sl keeps its color)
+                        *
+                        *      (p)             (s)
+                        *      / \             / \
+                        *     N   S     -->   P   Sr
+                        *        / \         / \
+                        *      (sl) sr      N  (sl)
+                        */
+                       parent->rb_right = tmp2 = sibling->rb_left;
+                       sibling->rb_left = parent;
+                       rb_set_parent_color(tmp1, sibling, RB_BLACK);
+                       if (tmp2)
+                               rb_set_parent(tmp2, parent);
+                       __rb_rotate_set_parents(parent, sibling, root,
+                                               RB_BLACK);
+                       augment_rotate(parent, sibling);
+                       break;
+               } else {
+                       sibling = parent->rb_left;
+                       if (rb_is_red(sibling)) {
+                               /* Case 1 - right rotate at parent */
+                               parent->rb_left = tmp1 = sibling->rb_right;
+                               sibling->rb_right = parent;
+                               rb_set_parent_color(tmp1, parent, RB_BLACK);
+                               __rb_rotate_set_parents(parent, sibling, root,
+                                                       RB_RED);
+                               augment_rotate(parent, sibling);
+                               sibling = tmp1;
                        }
-                       else
-                       {
-                               if (!other->rb_left || rb_is_black(other->rb_left))
-                               {
-                                       rb_set_black(other->rb_right);
-                                       rb_set_red(other);
-                                       __rb_rotate_left(other, root);
-                                       other = parent->rb_left;
+                       tmp1 = sibling->rb_left;
+                       if (!tmp1 || rb_is_black(tmp1)) {
+                               tmp2 = sibling->rb_right;
+                               if (!tmp2 || rb_is_black(tmp2)) {
+                                       /* Case 2 - sibling color flip */
+                                       rb_set_parent_color(sibling, parent,
+                                                           RB_RED);
+                                       if (rb_is_red(parent))
+                                               rb_set_black(parent);
+                                       else {
+                                               node = parent;
+                                               parent = rb_parent(node);
+                                               if (parent)
+                                                       continue;
+                                       }
+                                       break;
                                }
-                               rb_set_color(other, rb_color(parent));
-                               rb_set_black(parent);
-                               rb_set_black(other->rb_left);
-                               __rb_rotate_right(parent, root);
-                               node = root->rb_node;
-                               break;
+                               /* Case 3 - right rotate at sibling */
+                               sibling->rb_right = tmp1 = tmp2->rb_left;
+                               tmp2->rb_left = sibling;
+                               parent->rb_left = tmp2;
+                               if (tmp1)
+                                       rb_set_parent_color(tmp1, sibling,
+                                                           RB_BLACK);
+                               augment_rotate(sibling, tmp2);
+                               tmp1 = sibling;
+                               sibling = tmp2;
                        }
+                       /* Case 4 - left rotate at parent + color flips */
+                       parent->rb_left = tmp2 = sibling->rb_right;
+                       sibling->rb_right = parent;
+                       rb_set_parent_color(tmp1, sibling, RB_BLACK);
+                       if (tmp2)
+                               rb_set_parent(tmp2, parent);
+                       __rb_rotate_set_parents(parent, sibling, root,
+                                               RB_BLACK);
+                       augment_rotate(parent, sibling);
+                       break;
                }
        }
-       if (node)
-               rb_set_black(node);
 }
+EXPORT_SYMBOL(__rb_erase_color);
 
-void rb_erase(struct rb_node *node, struct rb_root *root)
-{
-       struct rb_node *child, *parent;
-       int color;
-
-       if (!node->rb_left)
-               child = node->rb_right;
-       else if (!node->rb_right)
-               child = node->rb_left;
-       else
-       {
-               struct rb_node *old = node, *left;
-
-               node = node->rb_right;
-               while ((left = node->rb_left) != NULL)
-                       node = left;
-
-               if (rb_parent(old)) {
-                       if (rb_parent(old)->rb_left == old)
-                               rb_parent(old)->rb_left = node;
-                       else
-                               rb_parent(old)->rb_right = node;
-               } else
-                       root->rb_node = node;
-
-               child = node->rb_right;
-               parent = rb_parent(node);
-               color = rb_color(node);
-
-               if (parent == old) {
-                       parent = node;
-               } else {
-                       if (child)
-                               rb_set_parent(child, parent);
-                       parent->rb_left = child;
-
-                       node->rb_right = old->rb_right;
-                       rb_set_parent(old->rb_right, node);
-               }
-
-               node->rb_parent_color = old->rb_parent_color;
-               node->rb_left = old->rb_left;
-               rb_set_parent(old->rb_left, node);
+/*
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+ */
 
-               goto color;
-       }
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
 
-       parent = rb_parent(node);
-       color = rb_color(node);
-
-       if (child)
-               rb_set_parent(child, parent);
-       if (parent)
-       {
-               if (parent->rb_left == node)
-                       parent->rb_left = child;
-               else
-                       parent->rb_right = child;
-       }
-       else
-               root->rb_node = child;
+static const struct rb_augment_callbacks dummy_callbacks = {
+       dummy_propagate, dummy_copy, dummy_rotate
+};
 
- color:
-       if (color == RB_BLACK)
-               __rb_erase_color(child, parent, root);
-}
-EXPORT_SYMBOL(rb_erase);
-
-static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data)
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-       struct rb_node *parent;
-
-up:
-       func(node, data);
-       parent = rb_parent(node);
-       if (!parent)
-               return;
-
-       if (node == parent->rb_left && parent->rb_right)
-               func(parent->rb_right, data);
-       else if (parent->rb_left)
-               func(parent->rb_left, data);
-
-       node = parent;
-       goto up;
+       __rb_insert(node, root, dummy_rotate);
 }
+EXPORT_SYMBOL(rb_insert_color);
 
-/*
- * after inserting @node into the tree, update the tree to account for
- * both the new entry and any damage done by rebalance
- */
-void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data)
+void rb_erase(struct rb_node *node, struct rb_root *root)
 {
-       if (node->rb_left)
-               node = node->rb_left;
-       else if (node->rb_right)
-               node = node->rb_right;
-
-       rb_augment_path(node, func, data);
+       rb_erase_augmented(node, root, &dummy_callbacks);
 }
-EXPORT_SYMBOL(rb_augment_insert);
+EXPORT_SYMBOL(rb_erase);
 
 /*
- * before removing the node, find the deepest node on the rebalance path
- * that will still be there after @node gets removed
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
  */
-struct rb_node *rb_augment_erase_begin(struct rb_node *node)
-{
-       struct rb_node *deepest;
-
-       if (!node->rb_right && !node->rb_left)
-               deepest = rb_parent(node);
-       else if (!node->rb_right)
-               deepest = node->rb_left;
-       else if (!node->rb_left)
-               deepest = node->rb_right;
-       else {
-               deepest = rb_next(node);
-               if (deepest->rb_right)
-                       deepest = deepest->rb_right;
-               else if (rb_parent(deepest) != node)
-                       deepest = rb_parent(deepest);
-       }
-
-       return deepest;
-}
-EXPORT_SYMBOL(rb_augment_erase_begin);
 
-/*
- * after removal, update the tree to account for the removed entry
- * and any rebalance damage.
- */
-void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data)
+void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+       void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-       if (node)
-               rb_augment_path(node, func, data);
+       __rb_insert(node, root, augment_rotate);
 }
-EXPORT_SYMBOL(rb_augment_erase_end);
+EXPORT_SYMBOL(__rb_insert_augmented);
 
 /*
  * This function returns the first node (in sort order) of the tree.
@@ -387,11 +431,13 @@ struct rb_node *rb_next(const struct rb_node *node)
 {
        struct rb_node *parent;
 
-       if (rb_parent(node) == node)
+       if (RB_EMPTY_NODE(node))
                return NULL;
 
-       /* If we have a right-hand child, go down and then left as far
-          as we can. */
+       /*
+        * If we have a right-hand child, go down and then left as far
+        * as we can.
+        */
        if (node->rb_right) {
                node = node->rb_right; 
                while (node->rb_left)
@@ -399,12 +445,13 @@ struct rb_node *rb_next(const struct rb_node *node)
                return (struct rb_node *)node;
        }
 
-       /* No right-hand children.  Everything down and left is
-          smaller than us, so any 'next' node must be in the general
-          direction of our parent. Go up the tree; any time the
-          ancestor is a right-hand child of its parent, keep going
-          up. First time it's a left-hand child of its parent, said
-          parent is our 'next' node. */
+       /*
+        * No right-hand children. Everything down and left is smaller than us,
+        * so any 'next' node must be in the general direction of our parent.
+        * Go up the tree; any time the ancestor is a right-hand child of its
+        * parent, keep going up. First time it's a left-hand child of its
+        * parent, said parent is our 'next' node.
+        */
        while ((parent = rb_parent(node)) && node == parent->rb_right)
                node = parent;
 
@@ -416,11 +463,13 @@ struct rb_node *rb_prev(const struct rb_node *node)
 {
        struct rb_node *parent;
 
-       if (rb_parent(node) == node)
+       if (RB_EMPTY_NODE(node))
                return NULL;
 
-       /* If we have a left-hand child, go down and then right as far
-          as we can. */
+       /*
+        * If we have a left-hand child, go down and then right as far
+        * as we can.
+        */
        if (node->rb_left) {
                node = node->rb_left; 
                while (node->rb_right)
@@ -428,8 +477,10 @@ struct rb_node *rb_prev(const struct rb_node *node)
                return (struct rb_node *)node;
        }
 
-       /* No left-hand children. Go up till we find an ancestor which
-          is a right-hand child of its parent */
+       /*
+        * No left-hand children. Go up till we find an ancestor which
+        * is a right-hand child of its parent.
+        */
        while ((parent = rb_parent(node)) && node == parent->rb_left)
                node = parent;
 
@@ -443,14 +494,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
        struct rb_node *parent = rb_parent(victim);
 
        /* Set the surrounding nodes to point to the replacement */
-       if (parent) {
-               if (victim == parent->rb_left)
-                       parent->rb_left = new;
-               else
-                       parent->rb_right = new;
-       } else {
-               root->rb_node = new;
-       }
+       __rb_change_child(victim, new, parent, root);
        if (victim->rb_left)
                rb_set_parent(victim->rb_left, new);
        if (victim->rb_right)
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
new file mode 100644 (file)
index 0000000..268b239
--- /dev/null
@@ -0,0 +1,234 @@
+#include <linux/module.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/random.h>
+#include <asm/timex.h>
+
+#define NODES       100
+#define PERF_LOOPS  100000
+#define CHECK_LOOPS 100
+
+struct test_node {
+       struct rb_node rb;
+       u32 key;
+
+       /* following fields used for testing augmented rbtree functionality */
+       u32 val;
+       u32 augmented;
+};
+
+static struct rb_root root = RB_ROOT;
+static struct test_node nodes[NODES];
+
+static struct rnd_state rnd;
+
+static void insert(struct test_node *node, struct rb_root *root)
+{
+       struct rb_node **new = &root->rb_node, *parent = NULL;
+       u32 key = node->key;
+
+       while (*new) {
+               parent = *new;
+               if (key < rb_entry(parent, struct test_node, rb)->key)
+                       new = &parent->rb_left;
+               else
+                       new = &parent->rb_right;
+       }
+
+       rb_link_node(&node->rb, parent, new);
+       rb_insert_color(&node->rb, root);
+}
+
+static inline void erase(struct test_node *node, struct rb_root *root)
+{
+       rb_erase(&node->rb, root);
+}
+
+static inline u32 augment_recompute(struct test_node *node)
+{
+       u32 max = node->val, child_augmented;
+       if (node->rb.rb_left) {
+               child_augmented = rb_entry(node->rb.rb_left, struct test_node,
+                                          rb)->augmented;
+               if (max < child_augmented)
+                       max = child_augmented;
+       }
+       if (node->rb.rb_right) {
+               child_augmented = rb_entry(node->rb.rb_right, struct test_node,
+                                          rb)->augmented;
+               if (max < child_augmented)
+                       max = child_augmented;
+       }
+       return max;
+}
+
+RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb,
+                    u32, augmented, augment_recompute)
+
+static void insert_augmented(struct test_node *node, struct rb_root *root)
+{
+       struct rb_node **new = &root->rb_node, *rb_parent = NULL;
+       u32 key = node->key;
+       u32 val = node->val;
+       struct test_node *parent;
+
+       while (*new) {
+               rb_parent = *new;
+               parent = rb_entry(rb_parent, struct test_node, rb);
+               if (parent->augmented < val)
+                       parent->augmented = val;
+               if (key < parent->key)
+                       new = &parent->rb.rb_left;
+               else
+                       new = &parent->rb.rb_right;
+       }
+
+       node->augmented = val;
+       rb_link_node(&node->rb, rb_parent, new);
+       rb_insert_augmented(&node->rb, root, &augment_callbacks);
+}
+
+static void erase_augmented(struct test_node *node, struct rb_root *root)
+{
+       rb_erase_augmented(&node->rb, root, &augment_callbacks);
+}
+
+static void init(void)
+{
+       int i;
+       for (i = 0; i < NODES; i++) {
+               nodes[i].key = prandom32(&rnd);
+               nodes[i].val = prandom32(&rnd);
+       }
+}
+
+static bool is_red(struct rb_node *rb)
+{
+       return !(rb->__rb_parent_color & 1);
+}
+
+static int black_path_count(struct rb_node *rb)
+{
+       int count;
+       for (count = 0; rb; rb = rb_parent(rb))
+               count += !is_red(rb);
+       return count;
+}
+
+static void check(int nr_nodes)
+{
+       struct rb_node *rb;
+       int count = 0;
+       int blacks;
+       u32 prev_key = 0;
+
+       for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+               struct test_node *node = rb_entry(rb, struct test_node, rb);
+               WARN_ON_ONCE(node->key < prev_key);
+               WARN_ON_ONCE(is_red(rb) &&
+                            (!rb_parent(rb) || is_red(rb_parent(rb))));
+               if (!count)
+                       blacks = black_path_count(rb);
+               else
+                       WARN_ON_ONCE((!rb->rb_left || !rb->rb_right) &&
+                                    blacks != black_path_count(rb));
+               prev_key = node->key;
+               count++;
+       }
+       WARN_ON_ONCE(count != nr_nodes);
+}
+
+static void check_augmented(int nr_nodes)
+{
+       struct rb_node *rb;
+
+       check(nr_nodes);
+       for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+               struct test_node *node = rb_entry(rb, struct test_node, rb);
+               WARN_ON_ONCE(node->augmented != augment_recompute(node));
+       }
+}
+
+static int rbtree_test_init(void)
+{
+       int i, j;
+       cycles_t time1, time2, time;
+
+       printk(KERN_ALERT "rbtree testing");
+
+       prandom32_seed(&rnd, 3141592653589793238ULL);
+       init();
+
+       time1 = get_cycles();
+
+       for (i = 0; i < PERF_LOOPS; i++) {
+               for (j = 0; j < NODES; j++)
+                       insert(nodes + j, &root);
+               for (j = 0; j < NODES; j++)
+                       erase(nodes + j, &root);
+       }
+
+       time2 = get_cycles();
+       time = time2 - time1;
+
+       time = div_u64(time, PERF_LOOPS);
+       printk(" -> %llu cycles\n", (unsigned long long)time);
+
+       for (i = 0; i < CHECK_LOOPS; i++) {
+               init();
+               for (j = 0; j < NODES; j++) {
+                       check(j);
+                       insert(nodes + j, &root);
+               }
+               for (j = 0; j < NODES; j++) {
+                       check(NODES - j);
+                       erase(nodes + j, &root);
+               }
+               check(0);
+       }
+
+       printk(KERN_ALERT "augmented rbtree testing");
+
+       init();
+
+       time1 = get_cycles();
+
+       for (i = 0; i < PERF_LOOPS; i++) {
+               for (j = 0; j < NODES; j++)
+                       insert_augmented(nodes + j, &root);
+               for (j = 0; j < NODES; j++)
+                       erase_augmented(nodes + j, &root);
+       }
+
+       time2 = get_cycles();
+       time = time2 - time1;
+
+       time = div_u64(time, PERF_LOOPS);
+       printk(" -> %llu cycles\n", (unsigned long long)time);
+
+       for (i = 0; i < CHECK_LOOPS; i++) {
+               init();
+               for (j = 0; j < NODES; j++) {
+                       check_augmented(j);
+                       insert_augmented(nodes + j, &root);
+               }
+               for (j = 0; j < NODES; j++) {
+                       check_augmented(NODES - j);
+                       erase_augmented(nodes + j, &root);
+               }
+               check_augmented(0);
+       }
+
+       return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void rbtree_test_exit(void)
+{
+       printk(KERN_ALERT "test exit\n");
+}
+
+module_init(rbtree_test_init)
+module_exit(rbtree_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michel Lespinasse");
+MODULE_DESCRIPTION("Red Black Tree test");
index 0e337541f005d8f29a36434dae555b7477d12164..d4cce101083c4bc11a5ea5d8df7576c042597990 100644 (file)
@@ -174,35 +174,27 @@ char *put_dec_trunc8(char *buf, unsigned r)
        unsigned q;
 
        /* Copy of previous function's body with added early returns */
-       q      = (r * (uint64_t)0x1999999a) >> 32;
-       *buf++ = (r - 10 * q) + '0'; /* 2 */
-       if (q == 0)
-               return buf;
-       r      = (q * (uint64_t)0x1999999a) >> 32;
-       *buf++ = (q - 10 * r) + '0'; /* 3 */
-       if (r == 0)
-               return buf;
-       q      = (r * (uint64_t)0x1999999a) >> 32;
-       *buf++ = (r - 10 * q) + '0'; /* 4 */
-       if (q == 0)
-               return buf;
-       r      = (q * (uint64_t)0x1999999a) >> 32;
-       *buf++ = (q - 10 * r) + '0'; /* 5 */
+       while (r >= 10000) {
+               q = r + '0';
+               r  = (r * (uint64_t)0x1999999a) >> 32;
+               *buf++ = q - 10*r;
+       }
+
        if (r == 0)
                return buf;
-       q      = (r * 0x199a) >> 16;
-       *buf++ = (r - 10 * q)  + '0'; /* 6 */
+       q      = (r * 0x199a) >> 16;    /* r <= 9999 */
+       *buf++ = (r - 10 * q)  + '0';
        if (q == 0)
                return buf;
-       r      = (q * 0xcd) >> 11;
-       *buf++ = (q - 10 * r)  + '0'; /* 7 */
+       r      = (q * 0xcd) >> 11;      /* q <= 999 */
+       *buf++ = (q - 10 * r)  + '0';
        if (r == 0)
                return buf;
-       q      = (r * 0xcd) >> 11;
-       *buf++ = (r - 10 * q) + '0'; /* 8 */
+       q      = (r * 0xcd) >> 11;      /* r <= 99 */
+       *buf++ = (r - 10 * q) + '0';
        if (q == 0)
                return buf;
-       *buf++ = q + '0'; /* 9 */
+       *buf++ = q + '0';               /* q <= 9 */
        return buf;
 }
 
@@ -243,18 +235,34 @@ char *put_dec(char *buf, unsigned long long n)
 
 /* Second algorithm: valid only for 64-bit long longs */
 
+/* See comment in put_dec_full9 for choice of constants */
 static noinline_for_stack
-char *put_dec_full4(char *buf, unsigned q)
+void put_dec_full4(char *buf, unsigned q)
 {
        unsigned r;
-       r      = (q * 0xcccd) >> 19;
-       *buf++ = (q - 10 * r) + '0';
-       q      = (r * 0x199a) >> 16;
-       *buf++ = (r - 10 * q)  + '0';
+       r      = (q * 0xccd) >> 15;
+       buf[0] = (q - 10 * r) + '0';
+       q      = (r * 0xcd) >> 11;
+       buf[1] = (r - 10 * q)  + '0';
        r      = (q * 0xcd) >> 11;
-       *buf++ = (q - 10 * r)  + '0';
-       *buf++ = r + '0';
-       return buf;
+       buf[2] = (q - 10 * r)  + '0';
+       buf[3] = r + '0';
+}
+
+/*
+ * Call put_dec_full4 on x % 10000, return x / 10000.
+ * The approximation x/10000 == (x * 0x346DC5D7) >> 43
+ * holds for all x < 1,128,869,999.  The largest value this
+ * helper will ever be asked to convert is 1,125,520,955.
+ * (d1 in the put_dec code, assuming n is all-ones).
+ */
+static
+unsigned put_dec_helper4(char *buf, unsigned x)
+{
+        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
+
+        put_dec_full4(buf, x - q * 10000);
+        return q;
 }
 
 /* Based on code by Douglas W. Jones found at
@@ -276,28 +284,19 @@ char *put_dec(char *buf, unsigned long long n)
        d3  = (h >> 16); /* implicit "& 0xffff" */
 
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
+       q = put_dec_helper4(buf, q);
+
+       q += 7671 * d3 + 9496 * d2 + 6 * d1;
+       q = put_dec_helper4(buf+4, q);
+
+       q += 4749 * d3 + 42 * d2;
+       q = put_dec_helper4(buf+8, q);
 
-       buf = put_dec_full4(buf, q % 10000);
-       q   = q / 10000;
-
-       d1  = q + 7671 * d3 + 9496 * d2 + 6 * d1;
-       buf = put_dec_full4(buf, d1 % 10000);
-       q   = d1 / 10000;
-
-       d2  = q + 4749 * d3 + 42 * d2;
-       buf = put_dec_full4(buf, d2 % 10000);
-       q   = d2 / 10000;
-
-       d3  = q + 281 * d3;
-       if (!d3)
-               goto done;
-       buf = put_dec_full4(buf, d3 % 10000);
-       q   = d3 / 10000;
-       if (!q)
-               goto done;
-       buf = put_dec_full4(buf, q);
- done:
-       while (buf[-1] == '0')
+       q += 281 * d3;
+       buf += 12;
+       if (q)
+               buf = put_dec_trunc8(buf, q);
+       else while (buf[-1] == '0')
                --buf;
 
        return buf;
@@ -990,7 +989,7 @@ int kptr_restrict __read_mostly;
  * - 'm' For a 6-byte MAC address, it prints the hex address without colons
  * - 'MF' For a 6-byte MAC FDDI address, it prints the address
  *       with a dash-separated hex notation
- * - '[mM]R For a 6-byte MAC address, Reverse order (Bluetooth)
+ * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
  * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
  *       IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
  *       IPv6 uses colon separated network-order 16 bit hex with leading 0's
@@ -1341,7 +1340,10 @@ qualifier:
  * %pR output the address range in a struct resource with decoded flags
  * %pr output the address range in a struct resource with raw flags
  * %pM output a 6-byte MAC address with colons
+ * %pMR output a 6-byte MAC address with colons in reversed order
+ * %pMF output a 6-byte MAC address with dashes
  * %pm output a 6-byte MAC address without colons
+ * %pmR output a 6-byte MAC address without colons in reversed order
  * %pI4 print an IPv4 address without leading zeros
  * %pi4 print an IPv4 address with leading zeros
  * %pI6 print an IPv6 address with colons
index d5c8019c662793886224b872cc494e1ae3d00842..3322342a1ffb0796708e2045571472235e502927 100644 (file)
@@ -318,7 +318,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
        bool "Transparent Hugepage Support"
-       depends on X86 && MMU
+       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select COMPACTION
        help
          Transparent Hugepages allows the kernel to use huge pages and
index 92753e2d82dac41fe9a8c2f4633189ad723bbd07..6b025f80af34c50eb10a424a085a7043156d4370 100644 (file)
@@ -14,9 +14,9 @@ endif
 obj-y                  := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
-                          prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+                          util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                          compaction.o $(mmu-y)
+                          compaction.o interval_tree.o $(mmu-y)
 
 obj-y += init-mm.o
 
index 7fcd3a52e68d4b2a9bfc07c1056b7db3a2b154b1..3bb723201198d0507006feb89e0376c449908b13 100644 (file)
@@ -70,14 +70,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
 
                /* async aborts if taking too long or contended */
                if (!cc->sync) {
-                       if (cc->contended)
-                               *cc->contended = true;
+                       cc->contended = true;
                        return false;
                }
 
                cond_resched();
-               if (fatal_signal_pending(current))
-                       return false;
        }
 
        if (!locked)
@@ -91,6 +88,60 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
        return compact_checklock_irqsave(lock, flags, false, cc);
 }
 
+static void compact_capture_page(struct compact_control *cc)
+{
+       unsigned long flags;
+       int mtype, mtype_low, mtype_high;
+
+       if (!cc->page || *cc->page)
+               return;
+
+       /*
+        * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+        * regardless of the migratetype of the freelist is is captured from.
+        * This is fine because the order for a high-order MIGRATE_MOVABLE
+        * allocation is typically at least a pageblock size and overall
+        * fragmentation is not impaired. Other allocation types must
+        * capture pages from their own migratelist because otherwise they
+        * could pollute other pageblocks like MIGRATE_MOVABLE with
+        * difficult to move pages and making fragmentation worse overall.
+        */
+       if (cc->migratetype == MIGRATE_MOVABLE) {
+               mtype_low = 0;
+               mtype_high = MIGRATE_PCPTYPES;
+       } else {
+               mtype_low = cc->migratetype;
+               mtype_high = cc->migratetype + 1;
+       }
+
+       /* Speculatively examine the free lists without zone lock */
+       for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+               int order;
+               for (order = cc->order; order < MAX_ORDER; order++) {
+                       struct page *page;
+                       struct free_area *area;
+                       area = &(cc->zone->free_area[order]);
+                       if (list_empty(&area->free_list[mtype]))
+                               continue;
+
+                       /* Take the lock and attempt capture of the page */
+                       if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+                               return;
+                       if (!list_empty(&area->free_list[mtype])) {
+                               page = list_entry(area->free_list[mtype].next,
+                                                       struct page, lru);
+                               if (capture_free_page(page, cc->order, mtype)) {
+                                       spin_unlock_irqrestore(&cc->zone->lock,
+                                                                       flags);
+                                       *cc->page = page;
+                                       return;
+                               }
+                       }
+                       spin_unlock_irqrestore(&cc->zone->lock, flags);
+               }
+       }
+}
+
 /*
  * Isolate free pages onto a private freelist. Caller must hold zone->lock.
  * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -296,8 +347,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 
        /* Time to isolate some pages for migration */
        cond_resched();
-       spin_lock_irqsave(&zone->lru_lock, flags);
-       locked = true;
+       locked = compact_trylock_irqsave(&zone->lru_lock, &flags, cc);
+       if (!locked)
+               return 0;
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
 
@@ -310,7 +362,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                /* Check if it is ok to still hold the lock */
                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
                                                                locked, cc);
-               if (!locked)
+               if (!locked || fatal_signal_pending(current))
                        break;
 
                /*
@@ -634,7 +686,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 
        /* Perform the isolation */
        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
-       if (!low_pfn)
+       if (!low_pfn || cc->contended)
                return ISOLATE_ABORT;
 
        cc->migrate_pfn = low_pfn;
@@ -645,7 +697,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
-       unsigned int order;
        unsigned long watermark;
 
        if (fatal_signal_pending(current))
@@ -688,14 +739,22 @@ static int compact_finished(struct zone *zone,
                return COMPACT_CONTINUE;
 
        /* Direct compactor: Is a suitable page free? */
-       for (order = cc->order; order < MAX_ORDER; order++) {
-               /* Job done if page is free of the right migratetype */
-               if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
-                       return COMPACT_PARTIAL;
-
-               /* Job done if allocation would set block type */
-               if (order >= pageblock_order && zone->free_area[order].nr_free)
+       if (cc->page) {
+               /* Was a suitable page captured? */
+               if (*cc->page)
                        return COMPACT_PARTIAL;
+       } else {
+               unsigned int order;
+               for (order = cc->order; order < MAX_ORDER; order++) {
+                       struct free_area *area = &zone->free_area[cc->order];
+                       /* Job done if page is free of the right migratetype */
+                       if (!list_empty(&area->free_list[cc->migratetype]))
+                               return COMPACT_PARTIAL;
+
+                       /* Job done if allocation would set block type */
+                       if (cc->order >= pageblock_order && area->nr_free)
+                               return COMPACT_PARTIAL;
+               }
        }
 
        return COMPACT_CONTINUE;
@@ -817,6 +876,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                goto out;
                        }
                }
+
+               /* Capture a page now if it is a suitable size */
+               compact_capture_page(cc);
        }
 
 out:
@@ -829,8 +891,10 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                bool sync, bool *contended)
+                                bool sync, bool *contended,
+                                struct page **page)
 {
+       unsigned long ret;
        struct compact_control cc = {
                .nr_freepages = 0,
                .nr_migratepages = 0,
@@ -838,12 +902,14 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
-               .contended = contended,
+               .page = page,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
 
-       return compact_zone(zone, &cc);
+       ret = compact_zone(zone, &cc);
+       *contended = cc.contended;
+       return ret;
 }
 
 int sysctl_extfrag_threshold = 500;
@@ -855,12 +921,14 @@ int sysctl_extfrag_threshold = 500;
  * @gfp_mask: The GFP mask of the current allocation
  * @nodemask: The allowed nodes to allocate from
  * @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
  *
  * This is the main entry point for direct page compaction.
  */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                       bool sync, bool *contended)
+                       bool sync, bool *contended, struct page **page)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -869,11 +937,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
 
-       /*
-        * Check whether it is worth even starting compaction. The order check is
-        * made because an assumption is made that the page allocator can satisfy
-        * the "cheaper" orders without taking special steps
-        */
+       /* Check if the GFP flags allow compaction */
        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
 
@@ -885,7 +949,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                int status;
 
                status = compact_zone_order(zone, order, gfp_mask, sync,
-                                               contended);
+                                               contended, page);
                rc = max(status, rc);
 
                /* If a normal allocation would succeed, stop compacting */
@@ -940,6 +1004,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
        struct compact_control cc = {
                .order = order,
                .sync = false,
+               .page = NULL,
        };
 
        return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1015,7 @@ static int compact_node(int nid)
        struct compact_control cc = {
                .order = -1,
                .sync = true,
+               .page = NULL,
        };
 
        return __compact_pgdat(NODE_DATA(nid), &cc);
index 384344575c375e1a1464734670218557c31e0e27..83efee76a5c0b12c577903346eca32f7ca49be18 100644 (file)
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         * Do we have something in the page cache already?
         */
        page = find_get_page(mapping, offset);
-       if (likely(page)) {
+       if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
                /*
                 * We found the page, so try async readahead before
                 * waiting for the lock.
                 */
                do_async_mmap_readahead(vma, ra, file, page, offset);
-       } else {
+       } else if (!page) {
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+       .remap_pages    = generic_file_remap_pages,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 
index 13e013b1270c6c240ba19eadb6e091e8c6afb13d..a52daee11d3feed2bf692a86f9df059f41effe98 100644 (file)
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
-       struct prio_tree_iter iter;
        unsigned long address;
        pte_t *pte;
        pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
 
 retry:
        mutex_lock(&mapping->i_mmap_mutex);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -305,6 +304,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
        .fault  = xip_file_fault,
        .page_mkwrite   = filemap_page_mkwrite,
+       .remap_pages = generic_file_remap_pages,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +313,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 
        file_accessed(file);
        vma->vm_ops = &xip_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+       vma->vm_flags |= VM_MIXEDMAP;
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
index 9ed4fd432467ee45a5310cd152fbba4399cbf722..a96e1b238255cd9d02850c2c578353a903b785e0 100644 (file)
@@ -5,6 +5,7 @@
  *
  * started by Ingo Molnar, Copyright (C) 2002, 2003
  */
+#include <linux/export.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
        return err;
 }
 
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long addr, unsigned long size, pgoff_t pgoff)
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+                            unsigned long size, pgoff_t pgoff)
 {
+       struct mm_struct *mm = vma->vm_mm;
        int err;
 
        do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
                pgoff++;
        } while (size);
 
-        return 0;
-
+       return 0;
 }
+EXPORT_SYMBOL(generic_file_remap_pages);
 
 /**
  * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
                goto out;
 
-       if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+       if (!vma->vm_ops->remap_pages)
                goto out;
 
        if (start < vma->vm_start || start + size > vma->vm_end)
@@ -213,7 +215,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
-               vma_prio_tree_remove(vma, &mapping->i_mmap);
+               vma_interval_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -229,7 +231,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        }
 
        mmu_notifier_invalidate_range_start(mm, start, start + size);
-       err = populate_range(mm, vma, start, size, pgoff);
+       err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
                if (vma->vm_flags & VM_LOCKED) {
index 57c4b93090151f2acbc1271b7b214fe5bc96478c..14bbf6488a9dd299d4cad58ba598e3675b37895a 100644 (file)
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void)
        unsigned long recommended_min;
        extern int min_free_kbytes;
 
-       if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-                     &transparent_hugepage_flags) &&
-           !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                     &transparent_hugepage_flags))
+       if (!khugepaged_enabled())
                return 0;
 
        for_each_populated_zone(zone)
@@ -139,12 +136,6 @@ static int start_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
-               int wakeup;
-               if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-               mutex_lock(&khugepaged_mutex);
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
@@ -154,16 +145,16 @@ static int start_khugepaged(void)
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                }
-               wakeup = !list_empty(&khugepaged_scan.mm_head);
-               mutex_unlock(&khugepaged_mutex);
-               if (wakeup)
+
+               if (!list_empty(&khugepaged_scan.mm_head))
                        wake_up_interruptible(&khugepaged_wait);
 
                set_recommended_min_free_kbytes();
-       } else
-               /* wakeup to exit */
-               wake_up_interruptible(&khugepaged_wait);
-out:
+       } else if (khugepaged_thread) {
+               kthread_stop(khugepaged_thread);
+               khugepaged_thread = NULL;
+       }
+
        return err;
 }
 
@@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj,
                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 
        if (ret > 0) {
-               int err = start_khugepaged();
+               int err;
+
+               mutex_lock(&khugepaged_mutex);
+               err = start_khugepaged();
+               mutex_unlock(&khugepaged_mutex);
+
                if (err)
                        ret = err;
        }
 
-       if (ret > 0 &&
-           (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-                     &transparent_hugepage_flags) ||
-            test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                     &transparent_hugepage_flags)))
-               set_recommended_min_free_kbytes();
-
        return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -570,8 +559,6 @@ static int __init hugepage_init(void)
 
        start_khugepaged();
 
-       set_recommended_min_free_kbytes();
-
        return 0;
 out:
        hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +598,6 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
-                                struct mm_struct *mm)
-{
-       assert_spin_locked(&mm->page_table_lock);
-
-       /* FIFO */
-       if (!mm->pmd_huge_pte)
-               INIT_LIST_HEAD(&pgtable->lru);
-       else
-               list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-       mm->pmd_huge_pte = pgtable;
-}
-
 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                 */
                page_add_new_anon_rmap(page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-               prepare_pmd_huge_pte(pgtable, mm);
+               pgtable_trans_huge_deposit(mm, pgtable);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
@@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       prepare_pmd_huge_pte(pgtable, dst_mm);
+       pgtable_trans_huge_deposit(dst_mm, pgtable);
        dst_mm->nr_ptes++;
 
        ret = 0;
@@ -802,25 +776,6 @@ out:
        return ret;
 }
 
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
-       pgtable_t pgtable;
-
-       assert_spin_locked(&mm->page_table_lock);
-
-       /* FIFO */
-       pgtable = mm->pmd_huge_pte;
-       if (list_empty(&pgtable->lru))
-               mm->pmd_huge_pte = NULL;
-       else {
-               mm->pmd_huge_pte = list_entry(pgtable->lru.next,
-                                             struct page, lru);
-               list_del(&pgtable->lru);
-       }
-       return pgtable;
-}
-
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -876,7 +831,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        pmdp_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
 
-       pgtable = get_pmd_huge_pte(mm);
+       pgtable = pgtable_trans_huge_withdraw(mm);
        pmd_populate(mm, &_pmd, pgtable);
 
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1041,7 +996,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                struct page *page;
                pgtable_t pgtable;
-               pgtable = get_pmd_huge_pte(tlb->mm);
+               pgtable = pgtable_trans_huge_withdraw(tlb->mm);
                page = pmd_page(*pmd);
                pmd_clear(pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
@@ -1358,7 +1313,7 @@ static int __split_huge_page_map(struct page *page,
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
        if (pmd) {
-               pgtable = get_pmd_huge_pte(mm);
+               pgtable = pgtable_trans_huge_withdraw(mm);
                pmd_populate(mm, &_pmd, pgtable);
 
                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
@@ -1406,8 +1361,7 @@ static int __split_huge_page_map(struct page *page,
                 * SMP TLB and finally we write the non-huge version
                 * of the pmd entry with pmd_populate.
                 */
-               set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
-               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+               pmdp_invalidate(vma, address, pmd);
                pmd_populate(mm, pmd, pgtable);
                ret = 1;
        }
@@ -1421,18 +1375,17 @@ static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
        int mapcount, mapcount2;
+       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
 
        BUG_ON(!PageHead(page));
        BUG_ON(PageTail(page));
 
        mapcount = 0;
-       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-               if (addr == -EFAULT)
-                       continue;
                mapcount += __split_huge_page_splitting(page, vma, addr);
        }
        /*
@@ -1453,12 +1406,10 @@ static void __split_huge_page(struct page *page,
        __split_huge_page_refcount(page);
 
        mapcount2 = 0;
-       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-               if (addr == -EFAULT)
-                       continue;
                mapcount2 += __split_huge_page_map(page, vma, addr);
        }
        if (mapcount != mapcount2)
@@ -1491,12 +1442,13 @@ out:
        return ret;
 }
 
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
-                  VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
 {
+       struct mm_struct *mm = vma->vm_mm;
+
        switch (advice) {
        case MADV_HUGEPAGE:
                /*
@@ -1504,6 +1456,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
                 */
                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
                        return -EINVAL;
+               if (mm->def_flags & VM_NOHUGEPAGE)
+                       return -EINVAL;
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
@@ -1655,11 +1609,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
        if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-       /*
-        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-        * true too, verify it here.
-        */
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+       VM_BUG_ON(vma->vm_flags & VM_NO_THP);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -1834,28 +1784,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
        }
 }
 
-static void collapse_huge_page(struct mm_struct *mm,
-                              unsigned long address,
-                              struct page **hpage,
-                              struct vm_area_struct *vma,
-                              int node)
+static void khugepaged_alloc_sleep(void)
 {
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd, _pmd;
-       pte_t *pte;
-       pgtable_t pgtable;
-       struct page *new_page;
-       spinlock_t *ptl;
-       int isolated;
-       unsigned long hstart, hend;
+       wait_event_freezable_timeout(khugepaged_wait, false,
+                       msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
 
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-#ifndef CONFIG_NUMA
-       up_read(&mm->mmap_sem);
-       VM_BUG_ON(!*hpage);
-       new_page = *hpage;
-#else
+#ifdef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+       if (IS_ERR(*hpage)) {
+               if (!*wait)
+                       return false;
+
+               *wait = false;
+               *hpage = NULL;
+               khugepaged_alloc_sleep();
+       } else if (*hpage) {
+               put_page(*hpage);
+               *hpage = NULL;
+       }
+
+       return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                      struct vm_area_struct *vma, unsigned long address,
+                      int node)
+{
        VM_BUG_ON(*hpage);
        /*
         * Allocate the page while the vma is still valid and under
@@ -1867,7 +1824,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+       *hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
                                      node, __GFP_OTHER_NODE);
 
        /*
@@ -1875,20 +1832,83 @@ static void collapse_huge_page(struct mm_struct *mm,
         * preparation for taking it in write mode.
         */
        up_read(&mm->mmap_sem);
-       if (unlikely(!new_page)) {
+       if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
-               return;
+               return NULL;
        }
-#endif
 
        count_vm_event(THP_COLLAPSE_ALLOC);
-       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-#ifdef CONFIG_NUMA
-               put_page(new_page);
+       return *hpage;
+}
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+       struct page *hpage;
+
+       do {
+               hpage = alloc_hugepage(khugepaged_defrag());
+               if (!hpage) {
+                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                       if (!*wait)
+                               return NULL;
+
+                       *wait = false;
+                       khugepaged_alloc_sleep();
+               } else
+                       count_vm_event(THP_COLLAPSE_ALLOC);
+       } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+       return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+       if (!*hpage)
+               *hpage = khugepaged_alloc_hugepage(wait);
+
+       if (unlikely(!*hpage))
+               return false;
+
+       return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                      struct vm_area_struct *vma, unsigned long address,
+                      int node)
+{
+       up_read(&mm->mmap_sem);
+       VM_BUG_ON(!*hpage);
+       return  *hpage;
+}
 #endif
+
+static void collapse_huge_page(struct mm_struct *mm,
+                                  unsigned long address,
+                                  struct page **hpage,
+                                  struct vm_area_struct *vma,
+                                  int node)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd, _pmd;
+       pte_t *pte;
+       pgtable_t pgtable;
+       struct page *new_page;
+       spinlock_t *ptl;
+       int isolated;
+       unsigned long hstart, hend;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       /* release the mmap_sem read lock. */
+       new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+       if (!new_page)
+               return;
+
+       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
                return;
-       }
 
        /*
         * Prevent all access to pagetables with the exception of
@@ -1913,11 +1933,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
        if (is_vma_temporary_stack(vma))
                goto out;
-       /*
-        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-        * true too, verify it here.
-        */
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+       VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -1971,8 +1987,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
-       VM_BUG_ON(page_count(pgtable) != 1);
-       VM_BUG_ON(page_mapcount(pgtable) != 0);
 
        _pmd = mk_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1990,12 +2004,11 @@ static void collapse_huge_page(struct mm_struct *mm,
        page_add_new_anon_rmap(new_page, vma, address);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache(vma, address, _pmd);
-       prepare_pmd_huge_pte(pgtable, mm);
+       pgtable_trans_huge_deposit(mm, pgtable);
        spin_unlock(&mm->page_table_lock);
 
-#ifndef CONFIG_NUMA
        *hpage = NULL;
-#endif
+
        khugepaged_pages_collapsed++;
 out_up_write:
        up_write(&mm->mmap_sem);
@@ -2003,9 +2016,6 @@ out_up_write:
 
 out:
        mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
-       put_page(new_page);
-#endif
        goto out_up_write;
 }
 
@@ -2155,12 +2165,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        goto skip;
                if (is_vma_temporary_stack(vma))
                        goto skip;
-               /*
-                * If is_pfn_mapping() is true is_learn_pfn_mapping()
-                * must be true too, verify it here.
-                */
-               VM_BUG_ON(is_linear_pfn_mapping(vma) ||
-                         vma->vm_flags & VM_NO_THP);
+               VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2235,32 +2240,23 @@ static int khugepaged_has_work(void)
 static int khugepaged_wait_event(void)
 {
        return !list_empty(&khugepaged_scan.mm_head) ||
-               !khugepaged_enabled();
+               kthread_should_stop();
 }
 
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
 {
+       struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = khugepaged_pages_to_scan;
+       bool wait = true;
 
        barrier(); /* write khugepaged_pages_to_scan to local stack */
 
        while (progress < pages) {
-               cond_resched();
-
-#ifndef CONFIG_NUMA
-               if (!*hpage) {
-                       *hpage = alloc_hugepage(khugepaged_defrag());
-                       if (unlikely(!*hpage)) {
-                               count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                               break;
-                       }
-                       count_vm_event(THP_COLLAPSE_ALLOC);
-               }
-#else
-               if (IS_ERR(*hpage))
+               if (!khugepaged_prealloc_page(&hpage, &wait))
                        break;
-#endif
+
+               cond_resched();
 
                if (unlikely(kthread_should_stop() || freezing(current)))
                        break;
@@ -2271,73 +2267,32 @@ static void khugepaged_do_scan(struct page **hpage)
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
-                                                           hpage);
+                                                           &hpage);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);
        }
-}
-
-static void khugepaged_alloc_sleep(void)
-{
-       wait_event_freezable_timeout(khugepaged_wait, false,
-                       msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-}
-
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(void)
-{
-       struct page *hpage;
 
-       do {
-               hpage = alloc_hugepage(khugepaged_defrag());
-               if (!hpage) {
-                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                       khugepaged_alloc_sleep();
-               } else
-                       count_vm_event(THP_COLLAPSE_ALLOC);
-       } while (unlikely(!hpage) &&
-                likely(khugepaged_enabled()));
-       return hpage;
+       if (!IS_ERR_OR_NULL(hpage))
+               put_page(hpage);
 }
-#endif
 
-static void khugepaged_loop(void)
+static void khugepaged_wait_work(void)
 {
-       struct page *hpage;
+       try_to_freeze();
 
-#ifdef CONFIG_NUMA
-       hpage = NULL;
-#endif
-       while (likely(khugepaged_enabled())) {
-#ifndef CONFIG_NUMA
-               hpage = khugepaged_alloc_hugepage();
-               if (unlikely(!hpage))
-                       break;
-#else
-               if (IS_ERR(hpage)) {
-                       khugepaged_alloc_sleep();
-                       hpage = NULL;
-               }
-#endif
+       if (khugepaged_has_work()) {
+               if (!khugepaged_scan_sleep_millisecs)
+                       return;
 
-               khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
-               if (hpage)
-                       put_page(hpage);
-#endif
-               try_to_freeze();
-               if (unlikely(kthread_should_stop()))
-                       break;
-               if (khugepaged_has_work()) {
-                       if (!khugepaged_scan_sleep_millisecs)
-                               continue;
-                       wait_event_freezable_timeout(khugepaged_wait, false,
-                           msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-               } else if (khugepaged_enabled())
-                       wait_event_freezable(khugepaged_wait,
-                                            khugepaged_wait_event());
+               wait_event_freezable_timeout(khugepaged_wait,
+                                            kthread_should_stop(),
+                       msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+               return;
        }
+
+       if (khugepaged_enabled())
+               wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
 static int khugepaged(void *none)
@@ -2347,20 +2302,9 @@ static int khugepaged(void *none)
        set_freezable();
        set_user_nice(current, 19);
 
-       /* serialize with start_khugepaged() */
-       mutex_lock(&khugepaged_mutex);
-
-       for (;;) {
-               mutex_unlock(&khugepaged_mutex);
-               VM_BUG_ON(khugepaged_thread != current);
-               khugepaged_loop();
-               VM_BUG_ON(khugepaged_thread != current);
-
-               mutex_lock(&khugepaged_mutex);
-               if (!khugepaged_enabled())
-                       break;
-               if (unlikely(kthread_should_stop()))
-                       break;
+       while (!kthread_should_stop()) {
+               khugepaged_do_scan();
+               khugepaged_wait_work();
        }
 
        spin_lock(&khugepaged_mm_lock);
@@ -2369,10 +2313,6 @@ static int khugepaged(void *none)
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
-
-       khugepaged_thread = NULL;
-       mutex_unlock(&khugepaged_mutex);
-
        return 0;
 }
 
index bc727122dd44de6c4ae9307c618e58ddf4da3c87..8536741f069b2449bbaab9e752431fb9eb4527a6 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
        } else {
+               arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
@@ -2473,7 +2473,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
-       struct prio_tree_iter iter;
        pgoff_t pgoff;
 
        /*
@@ -2490,7 +2489,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * __unmap_hugepage_range() is called as the lock is already held
         */
        mutex_lock(&mapping->i_mmap_mutex);
-       vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
                        continue;
index b8c91b342e244153ec9b24b4673e6dd2575af267..eebbed59b85b26bab16d2d9da47a8d7849ce24ea 100644 (file)
@@ -12,6 +12,7 @@
 #define __MM_INTERNAL_H
 
 #include <linux/mm.h>
+#include <linux/rmap.h>
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
@@ -130,7 +131,8 @@ struct compact_control {
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
-       bool *contended;                /* True if a lock was contended */
+       bool contended;                 /* True if a lock was contended */
+       struct page **page;             /* Page captured of requested size */
 };
 
 unsigned long
@@ -340,7 +342,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define ZONE_RECLAIM_FULL      -1
 #define ZONE_RECLAIM_SOME      0
 #define ZONE_RECLAIM_SUCCESS   1
-#endif
 
 extern int hwpoison_filter(struct page *p);
 
@@ -356,3 +357,54 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                           struct list_head *page_list);
+
+/*
+ * Unnecessary readahead harms performance, especially for SSD devices, where
+ * large reads are significantly more expensive than small ones.
+ * These implements simple swap random access detection. In swap page fault: if
+ * the page is found in swapcache, decrease a counter in the vma, otherwise we
+ * need to perform sync swapin and the counter is increased.  Optionally swapin
+ * will perform readahead if the counter is below a threshold.
+ */
+#ifdef CONFIG_SWAP
+#define SWAPRA_MISS_THRESHOLD  (100)
+#define SWAPRA_MAX_MISS ((SWAPRA_MISS_THRESHOLD) * 10)
+static inline void swap_cache_hit(struct vm_area_struct *vma)
+{
+       if (vma && vma->anon_vma)
+               atomic_dec_if_positive(&vma->anon_vma->swapra_miss);
+}
+
+static inline void swap_cache_miss(struct vm_area_struct *vma)
+{
+       if (!vma || !vma->anon_vma)
+               return;
+       if (atomic_read(&vma->anon_vma->swapra_miss) < SWAPRA_MAX_MISS)
+               atomic_inc(&vma->anon_vma->swapra_miss);
+}
+
+static inline int swap_cache_skip_readahead(struct vm_area_struct *vma)
+{
+       if (!vma || !vma->anon_vma)
+               return 0;
+       return atomic_read(&vma->anon_vma->swapra_miss) >
+               SWAPRA_MISS_THRESHOLD;
+}
+#else
+static inline void swap_cache_hit(struct vm_area_struct *vma)
+{
+}
+
+static inline void swap_cache_miss(struct vm_area_struct *vma)
+{
+}
+
+static inline int swap_cache_skip_readahead(struct vm_area_struct *vma)
+{
+       return 0;
+}
+#endif /* CONFIG_SWAP */
+
+#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644 (file)
index 0000000..4a5822a
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+       return v->vm_pgoff;
+}
+
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+       return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+                    unsigned long, shared.linear.rb_subtree_last,
+                    vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+                                   struct vm_area_struct *prev,
+                                   struct rb_root *root)
+{
+       struct rb_node **link;
+       struct vm_area_struct *parent;
+       unsigned long last = vma_last_pgoff(node);
+
+       VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+
+       if (!prev->shared.linear.rb.rb_right) {
+               parent = prev;
+               link = &prev->shared.linear.rb.rb_right;
+       } else {
+               parent = rb_entry(prev->shared.linear.rb.rb_right,
+                                 struct vm_area_struct, shared.linear.rb);
+               if (parent->shared.linear.rb_subtree_last < last)
+                       parent->shared.linear.rb_subtree_last = last;
+               while (parent->shared.linear.rb.rb_left) {
+                       parent = rb_entry(parent->shared.linear.rb.rb_left,
+                               struct vm_area_struct, shared.linear.rb);
+                       if (parent->shared.linear.rb_subtree_last < last)
+                               parent->shared.linear.rb_subtree_last = last;
+               }
+               link = &parent->shared.linear.rb.rb_left;
+       }
+
+       node->shared.linear.rb_subtree_last = last;
+       rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+       rb_insert_augmented(&node->shared.linear.rb, root,
+                           &vma_interval_tree_augment);
+}
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+       return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+       return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+                    avc_start_pgoff, avc_last_pgoff,
+                    static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+                                  struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+       node->cached_vma_start = avc_start_pgoff(node);
+       node->cached_vma_last = avc_last_pgoff(node);
+#endif
+       __anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+                                  struct rb_root *root)
+{
+       __anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+                                 unsigned long first, unsigned long last)
+{
+       return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+                                unsigned long first, unsigned long last)
+{
+       return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+       WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+       WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
index 875734dcd4cb6dbba238f7edc467732ae6a06ea9..a217cc54406065f6cef571bb35f7b19a22e295c8 100644 (file)
@@ -29,7 +29,7 @@
  * - kmemleak_lock (rwlock): protects the object_list modifications and
  *   accesses to the object_tree_root. The object_list is the main list
  *   holding the metadata (struct kmemleak_object) for the allocated memory
- *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   blocks. The object_tree_root is a red black tree used to look-up
  *   metadata based on a pointer to the corresponding memory block.  The
  *   kmemleak_object structures are added to the object_list and
  *   object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
  * Structure holding the metadata for each allocated memory block.
  * Modifications to such objects should be made while holding the
  * object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
  * the notes on locking above). These objects are reference-counted
  * (use_count) and freed using the RCU mechanism.
  */
@@ -141,7 +141,7 @@ struct kmemleak_object {
        unsigned long flags;            /* object status flags */
        struct list_head object_list;
        struct list_head gray_list;
-       struct prio_tree_node tree_node;
+       struct rb_node rb_node;
        struct rcu_head rcu;            /* object_list lockless traversal */
        /* object usage count; object freed when use_count == 0 */
        atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
 static LIST_HEAD(object_list);
 /* the list of gray-colored objects (see color_gray comment below) */
 static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* rw_lock protecting the access to object_list and object_tree_root */
 static DEFINE_RWLOCK(kmemleak_lock);
 
 /* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
        trace.entries = object->trace;
 
        pr_notice("Object 0x%08lx (size %zu):\n",
-                 object->tree_node.start, object->size);
+                 object->pointer, object->size);
        pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
                  object->comm, object->pid, object->jiffies);
        pr_notice("  min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
 }
 
 /*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
  * tree based on a pointer value. If alias is 0, only values pointing to the
  * beginning of the memory block are allowed. The kmemleak_lock must be held
  * when calling this function.
  */
 static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
 {
-       struct prio_tree_node *node;
-       struct prio_tree_iter iter;
-       struct kmemleak_object *object;
-
-       prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
-       node = prio_tree_next(&iter);
-       if (node) {
-               object = prio_tree_entry(node, struct kmemleak_object,
-                                        tree_node);
-               if (!alias && object->pointer != ptr) {
+       struct rb_node *rb = object_tree_root.rb_node;
+
+       while (rb) {
+               struct kmemleak_object *object =
+                       rb_entry(rb, struct kmemleak_object, rb_node);
+               if (ptr < object->pointer)
+                       rb = object->rb_node.rb_left;
+               else if (object->pointer + object->size <= ptr)
+                       rb = object->rb_node.rb_right;
+               else if (object->pointer == ptr || alias)
+                       return object;
+               else {
                        kmemleak_warn("Found object by alias at 0x%08lx\n",
                                      ptr);
                        dump_object_info(object);
-                       object = NULL;
+                       break;
                }
-       } else
-               object = NULL;
-
-       return object;
+       }
+       return NULL;
 }
 
 /*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
 }
 
 /*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
  */
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
                                             int min_count, gfp_t gfp)
 {
        unsigned long flags;
-       struct kmemleak_object *object;
-       struct prio_tree_node *node;
+       struct kmemleak_object *object, *parent;
+       struct rb_node **link, *rb_parent;
 
        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
@@ -560,32 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        /* kernel backtrace */
        object->trace_len = __save_stack_trace(object->trace);
 
-       INIT_PRIO_TREE_NODE(&object->tree_node);
-       object->tree_node.start = ptr;
-       object->tree_node.last = ptr + size - 1;
-
        write_lock_irqsave(&kmemleak_lock, flags);
 
        min_addr = min(min_addr, ptr);
        max_addr = max(max_addr, ptr + size);
-       node = prio_tree_insert(&object_tree_root, &object->tree_node);
-       /*
-        * The code calling the kernel does not yet have the pointer to the
-        * memory block to be able to free it.  However, we still hold the
-        * kmemleak_lock here in case parts of the kernel started freeing
-        * random memory blocks.
-        */
-       if (node != &object->tree_node) {
-               kmemleak_stop("Cannot insert 0x%lx into the object search tree "
-                             "(already existing)\n", ptr);
-               kmem_cache_free(object_cache, object);
-               object = lookup_object(ptr, 1);
-               spin_lock(&object->lock);
-               dump_object_info(object);
-               spin_unlock(&object->lock);
-
-               goto out;
+       link = &object_tree_root.rb_node;
+       rb_parent = NULL;
+       while (*link) {
+               rb_parent = *link;
+               parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+               if (ptr + size <= parent->pointer)
+                       link = &parent->rb_node.rb_left;
+               else if (parent->pointer + parent->size <= ptr)
+                       link = &parent->rb_node.rb_right;
+               else {
+                       kmemleak_stop("Cannot insert 0x%lx into the object "
+                                     "search tree (overlaps existing)\n",
+                                     ptr);
+                       kmem_cache_free(object_cache, object);
+                       object = parent;
+                       spin_lock(&object->lock);
+                       dump_object_info(object);
+                       spin_unlock(&object->lock);
+                       goto out;
+               }
        }
+       rb_link_node(&object->rb_node, rb_parent, link);
+       rb_insert_color(&object->rb_node, &object_tree_root);
+
        list_add_tail_rcu(&object->object_list, &object_list);
 out:
        write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -601,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
        unsigned long flags;
 
        write_lock_irqsave(&kmemleak_lock, flags);
-       prio_tree_remove(&object_tree_root, &object->tree_node);
+       rb_erase(&object->rb_node, &object_tree_root);
        list_del_rcu(&object->object_list);
        write_unlock_irqrestore(&kmemleak_lock, flags);
 
@@ -1767,7 +1769,6 @@ void __init kmemleak_init(void)
 
        object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
        scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
-       INIT_PRIO_TREE_ROOT(&object_tree_root);
 
        if (crt_early_log >= ARRAY_SIZE(early_log))
                pr_warning("Early log buffer exceeded (%d), please increase "
index 47c885368890741407eb45e8699acb12fc773aca..14ee5cf8a513c7abe24ff47122eec918856a3611 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1469,10 +1469,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                 */
                if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
                                 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-                                VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
-                                VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
+                                VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
                        return 0;               /* just ignore the advice */
 
+#ifdef VM_SAO
+               if (*vm_flags & VM_SAO)
+                       return 0;
+#endif
+
                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                        err = __ksm_enter(mm);
                        if (err)
@@ -1614,7 +1618,8 @@ again:
                struct vm_area_struct *vma;
 
                anon_vma_lock(anon_vma);
-               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+               anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                              0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1667,7 +1672,8 @@ again:
                struct vm_area_struct *vma;
 
                anon_vma_lock(anon_vma);
-               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+               anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                              0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
@@ -1719,7 +1725,8 @@ again:
                struct vm_area_struct *vma;
 
                anon_vma_lock(anon_vma);
-               list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+               anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+                                              0, ULONG_MAX) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
index 14d260fa0d17939a2279c244df91789cd30720e4..03dfa5c7adb3c41acdf672b6504516073866c996 100644 (file)
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
                new_flags &= ~VM_DONTCOPY;
                break;
        case MADV_DONTDUMP:
-               new_flags |= VM_NODUMP;
+               new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
-               new_flags &= ~VM_NODUMP;
+               if (new_flags & VM_SPECIAL) {
+                       error = -EINVAL;
+                       goto out;
+               }
+               new_flags &= ~VM_DONTDUMP;
                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
index 82aa349d2f7a040b489bee441bf848b61119f788..8fb6510eb8e27bd640d42c8ac5a1bfe5785c86f6 100644 (file)
@@ -756,7 +756,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
                return ret;
 
        for (i = start_rgn; i < end_rgn; i++)
-               type->regions[i].nid = nid;
+               memblock_set_region_node(&type->regions[i], nid);
 
        memblock_merge_regions(type);
        return 0;
@@ -888,6 +888,11 @@ int __init memblock_is_reserved(phys_addr_t addr)
 
 int __init_memblock memblock_is_memory(phys_addr_t addr)
 {
+
+       if (unlikely(addr < memblock_start_of_DRAM() ||
+               addr >= memblock_end_of_DRAM()))
+               return 0;
+
        return memblock_search(&memblock.memory, addr) != -1;
 }
 
index a6e2141a6610bfd4fab325e343149a25fadc8835..6c5899b9034aa7d769e5fd80e139cad61af104c0 100644 (file)
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct anon_vma *av;
+       pgoff_t pgoff;
 
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
                return;
 
+       pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
 
                if (!task_early_kill(tsk))
                        continue;
-               list_for_each_entry(vmac, &av->head, same_anon_vma) {
+               anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+                                              pgoff, pgoff) {
                        vma = vmac->vma;
                        if (!page_mapped_in_vma(page, vma))
                                continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
-       struct prio_tree_iter iter;
        struct address_space *mapping = page->mapping;
 
        mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                if (!task_early_kill(tsk))
                        continue;
 
-               vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+               vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
                                      pgoff) {
                        /*
                         * Send early kill signal to tasks where a vma covers
index 57361708d1a57d7bc11c8f34d269a35c50317dbb..74691f159aab2eac6b1874d03320399e55d8bf9f 100644 (file)
@@ -1047,7 +1047,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-       if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+       if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+                              VM_PFNMAP | VM_MIXEDMAP))) {
                if (!vma->anon_vma)
                        return 0;
        }
@@ -1055,12 +1056,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
-       if (unlikely(is_pfn_mapping(vma))) {
+       if (unlikely(vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
-               ret = track_pfn_vma_copy(vma);
+               ret = track_pfn_copy(vma);
                if (ret)
                        return ret;
        }
@@ -1327,8 +1328,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
        if (vma->vm_file)
                uprobe_munmap(vma, start, end);
 
-       if (unlikely(is_pfn_mapping(vma)))
-               untrack_pfn_vma(vma, 0, 0);
+       if (unlikely(vma->vm_flags & VM_PFNMAP))
+               untrack_pfn(vma, 0, 0);
 
        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -2085,6 +2086,11 @@ out:
  * ask for a shared writable mapping!
  *
  * The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
  */
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
@@ -2093,7 +2099,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
-       vma->vm_flags |= VM_INSERTPAGE;
+       if (!(vma->vm_flags & VM_MIXEDMAP)) {
+               BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+               BUG_ON(vma->vm_flags & VM_PFNMAP);
+               vma->vm_flags |= VM_MIXEDMAP;
+       }
        return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
@@ -2162,14 +2172,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-       if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+       if (track_pfn_insert(vma, &pgprot, pfn))
                return -EINVAL;
 
        ret = insert_pfn(vma, addr, pfn, pgprot);
 
-       if (ret)
-               untrack_pfn_vma(vma, pfn, PAGE_SIZE);
-
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2297,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *      (accesses can have side effects).
-        *   VM_RESERVED is specified all over the place, because
-        *      in 2.4 it kept swapout's vma scan off this vma; but
-        *      in 2.6 the LRU scan won't even find its pages, so this
-        *      flag means no more than count its pages in reserved_vm,
-        *      and omit it from core dump, even when VM_IO turned off.
         *   VM_PFNMAP tells the core MM that the base pages are just
         *      raw PFN mappings, and do not have a "struct page" associated
         *      with them.
+        *   VM_DONTEXPAND
+        *      Disable vma merging and expanding with mremap().
+        *   VM_DONTDUMP
+        *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+        * See vm_normal_page() for details.
         */
-       if (addr == vma->vm_start && end == vma->vm_end) {
+       if (is_cow_mapping(vma->vm_flags)) {
+               if (addr != vma->vm_start || end != vma->vm_end)
+                       return -EINVAL;
                vma->vm_pgoff = pfn;
-               vma->vm_flags |= VM_PFN_AT_MMAP;
-       } else if (is_cow_mapping(vma->vm_flags))
-               return -EINVAL;
-
-       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+       }
 
-       err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
-       if (err) {
-               /*
-                * To indicate that track_pfn related cleanup is not
-                * needed from higher level routine calling unmap_vmas
-                */
-               vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
-               vma->vm_flags &= ~VM_PFN_AT_MMAP;
+       err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+       if (err)
                return -EINVAL;
-       }
+
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2335,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        } while (pgd++, addr = next, addr != end);
 
        if (err)
-               untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+               untrack_pfn(vma, pfn, PAGE_ALIGN(size));
 
        return err;
 }
@@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
                                            struct zap_details *details)
 {
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
        pgoff_t vba, vea, zba, zea;
 
-       vma_prio_tree_foreach(vma, &iter, root,
+       vma_interval_tree_foreach(vma, root,
                        details->first_index, details->last_index) {
 
                vba = vma->vm_pgoff;
@@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
         * across *all* the pages in each nonlinear VMA, not just the pages
         * whose virtual address lies outside the file truncation point.
         */
-       list_for_each_entry(vma, head, shared.vm_set.list) {
+       list_for_each_entry(vma, head, shared.nonlinear) {
                details->nonlinear_vma = vma;
                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
        }
@@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping,
 
 
        mutex_lock(&mapping->i_mmap_mutex);
-       if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+       if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
@@ -2953,7 +2952,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                goto out_release;
-       }
+       } else if (!(flags & FAULT_FLAG_TRIED))
+               swap_cache_hit(vma);
 
        locked = lock_page_or_retry(page, mm, flags);
 
index 6a5b90d0cfd7cbd61f4bc679bb4313e7a374ac0d..f9ac0955e10a5af0979fde81d7124578c522a317 100644 (file)
@@ -756,13 +756,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
        return 0;
 }
 
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
-       /* This should be improooooved!! */
-       return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
-
 #define NR_OFFLINE_AT_ONCE_PAGES       (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +806,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        putback_lru_pages(&source);
                        goto out;
                }
-               /* this function returns # of failed pages */
-               ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+
+               /*
+                * alloc_migrate_target should be improooooved!!
+                * migrate_pages returns # of failed pages.
+                */
+               ret = migrate_pages(&source, alloc_migrate_target, 0,
                                                        true, MIGRATE_SYNC);
                if (ret)
                        putback_lru_pages(&source);
@@ -970,8 +967,13 @@ repeat:
 
        init_per_zone_wmark_min();
 
-       if (!populated_zone(zone))
+       if (!populated_zone(zone)) {
                zone_pcp_reset(zone);
+               mutex_lock(&zonelists_mutex);
+               build_all_zonelists(NULL, NULL);
+               mutex_unlock(&zonelists_mutex);
+       } else
+               zone_pcp_update(zone);
 
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
index 4ada3be6e2521278de6da6e110995d63ccd7ed8f..3d64b369180d603225ea92348a6e5d0e10a8a06c 100644 (file)
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        return first;
 }
 
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+                                               struct mempolicy *pol)
+{
+       int err;
+       struct mempolicy *old;
+       struct mempolicy *new;
+
+       pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+                vma->vm_start, vma->vm_end, vma->vm_pgoff,
+                vma->vm_ops, vma->vm_file,
+                vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+       new = mpol_dup(pol);
+       if (IS_ERR(new))
+               return PTR_ERR(new);
+
+       if (vma->vm_ops && vma->vm_ops->set_policy) {
+               err = vma->vm_ops->set_policy(vma, new);
+               if (err)
+                       goto err_out;
+       }
+
+       old = vma->vm_policy;
+       vma->vm_policy = new; /* protected by mmap_sem */
+       mpol_put(old);
+
+       return 0;
+ err_out:
+       mpol_put(new);
+       return err;
+}
+
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        if (err)
                                goto out;
                }
-
-               /*
-                * Apply policy to a single VMA. The reference counting of
-                * policy for vma_policy linkages has already been handled by
-                * vma_merge and split_vma as necessary. If this is a shared
-                * policy then ->set_policy will increment the reference count
-                * for an sp node.
-                */
-               pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-                       vma->vm_start, vma->vm_end, vma->vm_pgoff,
-                       vma->vm_ops, vma->vm_file,
-                       vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-               if (vma->vm_ops && vma->vm_ops->set_policy) {
-                       err = vma->vm_ops->set_policy(vma, new_pol);
-                       if (err)
-                               goto out;
-               }
+               err = vma_replace_policy(vma, new_pol);
+               if (err)
+                       goto out;
        }
 
  out:
@@ -1530,8 +1552,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
                                                                        addr);
                        if (vpol)
                                pol = vpol;
-               } else if (vma->vm_policy)
+               } else if (vma->vm_policy) {
                        pol = vma->vm_policy;
+
+                       /*
+                        * shmem_alloc_page() passes MPOL_F_SHARED policy with
+                        * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+                        * count on these policies which will be dropped by
+                        * mpol_cond_put() later
+                        */
+                       if (mpol_needs_cond_ref(pol))
+                               mpol_get(pol);
+               }
        }
        if (!pol)
                pol = &default_policy;
@@ -2061,7 +2093,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
  */
 
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2125,36 +2157,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 
        if (!sp->root.rb_node)
                return NULL;
-       spin_lock(&sp->lock);
+       mutex_lock(&sp->mutex);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
-       spin_unlock(&sp->lock);
+       mutex_unlock(&sp->mutex);
        return pol;
 }
 
+static void sp_free(struct sp_node *n)
+{
+       mpol_put(n->policy);
+       kmem_cache_free(sn_cache, n);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
-       mpol_put(n->policy);
-       kmem_cache_free(sn_cache, n);
+       sp_free(n);
 }
 
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
 {
-       struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+       struct sp_node *n;
+       struct mempolicy *newpol;
 
+       n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;
+
+       newpol = mpol_dup(pol);
+       if (IS_ERR(newpol)) {
+               kmem_cache_free(sn_cache, n);
+               return NULL;
+       }
+       newpol->flags |= MPOL_F_SHARED;
+
        n->start = start;
        n->end = end;
-       mpol_get(pol);
-       pol->flags |= MPOL_F_SHARED;    /* for unref */
-       n->policy = pol;
+       n->policy = newpol;
+
        return n;
 }
 
@@ -2162,10 +2208,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
 {
-       struct sp_node *n, *new2 = NULL;
+       struct sp_node *n;
+       int ret = 0;
 
-restart:
-       spin_lock(&sp->lock);
+       mutex_lock(&sp->mutex);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
@@ -2178,16 +2224,14 @@ restart:
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
+                               struct sp_node *new2;
+                               new2 = sp_alloc(end, n->end, n->policy);
                                if (!new2) {
-                                       spin_unlock(&sp->lock);
-                                       new2 = sp_alloc(end, n->end, n->policy);
-                                       if (!new2)
-                                               return -ENOMEM;
-                                       goto restart;
+                                       ret = -ENOMEM;
+                                       goto out;
                                }
                                n->end = start;
                                sp_insert(sp, new2);
-                               new2 = NULL;
                                break;
                        } else
                                n->end = start;
@@ -2198,12 +2242,9 @@ restart:
        }
        if (new)
                sp_insert(sp, new);
-       spin_unlock(&sp->lock);
-       if (new2) {
-               mpol_put(new2->policy);
-               kmem_cache_free(sn_cache, new2);
-       }
-       return 0;
+out:
+       mutex_unlock(&sp->mutex);
+       return ret;
 }
 
 /**
@@ -2221,7 +2262,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        int ret;
 
        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
-       spin_lock_init(&sp->lock);
+       mutex_init(&sp->mutex);
 
        if (mpol) {
                struct vm_area_struct pvma;
@@ -2275,7 +2316,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
        }
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
        if (err && new)
-               kmem_cache_free(sn_cache, new);
+               sp_free(new);
        return err;
 }
 
@@ -2287,16 +2328,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
 
        if (!p->root.rb_node)
                return;
-       spin_lock(&p->lock);
+       mutex_lock(&p->mutex);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
-               rb_erase(&n->nd, &p->root);
-               mpol_put(n->policy);
-               kmem_cache_free(sn_cache, n);
+               sp_delete(p, n);
        }
-       spin_unlock(&p->lock);
+       mutex_unlock(&p->mutex);
 }
 
 /* assumes fs == KERNEL_DS */
index ef726e8aa8e9ca56c0713abfa4b621f44a210d4c..a948be4b7ba7673aa77cf7b537d8a650cd558a15 100644 (file)
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                goto no_mlock;
 
-       if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+       if (!((vma->vm_flags & VM_DONTEXPAND) ||
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current->mm))) {
 
index ae18a48e7e4e7944af308bbff226217ae7d1601e..a7cc936a16097cac127173f1c01c1265f06f5a3b 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);
 
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
-
 /* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 
        flush_dcache_mmap_lock(mapping);
        if (unlikely(vma->vm_flags & VM_NONLINEAR))
-               list_del_init(&vma->shared.vm_set.list);
+               list_del_init(&vma->shared.nonlinear);
        else
-               vma_prio_tree_remove(vma, &mapping->i_mmap);
+               vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
 }
 
 /*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
  * vma from rmap and vmtruncate before freeing its page tables.
  */
 void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-       if (vma->vm_file) {
+       if (vma->vm_file)
                fput(vma->vm_file);
-               if (vma->vm_flags & VM_EXECUTABLE)
-                       removed_exe_file_vma(vma->vm_mm);
-       }
        mpol_put(vma_policy(vma));
        kmem_cache_free(vm_area_cachep, vma);
        return next;
@@ -306,7 +297,7 @@ out:
        return retval;
 }
 
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
        int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
 {
        int bug = 0;
        int i = 0;
-       struct vm_area_struct *tmp = mm->mmap;
-       while (tmp) {
-               tmp = tmp->vm_next;
+       struct vm_area_struct *vma = mm->mmap;
+       while (vma) {
+               struct anon_vma_chain *avc;
+               list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                       anon_vma_interval_tree_verify(avc);
+               vma = vma->vm_next;
                i++;
        }
        if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
 
-static struct vm_area_struct *
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
-               struct vm_area_struct **pprev, struct rb_node ***rb_link,
-               struct rb_node ** rb_parent)
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
 {
-       struct vm_area_struct * vma;
-       struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+       struct anon_vma_chain *avc;
+
+       list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+               anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+       struct anon_vma_chain *avc;
+
+       list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+               anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+               unsigned long end, struct vm_area_struct **pprev,
+               struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+       struct rb_node **__rb_link, *__rb_parent, *rb_prev;
 
        __rb_link = &mm->mm_rb.rb_node;
        rb_prev = __rb_parent = NULL;
-       vma = NULL;
 
        while (*__rb_link) {
                struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 
                if (vma_tmp->vm_end > addr) {
-                       vma = vma_tmp;
-                       if (vma_tmp->vm_start <= addr)
-                               break;
+                       /* Fail if an existing vma overlaps the area */
+                       if (vma_tmp->vm_start < end)
+                               return -ENOMEM;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
        *rb_link = __rb_link;
        *rb_parent = __rb_parent;
-       return vma;
+       return 0;
 }
 
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
                if (unlikely(vma->vm_flags & VM_NONLINEAR))
                        vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                else
-                       vma_prio_tree_insert(vma, &mapping->i_mmap);
+                       vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
        }
 }
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 
 /*
  * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree.  It has already been inserted into the prio_tree.
+ * mm's list and rbtree.  It has already been inserted into the interval tree.
  */
 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-       struct vm_area_struct *__vma, *prev;
+       struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;
 
-       __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
-       BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+       if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+                          &prev, &rb_link, &rb_parent))
+               BUG();
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        mm->map_count++;
 }
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct vm_area_struct *next = vma->vm_next;
        struct vm_area_struct *importer = NULL;
        struct address_space *mapping = NULL;
-       struct prio_tree_root *root = NULL;
+       struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        long adjust_next = 0;
@@ -559,7 +583,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
                mutex_lock(&mapping->i_mmap_mutex);
                if (insert) {
                        /*
-                        * Put into prio_tree now, so instantiated pages
+                        * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
@@ -570,22 +594,23 @@ again:                    remove_next = 1 + (end > next->vm_end);
 
        vma_adjust_trans_huge(vma, start, end, adjust_next);
 
-       /*
-        * When changing only vma->vm_end, we don't really need anon_vma
-        * lock. This is a fairly rare case by itself, but the anon_vma
-        * lock may be shared between many sibling processes.  Skipping
-        * the lock for brk adjustments makes a difference sometimes.
-        */
-       if (vma->anon_vma && (importer || start != vma->vm_start)) {
-               anon_vma = vma->anon_vma;
+       anon_vma = vma->anon_vma;
+       if (!anon_vma && adjust_next)
+               anon_vma = next->anon_vma;
+       if (anon_vma) {
+               VM_BUG_ON(adjust_next && next->anon_vma &&
+                         anon_vma != next->anon_vma);
                anon_vma_lock(anon_vma);
+               anon_vma_interval_tree_pre_update_vma(vma);
+               if (adjust_next)
+                       anon_vma_interval_tree_pre_update_vma(next);
        }
 
        if (root) {
                flush_dcache_mmap_lock(mapping);
-               vma_prio_tree_remove(vma, root);
+               vma_interval_tree_remove(vma, root);
                if (adjust_next)
-                       vma_prio_tree_remove(next, root);
+                       vma_interval_tree_remove(next, root);
        }
 
        vma->vm_start = start;
@@ -598,8 +623,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
 
        if (root) {
                if (adjust_next)
-                       vma_prio_tree_insert(next, root);
-               vma_prio_tree_insert(vma, root);
+                       vma_interval_tree_insert(next, root);
+               vma_interval_tree_insert(vma, root);
                flush_dcache_mmap_unlock(mapping);
        }
 
@@ -620,8 +645,12 @@ again:                     remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
 
-       if (anon_vma)
+       if (anon_vma) {
+               anon_vma_interval_tree_post_update_vma(vma);
+               if (adjust_next)
+                       anon_vma_interval_tree_post_update_vma(next);
                anon_vma_unlock(anon_vma);
+       }
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 
@@ -636,8 +665,6 @@ again:                      remove_next = 1 + (end > next->vm_end);
                if (file) {
                        uprobe_munmap(next, next->vm_start, next->vm_end);
                        fput(file);
-                       if (next->vm_flags & VM_EXECUTABLE)
-                               removed_exe_file_vma(mm);
                }
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
-       /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
-       if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
+       if (vma->vm_flags ^ vm_flags)
                return 0;
        if (vma->vm_file != file)
                return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
                        mm->exec_vm += pages;
        } else if (flags & stack_flags)
                mm->stack_vm += pages;
-       if (flags & (VM_RESERVED|VM_IO))
-               mm->reserved_vm += pages;
 }
 #endif /* CONFIG_PROC_FS */
 
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                return 0;
 
        /* Specialty mapping? */
-       if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+       if (vm_flags & VM_PFNMAP)
                return 0;
 
        /* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /* Clear old maps */
        error = -ENOMEM;
 munmap_back:
-       vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
-       if (vma && vma->vm_start < addr + len) {
+       if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -1306,8 +1329,6 @@ munmap_back:
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
-               if (vm_flags & VM_EXECUTABLE)
-                       added_exe_file_vma(mm);
 
                /* Can addr have changed??
                 *
@@ -1758,13 +1779,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                               anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
+                               anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+       validate_mm(vma->vm_mm);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,14 +1832,17 @@ int expand_downwards(struct vm_area_struct *vma,
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                               anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
+                               anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+       validate_mm(vma->vm_mm);
        return error;
 }
 
@@ -1989,11 +2016,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        if (anon_vma_clone(new, vma))
                goto out_free_mpol;
 
-       if (new->vm_file) {
+       if (new->vm_file)
                get_file(new->vm_file);
-               if (vma->vm_flags & VM_EXECUTABLE)
-                       added_exe_file_vma(mm);
-       }
 
        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);
@@ -2011,11 +2035,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        /* Clean everything up if vma_adjust failed. */
        if (new->vm_ops && new->vm_ops->close)
                new->vm_ops->close(new);
-       if (new->vm_file) {
-               if (vma->vm_flags & VM_EXECUTABLE)
-                       removed_exe_file_vma(mm);
+       if (new->vm_file)
                fput(new->vm_file);
-       }
        unlink_anon_vmas(new);
  out_free_mpol:
        mpol_put(pol);
@@ -2200,8 +2221,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
         * Clear old maps.  this also does some error checking for us
         */
  munmap_back:
-       vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
-       if (vma && vma->vm_start < addr + len) {
+       if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
                goto munmap_back;
@@ -2315,10 +2335,10 @@ void exit_mmap(struct mm_struct *mm)
  * and into the inode's i_mmap tree.  If vm_file is non-NULL
  * then i_mmap_mutex is taken here.
  */
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-       struct vm_area_struct * __vma, * prev;
-       struct rb_node ** rb_link, * rb_parent;
+       struct vm_area_struct *prev;
+       struct rb_node **rb_link, *rb_parent;
 
        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2336,8 +2356,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }
-       __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
-       if (__vma && __vma->vm_start < vma->vm_end)
+       if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+                          &prev, &rb_link, &rb_parent))
                return -ENOMEM;
        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2352,7 +2372,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  * prior to moving page table entries, to effect an mremap move.
  */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-       unsigned long addr, unsigned long len, pgoff_t pgoff)
+       unsigned long addr, unsigned long len, pgoff_t pgoff,
+       bool *need_rmap_locks)
 {
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
@@ -2371,7 +2392,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                faulted_in_anon_vma = false;
        }
 
-       find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+       if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+               return NULL;    /* should never get here */
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                        vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
        if (new_vma) {
@@ -2393,9 +2415,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON(faulted_in_anon_vma);
-                       *vmap = new_vma;
-               } else
-                       anon_vma_moveto_tail(new_vma);
+                       *vmap = vma = new_vma;
+               }
+               *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
@@ -2410,15 +2432,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                        new_vma->vm_start = addr;
                        new_vma->vm_end = addr + len;
                        new_vma->vm_pgoff = pgoff;
-                       if (new_vma->vm_file) {
+                       if (new_vma->vm_file)
                                get_file(new_vma->vm_file);
-
-                               if (vma->vm_flags & VM_EXECUTABLE)
-                                       added_exe_file_vma(mm);
-                       }
                        if (new_vma->vm_ops && new_vma->vm_ops->open)
                                new_vma->vm_ops->open(new_vma);
                        vma_link(mm, new_vma, prev, rb_link, rb_parent);
+                       *need_rmap_locks = false;
                }
        }
        return new_vma;
@@ -2536,7 +2555,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-       if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+       if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
@@ -2552,7 +2571,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
-                                      &anon_vma->root->head.next))
+                                      &anon_vma->root->rb_root.rb_node))
                        BUG();
        }
 }
@@ -2593,7 +2612,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * A single task can't take more than one mm_take_all_locks() in a row
  * or it would deadlock.
  *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  * mapping->flags avoid to take the same lock twice, if more than one
  * vma in this mm is backed by the same anon_vma or address_space.
  *
@@ -2640,13 +2659,13 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-       if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+       if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
-                * the vma so the users using the anon_vma->head will
+                * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
@@ -2654,7 +2673,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
-                                         &anon_vma->root->head.next))
+                                         &anon_vma->root->rb_root.rb_node))
                        BUG();
                anon_vma_unlock(anon_vma);
        }
index 862b60822d9f8c4761448dee6165974da8405ecb..c297142f0fe6dd096e5d3eb78df66f1d5f71cb46 100644 (file)
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
+
 /*
  * This function can't run concurrently against mmu_notifier_register
  * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
  * in parallel despite there being no task using this mm any more,
  * through the vmas outside of the exit_mmap context, such as with
  * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
  * can't go away from under us as exit_mmap holds an mm_count pin
  * itself.
  */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+       int id;
 
        /*
-        * RCU here will block mmu_notifier_unregister until
+        * SRCU here will block mmu_notifier_unregister until
         * ->release returns.
         */
-       rcu_read_lock();
+       id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
                /*
                 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-       rcu_read_unlock();
+       srcu_read_unlock(&srcu, id);
 
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
        spin_unlock(&mm->mmu_notifier_mm->lock);
 
        /*
-        * synchronize_rcu here prevents mmu_notifier_release to
+        * synchronize_srcu here prevents mmu_notifier_release to
         * return to exit_mmap (which would proceed freeing all pages
         * in the mm) until the ->release method returns, if it was
         * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
         * The mmu_notifier_mm can't go away from under us because one
         * mm_count is hold by exit_mmap.
         */
-       synchronize_rcu();
+       synchronize_srcu(&srcu);
 }
 
 /*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-       int young = 0;
+       int young = 0, id;
 
-       rcu_read_lock();
+       id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->clear_flush_young)
                        young |= mn->ops->clear_flush_young(mn, mm, address);
        }
-       rcu_read_unlock();
+       srcu_read_unlock(&srcu, id);
 
        return young;
 }
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
-       int young = 0;
+       int young = 0, id;
 
-       rcu_read_lock();
+       id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->test_young) {
                        young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
                                break;
                }
        }
-       rcu_read_unlock();
+       srcu_read_unlock(&srcu, id);
 
        return young;
 }
@@ -126,8 +131,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+       int id;
 
-       rcu_read_lock();
+       id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->change_pte)
                        mn->ops->change_pte(mn, mm, address, pte);
@@ -138,7 +144,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
                else if (mn->ops->invalidate_page)
                        mn->ops->invalidate_page(mn, mm, address);
        }
-       rcu_read_unlock();
+       srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +152,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+       int id;
 
-       rcu_read_lock();
+       id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_page)
                        mn->ops->invalidate_page(mn, mm, address);
        }
-       rcu_read_unlock();
+       srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +167,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+       int id;
 
-       rcu_read_lock();
+       id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_start)
                        mn->ops->invalidate_range_start(mn, mm, start, end);
        }
-       rcu_read_unlock();
+       srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +182,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
        struct mmu_notifier *mn;
        struct hlist_node *n;
+       int id;
 
-       rcu_read_lock();
+       id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_end)
                        mn->ops->invalidate_range_end(mn, mm, start, end);
        }
-       rcu_read_unlock();
+       srcu_read_unlock(&srcu, id);
 }
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,22 +201,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
 
-       ret = -ENOMEM;
-       mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
-       if (unlikely(!mmu_notifier_mm))
-               goto out;
+       /*
+       * Verify that mmu_notifier_init() already run and the global srcu is
+       * initialized.
+       */
+       BUG_ON(!srcu.per_cpu_ref);
 
        if (take_mmap_sem)
                down_write(&mm->mmap_sem);
        ret = mm_take_all_locks(mm);
        if (unlikely(ret))
-               goto out_cleanup;
+               goto out;
 
        if (!mm_has_notifiers(mm)) {
+               mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
+                                       GFP_KERNEL);
+               if (unlikely(!mmu_notifier_mm)) {
+                       ret = -ENOMEM;
+                       goto out_of_mem;
+               }
                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
                spin_lock_init(&mmu_notifier_mm->lock);
+
                mm->mmu_notifier_mm = mmu_notifier_mm;
-               mmu_notifier_mm = NULL;
        }
        atomic_inc(&mm->mm_count);
 
@@ -223,13 +239,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
        hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
        spin_unlock(&mm->mmu_notifier_mm->lock);
 
+out_of_mem:
        mm_drop_all_locks(mm);
-out_cleanup:
+out:
        if (take_mmap_sem)
                up_write(&mm->mmap_sem);
-       /* kfree() does nothing if mmu_notifier_mm is NULL */
-       kfree(mmu_notifier_mm);
-out:
+
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
        return ret;
 }
@@ -274,8 +289,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
  * This releases the mm_count pin automatically and frees the mm
  * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
  * calling mmu_notifier_unregister. ->release or any other notifier
  * method may be invoked concurrently with mmu_notifier_unregister,
  * and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +302,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 
        if (!hlist_unhashed(&mn->hlist)) {
                /*
-                * RCU here will force exit_mmap to wait ->release to finish
+                * SRCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
-               rcu_read_lock();
+               int id;
 
+               id = srcu_read_lock(&srcu);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -299,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                 */
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-               rcu_read_unlock();
+               srcu_read_unlock(&srcu, id);
 
                spin_lock(&mm->mmu_notifier_mm->lock);
                hlist_del_rcu(&mn->hlist);
@@ -310,10 +326,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
         * Wait any running method to finish, of course including
         * ->release if it was run by mmu_notifier_relase instead of us.
         */
-       synchronize_rcu();
+       synchronize_srcu(&srcu);
 
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
        mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+       return init_srcu_struct(&srcu);
+}
+
+module_init(mmu_notifier_init);
index cc06d0e48d050dc0a2ec47cf62fd8d3b22b4a2be..3b639a4b26bd3a29d3f46318cb4f5be23abb54c6 100644 (file)
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                unsigned long old_addr, unsigned long old_end,
                struct vm_area_struct *new_vma, pmd_t *new_pmd,
-               unsigned long new_addr)
+               unsigned long new_addr, bool need_rmap_locks)
 {
        struct address_space *mapping = NULL;
+       struct anon_vma *anon_vma = NULL;
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
 
-       if (vma->vm_file) {
-               /*
-                * Subtle point from Rajesh Venkatasubramanian: before
-                * moving file-based ptes, we must lock truncate_pagecache
-                * out, since it might clean the dst vma before the src vma,
-                * and we propagate stale pages into the dst afterward.
-                */
-               mapping = vma->vm_file->f_mapping;
-               mutex_lock(&mapping->i_mmap_mutex);
+       /*
+        * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+        * locks to ensure that rmap will always observe either the old or the
+        * new ptes. This is the easiest way to avoid races with
+        * truncate_pagecache(), page migration, etc...
+        *
+        * When need_rmap_locks is false, we use other ways to avoid
+        * such races:
+        *
+        * - During exec() shift_arg_pages(), we use a specially tagged vma
+        *   which rmap call sites look for using is_vma_temporary_stack().
+        *
+        * - During mremap(), new_vma is often known to be placed after vma
+        *   in rmap traversal order. This ensures rmap will always observe
+        *   either the old pte, or the new pte, or both (the page table locks
+        *   serialize access to individual ptes, but only rmap traversal
+        *   order guarantees that we won't miss both the old and new ptes).
+        */
+       if (need_rmap_locks) {
+               if (vma->vm_file) {
+                       mapping = vma->vm_file->f_mapping;
+                       mutex_lock(&mapping->i_mmap_mutex);
+               }
+               if (vma->anon_vma) {
+                       anon_vma = vma->anon_vma;
+                       anon_vma_lock(anon_vma);
+               }
        }
 
        /*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                spin_unlock(new_ptl);
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
+       if (anon_vma)
+               anon_vma_unlock(anon_vma);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -122,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
 unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
-               unsigned long new_addr, unsigned long len)
+               unsigned long new_addr, unsigned long len,
+               bool need_rmap_locks)
 {
        unsigned long extent, next, old_end;
        pmd_t *old_pmd, *new_pmd;
@@ -169,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (extent > LATENCY_LIMIT)
                        extent = LATENCY_LIMIT;
                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                               new_vma, new_pmd, new_addr);
+                         new_vma, new_pmd, new_addr, need_rmap_locks);
                need_flush = true;
        }
        if (likely(need_flush))
@@ -193,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        unsigned long hiwater_vm;
        int split = 0;
        int err;
+       bool need_rmap_locks;
 
        /*
         * We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                return err;
 
        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-       new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+       new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+                          &need_rmap_locks);
        if (!new_vma)
                return -ENOMEM;
 
-       moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+       moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+                                    need_rmap_locks);
        if (moved_len < old_len) {
-               /*
-                * Before moving the page tables from the new vma to
-                * the old vma, we need to be sure the old vma is
-                * queued after new vma in the same_anon_vma list to
-                * prevent SMP races with rmap_walk (that could lead
-                * rmap_walk to miss some page table).
-                */
-               anon_vma_moveto_tail(vma);
-
                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
                 * and then proceed to unmap new area instead of old.
                 */
-               move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+               move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+                                true);
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
index 405573010f99a8b0d877dd980e284979ebd2ec69..bd82f6b314114dc937bea707a70072b299860a08 100644 (file)
@@ -162,8 +162,6 @@ unsigned long __init free_all_bootmem(void)
         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
-        * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-        *  will be used instead of only Node0 related
         */
        return free_low_memory_core_early(MAX_NUMNODES);
 }
index d4b0c10872de59d8959262b1daac92d5d60eb80a..28ecc1af704348d28277827b03dfbbb3239a4ec8 100644 (file)
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-               vma_prio_tree_insert(vma, &mapping->i_mmap);
+               vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
-               vma_prio_tree_remove(vma, &mapping->i_mmap);
+               vma_interval_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
                mutex_unlock(&mapping->i_mmap_mutex);
        }
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
        kenter("%p", vma);
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-       if (vma->vm_file) {
+       if (vma->vm_file)
                fput(vma->vm_file);
-               if (vma->vm_flags & VM_EXECUTABLE)
-                       removed_exe_file_vma(mm);
-       }
        put_nommu_region(vma->vm_region);
        kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1286,10 +1283,6 @@ unsigned long do_mmap_pgoff(struct file *file,
                get_file(file);
                vma->vm_file = file;
                get_file(file);
-               if (vm_flags & VM_EXECUTABLE) {
-                       added_exe_file_vma(current->mm);
-                       vma->vm_mm = current->mm;
-               }
        }
 
        down_write(&nommu_region_sem);
@@ -1442,8 +1435,6 @@ error:
        kmem_cache_free(vm_region_jar, region);
        if (vma->vm_file)
                fput(vma->vm_file);
-       if (vma->vm_flags & VM_EXECUTABLE)
-               removed_exe_file_vma(vma->vm_mm);
        kmem_cache_free(vm_area_cachep, vma);
        kleave(" = %d", ret);
        return ret;
@@ -1822,7 +1813,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        if (addr != (pfn << PAGE_SHIFT))
                return -EINVAL;
 
-       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+       vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1963,6 +1954,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+                            unsigned long size, pgoff_t pgoff)
+{
+       BUG();
+       return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, int write)
 {
@@ -2047,7 +2046,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                                size_t newsize)
 {
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
        struct vm_region *region;
        pgoff_t low, high;
        size_t r_size, r_top;
@@ -2059,8 +2057,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        mutex_lock(&inode->i_mapping->i_mmap_mutex);
 
        /* search for VMAs that fall within the dead zone */
-       vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-                             low, high) {
+       vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
@@ -2076,8 +2073,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
         * we don't check for any regions that start beyond the EOF as there
         * shouldn't be any
         */
-       vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-                             0, ULONG_MAX) {
+       vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+                                 0, ULONG_MAX) {
                if (!(vma->vm_flags & VM_SHARED))
                        continue;
 
index 198600861638b9833ef6603198f980bf2b9fe67f..79e0f3e24831212d7be9635d7ee214755ed22ba3 100644 (file)
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-               "oom_adj=%d, oom_score_adj=%d\n",
-               current->comm, gfp_mask, order, current->signal->oom_adj,
+               "oom_score_adj=%d\n",
+               current->comm, gfp_mask, order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
index c13ea7538891d85988b029236f11e233632b58ec..bc5229bf7948d10f084c790cd95a7cb16d3b25dc 100644 (file)
@@ -368,8 +368,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        int nr_pages = 1 << order;
        int bad = 0;
 
-       if (unlikely(compound_order(page) != order) ||
-           unlikely(!PageHead(page))) {
+       if (unlikely(compound_order(page) != order)) {
                bad_page(page);
                bad++;
        }
@@ -672,8 +671,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                       __free_one_page(page, zone, 0, page_private(page));
-                       trace_mm_page_pcpu_drain(page, 0, page_private(page));
+                       __free_one_page(page, zone, 0,
+                               get_freepage_migratetype(page));
+                       trace_mm_page_pcpu_drain(page, 0,
+                               get_freepage_migratetype(page));
                } while (--to_free && --batch_free && !list_empty(list));
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -722,6 +723,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
        int wasMlocked = __TestClearPageMlocked(page);
+       int migratetype;
 
        if (!free_pages_prepare(page, order))
                return;
@@ -730,8 +732,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        if (unlikely(wasMlocked))
                free_page_mlock(page);
        __count_vm_events(PGFREE, 1 << order);
-       free_one_page(page_zone(page), page, order,
-                                       get_pageblock_migratetype(page));
+       migratetype = get_pageblock_migratetype(page);
+       set_freepage_migratetype(page, migratetype);
+       free_one_page(page_zone(page), page, order, migratetype);
        local_irq_restore(flags);
 }
 
@@ -915,7 +918,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
                          struct page *start_page, struct page *end_page,
                          int migratetype)
 {
@@ -951,6 +954,7 @@ static int move_freepages(struct zone *zone,
                order = page_order(page);
                list_move(&page->lru,
                          &zone->free_area[order].free_list[migratetype]);
+               set_freepage_migratetype(page, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -1135,7 +1139,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
                                mt = migratetype;
                }
-               set_page_private(page, mt);
+               set_freepage_migratetype(page, mt);
                list = &page->lru;
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
@@ -1302,7 +1306,7 @@ void free_hot_cold_page(struct page *page, int cold)
                return;
 
        migratetype = get_pageblock_migratetype(page);
-       set_page_private(page, migratetype);
+       set_freepage_migratetype(page, migratetype);
        local_irq_save(flags);
        if (unlikely(wasMlocked))
                free_page_mlock(page);
@@ -1380,16 +1384,11 @@ void split_page(struct page *page, unsigned int order)
 }
 
 /*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
+ * Similar to the split_page family of functions except that the page
+ * required at the given order and being isolated now to prevent races
+ * with parallel allocators
  */
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
        unsigned int order;
        unsigned long watermark;
@@ -1411,10 +1410,11 @@ int split_free_page(struct page *page)
        rmv_page_order(page);
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
 
-       /* Split into individual pages */
-       set_page_refcounted(page);
-       split_page(page, order);
+       if (alloc_order != order)
+               expand(zone, page, alloc_order, order,
+                       &zone->free_area[order], migratetype);
 
+       /* Set the pageblock if the captured page is at least a pageblock */
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1425,35 @@ int split_free_page(struct page *page)
                }
        }
 
-       return 1 << order;
+       return 1UL << order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+       unsigned int order;
+       int nr_pages;
+
+       BUG_ON(!PageBuddy(page));
+       order = page_order(page);
+
+       nr_pages = capture_free_page(page, order, 0);
+       if (!nr_pages)
+               return 0;
+
+       /* Split into individual pages */
+       set_page_refcounted(page);
+       split_page(page, order);
+       return nr_pages;
 }
 
 /*
@@ -2105,7 +2133,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
-       struct page *page;
+       struct page *page = NULL;
 
        if (!order)
                return NULL;
@@ -2118,10 +2146,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration,
-                                               contended_compaction);
+                                               contended_compaction, &page);
        current->flags &= ~PF_MEMALLOC;
-       if (*did_some_progress != COMPACT_SKIPPED) {
 
+       /* If compaction captured a page, prep and use it */
+       if (page) {
+               prep_new_page(page, order, gfp_mask);
+               goto got_page;
+       }
+
+       if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
                drain_pages(get_cpu());
                put_cpu();
@@ -2131,6 +2165,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                alloc_flags & ~ALLOC_NO_WATERMARKS,
                                preferred_zone, migratetype);
                if (page) {
+got_page:
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
                        if (order >= preferred_zone->compact_order_failed)
@@ -2362,9 +2397,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 
 restart:
-       if (!(gfp_mask & __GFP_NO_KSWAPD))
-               wake_all_kswapd(order, zonelist, high_zoneidx,
-                                               zone_idx(preferred_zone));
+       wake_all_kswapd(order, zonelist, high_zoneidx,
+                                       zone_idx(preferred_zone));
 
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2475,7 @@ rebalance:
         * system then fail the allocation instead of entering direct reclaim.
         */
        if ((deferred_compaction || contended_compaction) &&
-                                               (gfp_mask & __GFP_NO_KSWAPD))
+           (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
                goto nopage;
 
        /* Try direct reclaim and then allocating */
@@ -4879,7 +4913,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
 
-       /* Print out the early_node_map[] */
+       /* Print out the early node map */
        printk("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5619,18 +5653,6 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
                                pageblock_nr_pages));
 }
 
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
-                            int **resultp)
-{
-       gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
-       if (PageHighMem(page))
-               gfp_mask |= __GFP_HIGHMEM;
-
-       return alloc_page(gfp_mask);
-}
-
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 {
@@ -5670,8 +5692,10 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
                        break;
                }
 
+               reclaim_clean_pages_from_list(cc.zone, &cc.migratepages);
+
                ret = migrate_pages(&cc.migratepages,
-                                   __alloc_contig_migrate_alloc,
+                                   alloc_migrate_target,
                                    0, false, MIGRATE_SYNC);
        }
 
index 247d1f175739247718d087aee8bba3c810daaa46..49c617e1d1212b55223ef31154013f7345f475bf 100644 (file)
@@ -193,10 +193,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
                        continue;
                }
                page = pfn_to_page(pfn);
-               if (PageBuddy(page))
+               if (PageBuddy(page)) {
+                       /*
+                        * If race between isolatation and allocation happens,
+                        * some free pages could be in MIGRATE_MOVABLE list
+                        * although pageblock's migratation type of the page
+                        * is MIGRATE_ISOLATE. Catch it and move the page into
+                        * MIGRATE_ISOLATE list.
+                        */
+                       if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+                               struct page *end_page;
+
+                               end_page = page + (1 << page_order(page)) - 1;
+                               move_freepages(page_zone(page), page, end_page,
+                                               MIGRATE_ISOLATE);
+                       }
                        pfn += 1 << page_order(page);
+               }
                else if (page_count(page) == 0 &&
-                               page_private(page) == MIGRATE_ISOLATE)
+                       get_freepage_migratetype(page) == MIGRATE_ISOLATE)
                        pfn += 1;
                else
                        break;
@@ -233,3 +248,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        spin_unlock_irqrestore(&zone->lock, flags);
        return ret ? 0 : -EBUSY;
 }
+
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+                                 int **resultp)
+{
+       gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+       if (PageHighMem(page))
+               gfp_mask |= __GFP_HIGHMEM;
+
+       return alloc_page(gfp_mask);
+}
index 74c0ddaa6fa0df019c590994fa79b136dd1b4abd..e642627da6b75d7c6cf5258a59cfde49dca7c89e 100644 (file)
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+       assert_spin_locked(&mm->page_table_lock);
+
+       /* FIFO */
+       if (!mm->pmd_huge_pte)
+               INIT_LIST_HEAD(&pgtable->lru);
+       else
+               list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+       mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+       pgtable_t pgtable;
+
+       assert_spin_locked(&mm->page_table_lock);
+
+       /* FIFO */
+       pgtable = mm->pmd_huge_pte;
+       if (list_empty(&pgtable->lru))
+               mm->pmd_huge_pte = NULL;
+       else {
+               mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                             struct page, lru);
+               list_del(&pgtable->lru);
+       }
+       return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                    pmd_t *pmdp)
+{
+       set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+       flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644 (file)
index 799dcfd..0000000
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004   Initial version
- */
-
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-
-#define RADIX_INDEX(vma)  ((vma)->vm_pgoff)
-#define VMA_SIZE(vma)    (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma)   ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- *      |
- *      A       vm_set.head
- *     / \      /
- *    L   R -> H-I-J-K-M-N-O-P-Q-S
- *    ^   ^    <-- vm_set.list -->
- *  tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL    ==> a tree node
- *      vma->shared.vm_set.head != NULL ==> list of others mapping same range
- *      vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- *     vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- *     vma->shared.vm_set.head == NULL ==> a list node
- */
-
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
-       /* Leave these BUG_ONs till prio_tree patch stabilizes */
-       BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
-       BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-
-       vma->shared.vm_set.head = NULL;
-       vma->shared.vm_set.parent = NULL;
-
-       if (!old->shared.vm_set.parent)
-               list_add(&vma->shared.vm_set.list,
-                               &old->shared.vm_set.list);
-       else if (old->shared.vm_set.head)
-               list_add_tail(&vma->shared.vm_set.list,
-                               &old->shared.vm_set.head->shared.vm_set.list);
-       else {
-               INIT_LIST_HEAD(&vma->shared.vm_set.list);
-               vma->shared.vm_set.head = old;
-               old->shared.vm_set.head = vma;
-       }
-}
-
-void vma_prio_tree_insert(struct vm_area_struct *vma,
-                         struct prio_tree_root *root)
-{
-       struct prio_tree_node *ptr;
-       struct vm_area_struct *old;
-
-       vma->shared.vm_set.head = NULL;
-
-       ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
-       if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
-               old = prio_tree_entry(ptr, struct vm_area_struct,
-                                       shared.prio_tree_node);
-               vma_prio_tree_add(vma, old);
-       }
-}
-
-void vma_prio_tree_remove(struct vm_area_struct *vma,
-                         struct prio_tree_root *root)
-{
-       struct vm_area_struct *node, *head, *new_head;
-
-       if (!vma->shared.vm_set.head) {
-               if (!vma->shared.vm_set.parent)
-                       list_del_init(&vma->shared.vm_set.list);
-               else
-                       raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
-       } else {
-               /* Leave this BUG_ON till prio_tree patch stabilizes */
-               BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
-               if (vma->shared.vm_set.parent) {
-                       head = vma->shared.vm_set.head;
-                       if (!list_empty(&head->shared.vm_set.list)) {
-                               new_head = list_entry(
-                                       head->shared.vm_set.list.next,
-                                       struct vm_area_struct,
-                                       shared.vm_set.list);
-                               list_del_init(&head->shared.vm_set.list);
-                       } else
-                               new_head = NULL;
-
-                       raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
-                                       &head->shared.prio_tree_node);
-                       head->shared.vm_set.head = new_head;
-                       if (new_head)
-                               new_head->shared.vm_set.head = head;
-
-               } else {
-                       node = vma->shared.vm_set.head;
-                       if (!list_empty(&vma->shared.vm_set.list)) {
-                               new_head = list_entry(
-                                       vma->shared.vm_set.list.next,
-                                       struct vm_area_struct,
-                                       shared.vm_set.list);
-                               list_del_init(&vma->shared.vm_set.list);
-                               node->shared.vm_set.head = new_head;
-                               new_head->shared.vm_set.head = node;
-                       } else
-                               node->shared.vm_set.head = NULL;
-               }
-       }
-}
-
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
-                                       struct prio_tree_iter *iter)
-{
-       struct prio_tree_node *ptr;
-       struct vm_area_struct *next;
-
-       if (!vma) {
-               /*
-                * First call is with NULL vma
-                */
-               ptr = prio_tree_next(iter);
-               if (ptr) {
-                       next = prio_tree_entry(ptr, struct vm_area_struct,
-                                               shared.prio_tree_node);
-                       prefetch(next->shared.vm_set.head);
-                       return next;
-               } else
-                       return NULL;
-       }
-
-       if (vma->shared.vm_set.parent) {
-               if (vma->shared.vm_set.head) {
-                       next = vma->shared.vm_set.head;
-                       prefetch(next->shared.vm_set.list.next);
-                       return next;
-               }
-       } else {
-               next = list_entry(vma->shared.vm_set.list.next,
-                               struct vm_area_struct, shared.vm_set.list);
-               if (!next->shared.vm_set.head) {
-                       prefetch(next->shared.vm_set.list.next);
-                       return next;
-               }
-       }
-
-       ptr = prio_tree_next(iter);
-       if (ptr) {
-               next = prio_tree_entry(ptr, struct vm_area_struct,
-                                       shared.prio_tree_node);
-               prefetch(next->shared.vm_set.head);
-               return next;
-       } else
-               return NULL;
-}
index 0f3b7cda2a24c5705ea4ad6e7ef127f53fdf3633..42ea4ddd586503ddbe47e03cc73b980a885d9048 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
-
-       /*
-        * It's critical to add new vmas to the tail of the anon_vma,
-        * see comment in huge_memory.c:__split_huge_page().
-        */
-       list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+       anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
 }
 
 /**
@@ -268,51 +263,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
        return -ENOMEM;
 }
 
-/*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
-       struct anon_vma_chain *pavc;
-       struct anon_vma *root = NULL;
-
-       list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
-               struct anon_vma *anon_vma = pavc->anon_vma;
-               VM_BUG_ON(pavc->vma != dst);
-               root = lock_anon_vma_root(root, anon_vma);
-               list_del(&pavc->same_anon_vma);
-               list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
-       }
-       unlock_anon_vma_root(root);
-}
-
 /*
  * Attach vma to its own anon_vma, as well as to the anon_vmas that
  * the corresponding VMA in the parent process is attached to.
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
                struct anon_vma *anon_vma = avc->anon_vma;
 
                root = lock_anon_vma_root(root, anon_vma);
-               list_del(&avc->same_anon_vma);
+               anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
 
                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
-               if (list_empty(&anon_vma->head))
+               if (RB_EMPTY_ROOT(&anon_vma->rb_root))
                        continue;
 
                list_del(&avc->same_vma);
@@ -416,7 +366,10 @@ static void anon_vma_ctor(void *data)
 
        mutex_init(&anon_vma->mutex);
        atomic_set(&anon_vma->refcount, 0);
-       INIT_LIST_HEAD(&anon_vma->head);
+#ifdef CONFIG_SWAP
+       atomic_set(&anon_vma->swapra_miss, 0);
+#endif
+       anon_vma->rb_root = RB_ROOT;
 }
 
 void __init anon_vma_init(void)
@@ -560,22 +513,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 
 /*
  * At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
  */
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-       unsigned long address;
 
        if (unlikely(is_vm_hugetlb_page(vma)))
                pgoff = page->index << huge_page_order(page_hstate(page));
-       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
-               /* page should be within @vma mapping range */
-               return -EFAULT;
-       }
+
+       return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+       unsigned long address = __vma_address(page, vma);
+
+       /* page should be within @vma mapping range */
+       VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+
        return address;
 }
 
@@ -585,6 +542,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
  */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
+       unsigned long address;
        if (PageAnon(page)) {
                struct anon_vma *page__anon_vma = page_anon_vma(page);
                /*
@@ -600,7 +558,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
                        return -EFAULT;
        } else
                return -EFAULT;
-       return vma_address(page, vma);
+       address = __vma_address(page, vma);
+       if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+               return -EFAULT;
+       return address;
 }
 
 /*
@@ -674,8 +635,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
        pte_t *pte;
        spinlock_t *ptl;
 
-       address = vma_address(page, vma);
-       if (address == -EFAULT)         /* out of vma range */
+       address = __vma_address(page, vma);
+       if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                return 0;
        pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
        if (!pte)                       /* the page is not in this mm */
@@ -769,6 +730,7 @@ static int page_referenced_anon(struct page *page,
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
+       pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int referenced = 0;
 
@@ -777,11 +739,10 @@ static int page_referenced_anon(struct page *page,
                return referenced;
 
        mapcount = page_mapcount(page);
-       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+       pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-               if (address == -EFAULT)
-                       continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -820,7 +781,6 @@ static int page_referenced_file(struct page *page,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
        int referenced = 0;
 
        /*
@@ -846,10 +806,8 @@ static int page_referenced_file(struct page *page,
         */
        mapcount = page_mapcount(page);
 
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-               if (address == -EFAULT)
-                       continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
        int ret = 0;
 
        BUG_ON(PageAnon(page));
 
        mutex_lock(&mapping->i_mmap_mutex);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (vma->vm_flags & VM_SHARED) {
                        unsigned long address = vma_address(page, vma);
-                       if (address == -EFAULT)
-                               continue;
                        ret += page_mkclean_one(page, vma, address);
                }
        }
@@ -1492,6 +1447,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
+       pgoff_t pgoff;
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
 
@@ -1499,7 +1455,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
        if (!anon_vma)
                return ret;
 
-       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+       pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address;
 
@@ -1516,8 +1473,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                        continue;
 
                address = vma_address(page, vma);
-               if (address == -EFAULT)
-                       continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        break;
@@ -1547,7 +1502,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
        unsigned long cursor;
        unsigned long max_nl_cursor = 0;
@@ -1555,10 +1509,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned int mapcount;
 
        mutex_lock(&mapping->i_mmap_mutex);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-               if (address == -EFAULT)
-                       continue;
                ret = try_to_unmap_one(page, vma, address, flags);
                if (ret != SWAP_AGAIN || !page_mapped(page))
                        goto out;
@@ -1576,7 +1528,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                goto out;
 
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                               shared.vm_set.list) {
+                                                       shared.nonlinear) {
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
                        max_nl_cursor = cursor;
@@ -1608,7 +1560,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-                                               shared.vm_set.list) {
+                                                       shared.nonlinear) {
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1583,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
         * in locked vmas).  Reset cursor on all unreserved nonlinear
         * vmas, now forgetting on which ones it had fallen behind.
         */
-       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
                vma->vm_private_data = NULL;
 out:
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1668,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct anon_vma *anon_vma;
+       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
 
@@ -1729,11 +1682,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        if (!anon_vma)
                return ret;
        anon_vma_lock(anon_vma);
-       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
-               if (address == -EFAULT)
-                       continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
@@ -1748,16 +1699,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
        int ret = SWAP_AGAIN;
 
        if (!mapping)
                return ret;
        mutex_lock(&mapping->i_mmap_mutex);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+       vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
-               if (address == -EFAULT)
-                       continue;
                ret = rmap_one(page, vma, address, arg);
                if (ret != SWAP_AGAIN)
                        break;
index d3752110c8c7ee29b7a0d98947366482984c18ca..217aa9cd59df904e5031cef207052e65aec34900 100644 (file)
@@ -922,6 +922,7 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
+       pvma.anon_vma = NULL;
        return swapin_readahead(swap, gfp, &pvma, 0);
 }
 
@@ -1339,7 +1340,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 
@@ -2643,6 +2643,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
 #endif
+       .remap_pages    = generic_file_remap_pages,
 };
 
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2836,7 +2837,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 
index 0a6b5671dd96517faf26f615e128375ca55f7cf9..a23b70ff38cb65c3d9d6de3539a93aa0b03756d5 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -803,6 +803,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
 
+#if DEBUG
 #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
 
 static void __slab_error(const char *function, struct kmem_cache *cachep,
@@ -812,6 +813,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
               function, cachep->name, msg);
        dump_stack();
 }
+#endif
 
 /*
  * By default on NUMA we use alien caches to stage the freeing of
index 77825883298f1f1843e068c2e5ad0d55706cf873..f76c76c7501b5ae47cf0144791a7d1626c65f26d 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 
        page_cache_get(page);
-       if (!pagevec_add(pvec, page))
+       if (!pagevec_space(pvec))
                __pagevec_lru_add(pvec, lru);
+       pagevec_add(pvec, page);
        put_cpu_var(lru_add_pvecs);
 }
 EXPORT_SYMBOL(__lru_cache_add);
index 0cb36fb1f61cc539baa143319c40da30ada3d04e..d1f6c2df820e995ffb067597e779c9a491d8bfe7 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/page_cgroup.h>
 
 #include <asm/pgtable.h>
+#include "internal.h"
 
 /*
  * swapper_space is a fiction, retained to simplify the path through
@@ -379,6 +380,10 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        unsigned long mask = (1UL << page_cluster) - 1;
        struct blk_plug plug;
 
+       swap_cache_miss(vma);
+       if (swap_cache_skip_readahead(vma))
+               goto skip;
+
        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
        end_offset = offset | mask;
@@ -397,5 +402,6 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        blk_finish_plug(&plug);
 
        lru_add_drain();        /* Push any new pages onto the LRU now */
+skip:
        return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
index 2bb90b1d241cc872da1e13dd80b449b2d323e812..8de704679bfc595be27703c081c8b2fc57e36e10 100644 (file)
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                usize -= PAGE_SIZE;
        } while (usize > 0);
 
-       /* Prevent "things" like memory migration? VM_flags need a cleanup... */
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
        return 0;
 }
index 99b434b674c02b4099bcfe58632ad56e183801fd..f8f56f8bc9c799e197ce1a09414b5bf979d55857 100644 (file)
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
                                      struct scan_control *sc,
+                                     enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
-                                     unsigned long *ret_nr_writeback)
+                                     unsigned long *ret_nr_writeback,
+                                     bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
-               enum page_references references;
                struct address_space *mapping;
                struct page *page;
                int may_enter_fs;
+               enum page_references references = PAGEREF_RECLAIM;
 
                cond_resched();
 
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        wait_on_page_writeback(page);
                }
 
-               references = page_check_references(page, sc);
+               if (!force_reclaim)
+                       references = page_check_references(page, sc);
+
                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, TTU_UNMAP)) {
+                       switch (try_to_unmap(page, ttu_flags)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
        return nr_reclaimed;
 }
 
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+                                           struct list_head *page_list)
+{
+       struct scan_control sc = {
+               .gfp_mask = GFP_KERNEL,
+               .priority = DEF_PRIORITY,
+               .may_unmap = 1,
+       };
+       unsigned long ret, dummy1, dummy2;
+       struct page *page, *next;
+       LIST_HEAD(clean_pages);
+
+       list_for_each_entry_safe(page, next, page_list, lru) {
+               if (page_is_file_cache(page) && !PageDirty(page)) {
+                       ClearPageActive(page);
+                       list_move(&page->lru, &clean_pages);
+               }
+       }
+
+       ret = shrink_page_list(&clean_pages, zone, &sc,
+                               TTU_UNMAP|TTU_IGNORE_ACCESS,
+                               &dummy1, &dummy2, true);
+       list_splice(&clean_pages, page_list);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+       return ret;
+}
+
 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
  * if it is of the appropriate PageActive status.  Pages which are being
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (nr_taken == 0)
                return 0;
 
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc,
-                                               &nr_dirty, &nr_writeback);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+                                       &nr_dirty, &nr_writeback, false);
 
        spin_lock_irq(&zone->lru_lock);
 
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
        return false;
 }
 
+#ifdef CONFIG_COMPACTION
+/*
+ * If compaction is deferred for sc->order then scale the number of pages
+ * reclaimed based on the number of consecutive allocation failures
+ */
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                       struct lruvec *lruvec, struct scan_control *sc)
+{
+       struct zone *zone = lruvec_zone(lruvec);
+
+       if (zone->compact_order_failed <= sc->order)
+               pages_for_compaction <<= zone->compact_defer_shift;
+       return pages_for_compaction;
+}
+#else
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+                       struct lruvec *lruvec, struct scan_control *sc)
+{
+       return pages_for_compaction;
+}
+#endif
+
 /*
  * Reclaim/compaction is used for high-order allocation requests. It reclaims
  * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
+
+       pages_for_compaction = scale_for_compaction(pages_for_compaction,
+                                                   lruvec, sc);
        inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
        if (nr_swap_pages > 0)
                inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -3101,9 +3157,9 @@ int kswapd_run(int nid)
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state == SYSTEM_BOOTING);
-               printk("Failed to start kswapd on node %d\n",nid);
                pgdat->kswapd = NULL;
-               ret = -1;
+               pr_err("Failed to start kswapd on node %d\n", nid);
+               ret = PTR_ERR(pgdat->kswapd);
        }
        return ret;
 }
index 42119c05e82c023b777298e9f41b6a6ebbade0cb..7a392b9287954aab2025af988cc7c4e68add1e00 100644 (file)
@@ -213,7 +213,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
        kref_init(&req->r_kref);
        init_completion(&req->r_completion);
        init_completion(&req->r_safe_completion);
-       rb_init_node(&req->r_node);
        INIT_LIST_HEAD(&req->r_unsafe_item);
        INIT_LIST_HEAD(&req->r_linger_item);
        INIT_LIST_HEAD(&req->r_linger_osd);
index 6a3ee981931d3c2b4438cff1d220ec2e8039bd7e..afa44595f34868184bef73440f0c68f71c538452 100644 (file)
@@ -209,7 +209,7 @@ endif
 # >$< substitution to preserve $ when reloading .cmd file
 # note: when using inline perl scripts [perl -e '...$$t=1;...']
 # in $(cmd_xxx) double $$ your perl vars
-make-cmd = $(subst \#,\\\#,$(subst $$,$$$$,$(call escsq,$(cmd_$(1)))))
+make-cmd = $(subst \\,\\\\,$(subst \#,\\\#,$(subst $$,$$$$,$(call escsq,$(cmd_$(1))))))
 
 # Find any prerequisites that is newer than target or that does not exist.
 # PHONY targets skipped in both cases.
index ca05ba217f5fd4038bcc511adff8a5b95823fb3e..8e5ac71f42ee8fe99362acc42bfa089e4cb69f80 100755 (executable)
@@ -421,7 +421,7 @@ sub top_of_kernel_tree {
                }
        }
        return 1;
-    }
+}
 
 sub parse_email {
        my ($formatted_email) = @_;
@@ -1386,6 +1386,8 @@ sub process {
        my $in_header_lines = 1;
        my $in_commit_log = 0;          #Scanning lines before patch
 
+       my $non_utf8_charset = 0;
+
        our @report = ();
        our $cnt_lines = 0;
        our $cnt_error = 0;
@@ -1686,10 +1688,17 @@ sub process {
                        $in_commit_log = 1;
                }
 
-# Still not yet in a patch, check for any UTF-8
-               if ($in_commit_log && $realfile =~ /^$/ &&
+# Check if there is UTF-8 in a commit log when a mail header has explicitly
+# declined it, i.e defined some charset where it is missing.
+               if ($in_header_lines &&
+                   $rawline =~ /^Content-Type:.+charset="(.+)".*$/ &&
+                   $1 !~ /utf-8/i) {
+                       $non_utf8_charset = 1;
+               }
+
+               if ($in_commit_log && $non_utf8_charset && $realfile =~ /^$/ &&
                    $rawline =~ /$NON_ASCII_UTF8/) {
-                       CHK("UTF8_BEFORE_PATCH",
+                       WARN("UTF8_BEFORE_PATCH",
                            "8-bit UTF-8 used in possible commit log\n" . $herecurr);
                }
 
@@ -1873,6 +1882,20 @@ sub process {
                            "No space is necessary after a cast\n" . $hereprev);
                }
 
+               if ($realfile =~ m@^(drivers/net/|net/)@ &&
+                   $rawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
+                   $prevrawline =~ /^\+[ \t]*$/) {
+                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
+                            "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
+               }
+
+               if ($realfile =~ m@^(drivers/net/|net/)@ &&
+                   $rawline !~ m@^\+[ \t]*(\/\*|\*\/)@ &&
+                   $rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) {
+                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
+                            "networking block comments put the trailing */ on a separate line\n" . $herecurr);
+               }
+
 # check for spaces at the beginning of a line.
 # Exceptions:
 #  1) within comments
@@ -2390,8 +2413,10 @@ sub process {
                        my $orig = $1;
                        my $level = lc($orig);
                        $level = "warn" if ($level eq "warning");
+                       my $level2 = $level;
+                       $level2 = "dbg" if ($level eq "debug");
                        WARN("PREFER_PR_LEVEL",
-                            "Prefer pr_$level(... to printk(KERN_$1, ...\n" . $herecurr);
+                            "Prefer netdev_$level2(netdev, ... then dev_$level2(dev, ... then pr_$level(...  to printk(KERN_$orig ...\n" . $herecurr);
                }
 
                if ($line =~ /\bpr_warning\s*\(/) {
index d24810fc6af6caf3a15ea3f5a55b33f4f08a667b..fd8fa9aa7c4edd698430a9cb0647a8d26095a9a2 100755 (executable)
@@ -200,7 +200,7 @@ EOF
 syscall_list() {
     grep '^[0-9]' "$1" | sort -n | (
        while read nr abi name entry ; do
-           echo <<EOF
+           cat <<EOF
 #if !defined(__NR_${name}) && !defined(__IGNORE_${name})
 #warning syscall ${name} not implemented
 #endif
index 4b877a92a7ea3dc3a0307f5c5efb9e78c3289b17..44dfc415a379afc9c327388664e949bfcc855bcb 100644 (file)
 static DEFINE_MUTEX(devcgroup_mutex);
 
 /*
- * whitelist locking rules:
+ * exception list locking rules:
  * hold devcgroup_mutex for update/read.
  * hold rcu_read_lock() for read.
  */
 
-struct dev_whitelist_item {
+struct dev_exception_item {
        u32 major, minor;
        short type;
        short access;
@@ -41,7 +41,8 @@ struct dev_whitelist_item {
 
 struct dev_cgroup {
        struct cgroup_subsys_state css;
-       struct list_head whitelist;
+       struct list_head exceptions;
+       bool deny_all;
 };
 
 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
@@ -74,12 +75,12 @@ static int devcgroup_can_attach(struct cgroup *new_cgrp,
 /*
  * called under devcgroup_mutex
  */
-static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
+static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
 {
-       struct dev_whitelist_item *wh, *tmp, *new;
+       struct dev_exception_item *ex, *tmp, *new;
 
-       list_for_each_entry(wh, orig, list) {
-               new = kmemdup(wh, sizeof(*wh), GFP_KERNEL);
+       list_for_each_entry(ex, orig, list) {
+               new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
                if (!new)
                        goto free_and_exit;
                list_add_tail(&new->list, dest);
@@ -88,64 +89,60 @@ static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
        return 0;
 
 free_and_exit:
-       list_for_each_entry_safe(wh, tmp, dest, list) {
-               list_del(&wh->list);
-               kfree(wh);
+       list_for_each_entry_safe(ex, tmp, dest, list) {
+               list_del(&ex->list);
+               kfree(ex);
        }
        return -ENOMEM;
 }
 
-/* Stupid prototype - don't bother combining existing entries */
 /*
  * called under devcgroup_mutex
  */
-static int dev_whitelist_add(struct dev_cgroup *dev_cgroup,
-                       struct dev_whitelist_item *wh)
+static int dev_exception_add(struct dev_cgroup *dev_cgroup,
+                            struct dev_exception_item *ex)
 {
-       struct dev_whitelist_item *whcopy, *walk;
+       struct dev_exception_item *excopy, *walk;
 
-       whcopy = kmemdup(wh, sizeof(*wh), GFP_KERNEL);
-       if (!whcopy)
+       excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
+       if (!excopy)
                return -ENOMEM;
 
-       list_for_each_entry(walk, &dev_cgroup->whitelist, list) {
-               if (walk->type != wh->type)
+       list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
+               if (walk->type != ex->type)
                        continue;
-               if (walk->major != wh->major)
+               if (walk->major != ex->major)
                        continue;
-               if (walk->minor != wh->minor)
+               if (walk->minor != ex->minor)
                        continue;
 
-               walk->access |= wh->access;
-               kfree(whcopy);
-               whcopy = NULL;
+               walk->access |= ex->access;
+               kfree(excopy);
+               excopy = NULL;
        }
 
-       if (whcopy != NULL)
-               list_add_tail_rcu(&whcopy->list, &dev_cgroup->whitelist);
+       if (excopy != NULL)
+               list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
        return 0;
 }
 
 /*
  * called under devcgroup_mutex
  */
-static void dev_whitelist_rm(struct dev_cgroup *dev_cgroup,
-                       struct dev_whitelist_item *wh)
+static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
+                            struct dev_exception_item *ex)
 {
-       struct dev_whitelist_item *walk, *tmp;
+       struct dev_exception_item *walk, *tmp;
 
-       list_for_each_entry_safe(walk, tmp, &dev_cgroup->whitelist, list) {
-               if (walk->type == DEV_ALL)
-                       goto remove;
-               if (walk->type != wh->type)
+       list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
+               if (walk->type != ex->type)
                        continue;
-               if (walk->major != ~0 && walk->major != wh->major)
+               if (walk->major != ex->major)
                        continue;
-               if (walk->minor != ~0 && walk->minor != wh->minor)
+               if (walk->minor != ex->minor)
                        continue;
 
-remove:
-               walk->access &= ~wh->access;
+               walk->access &= ~ex->access;
                if (!walk->access) {
                        list_del_rcu(&walk->list);
                        kfree_rcu(walk, rcu);
@@ -153,6 +150,22 @@ remove:
        }
 }
 
+/**
+ * dev_exception_clean - frees all entries of the exception list
+ * @dev_cgroup: dev_cgroup with the exception list to be cleaned
+ *
+ * called under devcgroup_mutex
+ */
+static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
+{
+       struct dev_exception_item *ex, *tmp;
+
+       list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
+               list_del(&ex->list);
+               kfree(ex);
+       }
+}
+
 /*
  * called from kernel/cgroup.c with cgroup_lock() held.
  */
@@ -165,25 +178,17 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
        dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
        if (!dev_cgroup)
                return ERR_PTR(-ENOMEM);
-       INIT_LIST_HEAD(&dev_cgroup->whitelist);
+       INIT_LIST_HEAD(&dev_cgroup->exceptions);
        parent_cgroup = cgroup->parent;
 
-       if (parent_cgroup == NULL) {
-               struct dev_whitelist_item *wh;
-               wh = kmalloc(sizeof(*wh), GFP_KERNEL);
-               if (!wh) {
-                       kfree(dev_cgroup);
-                       return ERR_PTR(-ENOMEM);
-               }
-               wh->minor = wh->major = ~0;
-               wh->type = DEV_ALL;
-               wh->access = ACC_MASK;
-               list_add(&wh->list, &dev_cgroup->whitelist);
-       } else {
+       if (parent_cgroup == NULL)
+               dev_cgroup->deny_all = false;
+       else {
                parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
                mutex_lock(&devcgroup_mutex);
-               ret = dev_whitelist_copy(&dev_cgroup->whitelist,
-                               &parent_dev_cgroup->whitelist);
+               ret = dev_exceptions_copy(&dev_cgroup->exceptions,
+                                         &parent_dev_cgroup->exceptions);
+               dev_cgroup->deny_all = parent_dev_cgroup->deny_all;
                mutex_unlock(&devcgroup_mutex);
                if (ret) {
                        kfree(dev_cgroup);
@@ -197,13 +202,9 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
 static void devcgroup_destroy(struct cgroup *cgroup)
 {
        struct dev_cgroup *dev_cgroup;
-       struct dev_whitelist_item *wh, *tmp;
 
        dev_cgroup = cgroup_to_devcgroup(cgroup);
-       list_for_each_entry_safe(wh, tmp, &dev_cgroup->whitelist, list) {
-               list_del(&wh->list);
-               kfree(wh);
-       }
+       dev_exception_clean(dev_cgroup);
        kfree(dev_cgroup);
 }
 
@@ -249,59 +250,87 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
                                struct seq_file *m)
 {
        struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
-       struct dev_whitelist_item *wh;
+       struct dev_exception_item *ex;
        char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
 
        rcu_read_lock();
-       list_for_each_entry_rcu(wh, &devcgroup->whitelist, list) {
-               set_access(acc, wh->access);
-               set_majmin(maj, wh->major);
-               set_majmin(min, wh->minor);
-               seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type),
+       /*
+        * To preserve the compatibility:
+        * - Only show the "all devices" when the default policy is to allow
+        * - List the exceptions in case the default policy is to deny
+        * This way, the file remains as a "whitelist of devices"
+        */
+       if (devcgroup->deny_all == false) {
+               set_access(acc, ACC_MASK);
+               set_majmin(maj, ~0);
+               set_majmin(min, ~0);
+               seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
                           maj, min, acc);
+       } else {
+               list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
+                       set_access(acc, ex->access);
+                       set_majmin(maj, ex->major);
+                       set_majmin(min, ex->minor);
+                       seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
+                                  maj, min, acc);
+               }
        }
        rcu_read_unlock();
 
        return 0;
 }
 
-/*
- * may_access_whitelist:
- * does the access granted to dev_cgroup c contain the access
- * requested in whitelist item refwh.
- * return 1 if yes, 0 if no.
- * call with devcgroup_mutex held
+/**
+ * may_access - verifies if a new exception is part of what is allowed
+ *             by a dev cgroup based on the default policy +
+ *             exceptions. This is used to make sure a child cgroup
+ *             won't have more privileges than its parent or to
+ *             verify if a certain access is allowed.
+ * @dev_cgroup: dev cgroup to be tested against
+ * @refex: new exception
  */
-static int may_access_whitelist(struct dev_cgroup *c,
-                                      struct dev_whitelist_item *refwh)
+static int may_access(struct dev_cgroup *dev_cgroup,
+                     struct dev_exception_item *refex)
 {
-       struct dev_whitelist_item *whitem;
+       struct dev_exception_item *ex;
+       bool match = false;
 
-       list_for_each_entry(whitem, &c->whitelist, list) {
-               if (whitem->type & DEV_ALL)
-                       return 1;
-               if ((refwh->type & DEV_BLOCK) && !(whitem->type & DEV_BLOCK))
+       list_for_each_entry(ex, &dev_cgroup->exceptions, list) {
+               if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
                        continue;
-               if ((refwh->type & DEV_CHAR) && !(whitem->type & DEV_CHAR))
+               if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
                        continue;
-               if (whitem->major != ~0 && whitem->major != refwh->major)
+               if (ex->major != ~0 && ex->major != refex->major)
                        continue;
-               if (whitem->minor != ~0 && whitem->minor != refwh->minor)
+               if (ex->minor != ~0 && ex->minor != refex->minor)
                        continue;
-               if (refwh->access & (~whitem->access))
+               if (refex->access & (~ex->access))
                        continue;
-               return 1;
+               match = true;
+               break;
        }
+
+       /*
+        * In two cases we'll consider this new exception valid:
+        * - the dev cgroup has its default policy to allow + exception list:
+        *   the new exception should *not* match any of the exceptions
+        *   (!deny_all, !match)
+        * - the dev cgroup has its default policy to deny + exception list:
+        *   the new exception *should* match the exceptions
+        *   (deny_all, match)
+        */
+       if (dev_cgroup->deny_all == match)
+               return 1;
        return 0;
 }
 
 /*
  * parent_has_perm:
- * when adding a new allow rule to a device whitelist, the rule
+ * when adding a new allow rule to a device exception list, the rule
  * must be allowed in the parent device
  */
 static int parent_has_perm(struct dev_cgroup *childcg,
-                                 struct dev_whitelist_item *wh)
+                                 struct dev_exception_item *ex)
 {
        struct cgroup *pcg = childcg->css.cgroup->parent;
        struct dev_cgroup *parent;
@@ -309,17 +338,17 @@ static int parent_has_perm(struct dev_cgroup *childcg,
        if (!pcg)
                return 1;
        parent = cgroup_to_devcgroup(pcg);
-       return may_access_whitelist(parent, wh);
+       return may_access(parent, ex);
 }
 
 /*
- * Modify the whitelist using allow/deny rules.
+ * Modify the exception list using allow/deny rules.
  * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
  * so we can give a container CAP_MKNOD to let it create devices but not
- * modify the whitelist.
+ * modify the exception list.
  * It seems likely we'll want to add a CAP_CONTAINER capability to allow
  * us to also grant CAP_SYS_ADMIN to containers without giving away the
- * device whitelist controls, but for now we'll stick with CAP_SYS_ADMIN
+ * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
  *
  * Taking rules away is always allowed (given CAP_SYS_ADMIN).  Granting
  * new access is only allowed if you're in the top-level cgroup, or your
@@ -331,26 +360,36 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
        const char *b;
        char *endp;
        int count;
-       struct dev_whitelist_item wh;
+       struct dev_exception_item ex;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       memset(&wh, 0, sizeof(wh));
+       memset(&ex, 0, sizeof(ex));
        b = buffer;
 
        switch (*b) {
        case 'a':
-               wh.type = DEV_ALL;
-               wh.access = ACC_MASK;
-               wh.major = ~0;
-               wh.minor = ~0;
-               goto handle;
+               switch (filetype) {
+               case DEVCG_ALLOW:
+                       if (!parent_has_perm(devcgroup, &ex))
+                               return -EPERM;
+                       dev_exception_clean(devcgroup);
+                       devcgroup->deny_all = false;
+                       break;
+               case DEVCG_DENY:
+                       dev_exception_clean(devcgroup);
+                       devcgroup->deny_all = true;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+               return 0;
        case 'b':
-               wh.type = DEV_BLOCK;
+               ex.type = DEV_BLOCK;
                break;
        case 'c':
-               wh.type = DEV_CHAR;
+               ex.type = DEV_CHAR;
                break;
        default:
                return -EINVAL;
@@ -360,10 +399,10 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
                return -EINVAL;
        b++;
        if (*b == '*') {
-               wh.major = ~0;
+               ex.major = ~0;
                b++;
        } else if (isdigit(*b)) {
-               wh.major = simple_strtoul(b, &endp, 10);
+               ex.major = simple_strtoul(b, &endp, 10);
                b = endp;
        } else {
                return -EINVAL;
@@ -374,10 +413,10 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 
        /* read minor */
        if (*b == '*') {
-               wh.minor = ~0;
+               ex.minor = ~0;
                b++;
        } else if (isdigit(*b)) {
-               wh.minor = simple_strtoul(b, &endp, 10);
+               ex.minor = simple_strtoul(b, &endp, 10);
                b = endp;
        } else {
                return -EINVAL;
@@ -387,13 +426,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
        for (b++, count = 0; count < 3; count++, b++) {
                switch (*b) {
                case 'r':
-                       wh.access |= ACC_READ;
+                       ex.access |= ACC_READ;
                        break;
                case 'w':
-                       wh.access |= ACC_WRITE;
+                       ex.access |= ACC_WRITE;
                        break;
                case 'm':
-                       wh.access |= ACC_MKNOD;
+                       ex.access |= ACC_MKNOD;
                        break;
                case '\n':
                case '\0':
@@ -404,15 +443,31 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
                }
        }
 
-handle:
        switch (filetype) {
        case DEVCG_ALLOW:
-               if (!parent_has_perm(devcgroup, &wh))
+               if (!parent_has_perm(devcgroup, &ex))
                        return -EPERM;
-               return dev_whitelist_add(devcgroup, &wh);
+               /*
+                * If the default policy is to allow by default, try to remove
+                * an matching exception instead. And be silent about it: we
+                * don't want to break compatibility
+                */
+               if (devcgroup->deny_all == false) {
+                       dev_exception_rm(devcgroup, &ex);
+                       return 0;
+               }
+               return dev_exception_add(devcgroup, &ex);
        case DEVCG_DENY:
-               dev_whitelist_rm(devcgroup, &wh);
-               break;
+               /*
+                * If the default policy is to deny by default, try to remove
+                * an matching exception instead. And be silent about it: we
+                * don't want to break compatibility
+                */
+               if (devcgroup->deny_all == true) {
+                       dev_exception_rm(devcgroup, &ex);
+                       return 0;
+               }
+               return dev_exception_add(devcgroup, &ex);
        default:
                return -EINVAL;
        }
@@ -468,73 +523,71 @@ struct cgroup_subsys devices_subsys = {
        .broken_hierarchy = true,
 };
 
-int __devcgroup_inode_permission(struct inode *inode, int mask)
+/**
+ * __devcgroup_check_permission - checks if an inode operation is permitted
+ * @dev_cgroup: the dev cgroup to be tested against
+ * @type: device type
+ * @major: device major number
+ * @minor: device minor number
+ * @access: combination of ACC_WRITE, ACC_READ and ACC_MKNOD
+ *
+ * returns 0 on success, -EPERM case the operation is not permitted
+ */
+static int __devcgroup_check_permission(struct dev_cgroup *dev_cgroup,
+                                       short type, u32 major, u32 minor,
+                                       short access)
 {
-       struct dev_cgroup *dev_cgroup;
-       struct dev_whitelist_item *wh;
-
-       rcu_read_lock();
+       struct dev_exception_item ex;
+       int rc;
 
-       dev_cgroup = task_devcgroup(current);
+       memset(&ex, 0, sizeof(ex));
+       ex.type = type;
+       ex.major = major;
+       ex.minor = minor;
+       ex.access = access;
 
-       list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) {
-               if (wh->type & DEV_ALL)
-                       goto found;
-               if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode))
-                       continue;
-               if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode))
-                       continue;
-               if (wh->major != ~0 && wh->major != imajor(inode))
-                       continue;
-               if (wh->minor != ~0 && wh->minor != iminor(inode))
-                       continue;
+       rcu_read_lock();
+       rc = may_access(dev_cgroup, &ex);
+       rcu_read_unlock();
 
-               if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE))
-                       continue;
-               if ((mask & MAY_READ) && !(wh->access & ACC_READ))
-                       continue;
-found:
-               rcu_read_unlock();
-               return 0;
-       }
+       if (!rc)
+               return -EPERM;
 
-       rcu_read_unlock();
+       return 0;
+}
 
-       return -EPERM;
+int __devcgroup_inode_permission(struct inode *inode, int mask)
+{
+       struct dev_cgroup *dev_cgroup = task_devcgroup(current);
+       short type, access = 0;
+
+       if (S_ISBLK(inode->i_mode))
+               type = DEV_BLOCK;
+       if (S_ISCHR(inode->i_mode))
+               type = DEV_CHAR;
+       if (mask & MAY_WRITE)
+               access |= ACC_WRITE;
+       if (mask & MAY_READ)
+               access |= ACC_READ;
+
+       return __devcgroup_check_permission(dev_cgroup, type, imajor(inode),
+                                           iminor(inode), access);
 }
 
 int devcgroup_inode_mknod(int mode, dev_t dev)
 {
-       struct dev_cgroup *dev_cgroup;
-       struct dev_whitelist_item *wh;
+       struct dev_cgroup *dev_cgroup = task_devcgroup(current);
+       short type;
 
        if (!S_ISBLK(mode) && !S_ISCHR(mode))
                return 0;
 
-       rcu_read_lock();
-
-       dev_cgroup = task_devcgroup(current);
-
-       list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) {
-               if (wh->type & DEV_ALL)
-                       goto found;
-               if ((wh->type & DEV_BLOCK) && !S_ISBLK(mode))
-                       continue;
-               if ((wh->type & DEV_CHAR) && !S_ISCHR(mode))
-                       continue;
-               if (wh->major != ~0 && wh->major != MAJOR(dev))
-                       continue;
-               if (wh->minor != ~0 && wh->minor != MINOR(dev))
-                       continue;
-
-               if (!(wh->access & ACC_MKNOD))
-                       continue;
-found:
-               rcu_read_unlock();
-               return 0;
-       }
+       if (S_ISBLK(mode))
+               type = DEV_BLOCK;
+       else
+               type = DEV_CHAR;
 
-       rcu_read_unlock();
+       return __devcgroup_check_permission(dev_cgroup, type, MAJOR(dev),
+                                           MINOR(dev), ACC_MKNOD);
 
-       return -EPERM;
 }
index 28f911cdd7c79dc292171fea10385adad51a27eb..c5454c0477c346e4d814f5ff209feba86e5b86ad 100644 (file)
@@ -174,7 +174,8 @@ static void sel_netnode_insert(struct sel_netnode *node)
        if (sel_netnode_hash[idx].size == SEL_NETNODE_HASH_BKT_LIMIT) {
                struct sel_netnode *tail;
                tail = list_entry(
-                       rcu_dereference(sel_netnode_hash[idx].list.prev),
+                       rcu_dereference_protected(sel_netnode_hash[idx].list.prev,
+                                                 lockdep_is_held(&sel_netnode_lock)),
                        struct sel_netnode, list);
                list_del_rcu(&tail->list);
                kfree_rcu(tail, rcu);
index 298e695d6822577e80e5a03a3b5b77d78fe9ebc8..c86d018f0e74cdc94732cc40fc1c1c01c69f4df1 100644 (file)
@@ -485,7 +485,7 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma)
                        return -EACCES;
        }
 
-       vma->vm_flags |= VM_RESERVED;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &sel_mmap_policy_ops;
 
        return 0;
index 867558c983349d144c5f46a729af55a2a72dbe0a..2952ba576fb9ceed3f64c23663a2b40f3cfbe13e 100644 (file)
@@ -949,18 +949,13 @@ bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
 const char *tomoyo_get_exe(void)
 {
        struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
        const char *cp = NULL;
 
        if (!mm)
                return NULL;
        down_read(&mm->mmap_sem);
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) {
-                       cp = tomoyo_realpath_from_path(&vma->vm_file->f_path);
-                       break;
-               }
-       }
+       if (mm->exe_file)
+               cp = tomoyo_realpath_from_path(&mm->exe_file->f_path);
        up_read(&mm->mmap_sem);
        return cp;
 }
index 53b5ada8f7c36fd5199366662cc0ce5c5233e66c..b312c5d325beedac8308fa18ae5a4b4d6c1d513f 100644 (file)
@@ -3038,7 +3038,7 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file
                return -EINVAL;
        area->vm_ops = &snd_pcm_vm_ops_status;
        area->vm_private_data = substream;
-       area->vm_flags |= VM_RESERVED;
+       area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 
@@ -3075,7 +3075,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file
                return -EINVAL;
        area->vm_ops = &snd_pcm_vm_ops_control;
        area->vm_private_data = substream;
-       area->vm_flags |= VM_RESERVED;
+       area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        return 0;
 }
 #else /* ! coherent mmap */
@@ -3169,7 +3169,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = {
 int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream,
                             struct vm_area_struct *area)
 {
-       area->vm_flags |= VM_RESERVED;
+       area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 #ifdef ARCH_HAS_DMA_MMAP_COHERENT
        if (!substream->ops->page &&
            substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV)
index c4fd3b1d95927f8eab2b2bfc21d745f3578f94f6..d0323a693ba20f4719731369f85324e9ba582096 100644 (file)
@@ -262,7 +262,7 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw,
        }
 
        area->vm_ops = &usb_stream_hwdep_vm_ops;
-       area->vm_flags |= VM_RESERVED;
+       area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        area->vm_private_data = us122l;
        atomic_inc(&us122l->mmap_count);
 out:
index 04aafb43a13c9fbdc5eca982cdc6d13718d5245d..0b34dbc8f3020436d3e740bb61dea9f648a745ab 100644 (file)
@@ -82,7 +82,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep * hw, struct file *filp, struct v
                us428->us428ctls_sharedmem->CtlSnapShotLast = -2;
        }
        area->vm_ops = &us428ctls_vm_ops;
-       area->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+       area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        area->vm_private_data = hw->private_data;
        return 0;
 }
index 8e40b6e67e9eeeb7ea8a9dd53dd4c73fced97b5c..cc56007791e02192bd6176050a252b1eed2fc53a 100644 (file)
@@ -723,7 +723,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(struct snd_hwdep * hw, struct file *filp, st
                return -ENODEV;
        }
        area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops;
-       area->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
+       area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
        area->vm_private_data = hw->private_data;
        return 0;
 }
index 2a030c5af3aa2062082d87b1604b9980a1006844..9bcdc844b330b2c4f9af7435b054caa7d759dcf0 100644 (file)
@@ -1,2 +1,3 @@
 #include <stdbool.h>
+#include <stdbool.h>
 #include "../../../../include/linux/rbtree.h"
index 85baf11e2acd7d11aa4990a0f7f53f8d28689a20..43480149119ee773f0a32cc8abe995903609fd11 100644 (file)
@@ -1,4 +1,4 @@
-TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug
+TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug epoll
 
 all:
        for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
new file mode 100644 (file)
index 0000000..19806ed
--- /dev/null
@@ -0,0 +1,11 @@
+# Makefile for epoll selftests
+
+all: test_epoll
+%: %.c
+       gcc -pthread -g -o $@ $^
+
+run_tests: all
+       ./test_epoll
+
+clean:
+       $(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
new file mode 100644 (file)
index 0000000..e0fcff1
--- /dev/null
@@ -0,0 +1,344 @@
+/*
+ *  tools/testing/selftests/epoll/test_epoll.c
+ *
+ *  Copyright 2012 Adobe Systems Incorporated
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Paton J. Lewis <palewis@adobe.com>
+ *
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+
+/*
+ * A pointer to an epoll_item_private structure will be stored in the epoll
+ * item's event structure so that we can get access to the epoll_item_private
+ * data after calling epoll_wait:
+ */
+struct epoll_item_private {
+       int index;  /* Position of this struct within the epoll_items array. */
+       int fd;
+       uint32_t events;
+       pthread_mutex_t mutex;  /* Guards the following variables... */
+       int stop;
+       int status;  /* Stores any error encountered while handling item. */
+       /* The following variable allows us to test whether we have encountered
+          a problem while attempting to cancel and delete the associated
+          event. When the test program exits, 'deleted' should be exactly
+          one. If it is greater than one, then the failed test reflects a real
+          world situation where we would have tried to access the epoll item's
+          private data after deleting it: */
+       int deleted;
+};
+
+struct epoll_item_private *epoll_items;
+
+/*
+ * Delete the specified item from the epoll set. In a real-world secneario this
+ * is where we would free the associated data structure, but in this testing
+ * environment we retain the structure so that we can test for double-deletion:
+ */
+void delete_item(int index)
+{
+       __sync_fetch_and_add(&epoll_items[index].deleted, 1);
+}
+
+/*
+ * A pointer to a read_thread_data structure will be passed as the argument to
+ * each read thread:
+ */
+struct read_thread_data {
+       int stop;
+       int status;  /* Indicates any error encountered by the read thread. */
+       int epoll_set;
+};
+
+/*
+ * The function executed by the read threads:
+ */
+void *read_thread_function(void *function_data)
+{
+       struct read_thread_data *thread_data =
+               (struct read_thread_data *)function_data;
+       struct epoll_event event_data;
+       struct epoll_item_private *item_data;
+       char socket_data;
+
+       /* Handle events until we encounter an error or this thread's 'stop'
+          condition is set: */
+       while (1) {
+               int result = epoll_wait(thread_data->epoll_set,
+                                       &event_data,
+                                       1,      /* Number of desired events */
+                                       1000);  /* Timeout in ms */
+               if (result < 0) {
+                       /* Breakpoints signal all threads. Ignore that while
+                          debugging: */
+                       if (errno == EINTR)
+                               continue;
+                       thread_data->status = errno;
+                       return 0;
+               } else if (thread_data->stop)
+                       return 0;
+               else if (result == 0)  /* Timeout */
+                       continue;
+
+               /* We need the mutex here because checking for the stop
+                  condition and re-enabling the epoll item need to be done
+                  together as one atomic operation when EPOLL_CTL_DISABLE is
+                  available: */
+               item_data = (struct epoll_item_private *)event_data.data.ptr;
+               pthread_mutex_lock(&item_data->mutex);
+
+               /* Remove the item from the epoll set if we want to stop
+                  handling that event: */
+               if (item_data->stop)
+                       delete_item(item_data->index);
+               else {
+                       /* Clear the data that was written to the other end of
+                          our non-blocking socket: */
+                       do {
+                               if (read(item_data->fd, &socket_data, 1) < 1) {
+                                       if ((errno == EAGAIN) ||
+                                           (errno == EWOULDBLOCK))
+                                               break;
+                                       else
+                                               goto error_unlock;
+                               }
+                       } while (item_data->events & EPOLLET);
+
+                       /* The item was one-shot, so re-enable it: */
+                       event_data.events = item_data->events;
+                       if (epoll_ctl(thread_data->epoll_set,
+                                                 EPOLL_CTL_MOD,
+                                                 item_data->fd,
+                                                 &event_data) < 0)
+                               goto error_unlock;
+               }
+
+               pthread_mutex_unlock(&item_data->mutex);
+       }
+
+error_unlock:
+       thread_data->status = item_data->status = errno;
+       pthread_mutex_unlock(&item_data->mutex);
+       return 0;
+}
+
+/*
+ * A pointer to a write_thread_data structure will be passed as the argument to
+ * the write thread:
+ */
+struct write_thread_data {
+       int stop;
+       int status;  /* Indicates any error encountered by the write thread. */
+       int n_fds;
+       int *fds;
+};
+
+/*
+ * The function executed by the write thread. It writes a single byte to each
+ * socket in turn until the stop condition for this thread is set. If writing to
+ * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
+ * the moment and just move on to the next socket in the list. We don't care
+ * about the order in which we deliver events to the epoll set. In fact we don't
+ * care about the data we're writing to the pipes at all; we just want to
+ * trigger epoll events:
+ */
+void *write_thread_function(void *function_data)
+{
+       const char data = 'X';
+       int index;
+       struct write_thread_data *thread_data =
+               (struct write_thread_data *)function_data;
+       while (!write_thread_data->stop)
+               for (index = 0;
+                    !thread_data->stop && (index < thread_data->n_fds);
+                    ++index)
+                       if ((write(thread_data->fds[index], &data, 1) < 1) &&
+                               (errno != EAGAIN) &&
+                               (errno != EWOULDBLOCK)) {
+                               write_thread_data->status = errno;
+                               return;
+                       }
+}
+
+/*
+ * Arguments are currently ignored:
+ */
+int main(int argc, char **argv)
+{
+       const int n_read_threads = 100;
+       const int n_epoll_items = 500;
+       int index;
+       int epoll_set = epoll_create1(0);
+       struct write_thread_data write_thread_data = {
+               0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
+       };
+       struct read_thread_data *read_thread_data =
+               malloc(n_read_threads * sizeof(struct read_thread_data));
+       pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
+       pthread_t write_thread;
+
+       printf("-----------------\n");
+       printf("Runing test_epoll\n");
+       printf("-----------------\n");
+
+       epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
+
+       if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
+               read_thread_data == 0 || read_threads == 0)
+               goto error;
+
+       if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
+               printf("Error: please run this test on a multi-core system.\n");
+               goto error;
+       }
+
+       /* Create the socket pairs and epoll items: */
+       for (index = 0; index < n_epoll_items; ++index) {
+               int socket_pair[2];
+               struct epoll_event event_data;
+               if (socketpair(AF_UNIX,
+                              SOCK_STREAM | SOCK_NONBLOCK,
+                              0,
+                              socket_pair) < 0)
+                       goto error;
+               write_thread_data.fds[index] = socket_pair[0];
+               epoll_items[index].index = index;
+               epoll_items[index].fd = socket_pair[1];
+               if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
+                       goto error;
+               /* We always use EPOLLONESHOT because this test is currently
+                  structured to demonstrate the need for EPOLL_CTL_DISABLE,
+                  which only produces useful information in the EPOLLONESHOT
+                  case (without EPOLLONESHOT, calling epoll_ctl with
+                  EPOLL_CTL_DISABLE will never return EBUSY). If support for
+                  testing events without EPOLLONESHOT is desired, it should
+                  probably be implemented in a separate unit test. */
+               epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
+               if (index < n_epoll_items / 2)
+                       epoll_items[index].events |= EPOLLET;
+               epoll_items[index].stop = 0;
+               epoll_items[index].status = 0;
+               epoll_items[index].deleted = 0;
+               event_data.events = epoll_items[index].events;
+               event_data.data.ptr = &epoll_items[index];
+               if (epoll_ctl(epoll_set,
+                             EPOLL_CTL_ADD,
+                             epoll_items[index].fd,
+                             &event_data) < 0)
+                       goto error;
+       }
+
+       /* Create and start the read threads: */
+       for (index = 0; index < n_read_threads; ++index) {
+               read_thread_data[index].stop = 0;
+               read_thread_data[index].status = 0;
+               read_thread_data[index].epoll_set = epoll_set;
+               if (pthread_create(&read_threads[index],
+                                  NULL,
+                                  read_thread_function,
+                                  &read_thread_data[index]) != 0)
+                       goto error;
+       }
+
+       if (pthread_create(&write_thread,
+                          NULL,
+                          write_thread_function,
+                          &write_thread_data) != 0)
+               goto error;
+
+       /* Cancel all event pollers: */
+#ifdef EPOLL_CTL_DISABLE
+       for (index = 0; index < n_epoll_items; ++index) {
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               ++epoll_items[index].stop;
+               if (epoll_ctl(epoll_set,
+                             EPOLL_CTL_DISABLE,
+                             epoll_items[index].fd,
+                             NULL) == 0)
+                       delete_item(index);
+               else if (errno != EBUSY) {
+                       pthread_mutex_unlock(&epoll_items[index].mutex);
+                       goto error;
+               }
+               /* EBUSY means events were being handled; allow the other thread
+                  to delete the item. */
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+       }
+#else
+       for (index = 0; index < n_epoll_items; ++index) {
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               ++epoll_items[index].stop;
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+               /* Wait in case a thread running read_thread_function is
+                  currently executing code between epoll_wait and
+                  pthread_mutex_lock with this item. Note that a longer delay
+                  would make double-deletion less likely (at the expense of
+                  performance), but there is no guarantee that any delay would
+                  ever be sufficient. Note also that we delete all event
+                  pollers at once for testing purposes, but in a real-world
+                  environment we are likely to want to be able to cancel event
+                  pollers at arbitrary times. Therefore we can't improve this
+                  situation by just splitting this loop into two loops
+                  (i.e. signal 'stop' for all items, sleep, and then delete all
+                  items). We also can't fix the problem via EPOLL_CTL_DEL
+                  because that command can't prevent the case where some other
+                  thread is executing read_thread_function within the region
+                  mentioned above: */
+               usleep(1);
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               if (!epoll_items[index].deleted)
+                       delete_item(index);
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+       }
+#endif
+
+       /* Shut down the read threads: */
+       for (index = 0; index < n_read_threads; ++index)
+               __sync_fetch_and_add(&read_thread_data[index].stop, 1);
+       for (index = 0; index < n_read_threads; ++index) {
+               if (pthread_join(read_threads[index], NULL) != 0)
+                       goto error;
+               if (read_thread_data[index].status)
+                       goto error;
+       }
+
+       /* Shut down the write thread: */
+       __sync_fetch_and_add(&write_thread_data.stop, 1);
+       if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
+               goto error;
+
+       /* Check for final error conditions: */
+       for (index = 0; index < n_epoll_items; ++index) {
+               if (epoll_items[index].status != 0)
+                       goto error;
+               if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
+                       goto error;
+       }
+       for (index = 0; index < n_epoll_items; ++index)
+               if (epoll_items[index].deleted != 1) {
+                       printf("Error: item data deleted %1d times.\n",
+                                  epoll_items[index].deleted);
+                       goto error;
+               }
+
+       printf("[PASS]\n");
+       return 0;
+
+ error:
+       printf("[FAIL]\n");
+       return errno;
+}