]> git.karo-electronics.de Git - mv-sheeva.git/commitdiff
Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
authorIngo Molnar <mingo@elte.hu>
Sun, 9 Jan 2011 09:42:21 +0000 (10:42 +0100)
committerIngo Molnar <mingo@elte.hu>
Sun, 9 Jan 2011 09:42:21 +0000 (10:42 +0100)
165 files changed:
Documentation/RCU/trace.txt
Documentation/dontdiff
Documentation/kernel-docs.txt
Documentation/kernel-parameters.txt
Documentation/x86/boot.txt
MAINTAINERS
arch/Kconfig
arch/s390/Kconfig
arch/s390/include/asm/mutex.h
arch/x86/Kconfig
arch/x86/Kconfig.debug
arch/x86/boot/compressed/head_64.S
arch/x86/include/asm/alternative.h
arch/x86/include/asm/amd_nb.h
arch/x86/include/asm/apic.h
arch/x86/include/asm/apicdef.h
arch/x86/include/asm/bootparam.h
arch/x86/include/asm/fixmap.h
arch/x86/include/asm/i387.h
arch/x86/include/asm/io_apic.h
arch/x86/include/asm/mce.h
arch/x86/include/asm/microcode.h
arch/x86/include/asm/mpspec.h
arch/x86/include/asm/mpspec_def.h
arch/x86/include/asm/mrst-vrtc.h [new file with mode: 0644]
arch/x86/include/asm/mrst.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/pci.h
arch/x86/include/asm/setup.h
arch/x86/include/asm/uv/uv_bau.h
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/alternative.c
arch/x86/kernel/amd_nb.c
arch/x86/kernel/apb_timer.c
arch/x86/kernel/aperture_64.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/early_printk.c
arch/x86/kernel/ftrace.c
arch/x86/kernel/head32.c
arch/x86/kernel/head_32.S
arch/x86/kernel/microcode_amd.c
arch/x86/kernel/pci-gart_64.c
arch/x86/kernel/reboot_fixups_32.c
arch/x86/kernel/setup.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/trampoline_64.S
arch/x86/kernel/tsc.c
arch/x86/kernel/verify_cpu.S [moved from arch/x86/kernel/verify_cpu_64.S with 65% similarity]
arch/x86/kernel/vmlinux.lds.S
arch/x86/lguest/i386_head.S
arch/x86/mm/Makefile
arch/x86/mm/amdtopology_64.c [moved from arch/x86/mm/k8topology_64.c with 94% similarity]
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/numa_64.c
arch/x86/mm/pageattr.c
arch/x86/mm/setup_nx.c
arch/x86/mm/srat_32.c
arch/x86/mm/srat_64.c
arch/x86/oprofile/op_model_amd.c
arch/x86/pci/Makefile
arch/x86/pci/ce4100.c [new file with mode: 0644]
arch/x86/pci/pcbios.c
arch/x86/platform/Makefile
arch/x86/platform/ce4100/Makefile [new file with mode: 0644]
arch/x86/platform/ce4100/ce4100.c [new file with mode: 0644]
arch/x86/platform/iris/Makefile [new file with mode: 0644]
arch/x86/platform/iris/iris.c [new file with mode: 0644]
arch/x86/platform/mrst/Makefile
arch/x86/platform/mrst/early_printk_mrst.c [moved from arch/x86/kernel/early_printk_mrst.c with 100% similarity]
arch/x86/platform/mrst/mrst.c
arch/x86/platform/mrst/vrtc.c [new file with mode: 0644]
arch/x86/platform/sfi/sfi.c
arch/x86/platform/uv/tlb_uv.c
arch/x86/platform/visws/visws_quirks.c
drivers/acpi/numa.c
drivers/char/agp/amd64-agp.c
drivers/edac/amd64_edac.c
drivers/platform/x86/intel_scu_ipc.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/rtc-mrst.c [new file with mode: 0644]
fs/gfs2/bmap.c
fs/gfs2/glock.c
fs/gfs2/glock.h
fs/gfs2/glops.c
fs/gfs2/incore.h
fs/gfs2/inode.c
fs/gfs2/lock_dlm.c
fs/gfs2/ops_inode.c
fs/gfs2/quota.c
fs/gfs2/rgrp.c
fs/gfs2/rgrp.h
fs/gfs2/xattr.c
fs/proc/base.c
include/linux/completion.h
include/linux/dynamic_debug.h
include/linux/hrtimer.h
include/linux/init_task.h
include/linux/interrupt.h
include/linux/module.h
include/linux/mutex.h
include/linux/rculist.h
include/linux/rcupdate.h
include/linux/rcutiny.h
include/linux/rcutree.h
include/linux/sched.h
include/linux/sfi.h
include/linux/timer.h
include/linux/timerqueue.h [new file with mode: 0644]
include/linux/tracepoint.h
include/linux/workqueue.h
include/trace/define_trace.h
include/trace/events/skb.h
init/Kconfig
kernel/Makefile
kernel/cpu.c
kernel/fork.c
kernel/futex.c
kernel/hrtimer.c
kernel/irq/manage.c
kernel/kthread.c
kernel/lockdep_proc.c
kernel/module.c
kernel/mutex.c
kernel/posix-timers.c
kernel/printk.c
kernel/rcutiny.c
kernel/rcutiny_plugin.h
kernel/rcutorture.c
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h
kernel/rcutree_trace.c
kernel/sched.c
kernel/sched_autogroup.c [new file with mode: 0644]
kernel/sched_autogroup.h [new file with mode: 0644]
kernel/sched_clock.c
kernel/sched_debug.c
kernel/sched_fair.c
kernel/sched_features.h
kernel/sched_rt.c
kernel/softirq.c
kernel/srcu.c
kernel/sys.c
kernel/sysctl.c
kernel/time/timecompare.c
kernel/time/timekeeping.c
kernel/time/timer_list.c
kernel/timer.c
kernel/trace/Makefile
kernel/trace/trace.c
kernel/trace/trace_selftest.c
kernel/watchdog.c
lib/Makefile
lib/dynamic_debug.c
lib/timerqueue.c [new file with mode: 0644]
scripts/kernel-doc

index a851118775d84c7a1d2356ba6a6c8e6208292887..6a8c73f55b80ca38601ba96f179565fe8b0b7ea0 100644 (file)
@@ -1,18 +1,22 @@
 CONFIG_RCU_TRACE debugfs Files and Formats
 
 
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state.  This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
 
 
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
 
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
 top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
 
 The output of "cat rcu/rcudata" looks as follows:
 
@@ -130,7 +134,8 @@ o   "ci" is the number of RCU callbacks that have been invoked for
        been registered in absence of CPU-hotplug activity.
 
 o      "co" is the number of RCU callbacks that have been orphaned due to
-       this CPU going offline.
+       this CPU going offline.  These orphaned callbacks have been moved
+       to an arbitrarily chosen online CPU.
 
 o      "ca" is the number of RCU callbacks that have been adopted due to
        other CPUs going offline.  Note that ci+co-ca+ql is the number of
@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started.  It is
 
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
 
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
 1/1 .>. 0:127 ^0    
 3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
 0/1 .>. 0:127 ^0    
 0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@ o  "fqlh" is the number of calls to force_quiescent_state() that
        exited immediately (without even being counted in nfqs above)
        due to contention on ->fqslock.
 
-o      "oqlen" is the number of callbacks on the "orphan" callback
-       list.  RCU callbacks are placed on this list by CPUs going
-       offline, and are "adopted" either by the CPU helping the outgoing
-       CPU or by the next rcu_barrier*() call, whichever comes first.
-
 o      Each element of the form "1/1 0:127 ^0" represents one struct
        rcu_node.  Each line represents one level of the hierarchy, from
        root to leaves.  It is best to think of the rcu_data structures
@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing.  Alert
        readers will note that the rcu "nn" number for a given CPU very
        closely matches the rcu_bh "np" number for that same CPU.  This
        is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+             ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+             normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+             exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds.  The fields are as follows:
+
+o      "qlen" is the number of RCU callbacks currently waiting either
+       for an RCU grace period or waiting to be invoked.  This is the
+       only field present for rcu_sched and rcu_bh, due to the
+       short-circuiting of grace period in those two cases.
+
+o      "gp" is the number of grace periods that have completed.
+
+o      "g197/p197/c197" displays the grace-period state, with the
+       "g" number being the number of grace periods that have started
+       (mod 256), the "p" number being the number of grace periods
+       that the CPU has responded to (also mod 256), and the "c"
+       number being the number of grace periods that have completed
+       (once again mode 256).
+
+       Why have both "gp" and "g"?  Because the data flowing into
+       "gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o      "tasks" is a set of bits.  The first bit is "T" if there are
+       currently tasks that have recently blocked within an RCU
+       read-side critical section, the second bit is "N" if any of the
+       aforementioned tasks are blocking the current RCU grace period,
+       and the third bit is "E" if any of the aforementioned tasks are
+       blocking the current expedited grace period.  Each bit is "."
+       if the corresponding condition does not hold.
+
+o      "ttb" is a single bit.  It is "B" if any of the blocked tasks
+       need to be priority boosted and "." otherwise.
+
+o      "btg" indicates whether boosting has been carried out during
+       the current grace period, with "exp" indicating that boosting
+       is in progress for an expedited grace period, "no" indicating
+       that boosting has not yet started for a normal grace period,
+       "begun" indicating that boosting has bebug for a normal grace
+       period, and "done" indicating that boosting has completed for
+       a normal grace period.
+
+o      "ntb" is the total number of tasks subjected to RCU priority boosting
+       periods since boot.
+
+o      "neb" is the number of expedited grace periods that have had
+       to resort to RCU priority boosting since boot.
+
+o      "nnb" is the number of normal grace periods that have had
+       to resort to RCU priority boosting since boot.
+
+o      "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o      "bt" is the low-order 12 bits of the value that the jiffies counter
+       will have at the next time that boosting is scheduled to begin.
+
+o      In the line beginning with "normal balk", the fields are as follows:
+
+       o       "nt" is the number of times that the system balked from
+               boosting because there were no blocked tasks to boost.
+               Note that the system will balk from boosting even if the
+               grace period is overdue when the currently running task
+               is looping within an RCU read-side critical section.
+               There is no point in boosting in this case, because
+               boosting a running task won't make it run any faster.
+
+       o       "gt" is the number of times that the system balked
+               from boosting because, although there were blocked tasks,
+               none of them were preventing the current grace period
+               from completing.
+
+       o       "bt" is the number of times that the system balked
+               from boosting because boosting was already in progress.
+
+       o       "b" is the number of times that the system balked from
+               boosting because boosting had already completed for
+               the grace period in question.
+
+       o       "ny" is the number of times that the system balked from
+               boosting because it was not yet time to start boosting
+               the grace period in question.
+
+       o       "nos" is the number of times that the system balked from
+               boosting for inexplicable ("not otherwise specified")
+               reasons.  This can actually happen due to races involving
+               increments of the jiffies counter.
+
+o      In the line beginning with "exp balk", the fields are as follows:
+
+       o       "bt" is the number of times that the system balked from
+               boosting because there were no blocked tasks to boost.
+
+       o       "nos" is the number of times that the system balked from
+                boosting for inexplicable ("not otherwise specified")
+                reasons.
index d9bcffd594331d7b52a5608269327a7c6642af31..470d3dba1a69aa48c55d98bea3c70d094f458de6 100644 (file)
@@ -62,6 +62,10 @@ aic7*reg_print.c*
 aic7*seq.h*
 aicasm
 aicdb.h*
+altivec1.c
+altivec2.c
+altivec4.c
+altivec8.c
 asm-offsets.h
 asm_offsets.h
 autoconf.h*
@@ -76,6 +80,7 @@ btfixupprep
 build
 bvmlinux
 bzImage*
+capflags.c
 classlist.h*
 comp*.log
 compile.h*
@@ -94,6 +99,7 @@ devlist.h*
 docproc
 elf2ecoff
 elfconfig.h*
+evergreen_reg_safe.h
 fixdep
 flask.h
 fore200e_mkfirm
@@ -108,9 +114,16 @@ genksyms
 *_gray256.c
 ihex2fw
 ikconfig.h*
+inat-tables.c
 initramfs_data.cpio
 initramfs_data.cpio.gz
 initramfs_list
+int16.c
+int1.c
+int2.c
+int32.c
+int4.c
+int8.c
 kallsyms
 kconfig
 keywords.c
@@ -140,6 +153,7 @@ mkprep
 mktables
 mktree
 modpost
+modules.builtin
 modules.order
 modversions.h*
 ncscope.*
@@ -153,14 +167,23 @@ pca200e.bin
 pca200e_ecd.bin2
 piggy.gz
 piggyback
+piggy.S
 pnmtologo
 ppc_defs.h*
 pss_boot.h
 qconf
+r100_reg_safe.h
+r200_reg_safe.h
+r300_reg_safe.h
+r420_reg_safe.h
+r600_reg_safe.h
 raid6altivec*.c
 raid6int*.c
 raid6tables.c
 relocs
+rn50_reg_safe.h
+rs600_reg_safe.h
+rv515_reg_safe.h
 series
 setup
 setup.bin
@@ -169,6 +192,7 @@ sImage
 sm_tbl*
 split-include
 syscalltab.h
+tables.c
 tags
 tftpboot.img
 timeconst.h
@@ -190,6 +214,7 @@ vmlinux
 vmlinux-*
 vmlinux.aout
 vmlinux.lds
+voffset.h
 vsyscall.lds
 vsyscall_32.lds
 wanxlfw.inc
@@ -200,3 +225,4 @@ wakeup.elf
 wakeup.lds
 zImage*
 zconf.hash.c
+zoffset.h
index 715eaaf1519dd25fa0d4011684cd263bc12b2d2f..9a8674629a07598eb553970e1207b31b42ec0316 100644 (file)
        Notes: Further information in
        http://www.oreilly.com/catalog/linuxdrive2/
 
-     * Title: "Linux Device Drivers, 3nd Edition"
+     * Title: "Linux Device Drivers, 3rd Edition"
        Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
        Publisher: O'Reilly & Associates.
        Date: 2005.
        Pages: 600.
        ISBN: 0-13-101908-2
 
-     * Title:  "The  Design  and Implementation of the 4.4 BSD UNIX
-       Operating System"
-       Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
-       John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1996.
-       ISBN: 0-201-54979-4
-
      * Title: "Programming for the real world - POSIX.4"
        Author: Bill O. Gallmeister.
        Publisher: O'Reilly & Associates, Inc..
        POSIX. Good reference.
 
      * Title:  "UNIX  Systems  for  Modern Architectures: Symmetric
-       Multiprocesssing and Caching for Kernel Programmers"
+       Multiprocessing and Caching for Kernel Programmers"
        Author: Curt Schimmel.
        Publisher: Addison Wesley.
        Date: June, 1994.
        Pages: 432.
        ISBN: 0-201-63338-8
 
-     * Title:  "The  Design  and Implementation of the 4.3 BSD UNIX
-       Operating System"
-       Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
-       Karels, John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1989 (reprinted with corrections on October, 1990).
-       ISBN: 0-201-06196-1
-
-     * Title: "The Design of the UNIX Operating System"
-       Author: Maurice J. Bach.
-       Publisher: Prentice Hall.
-       Date: 1986.
-       Pages: 471.
-       ISBN: 0-13-201757-1
-
      MISCELLANEOUS:
 
      * Name: linux/Documentation
index 992cda68fa63b8a2b3a289e67c1d91700f5ddc06..f3dc951e949f04255d90f35b5da4b78c7d015a67 100644 (file)
@@ -1614,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file
        noapic          [SMP,APIC] Tells the kernel to not make use of any
                        IOAPICs that may be present in the system.
 
+       noautogroup     Disable scheduler automatic task group creation.
+
        nobats          [PPC] Do not use BATs for mapping kernel lowmem
                        on "Classic" PPC cores.
 
@@ -2459,12 +2461,13 @@ and is between 256 and 4096 characters. It is defined in the file
                        to facilitate early boot debugging.
                        See also Documentation/trace/events.txt
 
-       tsc=            Disable clocksource-must-verify flag for TSC.
+       tsc=            Disable clocksource stability checks for TSC.
                        Format: <string>
                        [x86] reliable: mark tsc clocksource as reliable, this
-                       disables clocksource verification at runtime.
-                       Used to enable high-resolution timer mode on older
-                       hardware, and in virtualized environment.
+                       disables clocksource verification at runtime, as well
+                       as the stability checks done at bootup. Used to enable
+                       high-resolution timer mode on older hardware, and in
+                       virtualized environment.
                        [x86] noirqtime: Do not use TSC to do irq accounting.
                        Used to run time disable IRQ_TIME_ACCOUNTING on any
                        platforms where RDTSC is slow and this accounting
index 30b43e1b26979cee024aa7636e250520b8ba235f..bdeb81ccb5f61b973280f9704f28a9f9cd8cc2e1 100644 (file)
@@ -600,6 +600,7 @@ Protocol:   2.07+
   0x00000001   lguest
   0x00000002   Xen
   0x00000003   Moorestown MID
+  0x00000004   CE4100 TV Platform
 
 Field name:    hardware_subarch_data
 Type:          write (subarch-dependent)
index b1dda78a1e75fc867be18cfdd24d0a605aaf8f83..c5c7292daba076e439843e3b8c5c09a5d96d2285 100644 (file)
@@ -2812,6 +2812,10 @@ M:       Thomas Gleixner <tglx@linutronix.de>
 S:     Maintained
 F:     Documentation/timers/
 F:     kernel/hrtimer.c
+F:     kernel/time/clockevents.c
+F:     kernel/time/tick*.*
+F:     kernel/time/timer_*.c
+F      include/linux/clockevents.h
 F:     include/linux/hrtimer.h
 
 HIGH-SPEED SCC DRIVER FOR AX.25
@@ -5142,6 +5146,18 @@ L:       alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:     Supported
 F:     sound/soc/s3c24xx
 
+TIMEKEEPING, NTP
+M:     John Stultz <johnstul@us.ibm.com>
+M:     Thomas Gleixner <tglx@linutronix.de>
+S:     Supported
+F:     include/linux/clocksource.h
+F:     include/linux/time.h
+F:     include/linux/timex.h
+F:     include/linux/timekeeping.h
+F:     kernel/time/clocksource.c
+F:     kernel/time/time*.c
+F:     kernel/time/ntp.c
+
 TLG2300 VIDEO4LINUX-2 DRIVER
 M:     Huang Shijie <shijie8@gmail.com>
 M:     Kang Yong <kangyong@telegent.com>
index 8bf0fa652eb63c57dec1ebfec1a93a407be4ed32..f78c2be4242b437ced3308795952102bf1359763 100644 (file)
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
 config HAVE_ARCH_JUMP_LABEL
        bool
 
+config HAVE_ARCH_MUTEX_CPU_RELAX
+       bool
+
 source "kernel/gcov/Kconfig"
index e0b98e71ff4797e5807f9e5d99f674aeb90456b8..6c6d7b339aae4f49fac84b7b3edc069f4ccf3adf 100644 (file)
@@ -99,6 +99,7 @@ config S390
        select HAVE_KERNEL_LZMA
        select HAVE_KERNEL_LZO
        select HAVE_GET_USER_PAGES_FAST
+       select HAVE_ARCH_MUTEX_CPU_RELAX
        select ARCH_INLINE_SPIN_TRYLOCK
        select ARCH_INLINE_SPIN_TRYLOCK_BH
        select ARCH_INLINE_SPIN_LOCK
index 458c1f7fbc1808d48982aa0c5fe89bfe3df2098c..688271f5f2e452b9951599550f33ed0ddcfe0a7c 100644 (file)
@@ -7,3 +7,5 @@
  */
 
 #include <asm-generic/mutex-dec.h>
+
+#define arch_mutex_cpu_relax() barrier()
index e330da21b84f0636751b7e18e921ecff9cc31f55..b6fccb07123e206e23a4b73534d4891f66753b90 100644 (file)
@@ -377,6 +377,18 @@ config X86_ELAN
 
          If unsure, choose "PC-compatible" instead.
 
+config X86_INTEL_CE
+       bool "CE4100 TV platform"
+       depends on PCI
+       depends on PCI_GODIRECT
+       depends on X86_32
+       depends on X86_EXTENDED_PLATFORM
+       select X86_REBOOTFIXUPS
+       ---help---
+         Select for the Intel CE media processor (CE4100) SOC.
+         This option compiles in support for the CE4100 SOC for settop
+         boxes and media devices.
+
 config X86_MRST
        bool "Moorestown MID platform"
        depends on PCI
@@ -385,6 +397,10 @@ config X86_MRST
        depends on X86_EXTENDED_PLATFORM
        depends on X86_IO_APIC
        select APB_TIMER
+       select I2C
+       select SPI
+       select INTEL_SCU_IPC
+       select X86_PLATFORM_DEVICES
        ---help---
          Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
          Internet Device(MID) platform. Moorestown consists of two chips:
@@ -466,6 +482,19 @@ config X86_ES7000
          Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
          supposed to run on an IA32-based Unisys ES7000 system.
 
+config X86_32_IRIS
+       tristate "Eurobraille/Iris poweroff module"
+       depends on X86_32
+       ---help---
+         The Iris machines from EuroBraille do not have APM or ACPI support
+         to shut themselves down properly.  A special I/O sequence is
+         needed to do so, which is what this module does at
+         kernel shutdown.
+
+         This is only for Iris machines from EuroBraille.
+
+         If unused, say N.
+
 config SCHED_OMIT_FRAME_POINTER
        def_bool y
        prompt "Single-depth WCHAN output"
@@ -1141,16 +1170,16 @@ config NUMA
 comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
        depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
 
-config K8_NUMA
+config AMD_NUMA
        def_bool y
        prompt "Old style AMD Opteron NUMA detection"
        depends on X86_64 && NUMA && PCI
        ---help---
-         Enable K8 NUMA node topology detection.  You should say Y here if
-         you have a multi processor AMD K8 system. This uses an old
-         method to read the NUMA configuration directly from the builtin
-         Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
-         instead, which also takes priority if both are compiled in.
+         Enable AMD NUMA node topology detection.  You should say Y here if
+         you have a multi processor AMD system. This uses an old method to
+         read the NUMA configuration directly from the builtin Northbridge
+         of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
+         which also takes priority if both are compiled in.
 
 config X86_64_ACPI_NUMA
        def_bool y
index b59ee765414ea3891d6d4914485ba366fcee5663..45143bbcfe5e487d53e33bddaa7193ab68a5275e 100644 (file)
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
          feature as well as for the change_page_attr() infrastructure.
          If in doubt, say "N"
 
+config DEBUG_SET_MODULE_RONX
+       bool "Set loadable kernel module data as NX and text as RO"
+       depends on MODULES
+       ---help---
+         This option helps catch unintended modifications to loadable
+         kernel module's text and read-only data. It also prevents execution
+         of module data. Such protection may interfere with run-time code
+         patching and dynamic kernel tracing - and they might also protect
+         against certain classes of kernel exploits.
+         If in doubt, say "N".
+
 config DEBUG_NX_TEST
        tristate "Testcase for the NX non-executable stack feature"
        depends on DEBUG_KERNEL && m
index 52f85a196fa033df961d20349ce6b7437409e843..35af09d13dc13b5d41ec7e19e066c7b5b676f30a 100644 (file)
@@ -182,7 +182,7 @@ no_longmode:
        hlt
        jmp     1b
 
-#include "../../kernel/verify_cpu_64.S"
+#include "../../kernel/verify_cpu.S"
 
        /*
         * Be careful here startup_64 needs to be at a predictable
index 4a2adaa9aefcc1fe2e42f7c01e29cb3ec61f5842..13009d1af99a33e2bbee39fbe80a194fac85ece4 100644 (file)
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
 extern void alternatives_smp_module_del(struct module *mod);
 extern void alternatives_smp_switch(int smp);
 extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
 #else
 static inline void alternatives_smp_module_add(struct module *mod, char *name,
                                               void *locks, void *locks_end,
index c8517f81b21e73f9f2c428a26f2fb8995f73011f..6aee50d655d12f6792e495c5d5d004aaef9377e2 100644 (file)
@@ -3,36 +3,53 @@
 
 #include <linux/pci.h>
 
-extern struct pci_device_id k8_nb_ids[];
+extern struct pci_device_id amd_nb_misc_ids[];
 struct bootnode;
 
-extern int early_is_k8_nb(u32 value);
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
+extern int early_is_amd_nb(u32 value);
+extern int amd_cache_northbridges(void);
+extern void amd_flush_garts(void);
+extern int amd_get_nodes(struct bootnode *nodes);
+extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int amd_scan_nodes(void);
 
-struct k8_northbridge_info {
+struct amd_northbridge {
+       struct pci_dev *misc;
+};
+
+struct amd_northbridge_info {
        u16 num;
-       u8 gart_supported;
-       struct pci_dev **nb_misc;
+       u64 flags;
+       struct amd_northbridge *nb;
 };
-extern struct k8_northbridge_info k8_northbridges;
+extern struct amd_northbridge_info amd_northbridges;
+
+#define AMD_NB_GART                    0x1
+#define AMD_NB_L3_INDEX_DISABLE                0x2
 
 #ifdef CONFIG_AMD_NB
 
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline int amd_nb_num(void)
 {
-       return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+       return amd_northbridges.num;
 }
 
-#else
+static inline int amd_nb_has_feature(int feature)
+{
+       return ((amd_northbridges.flags & feature) == feature);
+}
 
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline struct amd_northbridge *node_to_amd_nb(int node)
 {
-       return NULL;
+       return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
 }
+
+#else
+
+#define amd_nb_num(x)          0
+#define amd_nb_has_feature(x)  false
+#define node_to_amd_nb(x)      NULL
+
 #endif
 
 
index f6ce0bda3b98a74906cb1c8297f4ba150430699d..cf12007796db95f1a48635a55016ff01a0f811c2 100644 (file)
@@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
 extern void enable_NMI_through_LVT0(void);
+extern int apic_force_enable(void);
 
 /*
  * On 32bit this is mach-xxx local
index a859ca461fb0432585f952e08337610b1165a204..47a30ff8e51782a31f146c78b458ea6a89bcacc3 100644 (file)
 
 #ifdef CONFIG_X86_32
 # define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
 #else
 # define MAX_IO_APICS 128
 # define MAX_LOCAL_APIC 32768
index 8e6218550e774b56fd30f3171bc163bfb511b947..c8bfe63a06de289057321af73c114e3521d9088a 100644 (file)
@@ -124,6 +124,7 @@ enum {
        X86_SUBARCH_LGUEST,
        X86_SUBARCH_XEN,
        X86_SUBARCH_MRST,
+       X86_SUBARCH_CE4100,
        X86_NR_SUBARCHS,
 };
 
index 9479a037419fe1358a96cece0d877a269c71e365..0141b234406fb01f8418320ea49c997fdbd14cbb 100644 (file)
@@ -117,6 +117,10 @@ enum fixed_addresses {
        FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
        FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
        __end_of_permanent_fixed_addresses,
+
+#ifdef CONFIG_X86_MRST
+       FIX_LNW_VRTC,
+#endif
        /*
         * 256 temporary boot-time mappings, used by early_ioremap(),
         * before ioremap() is functional.
index 4aa2bb3b242ab76733e0f7e5ba95454471297c1a..ef328901c80240f4a1471d3e4bdd795daffc6621 100644 (file)
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
        int err;
 
        /* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+       asm volatile("1:  fxrstorq %[fx]\n\t"
+                    "2:\n"
+                    ".section .fixup,\"ax\"\n"
+                    "3:  movl $-1,%[err]\n"
+                    "    jmp  2b\n"
+                    ".previous\n"
+                    _ASM_EXTABLE(1b, 3b)
+                    : [err] "=r" (err)
+                    : [fx] "m" (*fx), "0" (0));
+#else
        asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
                     "2:\n"
                     ".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
                     _ASM_EXTABLE(1b, 3b)
                     : [err] "=r" (err)
                     : [fx] "R" (fx), "m" (*fx), "0" (0));
+#endif
        return err;
 }
 
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                return -EFAULT;
 
        /* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+       asm volatile("1:  fxsaveq %[fx]\n\t"
+                    "2:\n"
+                    ".section .fixup,\"ax\"\n"
+                    "3:  movl $-1,%[err]\n"
+                    "    jmp  2b\n"
+                    ".previous\n"
+                    _ASM_EXTABLE(1b, 3b)
+                    : [err] "=r" (err), [fx] "=m" (*fx)
+                    : "0" (0));
+#else
        asm volatile("1:  rex64/fxsave (%[fx])\n\t"
                     "2:\n"
                     ".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                     _ASM_EXTABLE(1b, 3b)
                     : [err] "=r" (err), "=m" (*fx)
                     : [fx] "R" (fx), "0" (0));
+#endif
        if (unlikely(err) &&
            __clear_user(fx, sizeof(struct i387_fxsave_struct)))
                err = -EFAULT;
index a6b28d017c2fb9aae9e5f5d2bae03db4dcadbd43..0c5ca4e30d7bda949a3470ad6c89f391d19ac623 100644 (file)
@@ -159,7 +159,7 @@ struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
                 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
-extern void ioapic_init_mappings(void);
+extern void ioapic_and_gsi_init(void);
 extern void ioapic_insert_resources(void);
 
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
@@ -168,10 +168,9 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 
-extern void probe_nr_irqs_gsi(void);
 extern int get_nr_irqs_gsi(void);
-
 extern void setup_ioapic_ids_from_mpc(void);
+extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
 struct mp_ioapic_gsi{
        u32 gsi_base;
@@ -189,9 +188,8 @@ extern void __init pre_init_apic_IRQ0(void);
 #define io_apic_assign_pci_irqs 0
 #define setup_ioapic_ids_from_mpc x86_init_noop
 static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void)  { }
+static inline void ioapic_and_gsi_init(void) { }
 static inline void ioapic_insert_resources(void) { }
-static inline void probe_nr_irqs_gsi(void)     { }
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
 
index c62c13cb9788f0a1ea664fed073ebbca1ed02f15..eb16e94ae04f79927eb2c5afbf7b6ecd849b1621 100644 (file)
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
 
 void mce_log_therm_throt_event(__u64 status);
 
+/* Interrupt Handler for core thermal thresholds */
+extern int (*platform_thermal_notify)(__u64 msr_val);
+
 #ifdef CONFIG_X86_THERMAL_VECTOR
 extern void mcheck_intel_therm_init(void);
 #else
index ef51b501e22a6e53bf4ae7e2d9e2566760f72ee1..24215072d0e1e5894d4643634bfe1ef9786eef55 100644 (file)
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
 
 #ifdef CONFIG_MICROCODE_AMD
 extern struct microcode_ops * __init init_amd_microcode(void);
+
+static inline void get_ucode_data(void *to, const u8 *from, size_t n)
+{
+       memcpy(to, from, n);
+}
+
 #else
 static inline struct microcode_ops * __init init_amd_microcode(void)
 {
index c82868e9f905f04779778542298ce5560ae2e865..0c90dd9f05053c83591df6e04ec5d7979fee779f 100644 (file)
@@ -5,8 +5,9 @@
 
 #include <asm/mpspec_def.h>
 #include <asm/x86_init.h>
+#include <asm/apicdef.h>
 
-extern int apic_version[MAX_APICS];
+extern int apic_version[];
 extern int pic_mode;
 
 #ifdef CONFIG_X86_32
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
                                 int active_high_low);
 #endif /* CONFIG_ACPI */
 
-#define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_APICS)
+#define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_LOCAL_APIC)
 
 struct physid_mask {
        unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
        test_and_set_bit(physid, (map).mask)
 
 #define physids_and(dst, src1, src2)                                   \
-       bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+       bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
 
 #define physids_or(dst, src1, src2)                                    \
-       bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+       bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
 
 #define physids_clear(map)                                     \
-       bitmap_zero((map).mask, MAX_APICS)
+       bitmap_zero((map).mask, MAX_LOCAL_APIC)
 
 #define physids_complement(dst, src)                           \
-       bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+       bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
 
 #define physids_empty(map)                                     \
-       bitmap_empty((map).mask, MAX_APICS)
+       bitmap_empty((map).mask, MAX_LOCAL_APIC)
 
 #define physids_equal(map1, map2)                              \
-       bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+       bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
 
 #define physids_weight(map)                                    \
-       bitmap_weight((map).mask, MAX_APICS)
+       bitmap_weight((map).mask, MAX_LOCAL_APIC)
 
 #define physids_shift_right(d, s, n)                           \
-       bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+       bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
 
 #define physids_shift_left(d, s, n)                            \
-       bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+       bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
 
 static inline unsigned long physids_coerce(physid_mask_t *map)
 {
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
        map->mask[0] = physids;
 }
 
-/* Note: will create very large stack frames if physid_mask_t is big */
-#define physid_mask_of_physid(physid)                                  \
-       ({                                                              \
-               physid_mask_t __physid_mask = PHYSID_MASK_NONE;         \
-               physid_set(physid, __physid_mask);                      \
-               __physid_mask;                                          \
-       })
-
 static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
 {
        physids_clear(*map);
index 4a7f96d7c188edd92387cdec4a3be36e23028f35..c0a955a9a08784f662a071d976ef57bc0637c8c6 100644 (file)
 
 #ifdef CONFIG_X86_32
 # define MAX_MPC_ENTRY 1024
-# define MAX_APICS      256
-#else
-# if NR_CPUS <= 255
-#  define MAX_APICS     255
-# else
-#  define MAX_APICS   32768
-# endif
 #endif
 
 /* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h
new file mode 100644 (file)
index 0000000..73668ab
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef _MRST_VRTC_H
+#define _MRST_VRTC_H
+
+extern unsigned char vrtc_cmos_read(unsigned char reg);
+extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
+extern unsigned long vrtc_get_time(void);
+extern int vrtc_set_mmss(unsigned long nowtime);
+
+#endif
index 4a711a684b174435bd5aae838515a836101eb389..719f00b28ff5358caf87d736ed5b4100dafdce9e 100644 (file)
@@ -14,7 +14,9 @@
 #include <linux/sfi.h>
 
 extern int pci_mrst_init(void);
-int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int sfi_mrtc_num;
+extern struct sfi_rtc_table_entry sfi_mrtc_array[];
 
 /*
  * Medfield is the follow-up of Moorestown, it combines two chip solution into
@@ -50,4 +52,14 @@ extern void mrst_early_console_init(void);
 
 extern struct console early_hsu_console;
 extern void hsu_early_console_init(void);
+
+extern void intel_scu_devices_create(void);
+extern void intel_scu_devices_destroy(void);
+
+/* VRTC timer */
+#define MRST_VRTC_MAP_SZ       (1024)
+/*#define MRST_VRTC_PGOFFSET   (0xc00) */
+
+extern void mrst_rtc_init(void);
+
 #endif /* _ASM_X86_MRST_H */
index 86030f63ba02cf0947920a2bdd35a1f0d13ea833..4d0dfa0d998e9f80ce244d86e1fd583513aaaaca 100644 (file)
 #define PACKAGE_THERM_INT_LOW_ENABLE           (1 << 1)
 #define PACKAGE_THERM_INT_PLN_ENABLE           (1 << 24)
 
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE    (1 << 15)
+#define THERM_SHIFT_THRESHOLD0        8
+#define THERM_MASK_THRESHOLD0          (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE    (1 << 23)
+#define THERM_SHIFT_THRESHOLD1        16
+#define THERM_MASK_THRESHOLD1          (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0        (1 << 6)
+#define THERM_LOG_THRESHOLD0           (1 << 7)
+#define THERM_STATUS_THRESHOLD1        (1 << 8)
+#define THERM_LOG_THRESHOLD1           (1 << 9)
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING       (1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC               (1ULL << 1)
index ef9975812c77f0702cc9fb2a1fb616b9cb1b8053..7709c12431b8075761ee36aadf22149fe4adfa4b 100644 (file)
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
 
 static inline void halt(void)
 {
-       PVOP_VCALL0(pv_irq_ops.safe_halt);
+       PVOP_VCALL0(pv_irq_ops.halt);
 }
 
 static inline void wbinvd(void)
index ca0437c714b2aa3c94195a67dfe49e5eeb349b19..6761292296307163a0f5cbb4af1e27642d33da10 100644 (file)
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
 
 #define PCIBIOS_MIN_CARDBUS_IO 0x4000
 
+extern int pcibios_enabled;
 void pcibios_config_init(void);
 struct pci_bus *pcibios_scan_root(int bus);
 
index d6763b139a844243b9fbb8dc620e633fe7b5825a..db8aa19a08a22d35e608625b251bcee7fa5fe1ee 100644 (file)
@@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void);
 static inline void x86_mrst_early_setup(void) { }
 #endif
 
+#ifdef CONFIG_X86_INTEL_CE
+extern void x86_ce4100_early_setup(void);
+#else
+static inline void x86_ce4100_early_setup(void) { }
+#endif
+
 #ifndef _SETUP
 
 /*
index 42d412fd8b02cdd369b5cc8db5a67aa5cc7a0770..ce1d54c8a433a6b866977beeb9c6681e0db64746 100644 (file)
  * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
  * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
  *
- * We will use 31 sets, one for sending BAU messages from each of the 32
+ * We will use one set for sending BAU messages from each of the
  * cpu's on the uvhub.
  *
  * TLB shootdown will use the first of the 8 descriptors of each set.
  * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
  */
 
+#define MAX_CPUS_PER_UVHUB             64
+#define MAX_CPUS_PER_SOCKET            32
+#define UV_ADP_SIZE                    64 /* hardware-provided max. */
+#define UV_CPUS_PER_ACT_STATUS         32 /* hardware-provided max. */
 #define UV_ITEMS_PER_DESCRIPTOR                8
 /* the 'throttle' to prevent the hardware stay-busy bug */
 #define MAX_BAU_CONCURRENT             3
-#define UV_CPUS_PER_ACT_STATUS         32
 #define UV_ACT_STATUS_MASK             0x3
 #define UV_ACT_STATUS_SIZE             2
-#define UV_ADP_SIZE                    32
 #define UV_DISTRIBUTION_SIZE           256
 #define UV_SW_ACK_NPENDING             8
 #define UV_NET_ENDPOINT_INTD           0x38
  * number of destination side software ack resources
  */
 #define DEST_NUM_RESOURCES             8
-#define MAX_CPUS_PER_NODE              32
 /*
  * completion statuses for sending a TLB flush message
  */
index 1e994754d323f400b85c5d1d7350c6e52a702ff9..34244b2cd880cff373e744ec34193adb8287ab2a 100644 (file)
@@ -85,7 +85,6 @@ obj-$(CONFIG_DOUBLEFAULT)     += doublefault_32.o
 obj-$(CONFIG_KGDB)             += kgdb.o
 obj-$(CONFIG_VM86)             += vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
-obj-$(CONFIG_EARLY_PRINTK_MRST)        += early_printk_mrst.o
 
 obj-$(CONFIG_HPET_TIMER)       += hpet.o
 obj-$(CONFIG_APB_TIMER)                += apb_timer.o
index 71232b941b6c9c6409fd14e9479f3d1625eeaeb0..17c8090fabd4703d324240328e2a15a55299bb70 100644 (file)
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 {
        unsigned int ver = 0;
 
+       if (id >= (MAX_LOCAL_APIC-1)) {
+               printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+               return;
+       }
+
        if (!enabled) {
                ++disabled_cpus;
                return;
@@ -910,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
        acpi_register_lapic_address(acpi_lapic_addr);
 
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-                                     acpi_parse_sapic, MAX_APICS);
+                                     acpi_parse_sapic, MAX_LOCAL_APIC);
 
        if (!count) {
                x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
-                                               acpi_parse_x2apic, MAX_APICS);
+                                       acpi_parse_x2apic, MAX_LOCAL_APIC);
                count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
-                                             acpi_parse_lapic, MAX_APICS);
+                                       acpi_parse_lapic, MAX_LOCAL_APIC);
        }
        if (!count && !x2count) {
                printk(KERN_ERR PREFIX "No LAPIC entries present\n");
index 553d0b0d639bf4b8ef1eca720e7fe43f0fa1e662..123608531c8f933b819a3fc7c135748137c8eb5b 100644 (file)
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
        mutex_unlock(&smp_alt);
 }
 
+bool skip_smp_alternatives;
 void alternatives_smp_switch(int smp)
 {
        struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
        printk("lockdep: fixing up alternatives.\n");
 #endif
 
-       if (noreplace_smp || smp_alt_once)
+       if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
                return;
        BUG_ON(!smp && (num_online_cpus() > 1));
 
index 8f6463d8ed0de1ebfece6cb1138a15697f657197..affacb5e0065a1392713da260ecf8abfbab2c405 100644 (file)
 
 static u32 *flush_words;
 
-struct pci_device_id k8_nb_ids[] = {
+struct pci_device_id amd_nb_misc_ids[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
        { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
        {}
 };
-EXPORT_SYMBOL(k8_nb_ids);
+EXPORT_SYMBOL(amd_nb_misc_ids);
 
-struct k8_northbridge_info k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
+struct amd_northbridge_info amd_northbridges;
+EXPORT_SYMBOL(amd_northbridges);
 
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+                                       struct pci_device_id *ids)
 {
        do {
                dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
                if (!dev)
                        break;
-       } while (!pci_match_id(&k8_nb_ids[0], dev));
+       } while (!pci_match_id(ids, dev));
        return dev;
 }
 
-int cache_k8_northbridges(void)
+int amd_cache_northbridges(void)
 {
-       int i;
-       struct pci_dev *dev;
+       int i = 0;
+       struct amd_northbridge *nb;
+       struct pci_dev *misc;
 
-       if (k8_northbridges.num)
+       if (amd_nb_num())
                return 0;
 
-       dev = NULL;
-       while ((dev = next_k8_northbridge(dev)) != NULL)
-               k8_northbridges.num++;
+       misc = NULL;
+       while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+               i++;
 
-       /* some CPU families (e.g. family 0x11) do not support GART */
-       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
-           boot_cpu_data.x86 == 0x15)
-               k8_northbridges.gart_supported = 1;
+       if (i == 0)
+               return 0;
 
-       k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
-                                         sizeof(void *), GFP_KERNEL);
-       if (!k8_northbridges.nb_misc)
+       nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+       if (!nb)
                return -ENOMEM;
 
-       if (!k8_northbridges.num) {
-               k8_northbridges.nb_misc[0] = NULL;
-               return 0;
-       }
+       amd_northbridges.nb = nb;
+       amd_northbridges.num = i;
 
-       if (k8_northbridges.gart_supported) {
-               flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
-                                     GFP_KERNEL);
-               if (!flush_words) {
-                       kfree(k8_northbridges.nb_misc);
-                       return -ENOMEM;
-               }
-       }
+       misc = NULL;
+       for (i = 0; i != amd_nb_num(); i++) {
+               node_to_amd_nb(i)->misc = misc =
+                       next_northbridge(misc, amd_nb_misc_ids);
+        }
+
+       /* some CPU families (e.g. family 0x11) do not support GART */
+       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+           boot_cpu_data.x86 == 0x15)
+               amd_northbridges.flags |= AMD_NB_GART;
+
+       /*
+        * Some CPU families support L3 Cache Index Disable. There are some
+        * limitations because of E382 and E388 on family 0x10.
+        */
+       if (boot_cpu_data.x86 == 0x10 &&
+           boot_cpu_data.x86_model >= 0x8 &&
+           (boot_cpu_data.x86_model > 0x9 ||
+            boot_cpu_data.x86_mask >= 0x1))
+               amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
 
-       dev = NULL;
-       i = 0;
-       while ((dev = next_k8_northbridge(dev)) != NULL) {
-               k8_northbridges.nb_misc[i] = dev;
-               if (k8_northbridges.gart_supported)
-                       pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-       }
-       k8_northbridges.nb_misc[i] = NULL;
        return 0;
 }
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
 
 /* Ignores subdevice/subvendor but as far as I can figure out
    they're useless anyways */
-int __init early_is_k8_nb(u32 device)
+int __init early_is_amd_nb(u32 device)
 {
        struct pci_device_id *id;
        u32 vendor = device & 0xffff;
        device >>= 16;
-       for (id = k8_nb_ids; id->vendor; id++)
+       for (id = amd_nb_misc_ids; id->vendor; id++)
                if (vendor == id->vendor && device == id->device)
                        return 1;
        return 0;
 }
 
-void k8_flush_garts(void)
+int amd_cache_gart(void)
+{
+       int i;
+
+       if (!amd_nb_has_feature(AMD_NB_GART))
+               return 0;
+
+       flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+       if (!flush_words) {
+               amd_northbridges.flags &= ~AMD_NB_GART;
+               return -ENOMEM;
+       }
+
+       for (i = 0; i != amd_nb_num(); i++)
+               pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                     &flush_words[i]);
+
+       return 0;
+}
+
+void amd_flush_garts(void)
 {
        int flushed, i;
        unsigned long flags;
        static DEFINE_SPINLOCK(gart_lock);
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return;
 
        /* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +130,16 @@ void k8_flush_garts(void)
           that it doesn't matter to serialize more. -AK */
        spin_lock_irqsave(&gart_lock, flags);
        flushed = 0;
-       for (i = 0; i < k8_northbridges.num; i++) {
-               pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
-                                      flush_words[i]|1);
+       for (i = 0; i < amd_nb_num(); i++) {
+               pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                      flush_words[i] | 1);
                flushed++;
        }
-       for (i = 0; i < k8_northbridges.num; i++) {
+       for (i = 0; i < amd_nb_num(); i++) {
                u32 w;
                /* Make sure the hardware actually executed the flush*/
                for (;;) {
-                       pci_read_config_dword(k8_northbridges.nb_misc[i],
+                       pci_read_config_dword(node_to_amd_nb(i)->misc,
                                              0x9c, &w);
                        if (!(w & 1))
                                break;
@@ -129,19 +150,23 @@ void k8_flush_garts(void)
        if (!flushed)
                printk("nothing to flush?\n");
 }
-EXPORT_SYMBOL_GPL(k8_flush_garts);
+EXPORT_SYMBOL_GPL(amd_flush_garts);
 
-static __init int init_k8_nbs(void)
+static __init int init_amd_nbs(void)
 {
        int err = 0;
 
-       err = cache_k8_northbridges();
+       err = amd_cache_northbridges();
 
        if (err < 0)
-               printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+               printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+
+       if (amd_cache_gart() < 0)
+               printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+                      "GART support disabled.\n");
 
        return err;
 }
 
 /* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
+fs_initcall(init_amd_nbs);
index 92543c73cf8ed8d085dc581fe8171b3bbb6f939e..7c9ab59653e8bc5e229ba9e96734d20d4db50db5 100644 (file)
@@ -315,6 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev)
 
        if (system_state == SYSTEM_BOOTING) {
                irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+               irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
                /* APB timer irqs are set up as mp_irqs, timer is edge type */
                __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
                if (request_irq(adev->irq, apbt_interrupt_handler,
index b3a16e8f0703d47f50a354223bfe8c6e9382126e..dcd7c83e1659212ea5bab9d1ba0b1f0d8e4942c5 100644 (file)
@@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
  * Do an PCI bus scan by hand because we're running before the PCI
  * subsystem.
  *
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
  * generically. It's probably overkill to always scan all slots because
  * the AGP bridges should be always an own bus on the HT hierarchy,
  * but do it here for future safety.
@@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void)
                dev_limit = bus_dev_ranges[i].dev_limit;
 
                for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
 
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void)
                dev_limit = bus_dev_ranges[i].dev_limit;
 
                for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
 
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void)
                dev_limit = bus_dev_ranges[i].dev_limit;
 
                for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
 
                        iommu_detected = 1;
@@ -518,7 +518,7 @@ out:
                dev_base = bus_dev_ranges[i].dev_base;
                dev_limit = bus_dev_ranges[i].dev_limit;
                for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
 
                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
index fb7657822aadd7cb0954f6483490b5e9a0279f60..879999a5230fc613a0815cd056b4820ebb5cf95a 100644 (file)
@@ -431,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
        reserved = reserve_eilvt_offset(offset, new);
 
        if (reserved != new) {
-               pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
-                      "vector 0x%x was already reserved by another core, "
-                      "APIC%lX=0x%x\n",
-                      smp_processor_id(), new, reserved, reg, old);
+               pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+                      "vector 0x%x, but the register is already in use for "
+                      "vector 0x%x on another cpu\n",
+                      smp_processor_id(), reg, offset, new, reserved);
                return -EINVAL;
        }
 
        if (!eilvt_entry_is_changeable(old, new)) {
-               pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
-                      "register already in use, APIC%lX=0x%x\n",
-                      smp_processor_id(), new, reg, old);
+               pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+                      "vector 0x%x, but the register is already in use for "
+                      "vector 0x%x on this cpu\n",
+                      smp_processor_id(), reg, offset, new, old);
                return -EBUSY;
        }
 
@@ -1532,13 +1533,60 @@ static int __init detect_init_APIC(void)
        return 0;
 }
 #else
+
+static int apic_verify(void)
+{
+       u32 features, h, l;
+
+       /*
+        * The APIC feature bit should now be enabled
+        * in `cpuid'
+        */
+       features = cpuid_edx(1);
+       if (!(features & (1 << X86_FEATURE_APIC))) {
+               pr_warning("Could not enable APIC!\n");
+               return -1;
+       }
+       set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+       /* The BIOS may have set up the APIC at some other address */
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       if (l & MSR_IA32_APICBASE_ENABLE)
+               mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+       pr_info("Found and enabled local APIC!\n");
+       return 0;
+}
+
+int apic_force_enable(void)
+{
+       u32 h, l;
+
+       if (disable_apic)
+               return -1;
+
+       /*
+        * Some BIOSes disable the local APIC in the APIC_BASE
+        * MSR. This can only be done in software for Intel P6 or later
+        * and AMD K7 (Model > 1) or later.
+        */
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+               pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+               l &= ~MSR_IA32_APICBASE_BASE;
+               l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+               wrmsr(MSR_IA32_APICBASE, l, h);
+               enabled_via_apicbase = 1;
+       }
+       return apic_verify();
+}
+
 /*
  * Detect and initialize APIC
  */
 static int __init detect_init_APIC(void)
 {
-       u32 h, l, features;
-
        /* Disabled by kernel option? */
        if (disable_apic)
                return -1;
@@ -1568,38 +1616,12 @@ static int __init detect_init_APIC(void)
                                "you can enable it with \"lapic\"\n");
                        return -1;
                }
-               /*
-                * Some BIOSes disable the local APIC in the APIC_BASE
-                * MSR. This can only be done in software for Intel P6 or later
-                * and AMD K7 (Model > 1) or later.
-                */
-               rdmsr(MSR_IA32_APICBASE, l, h);
-               if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-                       pr_info("Local APIC disabled by BIOS -- reenabling.\n");
-                       l &= ~MSR_IA32_APICBASE_BASE;
-                       l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
-                       wrmsr(MSR_IA32_APICBASE, l, h);
-                       enabled_via_apicbase = 1;
-               }
-       }
-       /*
-        * The APIC feature bit should now be enabled
-        * in `cpuid'
-        */
-       features = cpuid_edx(1);
-       if (!(features & (1 << X86_FEATURE_APIC))) {
-               pr_warning("Could not enable APIC!\n");
-               return -1;
+               if (apic_force_enable())
+                       return -1;
+       } else {
+               if (apic_verify())
+                       return -1;
        }
-       set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-       /* The BIOS may have set up the APIC at some other address */
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       if (l & MSR_IA32_APICBASE_ENABLE)
-               mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
-       pr_info("Found and enabled local APIC!\n");
 
        apic_pm_activate();
 
@@ -1687,7 +1709,7 @@ void __init init_apic_mappings(void)
  * This initializes the IO-APIC and APIC hardware if this is
  * a UP kernel.
  */
-int apic_version[MAX_APICS];
+int apic_version[MAX_LOCAL_APIC];
 
 int __init APIC_init_uniprocessor(void)
 {
index 16c2db8750a24d84e339d47b1b9c2341cd144b8c..f6cd5b41077034405045fec84fcde39a4b0b3212 100644 (file)
@@ -1933,8 +1933,7 @@ void disable_IO_APIC(void)
  *
  * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
  */
-
-void __init setup_ioapic_ids_from_mpc(void)
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
 {
        union IO_APIC_reg_00 reg_00;
        physid_mask_t phys_id_present_map;
@@ -1943,15 +1942,6 @@ void __init setup_ioapic_ids_from_mpc(void)
        unsigned char old_id;
        unsigned long flags;
 
-       if (acpi_ioapic)
-               return;
-       /*
-        * Don't check I/O APIC IDs for xAPIC systems.  They have
-        * no meaning without the serial APIC bus.
-        */
-       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-               return;
        /*
         * This is broken; anything with a real cpu count has to
         * circumvent this idiocy regardless.
@@ -2005,7 +1995,6 @@ void __init setup_ioapic_ids_from_mpc(void)
                        physids_or(phys_id_present_map, phys_id_present_map, tmp);
                }
 
-
                /*
                 * We need to adjust the IRQ routing table
                 * if the ID changed.
@@ -2041,6 +2030,21 @@ void __init setup_ioapic_ids_from_mpc(void)
                        apic_printk(APIC_VERBOSE, " ok.\n");
        }
 }
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+       if (acpi_ioapic)
+               return;
+       /*
+        * Don't check I/O APIC IDs for xAPIC systems.  They have
+        * no meaning without the serial APIC bus.
+        */
+       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+               return;
+       setup_ioapic_ids_from_mpc_nocheck();
+}
 #endif
 
 int no_timer_check __initdata;
@@ -3593,7 +3597,7 @@ int __init io_apic_get_redir_entries (int ioapic)
        return reg_01.bits.entries + 1;
 }
 
-void __init probe_nr_irqs_gsi(void)
+static void __init probe_nr_irqs_gsi(void)
 {
        int nr;
 
@@ -3910,7 +3914,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
        return res;
 }
 
-void __init ioapic_init_mappings(void)
+void __init ioapic_and_gsi_init(void)
 {
        unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
        struct resource *ioapic_res;
@@ -3948,6 +3952,8 @@ fake_ioapic_page:
                ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
                ioapic_res++;
        }
+
+       probe_nr_irqs_gsi();
 }
 
 void __init ioapic_insert_resources(void)
@@ -4057,7 +4063,8 @@ void __init pre_init_apic_IRQ0(void)
 
        printk(KERN_INFO "Early APIC setup for system timer0\n");
 #ifndef CONFIG_SMP
-       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+       physid_set_mask_of_physid(boot_cpu_physical_apicid,
+                                        &phys_cpu_present_map);
 #endif
        /* Make sure the irq descriptor is set up */
        cfg = alloc_irq_and_cfg_at(0, 0);
index 927902d90fe6123044536253cd8738f3f9815de2..936613e7711354765b5787a8381748cb54940466 100644 (file)
@@ -48,6 +48,16 @@ unsigned int uv_apicid_hibits;
 EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 static DEFINE_SPINLOCK(uv_nmi_lock);
 
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+       unsigned long val, *mmr;
+
+       mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+       val = *mmr;
+       early_iounmap(mmr, sizeof(*mmr));
+       return val;
+}
+
 static inline bool is_GRU_range(u64 start, u64 end)
 {
        return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -58,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
        return is_ISA_range(start, end) || is_GRU_range(start, end);
 }
 
-static int early_get_nodeid(void)
+static int __init early_get_pnodeid(void)
 {
        union uvh_node_id_u node_id;
-       unsigned long *mmr;
-
-       mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
-       node_id.v = *mmr;
-       early_iounmap(mmr, sizeof(*mmr));
+       union uvh_rh_gam_config_mmr_u  m_n_config;
+       int pnode;
 
        /* Currently, all blades have same revision number */
+       node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+       m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
        uv_min_hub_revision_id = node_id.s.revision;
 
-       return node_id.s.node_id;
+       pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+       return pnode;
 }
 
 static void __init early_get_apic_pnode_shift(void)
 {
-       unsigned long *mmr;
-
-       mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
-       uvh_apicid.v = *mmr;
-       early_iounmap(mmr, sizeof(*mmr));
+       uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
        if (!uvh_apicid.v)
                /*
                 * Old bios, use default value
@@ -95,21 +101,17 @@ static void __init early_get_apic_pnode_shift(void)
 static void __init uv_set_apicid_hibit(void)
 {
        union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
-       unsigned long *mmr;
 
-       mmr = early_ioremap(UV_LOCAL_MMR_BASE |
-               UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr));
-       apicid_mask.v = *mmr;
-       early_iounmap(mmr, sizeof(*mmr));
+       apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
        uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
 }
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-       int nodeid;
+       int pnodeid;
 
        if (!strcmp(oem_id, "SGI")) {
-               nodeid = early_get_nodeid();
+               pnodeid = early_get_pnodeid();
                early_get_apic_pnode_shift();
                x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
                x86_platform.nmi_init = uv_nmi_init;
@@ -119,7 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                        uv_system_type = UV_X2APIC;
                else if (!strcmp(oem_table_id, "UVH")) {
                        __get_cpu_var(x2apic_extra_bits) =
-                               nodeid << (uvh_apicid.s.pnode_shift - 1);
+                               pnodeid << uvh_apicid.s.pnode_shift;
                        uv_system_type = UV_NON_UNIQUE_APIC;
                        uv_set_apicid_hibit();
                        return 1;
@@ -682,27 +684,32 @@ void uv_nmi_init(void)
 void __init uv_system_init(void)
 {
        union uvh_rh_gam_config_mmr_u  m_n_config;
+       union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
        union uvh_node_id_u node_id;
        unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
-       int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+       int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
        int gnode_extra, max_pnode = 0;
        unsigned long mmr_base, present, paddr;
-       unsigned short pnode_mask;
+       unsigned short pnode_mask, pnode_io_mask;
 
        map_low_mmrs();
 
        m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
        m_val = m_n_config.s.m_skt;
        n_val = m_n_config.s.n_skt;
+       mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+       n_io = mmioh.s.n_io;
        mmr_base =
            uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
            ~UV_MMR_ENABLE;
        pnode_mask = (1 << n_val) - 1;
+       pnode_io_mask = (1 << n_io) - 1;
+
        node_id.v = uv_read_local_mmr(UVH_NODE_ID);
        gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
        gnode_upper = ((unsigned long)gnode_extra  << m_val);
-       printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
-                       n_val, m_val, gnode_upper, gnode_extra);
+       printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
+                       n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
 
        printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
 
@@ -735,7 +742,7 @@ void __init uv_system_init(void)
                for (j = 0; j < 64; j++) {
                        if (!test_bit(j, &present))
                                continue;
-                       pnode = (i * 64 + j);
+                       pnode = (i * 64 + j) & pnode_mask;
                        uv_blade_info[blade].pnode = pnode;
                        uv_blade_info[blade].nr_possible_cpus = 0;
                        uv_blade_info[blade].nr_online_cpus = 0;
@@ -756,6 +763,7 @@ void __init uv_system_init(void)
                /*
                 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
                 */
+               uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
                pnode = uv_apicid_to_pnode(apicid);
                blade = boot_pnode_to_blade(pnode);
@@ -772,7 +780,6 @@ void __init uv_system_init(void)
                uv_cpu_hub_info(cpu)->numa_blade_id = blade;
                uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
                uv_cpu_hub_info(cpu)->pnode = pnode;
-               uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
                uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
                uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -796,7 +803,7 @@ void __init uv_system_init(void)
 
        map_gru_high(max_pnode);
        map_mmr_high(max_pnode);
-       map_mmioh_high(max_pnode);
+       map_mmioh_high(max_pnode & pnode_io_mask);
 
        uv_cpu_init();
        uv_scir_register_cpu_notifier();
index 17ad0336621135a2310f3e9a2e5d9bf3d2f668ba..9ecf81f9b90fb0c73416d958b1aa216b17e1ecfa 100644 (file)
@@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx {
 };
 
 struct amd_l3_cache {
-       struct   pci_dev *dev;
-       bool     can_disable;
+       struct   amd_northbridge *nb;
        unsigned indices;
        u8       subcaches[4];
 };
@@ -311,14 +310,12 @@ struct _cache_attr {
 /*
  * L3 cache descriptors
  */
-static struct amd_l3_cache **__cpuinitdata l3_caches;
-
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
        unsigned int sc0, sc1, sc2, sc3;
        u32 val = 0;
 
-       pci_read_config_dword(l3->dev, 0x1C4, &val);
+       pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
 
        /* calculate subcache sizes */
        l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
        l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-{
-       struct amd_l3_cache *l3;
-       struct pci_dev *dev = node_to_k8_nb_misc(node);
-
-       l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-       if (!l3) {
-               printk(KERN_WARNING "Error allocating L3 struct\n");
-               return NULL;
-       }
-
-       l3->dev = dev;
-
-       amd_calc_l3_indices(l3);
-
-       return l3;
-}
-
-static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
-                                          int index)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
+                                       int index)
 {
+       static struct amd_l3_cache *__cpuinitdata l3_caches;
        int node;
 
-       if (boot_cpu_data.x86 != 0x10)
-               return;
-
-       if (index < 3)
-               return;
-
-       /* see errata #382 and #388 */
-       if (boot_cpu_data.x86_model < 0x8)
-               return;
-
-       if ((boot_cpu_data.x86_model == 0x8 ||
-            boot_cpu_data.x86_model == 0x9)
-               &&
-            boot_cpu_data.x86_mask < 0x1)
-                       return;
-
-       /* not in virtualized environments */
-       if (k8_northbridges.num == 0)
+       /* only for L3, and not in virtualized environments */
+       if (index < 3 || amd_nb_num() == 0)
                return;
 
        /*
@@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
         * never freed but this is done only on shutdown so it doesn't matter.
         */
        if (!l3_caches) {
-               int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
+               int size = amd_nb_num() * sizeof(struct amd_l3_cache);
 
                l3_caches = kzalloc(size, GFP_ATOMIC);
                if (!l3_caches)
@@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 
        node = amd_get_nb_id(smp_processor_id());
 
-       if (!l3_caches[node]) {
-               l3_caches[node] = amd_init_l3_cache(node);
-               l3_caches[node]->can_disable = true;
+       if (!l3_caches[node].nb) {
+               l3_caches[node].nb = node_to_amd_nb(node);
+               amd_calc_l3_indices(&l3_caches[node]);
        }
 
-       WARN_ON(!l3_caches[node]);
-
-       this_leaf->l3 = l3_caches[node];
+       this_leaf->l3 = &l3_caches[node];
 }
 
 /*
@@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
 {
        unsigned int reg = 0;
 
-       pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+       pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
 
        /* check whether this slot is activated already */
        if (reg & (3UL << 30))
@@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 {
        int index;
 
-       if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+       if (!this_leaf->l3 ||
+           !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                return -EINVAL;
 
        index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                if (!l3->subcaches[i])
                        continue;
 
-               pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+               pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
 
                /*
                 * We need to WBINVD on a core on the node containing the L3
@@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                wbinvd_on_cpu(cpu);
 
                reg |= BIT(31);
-               pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+               pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
        }
 }
 
@@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+       if (!this_leaf->l3 ||
+           !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                return -EINVAL;
 
        cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 #define STORE_CACHE_DISABLE(slot)                                      \
 static ssize_t                                                         \
 store_cache_disable_##slot(struct _cpuid4_info *this_leaf,             \
-                           const char *buf, size_t count)              \
+                          const char *buf, size_t count)               \
 {                                                                      \
        return store_cache_disable(this_leaf, buf, count, slot);        \
 }
@@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
                show_cache_disable_1, store_cache_disable_1);
 
 #else  /* CONFIG_AMD_NB */
-static void __cpuinit
-amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
-{
-};
+#define amd_init_l3_cache(x, y)
 #endif /* CONFIG_AMD_NB */
 
 static int
@@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
                amd_cpuid4(index, &eax, &ebx, &ecx);
-               amd_check_l3_disable(this_leaf, index);
+               amd_init_l3_cache(this_leaf, index);
        } else {
                cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
        }
@@ -983,30 +944,48 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
 
-#define DEFAULT_SYSFS_CACHE_ATTRS      \
-       &type.attr,                     \
-       &level.attr,                    \
-       &coherency_line_size.attr,      \
-       &physical_line_partition.attr,  \
-       &ways_of_associativity.attr,    \
-       &number_of_sets.attr,           \
-       &size.attr,                     \
-       &shared_cpu_map.attr,           \
-       &shared_cpu_list.attr
-
 static struct attribute *default_attrs[] = {
-       DEFAULT_SYSFS_CACHE_ATTRS,
+       &type.attr,
+       &level.attr,
+       &coherency_line_size.attr,
+       &physical_line_partition.attr,
+       &ways_of_associativity.attr,
+       &number_of_sets.attr,
+       &size.attr,
+       &shared_cpu_map.attr,
+       &shared_cpu_list.attr,
        NULL
 };
 
-static struct attribute *default_l3_attrs[] = {
-       DEFAULT_SYSFS_CACHE_ATTRS,
 #ifdef CONFIG_AMD_NB
-       &cache_disable_0.attr,
-       &cache_disable_1.attr,
+static struct attribute ** __cpuinit amd_l3_attrs(void)
+{
+       static struct attribute **attrs;
+       int n;
+
+       if (attrs)
+               return attrs;
+
+       n = sizeof (default_attrs) / sizeof (struct attribute *);
+
+       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+               n += 2;
+
+       attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
+       if (attrs == NULL)
+               return attrs = default_attrs;
+
+       for (n = 0; default_attrs[n]; n++)
+               attrs[n] = default_attrs[n];
+
+       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+               attrs[n++] = &cache_disable_0.attr;
+               attrs[n++] = &cache_disable_1.attr;
+       }
+
+       return attrs;
+}
 #endif
-       NULL
-};
 
 static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
@@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
                this_leaf = CPUID4_INFO_IDX(cpu, i);
 
-               if (this_leaf->l3 && this_leaf->l3->can_disable)
-                       ktype_cache.default_attrs = default_l3_attrs;
-               else
-                       ktype_cache.default_attrs = default_attrs;
-
+               ktype_cache.default_attrs = default_attrs;
+#ifdef CONFIG_AMD_NB
+               if (this_leaf->l3)
+                       ktype_cache.default_attrs = amd_l3_attrs();
+#endif
                retval = kobject_init_and_add(&(this_object->kobj),
                                              &ktype_cache,
                                              per_cpu(ici_cache_kobject, cpu),
index 80c482382d5c95e06b71ffdf91b6f5d362bf2b45..5bf2fac52aca7771b6b7827117b9d2b2778fd8ad 100644 (file)
@@ -31,8 +31,6 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 
-#define PFX               "mce_threshold: "
-#define VERSION           "version 1.1.1"
 #define NR_BANKS          6
 #define NR_BLOCKS         9
 #define THRESHOLD_MAX     0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
        struct list_head        miscj;
 };
 
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
-       .interrupt_enable       = 0,
-       .threshold_limit        = THRESHOLD_MAX,
-};
-
 struct threshold_bank {
        struct kobject          *kobj;
        struct threshold_block  *blocks;
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void);
 struct thresh_restart {
        struct threshold_block  *b;
        int                     reset;
+       int                     set_lvt_off;
+       int                     lvt_off;
        u16                     old_limit;
 };
 
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+       int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+       if (apic < 0) {
+               pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+                      b->bank, b->block, b->address, hi, lo);
+               return 0;
+       }
+
+       if (apic != msr) {
+               pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+                      b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+               return 0;
+       }
+
+       return 1;
+};
+
 /* must be called with correct cpu affinity */
 /* Called via smp_call_function_single() */
 static void threshold_restart_bank(void *_tr)
 {
        struct thresh_restart *tr = _tr;
-       u32 mci_misc_hi, mci_misc_lo;
+       u32 hi, lo;
 
-       rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+       rdmsr(tr->b->address, lo, hi);
 
-       if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+       if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
                tr->reset = 1;  /* limit cannot be lower than err count */
 
        if (tr->reset) {                /* reset err count and overflow bit */
-               mci_misc_hi =
-                   (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+               hi =
+                   (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
                    (THRESHOLD_MAX - tr->b->threshold_limit);
        } else if (tr->old_limit) {     /* change limit w/o reset */
-               int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+               int new_count = (hi & THRESHOLD_MAX) +
                    (tr->old_limit - tr->b->threshold_limit);
 
-               mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+               hi = (hi & ~MASK_ERR_COUNT_HI) |
                    (new_count & THRESHOLD_MAX);
        }
 
+       if (tr->set_lvt_off) {
+               if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+                       /* set new lvt offset */
+                       hi &= ~MASK_LVTOFF_HI;
+                       hi |= tr->lvt_off << 20;
+               }
+       }
+
        tr->b->interrupt_enable ?
-           (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
-           (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+           (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+           (hi &= ~MASK_INT_TYPE_HI);
 
-       mci_misc_hi |= MASK_COUNT_EN_HI;
-       wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+       hi |= MASK_COUNT_EN_HI;
+       wrmsr(tr->b->address, lo, hi);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+       struct thresh_restart tr = {
+               .b                      = b,
+               .set_lvt_off            = 1,
+               .lvt_off                = offset,
+       };
+
+       b->threshold_limit              = THRESHOLD_MAX;
+       threshold_restart_bank(&tr);
+};
+
+static int setup_APIC_mce(int reserved, int new)
+{
+       if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+                                             APIC_EILVT_MSG_FIX, 0))
+               return new;
+
+       return reserved;
 }
 
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
+       struct threshold_block b;
        unsigned int cpu = smp_processor_id();
        u32 low = 0, high = 0, address = 0;
        unsigned int bank, block;
-       struct thresh_restart tr;
-       int lvt_off = -1;
-       u8 offset;
+       int offset = -1;
 
        for (bank = 0; bank < NR_BANKS; ++bank) {
                for (block = 0; block < NR_BLOCKS; ++block) {
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                        if (shared_bank[bank] && c->cpu_core_id)
                                break;
 #endif
-                       offset = (high & MASK_LVTOFF_HI) >> 20;
-                       if (lvt_off < 0) {
-                               if (setup_APIC_eilvt(offset,
-                                                    THRESHOLD_APIC_VECTOR,
-                                                    APIC_EILVT_MSG_FIX, 0)) {
-                                       pr_err(FW_BUG "cpu %d, failed to "
-                                              "setup threshold interrupt "
-                                              "for bank %d, block %d "
-                                              "(MSR%08X=0x%x%08x)",
-                                              smp_processor_id(), bank, block,
-                                              address, high, low);
-                                       continue;
-                               }
-                               lvt_off = offset;
-                       } else if (lvt_off != offset) {
-                               pr_err(FW_BUG "cpu %d, invalid threshold "
-                                      "interrupt offset %d for bank %d,"
-                                      "block %d (MSR%08X=0x%x%08x)",
-                                      smp_processor_id(), lvt_off, bank,
-                                      block, address, high, low);
-                               continue;
-                       }
-
-                       high &= ~MASK_LVTOFF_HI;
-                       high |= lvt_off << 20;
-                       wrmsr(address, low, high);
+                       offset = setup_APIC_mce(offset,
+                                               (high & MASK_LVTOFF_HI) >> 20);
 
-                       threshold_defaults.address = address;
-                       tr.b = &threshold_defaults;
-                       tr.reset = 0;
-                       tr.old_limit = 0;
-                       threshold_restart_bank(&tr);
+                       memset(&b, 0, sizeof(b));
+                       b.cpu           = cpu;
+                       b.bank          = bank;
+                       b.block         = block;
+                       b.address       = address;
 
+                       mce_threshold_block_init(&b, offset);
                        mce_threshold_vector = amd_threshold_interrupt;
                }
        }
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
 
        b->interrupt_enable = !!new;
 
+       memset(&tr, 0, sizeof(tr));
        tr.b            = b;
-       tr.reset        = 0;
-       tr.old_limit    = 0;
 
        smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
        if (new < 1)
                new = 1;
 
+       memset(&tr, 0, sizeof(tr));
        tr.old_limit = b->threshold_limit;
        b->threshold_limit = new;
        tr.b = b;
-       tr.reset = 0;
 
        smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
                        continue;
                err = threshold_create_bank(cpu, bank);
                if (err)
-                       goto out;
+                       return err;
        }
-out:
+
        return err;
 }
 
index 4b683267eca5fb982111375ab9e000d7eda3b437..e12246ff5aa6d2683ae8537a92da76e479a26bf7 100644 (file)
@@ -53,8 +53,13 @@ struct thermal_state {
        struct _thermal_state core_power_limit;
        struct _thermal_state package_throttle;
        struct _thermal_state package_power_limit;
+       struct _thermal_state core_thresh0;
+       struct _thermal_state core_thresh1;
 };
 
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
+
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
 static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -200,6 +205,22 @@ static int therm_throt_process(bool new_event, int event, int level)
        return 0;
 }
 
+static int thresh_event_valid(int event)
+{
+       struct _thermal_state *state;
+       unsigned int this_cpu = smp_processor_id();
+       struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+       u64 now = get_jiffies_64();
+
+       state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
+
+       if (time_before64(now, state->next_check))
+               return 0;
+
+       state->next_check = now + CHECK_INTERVAL;
+       return 1;
+}
+
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
 static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,6 +334,22 @@ device_initcall(thermal_throttle_init_device);
 #define PACKAGE_THROTTLED      ((__u64)2 << 62)
 #define PACKAGE_POWER_LIMIT    ((__u64)3 << 62)
 
+static void notify_thresholds(__u64 msr_val)
+{
+       /* check whether the interrupt handler is defined;
+        * otherwise simply return
+        */
+       if (!platform_thermal_notify)
+               return;
+
+       /* lower threshold reached */
+       if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
+               platform_thermal_notify(msr_val);
+       /* higher threshold reached */
+       if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
+               platform_thermal_notify(msr_val);
+}
+
 /* Thermal transition interrupt handler */
 static void intel_thermal_interrupt(void)
 {
@@ -321,6 +358,9 @@ static void intel_thermal_interrupt(void)
 
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 
+       /* Check for violation of core thermal thresholds*/
+       notify_thresholds(msr_val);
+
        if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
                                THERMAL_THROTTLING_EVENT,
                                CORE_LEVEL) != 0)
index 4572f25f93255f8bb4a5e5158d3c9949f912b657..cd28a350f7f933162a6fa9bd0de08c64e22495f2 100644 (file)
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
                if (!strncmp(buf, "xen", 3))
                        early_console_register(&xenboot_console, keep);
 #endif
-#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+#ifdef CONFIG_EARLY_PRINTK_MRST
                if (!strncmp(buf, "mrst", 4)) {
                        mrst_early_console_init();
                        early_console_register(&early_mrst_console, keep);
@@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf)
                        hsu_early_console_init();
                        early_console_register(&early_hsu_console, keep);
                }
-
 #endif
                buf++;
        }
index 3afb33f14d2d2c86a3c961d87aaae531d2631ac8..298448656b6079d074232518cb16e50895b4a5b8 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/module.h>
 
 #include <trace/syscall.h>
 
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
 int ftrace_arch_code_modify_prepare(void)
 {
        set_kernel_text_rw();
+       set_all_modules_text_rw();
        modifying_code = 1;
        return 0;
 }
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
 int ftrace_arch_code_modify_post_process(void)
 {
        modifying_code = 0;
+       set_all_modules_text_ro();
        set_kernel_text_ro();
        return 0;
 }
index 763310165fa0d1b0e4bf1891632a729e289980e8..7f138b3c3c52cf2d6790d8308fdc9a6dae8b41b7 100644 (file)
@@ -61,6 +61,9 @@ void __init i386_start_kernel(void)
        case X86_SUBARCH_MRST:
                x86_mrst_early_setup();
                break;
+       case X86_SUBARCH_CE4100:
+               x86_ce4100_early_setup();
+               break;
        default:
                i386_default_early_setup();
                break;
index c0dbd9ac24f0d5cf7e87f8f0439275656b877f73..9f54b209c3780c3fdef9951368c6edf0cc081c62 100644 (file)
@@ -139,39 +139,6 @@ ENTRY(startup_32)
        movl %eax, pa(olpc_ofw_pgd)
 #endif
 
-#ifdef CONFIG_PARAVIRT
-       /* This is can only trip for a broken bootloader... */
-       cmpw $0x207, pa(boot_params + BP_version)
-       jb default_entry
-
-       /* Paravirt-compatible boot parameters.  Look to see what architecture
-               we're booting under. */
-       movl pa(boot_params + BP_hardware_subarch), %eax
-       cmpl $num_subarch_entries, %eax
-       jae bad_subarch
-
-       movl pa(subarch_entries)(,%eax,4), %eax
-       subl $__PAGE_OFFSET, %eax
-       jmp *%eax
-
-bad_subarch:
-WEAK(lguest_entry)
-WEAK(xen_entry)
-       /* Unknown implementation; there's really
-          nothing we can do at this point. */
-       ud2a
-
-       __INITDATA
-
-subarch_entries:
-       .long default_entry             /* normal x86/PC */
-       .long lguest_entry              /* lguest hypervisor */
-       .long xen_entry                 /* Xen hypervisor */
-       .long default_entry             /* Moorestown MID */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#endif /* CONFIG_PARAVIRT */
-
 /*
  * Initialize page tables.  This creates a PDE and a set of page
  * tables, which are located immediately beyond __brk_base.  The variable
@@ -181,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4
  *
  * Note that the stack is not yet set up!
  */
-default_entry:
 #ifdef CONFIG_X86_PAE
 
        /*
@@ -261,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
        movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
        movl %eax,pa(initial_page_table+0xffc)
 #endif
-       jmp 3f
+
+#ifdef CONFIG_PARAVIRT
+       /* This is can only trip for a broken bootloader... */
+       cmpw $0x207, pa(boot_params + BP_version)
+       jb default_entry
+
+       /* Paravirt-compatible boot parameters.  Look to see what architecture
+               we're booting under. */
+       movl pa(boot_params + BP_hardware_subarch), %eax
+       cmpl $num_subarch_entries, %eax
+       jae bad_subarch
+
+       movl pa(subarch_entries)(,%eax,4), %eax
+       subl $__PAGE_OFFSET, %eax
+       jmp *%eax
+
+bad_subarch:
+WEAK(lguest_entry)
+WEAK(xen_entry)
+       /* Unknown implementation; there's really
+          nothing we can do at this point. */
+       ud2a
+
+       __INITDATA
+
+subarch_entries:
+       .long default_entry             /* normal x86/PC */
+       .long lguest_entry              /* lguest hypervisor */
+       .long xen_entry                 /* Xen hypervisor */
+       .long default_entry             /* Moorestown MID */
+num_subarch_entries = (. - subarch_entries) / 4
+.previous
+#else
+       jmp default_entry
+#endif /* CONFIG_PARAVIRT */
+
 /*
  * Non-boot CPU entry point; entered from trampoline.S
  * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -282,7 +283,7 @@ ENTRY(startup_32_smp)
        movl %eax,%fs
        movl %eax,%gs
 #endif /* CONFIG_SMP */
-3:
+default_entry:
 
 /*
  *     New page tables may be in 4Mbyte page mode and may
@@ -316,6 +317,10 @@ ENTRY(startup_32_smp)
        subl $0x80000001, %eax
        cmpl $(0x8000ffff-0x80000001), %eax
        ja 6f
+
+       /* Clear bogus XD_DISABLE bits */
+       call verify_cpu
+
        mov $0x80000001, %eax
        cpuid
        /* Execute Disable bit supported? */
@@ -611,6 +616,8 @@ ignore_int:
 #endif
        iret
 
+#include "verify_cpu.S"
+
        __REFDATA
 .align 4
 ENTRY(initial_code)
@@ -622,13 +629,13 @@ ENTRY(initial_code)
 __PAGE_ALIGNED_BSS
        .align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
-ENTRY(initial_pg_pmd)
+initial_pg_pmd:
        .fill 1024*KPMDS,4,0
 #else
 ENTRY(initial_page_table)
        .fill 1024,4,0
 #endif
-ENTRY(initial_pg_fixmap)
+initial_pg_fixmap:
        .fill 1024,4,0
 ENTRY(empty_zero_page)
        .fill 4096,1,0
index ce0cb4721c9ac9eec8869e64c1fcd1a1ef0fd379..0fe6d1a66c38cf0aaea3383ac000eefbb5d34fca 100644 (file)
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
        return 0;
 }
 
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
-       memcpy(to, from, n);
-       return 0;
-}
-
 static void *
 get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 {
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
        u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
        void *mc;
 
-       if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
-               return NULL;
+       get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
 
        if (section_hdr[0] != UCODE_UCODE_TYPE) {
                pr_err("error: invalid type field in container file section header\n");
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
                return NULL;
        }
 
-       mc = vmalloc(UCODE_MAX_SIZE);
-       if (mc) {
-               memset(mc, 0, UCODE_MAX_SIZE);
-               if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
-                                  total_size)) {
-                       vfree(mc);
-                       mc = NULL;
-               } else
-                       *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
-       }
+       mc = vzalloc(UCODE_MAX_SIZE);
+       if (!mc)
+               return NULL;
+
+       get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
+       *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+
        return mc;
 }
 
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
        unsigned int *buf_pos = (unsigned int *)container_hdr;
        unsigned long size;
 
-       if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
-               return 0;
+       get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
 
        size = buf_pos[2];
 
@@ -219,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf)
        }
 
        buf += UCODE_CONTAINER_HEADER_SIZE;
-       if (get_ucode_data(equiv_cpu_table, buf, size)) {
-               vfree(equiv_cpu_table);
-               return 0;
-       }
+       get_ucode_data(equiv_cpu_table, buf, size);
 
        return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
 }
index ba0f0ca9f280bb0473470fdb71a3fa04faec00f0..c01ffa5b9b87e509da797e359577bbb84463ce5f 100644 (file)
@@ -143,7 +143,7 @@ static void flush_gart(void)
 
        spin_lock_irqsave(&iommu_bitmap_lock, flags);
        if (need_flush) {
-               k8_flush_garts();
+               amd_flush_garts();
                need_flush = false;
        }
        spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -561,17 +561,17 @@ static void enable_gart_translations(void)
 {
        int i;
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return;
 
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
 
                enable_gart_translation(dev, __pa(agp_gatt_table));
        }
 
        /* Flush the GART-TLB to remove stale entries */
-       k8_flush_garts();
+       amd_flush_garts();
 }
 
 /*
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
        if (!fix_up_north_bridges)
                return;
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return;
 
        pr_info("PCI-DMA: Restoring GART aperture settings\n");
 
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
 
                /*
                 * Don't enable translations just yet.  That is the next
@@ -644,7 +644,7 @@ static struct sys_device device_gart = {
  * Private Northbridge GATT initialization in case we cannot use the
  * AGP driver for some reason.
  */
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
 {
        unsigned aper_size, gatt_size, new_aper_size;
        unsigned aper_base, new_aper_base;
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
        aper_size = aper_base = info->aper_size = 0;
        dev = NULL;
-       for (i = 0; i < k8_northbridges.num; i++) {
-               dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               dev = node_to_amd_nb(i)->misc;
                new_aper_base = read_aperture(dev, &new_aper_size);
                if (!new_aper_base)
                        goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
        if (!no_agp)
                return;
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return;
 
-       for (i = 0; i < k8_northbridges.num; i++) {
+       for (i = 0; i < amd_nb_num(); i++) {
                u32 ctl;
 
-               dev = k8_northbridges.nb_misc[i];
+               dev = node_to_amd_nb(i)->misc;
                pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
                ctl &= ~GARTEN;
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void)
        unsigned long scratch;
        long i;
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return 0;
 
 #ifndef CONFIG_AGP_AMD64
        no_agp = 1;
 #else
        /* Makefile puts PCI initialization via subsys_initcall first. */
-       /* Add other K8 AGP bridge drivers here */
+       /* Add other AMD AGP bridge drivers here */
        no_agp = no_agp ||
                (agp_amd64_init() < 0) ||
                (agp_copy_info(agp_bridge, &info) < 0);
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void)
        if (no_iommu ||
            (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
            !gart_iommu_aperture ||
-           (no_agp && init_k8_gatt(&info) < 0)) {
+           (no_agp && init_amd_gatt(&info) < 0)) {
                if (max_pfn > MAX_DMA32_PFN) {
                        pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
                        pr_warning("falling back to iommu=soft.\n");
index fda313ebbb03dfc98d5d0f953a1b99846e25dc9a..c8e41e90f59ceb9da7be768fcfe197a83602742e 100644 (file)
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
        outb(1, 0x92);
 }
 
+static void ce4100_reset(struct pci_dev *dev)
+{
+       int i;
+
+       for (i = 0; i < 10; i++) {
+               outb(0x2, 0xcf9);
+               udelay(50);
+       }
+}
+
 struct device_fixup {
        unsigned int vendor;
        unsigned int device;
        void (*reboot_fixup)(struct pci_dev *);
 };
 
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100     0x0708
+
 static const struct device_fixup fixups_table[] = {
 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
 { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
 { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
 };
 
 /*
index a0f52af256a037eaca0fce91e00caad47018d65d..d3cfe26c0252ab24aaae1481c8c297ac096e6c6d 100644 (file)
@@ -705,7 +705,7 @@ static u64 __init get_max_mapped(void)
 void __init setup_arch(char **cmdline_p)
 {
        int acpi = 0;
-       int k8 = 0;
+       int amd = 0;
        unsigned long flags;
 
 #ifdef CONFIG_X86_32
@@ -991,12 +991,12 @@ void __init setup_arch(char **cmdline_p)
        acpi = acpi_numa_init();
 #endif
 
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
        if (!acpi)
-               k8 = !k8_numa_init(0, max_pfn);
+               amd = !amd_numa_init(0, max_pfn);
 #endif
 
-       initmem_init(0, max_pfn, acpi, k8);
+       initmem_init(0, max_pfn, acpi, amd);
        memblock_find_dma_reserve();
        dma32_reserve_bootmem();
 
@@ -1045,10 +1045,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
        init_apic_mappings();
-       ioapic_init_mappings();
-
-       /* need to wait for io_apic is mapped */
-       probe_nr_irqs_gsi();
+       ioapic_and_gsi_init();
 
        kvm_guest_init();
 
index 68f61ac632e1d814eaa5ab0604505f9b72fc6f80..ee886fe10ef4eb8515ae1d2431c0ce8ad3e8dd89 100644 (file)
@@ -1161,6 +1161,20 @@ out:
        preempt_enable();
 }
 
+void arch_disable_nonboot_cpus_begin(void)
+{
+       /*
+        * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+        * In the suspend path, we will be back in the SMP mode shortly anyways.
+        */
+       skip_smp_alternatives = true;
+}
+
+void arch_disable_nonboot_cpus_end(void)
+{
+       skip_smp_alternatives = false;
+}
+
 void arch_enable_nonboot_cpus_begin(void)
 {
        set_mtrr_aps_delayed_init();
index 3af2dff58b213262d403a8f9c1c55f84920b1d8d..075d130efcf9019eb3f4745677e281533335fa11 100644 (file)
@@ -127,7 +127,7 @@ startup_64:
 no_longmode:
        hlt
        jmp no_longmode
-#include "verify_cpu_64.S"
+#include "verify_cpu.S"
 
        # Careful these need to be in the same 64K segment as the above;
 tidt:
index 0c40d8b72416ba2ef7e86bfd812b7bf6f1db2f8f..356a0d455cf997cb1bd586d3a13fd8a7c16d4d3c 100644 (file)
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void)
 
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;
+
+       if (tsc_clocksource_reliable)
+               return 0;
        /*
         * Intel systems are normally all synchronized.
         * Exceptions must mark TSC as unstable:
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                /* assume multi socket systems are not synchronized: */
                if (num_possible_cpus() > 1)
-                       tsc_unstable = 1;
+                       return 1;
        }
 
-       return tsc_unstable;
+       return 0;
+}
+
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomolies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+       static u64 tsc_start = -1, ref_start;
+       static int hpet;
+       u64 tsc_stop, ref_stop, delta;
+       unsigned long freq;
+
+       /* Don't bother refining TSC on unstable systems */
+       if (check_tsc_unstable())
+               goto out;
+
+       /*
+        * Since the work is started early in boot, we may be
+        * delayed the first time we expire. So set the workqueue
+        * again once we know timers are working.
+        */
+       if (tsc_start == -1) {
+               /*
+                * Only set hpet once, to avoid mixing hardware
+                * if the hpet becomes enabled later.
+                */
+               hpet = is_hpet_enabled();
+               schedule_delayed_work(&tsc_irqwork, HZ);
+               tsc_start = tsc_read_refs(&ref_start, hpet);
+               return;
+       }
+
+       tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+       /* hpet or pmtimer available ? */
+       if (!hpet && !ref_start && !ref_stop)
+               goto out;
+
+       /* Check, whether the sampling was disturbed by an SMI */
+       if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+               goto out;
+
+       delta = tsc_stop - tsc_start;
+       delta *= 1000000LL;
+       if (hpet)
+               freq = calc_hpet_ref(delta, ref_start, ref_stop);
+       else
+               freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+       /* Make sure we're within 1% */
+       if (abs(tsc_khz - freq) > tsc_khz/100)
+               goto out;
+
+       tsc_khz = freq;
+       printk(KERN_INFO "Refined TSC clocksource calibration: "
+               "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+                                       (unsigned long)tsc_khz % 1000);
+
+out:
+       clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
-static void __init init_tsc_clocksource(void)
+
+static int __init init_tsc_clocksource(void)
 {
+       if (!cpu_has_tsc || tsc_disabled > 0)
+               return 0;
+
        if (tsc_clocksource_reliable)
                clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        /* lower the rating if we already know its unstable: */
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void)
                clocksource_tsc.rating = 0;
                clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
        }
-       clocksource_register_khz(&clocksource_tsc, tsc_khz);
+       schedule_delayed_work(&tsc_irqwork, 0);
+       return 0;
 }
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
 
 void __init tsc_init(void)
 {
@@ -949,6 +1036,5 @@ void __init tsc_init(void)
                mark_tsc_unstable("TSCs unsynchronized");
 
        check_system_tsc_reliable();
-       init_tsc_clocksource();
 }
 
similarity index 65%
rename from arch/x86/kernel/verify_cpu_64.S
rename to arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9af28e0c24ee6a7dff4122f73685b..0edefc19a113290792667afe816bfdecaa0c8fc3 100644 (file)
@@ -7,6 +7,7 @@
  *     Copyright (c) 2007  Andi Kleen (ak@suse.de)
  *     Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
  *     Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *     Copyright (c) 2010  Kees Cook (kees.cook@canonical.com)
  *
  *     This source code is licensed under the GNU General Public License,
  *     Version 2.  See the file COPYING for more details.
  *     This is a common code for verification whether CPU supports
  *     long mode and SSE or not. It is not called directly instead this
  *     file is included at various places and compiled in that context.
- *     Following are the current usage.
+ *     This file is expected to run in 32bit code.  Currently:
  *
- *     This file is included by both 16bit and 32bit code.
+ *     arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ *     arch/x86/kernel/trampoline_64.S: secondary processor verfication
+ *     arch/x86/kernel/head_32.S: processor startup
  *
- *     arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- *     arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- *     arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- *     arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- *     verify_cpu, returns the status of cpu check in register %eax.
+ *     verify_cpu, returns the status of longmode and SSE in register %eax.
  *             0: Success    1: Failure
  *
+ *     On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
  *     The caller needs to check for the error code and take the action
  *     appropriately. Either display a message or halt.
  */
@@ -62,8 +62,41 @@ verify_cpu:
        cmpl    $0x444d4163,%ecx
        jnz     verify_cpu_noamd
        mov     $1,%di                  # cpu is from AMD
+       jmp     verify_cpu_check
 
 verify_cpu_noamd:
+       cmpl    $0x756e6547,%ebx        # GenuineIntel?
+       jnz     verify_cpu_check
+       cmpl    $0x49656e69,%edx
+       jnz     verify_cpu_check
+       cmpl    $0x6c65746e,%ecx
+       jnz     verify_cpu_check
+
+       # only call IA32_MISC_ENABLE when:
+       # family > 6 || (family == 6 && model >= 0xd)
+       movl    $0x1, %eax              # check CPU family and model
+       cpuid
+       movl    %eax, %ecx
+
+       andl    $0x0ff00f00, %eax       # mask family and extended family
+       shrl    $8, %eax
+       cmpl    $6, %eax
+       ja      verify_cpu_clear_xd     # family > 6, ok
+       jb      verify_cpu_check        # family < 6, skip
+
+       andl    $0x000f00f0, %ecx       # mask model and extended model
+       shrl    $4, %ecx
+       cmpl    $0xd, %ecx
+       jb      verify_cpu_check        # family == 6, model < 0xd, skip
+
+verify_cpu_clear_xd:
+       movl    $MSR_IA32_MISC_ENABLE, %ecx
+       rdmsr
+       btrl    $2, %edx                # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+       jnc     verify_cpu_check        # only write MSR if bit was changed
+       wrmsr
+
+verify_cpu_check:
        movl    $0x1,%eax               # Does the cpu have what it takes
        cpuid
        andl    $REQUIRED_MASK0,%edx
index e03530aebfd0332635f901afa9f19f00317fa097..bf4700755184e32d4b4e549bd19f4014caa46468 100644 (file)
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
 
 PHDRS {
        text PT_LOAD FLAGS(5);          /* R_E */
-       data PT_LOAD FLAGS(7);          /* RWE */
+       data PT_LOAD FLAGS(6);          /* RW_ */
 #ifdef CONFIG_X86_64
        user PT_LOAD FLAGS(5);          /* R_E */
 #ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
 
        EXCEPTION_TABLE(16) :text = 0x9090
 
+#if defined(CONFIG_DEBUG_RODATA)
+       /* .text should occupy whole number of pages */
+       . = ALIGN(PAGE_SIZE);
+#endif
        X64_ALIGN_DEBUG_RODATA_BEGIN
        RO_DATA(PAGE_SIZE)
        X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
                __bss_start = .;
                *(.bss..page_aligned)
                *(.bss)
-               . = ALIGN(4);
+               . = ALIGN(PAGE_SIZE);
                __bss_stop = .;
        }
 
index e7d5382ef26344534b0883a11d02b19a82d4a9cd..4f420c2f2d5534ea4ac5af20292e25c4346c3cda 100644 (file)
@@ -4,7 +4,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/processor-flags.h>
-#include <asm/pgtable.h>
 
 /*G:020
  * Our story starts with the kernel booting into startup_32 in
@@ -38,113 +37,9 @@ ENTRY(lguest_entry)
        /* Set up the initial stack so we can run C code. */
        movl $(init_thread_union+THREAD_SIZE),%esp
 
-       call init_pagetables
-
        /* Jumps are relative: we're running __PAGE_OFFSET too low. */
        jmp lguest_init+__PAGE_OFFSET
 
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base.  The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- *
- * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they
- * don't have a stack at this point, so we can't just use call and ret.
- */
-init_pagetables:
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-#define pa(X) ((X) - __PAGE_OFFSET)
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
-       PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
-#ifdef CONFIG_X86_PAE
-
-       /*
-        * In PAE mode initial_page_table is statically defined to contain
-        * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-        * entries). The identity mapping is handled by pointing two PGD entries
-        * to the first kernel PMD.
-        *
-        * Note the upper half of each PMD or PTE are always zero at this stage.
-        */
-
-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
-
-       xorl %ebx,%ebx                          /* %ebx is kept at zero */
-
-       movl $pa(__brk_base), %edi
-       movl $pa(initial_pg_pmd), %edx
-       movl $PTE_IDENT_ATTR, %eax
-10:
-       leal PDE_IDENT_ATTR(%edi),%ecx          /* Create PMD entry */
-       movl %ecx,(%edx)                        /* Store PMD entry */
-                                               /* Upper half already zero */
-       addl $8,%edx
-       movl $512,%ecx
-11:
-       stosl
-       xchgl %eax,%ebx
-       stosl
-       xchgl %eax,%ebx
-       addl $0x1000,%eax
-       loop 11b
-
-       /*
-        * End condition: we must map up to the end + MAPPING_BEYOND_END.
-        */
-       movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-       cmpl %ebp,%eax
-       jb 10b
-1:
-       addl $__PAGE_OFFSET, %edi
-       movl %edi, pa(_brk_end)
-       shrl $12, %eax
-       movl %eax, pa(max_pfn_mapped)
-
-       /* Do early initialization of the fixmap area */
-       movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-       movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
-#else  /* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
-       movl $pa(__brk_base), %edi
-       movl $pa(initial_page_table), %edx
-       movl $PTE_IDENT_ATTR, %eax
-10:
-       leal PDE_IDENT_ATTR(%edi),%ecx          /* Create PDE entry */
-       movl %ecx,(%edx)                        /* Store identity PDE entry */
-       movl %ecx,page_pde_offset(%edx)         /* Store kernel PDE entry */
-       addl $4,%edx
-       movl $1024, %ecx
-11:
-       stosl
-       addl $0x1000,%eax
-       loop 11b
-       /*
-        * End condition: we must map up to the end + MAPPING_BEYOND_END.
-        */
-       movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-       cmpl %ebp,%eax
-       jb 10b
-       addl $__PAGE_OFFSET, %edi
-       movl %edi, pa(_brk_end)
-       shrl $12, %eax
-       movl %eax, pa(max_pfn_mapped)
-
-       /* Do early initialization of the fixmap area */
-       movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
-       movl %eax,pa(initial_page_table+0xffc)
-#endif
-       ret
-
 /*G:055
  * We create a macro which puts the assembler code between lgstart_ and lgend_
  * markers.  These templates are put in the .text section: they can't be
index 55543397a8a795f295ef6e3731e6b9373c953958..09df2f9a3d69ce36a20ec86c82bc9b719d44a1ae 100644 (file)
@@ -23,7 +23,7 @@ mmiotrace-y                   := kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)   += testmmiotrace.o
 
 obj-$(CONFIG_NUMA)             += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA)          += k8topology_64.o
+obj-$(CONFIG_AMD_NUMA)         += amdtopology_64.o
 obj-$(CONFIG_ACPI_NUMA)                += srat_$(BITS).o
 
 obj-$(CONFIG_HAVE_MEMBLOCK)            += memblock.o
similarity index 94%
rename from arch/x86/mm/k8topology_64.c
rename to arch/x86/mm/amdtopology_64.c
index 804a3b6c6e14f6aba0cc36616d85ab882a121685..51fae9cfdecb39ba87149dc76f128f6e12e73540 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * AMD K8 NUMA support.
+ * AMD NUMA support.
  * Discover the memory map and associated nodes.
  *
- * This version reads it directly from the K8 northbridge.
+ * This version reads it directly from the AMD northbridge.
  *
  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
  */
@@ -57,7 +57,7 @@ static __init void early_get_boot_cpu_id(void)
 {
        /*
         * need to get the APIC ID of the BSP so can use that to
-        * create apicid_to_node in k8_scan_nodes()
+        * create apicid_to_node in amd_scan_nodes()
         */
 #ifdef CONFIG_X86_MPPARSE
        /*
@@ -69,7 +69,7 @@ static __init void early_get_boot_cpu_id(void)
        early_init_lapic_mapping();
 }
 
-int __init k8_get_nodes(struct bootnode *physnodes)
+int __init amd_get_nodes(struct bootnode *physnodes)
 {
        int i;
        int ret = 0;
@@ -82,7 +82,7 @@ int __init k8_get_nodes(struct bootnode *physnodes)
        return ret;
 }
 
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long start = PFN_PHYS(start_pfn);
        unsigned long end = PFN_PHYS(end_pfn);
@@ -194,7 +194,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
        return 0;
 }
 
-int __init k8_scan_nodes(void)
+int __init amd_scan_nodes(void)
 {
        unsigned int bits;
        unsigned int cores;
index c0e28a13de7df55c1ee1b173b61fd2d18c50e49c..947f42abe820eed9e47388ff3fcfd6fc937bb96a 100644 (file)
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
        /*
         * We just marked the kernel text read only above, now that
         * we are going to free part of that, we need to make that
-        * writeable first.
+        * writeable and non-executable first.
         */
+       set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
        set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
 
        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
index 0e969f9f401b72bddf403325e2a3a862199ee07b..f89b5bb4e93f82926f339054aa37f5bd4e829216 100644 (file)
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 
 static inline int is_kernel_text(unsigned long addr)
 {
-       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+       if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
                return 1;
        return 0;
 }
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
 
+static void mark_nxdata_nx(void)
+{
+       /*
+        * When this called, init has already been executed and released,
+        * so everything past _etext sould be NX.
+        */
+       unsigned long start = PFN_ALIGN(_etext);
+       /*
+        * This comes from is_kernel_text upper limit. Also HPAGE where used:
+        */
+       unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+       if (__supported_pte_mask & _PAGE_NX)
+               printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+       set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
        printk(KERN_INFO "Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+       mark_nxdata_nx();
 }
 #endif
 
index 7ffc9b727efdc95ee6748acd1a1b646c8e164ffb..7762a517d69d9233a7a6e419a5eaa9ae6c94ee53 100644 (file)
@@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 static char *cmdline __initdata;
 
 static int __init setup_physnodes(unsigned long start, unsigned long end,
-                                       int acpi, int k8)
+                                       int acpi, int amd)
 {
        int nr_nodes = 0;
        int ret = 0;
@@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
        if (acpi)
                nr_nodes = acpi_get_nodes(physnodes);
 #endif
-#ifdef CONFIG_K8_NUMA
-       if (k8)
-               nr_nodes = k8_get_nodes(physnodes);
+#ifdef CONFIG_AMD_NUMA
+       if (amd)
+               nr_nodes = amd_get_nodes(physnodes);
 #endif
        /*
         * Basic sanity checking on the physical node map: there may be errors
-        * if the SRAT or K8 incorrectly reported the topology or the mem=
+        * if the SRAT or AMD code incorrectly reported the topology or the mem=
         * kernel parameter is used.
         */
        for (i = 0; i < nr_nodes; i++) {
@@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
  * numa=fake command-line option.
  */
 static int __init numa_emulation(unsigned long start_pfn,
-                       unsigned long last_pfn, int acpi, int k8)
+                       unsigned long last_pfn, int acpi, int amd)
 {
        u64 addr = start_pfn << PAGE_SHIFT;
        u64 max_addr = last_pfn << PAGE_SHIFT;
@@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn,
        int num_nodes;
        int i;
 
-       num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
+       num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
        /*
         * If the numa=fake command-line contains a 'M' or 'G', it represents
         * the fixed node size.  Otherwise, if it is just a single number N,
@@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 #endif /* CONFIG_NUMA_EMU */
 
 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
-                               int acpi, int k8)
+                               int acpi, int amd)
 {
        int i;
 
@@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
        nodes_clear(node_online_map);
 
 #ifdef CONFIG_NUMA_EMU
-       if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
+       if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
@@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
        nodes_clear(node_online_map);
 #endif
 
-#ifdef CONFIG_K8_NUMA
-       if (!numa_off && k8 && !k8_scan_nodes())
+#ifdef CONFIG_AMD_NUMA
+       if (!numa_off && amd && !amd_scan_nodes())
                return;
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
index 532e7933d606fdbdde77aacdb2de746f249d2bf2..8b830ca14ac46c08facc1a848ddcb3c42c0d56cf 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                                   unsigned long pfn)
 {
        pgprot_t forbidden = __pgprot(0);
+       pgprot_t required = __pgprot(0);
 
        /*
         * The BIOS area between 640k and 1Mb needs to be executable for
         * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
         */
-       if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+       if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
                pgprot_val(forbidden) |= _PAGE_NX;
+#endif
 
        /*
         * The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
        if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
                   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
                pgprot_val(forbidden) |= _PAGE_RW;
+       /*
+        * .data and .bss should always be writable.
+        */
+       if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+           within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+               pgprot_val(required) |= _PAGE_RW;
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
        /*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
 
        prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+       prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
 
        return prot;
 }
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 {
        unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
        pte_t new_pte, old_pte, *tmp;
-       pgprot_t old_prot, new_prot;
+       pgprot_t old_prot, new_prot, req_prot;
        int i, do_split = 1;
        unsigned int level;
 
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * We are safe now. Check whether the new pgprot is the same:
         */
        old_pte = *kpte;
-       old_prot = new_prot = pte_pgprot(old_pte);
+       old_prot = new_prot = req_prot = pte_pgprot(old_pte);
 
-       pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
-       pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+       pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+       pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 
        /*
         * old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
        pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;
 
-       new_prot = static_protections(new_prot, address, pfn);
+       new_prot = static_protections(req_prot, address, pfn);
 
        /*
         * We need to check the full range, whether
         * static_protection() requires a different pgprot for one of
         * the pages in the range we try to preserve:
         */
-       addr = address + PAGE_SIZE;
-       pfn++;
-       for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
-               pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+       addr = address & pmask;
+       pfn = pte_pfn(old_pte);
+       for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+               pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 
                if (pgprot_val(chk_prot) != pgprot_val(new_prot))
                        goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * that we limited the number of possible pages already to
         * the number of pages in the large page.
         */
-       if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+       if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
                /*
                 * The address is aligned and the number of pages
                 * covers the full page.
index a3250aa34086fce7d376e9e1e464fa2e996dbb6d..410531d3c292d20cde9b40a487711930eab51eb2 100644 (file)
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
 {
        if (!cpu_has_nx) {
                printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
-                      "missing in CPU or disabled in BIOS!\n");
+                      "missing in CPU!\n");
        } else {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
                if (disable_nx) {
index a17dffd136c143898e91187cbd39d005b05779b6..f16434568a51da26ea8524ae11fa84f0c7405717 100644 (file)
@@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
        /* mark this node as "seen" in node bitmap */
        BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
 
+       /* don't need to check apic_id here, because it is always 8 bits */
        apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
 
        printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
index a35cb9d8b0606bc8f7123cd15f0017972a5e8dda..171a0aacb99a0874373619f4fd51ed955e2ddb9e 100644 (file)
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
        }
 
        apic_id = pa->apic_id;
+       if (apic_id >= MAX_LOCAL_APIC) {
+               printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+               return;
+       }
        apicid_to_node[apic_id] = node;
        node_set(node, cpu_nodes_parsed);
        acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
        else
                apic_id = pa->apic_id;
+
+       if (apic_id >= MAX_LOCAL_APIC) {
+               printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+               return;
+       }
+
        apicid_to_node[apic_id] = node;
        node_set(node, cpu_nodes_parsed);
        acpi_numa = 1;
index 51104b33fd5140a7ceef1bbfbee3642d0ec2ba61..c3b8e24f2b16f4f6441c286268a61ac6a320b7c3 100644 (file)
@@ -610,6 +610,7 @@ static int force_ibs_eilvt_setup(void)
                ret = setup_ibs_ctl(i);
                if (ret)
                        return ret;
+               pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
                return 0;
        }
 
index effd96e33f16690c3dd318de9886bee75e5fd699..6b8759f7634e661de3983dbc7e6accb26e939a8c 100644 (file)
@@ -7,6 +7,7 @@ obj-$(CONFIG_PCI_OLPC)          += olpc.o
 obj-$(CONFIG_PCI_XEN)          += xen.o
 
 obj-y                          += fixup.o
+obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
 obj-$(CONFIG_ACPI)             += acpi.o
 obj-y                          += legacy.o irq.o
 
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
new file mode 100644 (file)
index 0000000..85b68ef
--- /dev/null
@@ -0,0 +1,315 @@
+/*
+ *  GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2010 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *  The full GNU General Public License is included in this distribution
+ *  in the file called LICENSE.GPL.
+ *
+ *  Contact Information:
+ *    Intel Corporation
+ *    2200 Mission College Blvd.
+ *    Santa Clara, CA  97052
+ *
+ * This provides access methods for PCI registers that mis-behave on
+ * the CE4100. Each register can be assigned a private init, read and
+ * write routine. The exception to this is the bridge device.  The
+ * bridge device is the only device on bus zero (0) that requires any
+ * fixup so it is a special case ATM
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+
+#include <asm/pci_x86.h>
+
+struct sim_reg {
+       u32 value;
+       u32 mask;
+};
+
+struct sim_dev_reg {
+       int dev_func;
+       int reg;
+       void (*init)(struct sim_dev_reg *reg);
+       void (*read)(struct sim_dev_reg *reg, u32 *value);
+       void (*write)(struct sim_dev_reg *reg, u32 value);
+       struct sim_reg sim_reg;
+};
+
+struct sim_reg_op {
+       void (*init)(struct sim_dev_reg *reg);
+       void (*read)(struct sim_dev_reg *reg, u32 value);
+       void (*write)(struct sim_dev_reg *reg, u32 value);
+};
+
+#define MB (1024 * 1024)
+#define KB (1024)
+#define SIZE_TO_MASK(size) (~(size - 1))
+
+#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
+{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
+       {0, SIZE_TO_MASK(size)} },
+
+static void reg_init(struct sim_dev_reg *reg)
+{
+       pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
+                             &reg->sim_reg.value);
+}
+
+static void reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pci_config_lock, flags);
+       *value = reg->sim_reg.value;
+       raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void reg_write(struct sim_dev_reg *reg, u32 value)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pci_config_lock, flags);
+       reg->sim_reg.value = (value & reg->sim_reg.mask) |
+               (reg->sim_reg.value & ~reg->sim_reg.mask);
+       raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void sata_reg_init(struct sim_dev_reg *reg)
+{
+       pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
+                             &reg->sim_reg.value);
+       reg->sim_reg.value += 0x400;
+}
+
+static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+       reg_read(reg, value);
+       if (*value != reg->sim_reg.mask)
+               *value |= 0x100;
+}
+
+void sata_revid_init(struct sim_dev_reg *reg)
+{
+       reg->sim_reg.value = 0x01060100;
+       reg->sim_reg.mask = 0;
+}
+
+static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
+{
+       reg_read(reg, value);
+}
+
+static struct sim_dev_reg bus1_fixups[] = {
+       DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
+       DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+       DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x8,  0, sata_revid_init, sata_revid_read, 0)
+       DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
+       DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
+};
+
+static void __init init_sim_regs(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+               if (bus1_fixups[i].init)
+                       bus1_fixups[i].init(&bus1_fixups[i]);
+       }
+}
+
+static inline void extract_bytes(u32 *value, int reg, int len)
+{
+       uint32_t mask;
+
+       *value >>= ((reg & 3) * 8);
+       mask = 0xFFFFFFFF >> ((4 - len) * 8);
+       *value &= mask;
+}
+
+int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
+{
+       u32 av_bridge_base, av_bridge_limit;
+       int retval = 0;
+
+       switch (reg) {
+       /* Make BARs appear to not request any memory. */
+       case PCI_BASE_ADDRESS_0:
+       case PCI_BASE_ADDRESS_0 + 1:
+       case PCI_BASE_ADDRESS_0 + 2:
+       case PCI_BASE_ADDRESS_0 + 3:
+               *value = 0;
+               break;
+
+               /* Since subordinate bus number register is hardwired
+                * to zero and read only, so do the simulation.
+                */
+       case PCI_PRIMARY_BUS:
+               if (len == 4)
+                       *value = 0x00010100;
+               break;
+
+       case PCI_SUBORDINATE_BUS:
+               *value = 1;
+               break;
+
+       case PCI_MEMORY_BASE:
+       case PCI_MEMORY_LIMIT:
+               /* Get the A/V bridge base address. */
+               pci_direct_conf1.read(0, 0, devfn,
+                               PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
+
+               av_bridge_limit = av_bridge_base + (512*MB - 1);
+               av_bridge_limit >>= 16;
+               av_bridge_limit &= 0xFFF0;
+
+               av_bridge_base >>= 16;
+               av_bridge_base &= 0xFFF0;
+
+               if (reg == PCI_MEMORY_LIMIT)
+                       *value = av_bridge_limit;
+               else if (len == 2)
+                       *value = av_bridge_base;
+               else
+                       *value = (av_bridge_limit << 16) | av_bridge_base;
+               break;
+               /* Make prefetchable memory limit smaller than prefetchable
+                * memory base, so not claim prefetchable memory space.
+                */
+       case PCI_PREF_MEMORY_BASE:
+               *value = 0xFFF0;
+               break;
+       case PCI_PREF_MEMORY_LIMIT:
+               *value = 0x0;
+               break;
+               /* Make IO limit smaller than IO base, so not claim IO space. */
+       case PCI_IO_BASE:
+               *value = 0xF0;
+               break;
+       case PCI_IO_LIMIT:
+               *value = 0;
+               break;
+       default:
+               retval = 1;
+       }
+       return retval;
+}
+
+static int ce4100_conf_read(unsigned int seg, unsigned int bus,
+                           unsigned int devfn, int reg, int len, u32 *value)
+{
+       int i, retval = 1;
+
+       if (bus == 1) {
+               for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+                       if (bus1_fixups[i].dev_func == devfn &&
+                           bus1_fixups[i].reg == (reg & ~3) &&
+                           bus1_fixups[i].read) {
+                               bus1_fixups[i].read(&(bus1_fixups[i]),
+                                                   value);
+                               extract_bytes(value, reg, len);
+                               return 0;
+                       }
+               }
+       }
+
+       if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
+           !bridge_read(devfn, reg, len, value))
+               return 0;
+
+       return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
+}
+
+static int ce4100_conf_write(unsigned int seg, unsigned int bus,
+                            unsigned int devfn, int reg, int len, u32 value)
+{
+       int i;
+
+       if (bus == 1) {
+               for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+                       if (bus1_fixups[i].dev_func == devfn &&
+                           bus1_fixups[i].reg == (reg & ~3) &&
+                           bus1_fixups[i].write) {
+                               bus1_fixups[i].write(&(bus1_fixups[i]),
+                                                    value);
+                               return 0;
+                       }
+               }
+       }
+
+       /* Discard writes to A/V bridge BAR. */
+       if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
+           ((reg & ~3) == PCI_BASE_ADDRESS_0))
+               return 0;
+
+       return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
+}
+
+struct pci_raw_ops ce4100_pci_conf = {
+       .read = ce4100_conf_read,
+       .write = ce4100_conf_write,
+};
+
+static int __init ce4100_pci_init(void)
+{
+       init_sim_regs();
+       raw_pci_ops = &ce4100_pci_conf;
+       return 0;
+}
+subsys_initcall(ce4100_pci_init);
index 2492d165096a2a696cf8332534966cfce2c848a7..a5f7d0d63de0def1481382f785d0fc34d553f0f6 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/uaccess.h>
 #include <asm/pci_x86.h>
 #include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
 
 /* BIOS32 signature: "_32_" */
 #define BIOS32_SIGNATURE       (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
 #define PCIBIOS_HW_TYPE1_SPEC          0x10
 #define PCIBIOS_HW_TYPE2_SPEC          0x20
 
+int pcibios_enabled;
+
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+       pcibios_enabled = 1;
+       set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+       if (__supported_pte_mask & _PAGE_NX)
+               printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+}
+
 /*
  * This is the standard structure used to identify the entry point
  * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
                        DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
                                        bios32_entry);
                        bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+                       set_bios_x();
                        if (check_pcibios())
                                return &pci_bios_access;
                }
index 7bf70b812fa2a436ec8ce1bea270b05af2b2f487..021eee91c0562503dbb68c5bea490674d9db8618 100644 (file)
@@ -1,5 +1,7 @@
 # Platform specific code goes here
+obj-y  += ce4100/
 obj-y  += efi/
+obj-y  += iris/
 obj-y  += mrst/
 obj-y  += olpc/
 obj-y  += scx200/
diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile
new file mode 100644 (file)
index 0000000..91fc929
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_INTEL_CE)     += ce4100.o
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
new file mode 100644 (file)
index 0000000..d2c0d51
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ * Intel CE4100  platform specific setup code
+ *
+ * (C) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
+
+#include <asm/setup.h>
+#include <asm/io.h>
+
+static int ce4100_i8042_detect(void)
+{
+       return 0;
+}
+
+static void __init sdv_find_smp_config(void)
+{
+}
+
+#ifdef CONFIG_SERIAL_8250
+
+
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+       offset = offset << p->regshift;
+       return readl(p->membase + offset);
+}
+
+/*
+ * The UART Tx interrupts are not set under some conditions and therefore serial
+ * transmission hangs. This is a silicon issue and has not been root caused. The
+ * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
+ * bit of LSR register in interrupt handler to see whether at least one of these
+ * two bits is set, if so then process the transmit request. If this workaround
+ * is not applied, then the serial transmission may hang. This workaround is for
+ * errata number 9 in Errata - B step.
+*/
+
+static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
+{
+       unsigned int ret, ier, lsr;
+
+       if (offset == UART_IIR) {
+               offset = offset << p->regshift;
+               ret = readl(p->membase + offset);
+               if (ret & UART_IIR_NO_INT) {
+                       /* see if the TX interrupt should have really set */
+                       ier = mem_serial_in(p, UART_IER);
+                       /* see if the UART's XMIT interrupt is enabled */
+                       if (ier & UART_IER_THRI) {
+                               lsr = mem_serial_in(p, UART_LSR);
+                               /* now check to see if the UART should be
+                                  generating an interrupt (but isn't) */
+                               if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
+                                       ret &= ~UART_IIR_NO_INT;
+                       }
+               }
+       } else
+               ret =  mem_serial_in(p, offset);
+       return ret;
+}
+
+static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
+{
+       offset = offset << p->regshift;
+       writel(value, p->membase + offset);
+}
+
+static void ce4100_serial_fixup(int port, struct uart_port *up,
+       unsigned short *capabilites)
+{
+#ifdef CONFIG_EARLY_PRINTK
+       /*
+        * Over ride the legacy port configuration that comes from
+        * asm/serial.h. Using the ioport driver then switching to the
+        * PCI memmaped driver hangs the IOAPIC
+        */
+       if (up->iotype !=  UPIO_MEM32) {
+               up->uartclk  = 14745600;
+               up->mapbase = 0xdffe0200;
+               set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
+                               up->mapbase & PAGE_MASK);
+               up->membase =
+                       (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
+               up->membase += up->mapbase & ~PAGE_MASK;
+               up->iotype   = UPIO_MEM32;
+               up->regshift = 2;
+       }
+#endif
+       up->iobase = 0;
+       up->serial_in = ce4100_mem_serial_in;
+       up->serial_out = ce4100_mem_serial_out;
+
+       *capabilites |= (1 << 12);
+}
+
+static __init void sdv_serial_fixup(void)
+{
+       serial8250_set_isa_configurator(ce4100_serial_fixup);
+}
+
+#else
+static inline void sdv_serial_fixup(void);
+#endif
+
+static void __init sdv_arch_setup(void)
+{
+       sdv_serial_fixup();
+}
+
+/*
+ * CE4100 specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_ce4100_early_setup(void)
+{
+       x86_init.oem.arch_setup = sdv_arch_setup;
+       x86_platform.i8042_detect = ce4100_i8042_detect;
+       x86_init.resources.probe_roms = x86_init_noop;
+       x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+       x86_init.mpparse.find_smp_config = sdv_find_smp_config;
+}
diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile
new file mode 100644 (file)
index 0000000..db92198
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_32_IRIS)              += iris.o
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
new file mode 100644 (file)
index 0000000..1ba7f5e
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * Eurobraille/Iris power off support.
+ *
+ * Eurobraille's Iris machine is a PC with no APM or ACPI support.
+ * It is shutdown by a special I/O sequence which this module provides.
+ *
+ *  Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <asm/io.h>
+
+#define IRIS_GIO_BASE          0x340
+#define IRIS_GIO_INPUT         IRIS_GIO_BASE
+#define IRIS_GIO_OUTPUT                (IRIS_GIO_BASE + 1)
+#define IRIS_GIO_PULSE         0x80 /* First byte to send */
+#define IRIS_GIO_REST          0x00 /* Second byte to send */
+#define IRIS_GIO_NODEV         0xff /* Likely not an Iris */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
+MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
+MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
+
+static int force;
+
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
+
+static void (*old_pm_power_off)(void);
+
+static void iris_power_off(void)
+{
+       outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
+       msleep(850);
+       outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
+}
+
+/*
+ * Before installing the power_off handler, try to make sure the OS is
+ * running on an Iris.  Since Iris does not support DMI, this is done
+ * by reading its input port and seeing whether the read value is
+ * meaningful.
+ */
+static int iris_init(void)
+{
+       unsigned char status;
+       if (force != 1) {
+               printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
+               return -ENODEV;
+       }
+       status = inb(IRIS_GIO_INPUT);
+       if (status == IRIS_GIO_NODEV) {
+               printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+               return -ENODEV;
+       }
+       old_pm_power_off = pm_power_off;
+       pm_power_off = &iris_power_off;
+       printk(KERN_INFO "Iris power_off handler installed.\n");
+
+       return 0;
+}
+
+static void iris_exit(void)
+{
+       pm_power_off = old_pm_power_off;
+       printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+}
+
+module_init(iris_init);
+module_exit(iris_exit);
index efbbc552fa953a5bc5e70e8108ec909cb3061252..f61ccdd4934141444f1c8f27d2b3a783f06c37d0 100644 (file)
@@ -1 +1,3 @@
 obj-$(CONFIG_X86_MRST)         += mrst.o
+obj-$(CONFIG_X86_MRST)         += vrtc.o
+obj-$(CONFIG_EARLY_PRINTK_MRST)        += early_printk_mrst.o
index 79ae68154e871fe208ff5fcfd809daf3b03c4ab5..fee0b4914e07ad494f82629ccc10dc0a1980879c 100644 (file)
@@ -9,9 +9,19 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+
+#define pr_fmt(fmt) "mrst: " fmt
+
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/sfi.h>
+#include <linux/intel_pmic_gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+#include <linux/i2c/pca953x.h>
+#include <linux/gpio_keys.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
 #include <linux/irq.h>
 #include <linux/module.h>
 
@@ -23,7 +33,9 @@
 #include <asm/mrst.h>
 #include <asm/io.h>
 #include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
 #include <asm/apb_timer.h>
+#include <asm/reboot.h>
 
 /*
  * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
@@ -102,10 +114,10 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
                memcpy(sfi_mtimer_array, pentry, totallen);
        }
 
-       printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+       pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
        pentry = sfi_mtimer_array;
        for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
-               printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+               pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
                        " irq = %d\n", totallen, (u32)pentry->phys_addr,
                        pentry->freq_hz, pentry->irq);
                        if (!pentry->irq)
@@ -176,14 +188,14 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
                memcpy(sfi_mrtc_array, pentry, totallen);
        }
 
-       printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+       pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
        pentry = sfi_mrtc_array;
        for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
-               printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+               pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
                        totallen, (u32)pentry->phys_addr, pentry->irq);
                mp_irq.type = MP_IOAPIC;
                mp_irq.irqtype = mp_INT;
-               mp_irq.irqflag = 0;
+               mp_irq.irqflag = 0xf;   /* level trigger and active low */
                mp_irq.srcbus = 0;
                mp_irq.srcbusirq = pentry->irq; /* IRQ */
                mp_irq.dstapic = MP_APIC_ALL;
@@ -209,6 +221,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
 
 void __init mrst_time_init(void)
 {
+       sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
        switch (mrst_timer_options) {
        case MRST_TIMER_APBT_ONLY:
                break;
@@ -224,16 +237,10 @@ void __init mrst_time_init(void)
                return;
        }
        /* we need at least one APB timer */
-       sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
        pre_init_apic_IRQ0();
        apbt_time_init();
 }
 
-void __init mrst_rtc_init(void)
-{
-       sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
-
 void __cpuinit mrst_arch_setup(void)
 {
        if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
@@ -256,6 +263,17 @@ static int mrst_i8042_detect(void)
        return 0;
 }
 
+/* Reboot and power off are handled by the SCU on a MID device */
+static void mrst_power_off(void)
+{
+       intel_scu_ipc_simple_command(0xf1, 1);
+}
+
+static void mrst_reboot(void)
+{
+       intel_scu_ipc_simple_command(0xf1, 0);
+}
+
 /*
  * Moorestown specific x86_init function overrides and early setup
  * calls.
@@ -281,6 +299,10 @@ void __init x86_mrst_early_setup(void)
 
        legacy_pic = &null_legacy_pic;
 
+       /* Moorestown specific power_off/restart method */
+       pm_power_off = mrst_power_off;
+       machine_ops.emergency_restart  = mrst_reboot;
+
        /* Avoid searching for BIOS MP tables */
        x86_init.mpparse.find_smp_config = x86_init_noop;
        x86_init.mpparse.get_smp_config = x86_init_uint_noop;
@@ -309,3 +331,505 @@ static inline int __init setup_x86_mrst_timer(char *arg)
        return 0;
 }
 __setup("x86_mrst_timer=", setup_x86_mrst_timer);
+
+/*
+ * Parsing GPIO table first, since the DEVS table will need this table
+ * to map the pin name to the actual pin.
+ */
+static struct sfi_gpio_table_entry *gpio_table;
+static int gpio_num_entry;
+
+static int __init sfi_parse_gpio(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_gpio_table_entry *pentry;
+       int num, i;
+
+       if (gpio_table)
+               return 0;
+       sb = (struct sfi_table_simple *)table;
+       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
+       pentry = (struct sfi_gpio_table_entry *)sb->pentry;
+
+       gpio_table = (struct sfi_gpio_table_entry *)
+                               kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+       if (!gpio_table)
+               return -1;
+       memcpy(gpio_table, pentry, num * sizeof(*pentry));
+       gpio_num_entry = num;
+
+       pr_debug("GPIO pin info:\n");
+       for (i = 0; i < num; i++, pentry++)
+               pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
+               " pin = %d\n", i,
+                       pentry->controller_name,
+                       pentry->pin_name,
+                       pentry->pin_no);
+       return 0;
+}
+
+static int get_gpio_by_name(const char *name)
+{
+       struct sfi_gpio_table_entry *pentry = gpio_table;
+       int i;
+
+       if (!pentry)
+               return -1;
+       for (i = 0; i < gpio_num_entry; i++, pentry++) {
+               if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
+                       return pentry->pin_no;
+       }
+       return -1;
+}
+
+/*
+ * Here defines the array of devices platform data that IAFW would export
+ * through SFI "DEVS" table, we use name and type to match the device and
+ * its platform data.
+ */
+struct devs_id {
+       char name[SFI_NAME_LEN + 1];
+       u8 type;
+       u8 delay;
+       void *(*get_platform_data)(void *info);
+};
+
+/* the offset for the mapping of global gpio pin to irq */
+#define MRST_IRQ_OFFSET 0x100
+
+static void __init *pmic_gpio_platform_data(void *info)
+{
+       static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
+       int gpio_base = get_gpio_by_name("pmic_gpio_base");
+
+       if (gpio_base == -1)
+               gpio_base = 64;
+       pmic_gpio_pdata.gpio_base = gpio_base;
+       pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
+       pmic_gpio_pdata.gpiointr = 0xffffeff8;
+
+       return &pmic_gpio_pdata;
+}
+
+static void __init *max3111_platform_data(void *info)
+{
+       struct spi_board_info *spi_info = info;
+       int intr = get_gpio_by_name("max3111_int");
+
+       if (intr == -1)
+               return NULL;
+       spi_info->irq = intr + MRST_IRQ_OFFSET;
+       return NULL;
+}
+
+/* we have multiple max7315 on the board ... */
+#define MAX7315_NUM 2
+static void __init *max7315_platform_data(void *info)
+{
+       static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
+       static int nr;
+       struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
+       struct i2c_board_info *i2c_info = info;
+       int gpio_base, intr;
+       char base_pin_name[SFI_NAME_LEN + 1];
+       char intr_pin_name[SFI_NAME_LEN + 1];
+
+       if (nr == MAX7315_NUM) {
+               pr_err("too many max7315s, we only support %d\n",
+                               MAX7315_NUM);
+               return NULL;
+       }
+       /* we have several max7315 on the board, we only need load several
+        * instances of the same pca953x driver to cover them
+        */
+       strcpy(i2c_info->type, "max7315");
+       if (nr++) {
+               sprintf(base_pin_name, "max7315_%d_base", nr);
+               sprintf(intr_pin_name, "max7315_%d_int", nr);
+       } else {
+               strcpy(base_pin_name, "max7315_base");
+               strcpy(intr_pin_name, "max7315_int");
+       }
+
+       gpio_base = get_gpio_by_name(base_pin_name);
+       intr = get_gpio_by_name(intr_pin_name);
+
+       if (gpio_base == -1)
+               return NULL;
+       max7315->gpio_base = gpio_base;
+       if (intr != -1) {
+               i2c_info->irq = intr + MRST_IRQ_OFFSET;
+               max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
+       } else {
+               i2c_info->irq = -1;
+               max7315->irq_base = -1;
+       }
+       return max7315;
+}
+
+static void __init *emc1403_platform_data(void *info)
+{
+       static short intr2nd_pdata;
+       struct i2c_board_info *i2c_info = info;
+       int intr = get_gpio_by_name("thermal_int");
+       int intr2nd = get_gpio_by_name("thermal_alert");
+
+       if (intr == -1 || intr2nd == -1)
+               return NULL;
+
+       i2c_info->irq = intr + MRST_IRQ_OFFSET;
+       intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+       return &intr2nd_pdata;
+}
+
+static void __init *lis331dl_platform_data(void *info)
+{
+       static short intr2nd_pdata;
+       struct i2c_board_info *i2c_info = info;
+       int intr = get_gpio_by_name("accel_int");
+       int intr2nd = get_gpio_by_name("accel_2");
+
+       if (intr == -1 || intr2nd == -1)
+               return NULL;
+
+       i2c_info->irq = intr + MRST_IRQ_OFFSET;
+       intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+       return &intr2nd_pdata;
+}
+
+static void __init *no_platform_data(void *info)
+{
+       return NULL;
+}
+
+static const struct devs_id __initconst device_ids[] = {
+       {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
+       {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
+       {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+       {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+       {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
+       {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
+       {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+       {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+       {},
+};
+
+#define MAX_IPCDEVS    24
+static struct platform_device *ipc_devs[MAX_IPCDEVS];
+static int ipc_next_dev;
+
+#define MAX_SCU_SPI    24
+static struct spi_board_info *spi_devs[MAX_SCU_SPI];
+static int spi_next_dev;
+
+#define MAX_SCU_I2C    24
+static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
+static int i2c_bus[MAX_SCU_I2C];
+static int i2c_next_dev;
+
+static void __init intel_scu_device_register(struct platform_device *pdev)
+{
+       if(ipc_next_dev == MAX_IPCDEVS)
+               pr_err("too many SCU IPC devices");
+       else
+               ipc_devs[ipc_next_dev++] = pdev;
+}
+
+static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
+{
+       struct spi_board_info *new_dev;
+
+       if (spi_next_dev == MAX_SCU_SPI) {
+               pr_err("too many SCU SPI devices");
+               return;
+       }
+
+       new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+       if (!new_dev) {
+               pr_err("failed to alloc mem for delayed spi dev %s\n",
+                       sdev->modalias);
+               return;
+       }
+       memcpy(new_dev, sdev, sizeof(*sdev));
+
+       spi_devs[spi_next_dev++] = new_dev;
+}
+
+static void __init intel_scu_i2c_device_register(int bus,
+                                               struct i2c_board_info *idev)
+{
+       struct i2c_board_info *new_dev;
+
+       if (i2c_next_dev == MAX_SCU_I2C) {
+               pr_err("too many SCU I2C devices");
+               return;
+       }
+
+       new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
+       if (!new_dev) {
+               pr_err("failed to alloc mem for delayed i2c dev %s\n",
+                       idev->type);
+               return;
+       }
+       memcpy(new_dev, idev, sizeof(*idev));
+
+       i2c_bus[i2c_next_dev] = bus;
+       i2c_devs[i2c_next_dev++] = new_dev;
+}
+
+/* Called by IPC driver */
+void intel_scu_devices_create(void)
+{
+       int i;
+
+       for (i = 0; i < ipc_next_dev; i++)
+               platform_device_add(ipc_devs[i]);
+
+       for (i = 0; i < spi_next_dev; i++)
+               spi_register_board_info(spi_devs[i], 1);
+
+       for (i = 0; i < i2c_next_dev; i++) {
+               struct i2c_adapter *adapter;
+               struct i2c_client *client;
+
+               adapter = i2c_get_adapter(i2c_bus[i]);
+               if (adapter) {
+                       client = i2c_new_device(adapter, i2c_devs[i]);
+                       if (!client)
+                               pr_err("can't create i2c device %s\n",
+                                       i2c_devs[i]->type);
+               } else
+                       i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
+       }
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_create);
+
+/* Called by IPC driver */
+void intel_scu_devices_destroy(void)
+{
+       int i;
+
+       for (i = 0; i < ipc_next_dev; i++)
+               platform_device_del(ipc_devs[i]);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
+
+static void __init install_irq_resource(struct platform_device *pdev, int irq)
+{
+       /* Single threaded */
+       static struct resource __initdata res = {
+               .name = "IRQ",
+               .flags = IORESOURCE_IRQ,
+       };
+       res.start = irq;
+       platform_device_add_resources(pdev, &res, 1);
+}
+
+static void __init sfi_handle_ipc_dev(struct platform_device *pdev)
+{
+       const struct devs_id *dev = device_ids;
+       void *pdata = NULL;
+
+       while (dev->name[0]) {
+               if (dev->type == SFI_DEV_TYPE_IPC &&
+                       !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) {
+                       pdata = dev->get_platform_data(pdev);
+                       break;
+               }
+               dev++;
+       }
+       pdev->dev.platform_data = pdata;
+       intel_scu_device_register(pdev);
+}
+
+static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
+{
+       const struct devs_id *dev = device_ids;
+       void *pdata = NULL;
+
+       while (dev->name[0]) {
+               if (dev->type == SFI_DEV_TYPE_SPI &&
+                               !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
+                       pdata = dev->get_platform_data(spi_info);
+                       break;
+               }
+               dev++;
+       }
+       spi_info->platform_data = pdata;
+       if (dev->delay)
+               intel_scu_spi_device_register(spi_info);
+       else
+               spi_register_board_info(spi_info, 1);
+}
+
+static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
+{
+       const struct devs_id *dev = device_ids;
+       void *pdata = NULL;
+
+       while (dev->name[0]) {
+               if (dev->type == SFI_DEV_TYPE_I2C &&
+                       !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
+                       pdata = dev->get_platform_data(i2c_info);
+                       break;
+               }
+               dev++;
+       }
+       i2c_info->platform_data = pdata;
+
+       if (dev->delay)
+               intel_scu_i2c_device_register(bus, i2c_info);
+       else
+               i2c_register_board_info(bus, i2c_info, 1);
+ }
+
+
+static int __init sfi_parse_devs(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_device_table_entry *pentry;
+       struct spi_board_info spi_info;
+       struct i2c_board_info i2c_info;
+       struct platform_device *pdev;
+       int num, i, bus;
+       int ioapic;
+       struct io_apic_irq_attr irq_attr;
+
+       sb = (struct sfi_table_simple *)table;
+       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
+       pentry = (struct sfi_device_table_entry *)sb->pentry;
+
+       for (i = 0; i < num; i++, pentry++) {
+               if (pentry->irq != (u8)0xff) { /* native RTE case */
+                       /* these SPI2 devices are not exposed to system as PCI
+                        * devices, but they have separate RTE entry in IOAPIC
+                        * so we have to enable them one by one here
+                        */
+                       ioapic = mp_find_ioapic(pentry->irq);
+                       irq_attr.ioapic = ioapic;
+                       irq_attr.ioapic_pin = pentry->irq;
+                       irq_attr.trigger = 1;
+                       irq_attr.polarity = 1;
+                       io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr);
+               }
+               switch (pentry->type) {
+               case SFI_DEV_TYPE_IPC:
+                       /* ID as IRQ is a hack that will go away */
+                       pdev = platform_device_alloc(pentry->name, pentry->irq);
+                       if (pdev == NULL) {
+                               pr_err("out of memory for SFI platform device '%s'.\n",
+                                                       pentry->name);
+                               continue;
+                       }
+                       install_irq_resource(pdev, pentry->irq);
+                       pr_debug("info[%2d]: IPC bus, name = %16.16s, "
+                               "irq = 0x%2x\n", i, pentry->name, pentry->irq);
+                       sfi_handle_ipc_dev(pdev);
+                       break;
+               case SFI_DEV_TYPE_SPI:
+                       memset(&spi_info, 0, sizeof(spi_info));
+                       strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
+                       spi_info.irq = pentry->irq;
+                       spi_info.bus_num = pentry->host_num;
+                       spi_info.chip_select = pentry->addr;
+                       spi_info.max_speed_hz = pentry->max_freq;
+                       pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
+                               "irq = 0x%2x, max_freq = %d, cs = %d\n", i,
+                               spi_info.bus_num,
+                               spi_info.modalias,
+                               spi_info.irq,
+                               spi_info.max_speed_hz,
+                               spi_info.chip_select);
+                       sfi_handle_spi_dev(&spi_info);
+                       break;
+               case SFI_DEV_TYPE_I2C:
+                       memset(&i2c_info, 0, sizeof(i2c_info));
+                       bus = pentry->host_num;
+                       strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
+                       i2c_info.irq = pentry->irq;
+                       i2c_info.addr = pentry->addr;
+                       pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
+                               "irq = 0x%2x, addr = 0x%x\n", i, bus,
+                               i2c_info.type,
+                               i2c_info.irq,
+                               i2c_info.addr);
+                       sfi_handle_i2c_dev(bus, &i2c_info);
+                       break;
+               case SFI_DEV_TYPE_UART:
+               case SFI_DEV_TYPE_HSI:
+               default:
+                       ;
+               }
+       }
+       return 0;
+}
+
+static int __init mrst_platform_init(void)
+{
+       sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
+       sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
+       return 0;
+}
+arch_initcall(mrst_platform_init);
+
+/*
+ * we will search these buttons in SFI GPIO table (by name)
+ * and register them dynamically. Please add all possible
+ * buttons here, we will shrink them if no GPIO found.
+ */
+static struct gpio_keys_button gpio_button[] = {
+       {KEY_POWER,             -1, 1, "power_btn",     EV_KEY, 0, 3000},
+       {KEY_PROG1,             -1, 1, "prog_btn1",     EV_KEY, 0, 20},
+       {KEY_PROG2,             -1, 1, "prog_btn2",     EV_KEY, 0, 20},
+       {SW_LID,                -1, 1, "lid_switch",    EV_SW,  0, 20},
+       {KEY_VOLUMEUP,          -1, 1, "vol_up",        EV_KEY, 0, 20},
+       {KEY_VOLUMEDOWN,        -1, 1, "vol_down",      EV_KEY, 0, 20},
+       {KEY_CAMERA,            -1, 1, "camera_full",   EV_KEY, 0, 20},
+       {KEY_CAMERA_FOCUS,      -1, 1, "camera_half",   EV_KEY, 0, 20},
+       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw1",        EV_SW,  0, 20},
+       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw2",        EV_SW,  0, 20},
+};
+
+static struct gpio_keys_platform_data mrst_gpio_keys = {
+       .buttons        = gpio_button,
+       .rep            = 1,
+       .nbuttons       = -1, /* will fill it after search */
+};
+
+static struct platform_device pb_device = {
+       .name           = "gpio-keys",
+       .id             = -1,
+       .dev            = {
+               .platform_data  = &mrst_gpio_keys,
+       },
+};
+
+/*
+ * Shrink the non-existent buttons, register the gpio button
+ * device if there is some
+ */
+static int __init pb_keys_init(void)
+{
+       struct gpio_keys_button *gb = gpio_button;
+       int i, num, good = 0;
+
+       num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
+       for (i = 0; i < num; i++) {
+               gb[i].gpio = get_gpio_by_name(gb[i].desc);
+               if (gb[i].gpio == -1)
+                       continue;
+
+               if (i != good)
+                       gb[good] = gb[i];
+               good++;
+       }
+
+       if (good) {
+               mrst_gpio_keys.nbuttons = good;
+               return platform_device_register(&pb_device);
+       }
+       return 0;
+}
+late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
new file mode 100644 (file)
index 0000000..32cd7ed
--- /dev/null
@@ -0,0 +1,165 @@
+/*
+ * vrtc.c: Driver for virtual RTC device on Intel MID platform
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based on RTC CMOS driver.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+#include <asm/time.h>
+#include <asm/fixmap.h>
+
+static unsigned char __iomem *vrtc_virt_base;
+
+unsigned char vrtc_cmos_read(unsigned char reg)
+{
+       unsigned char retval;
+
+       /* vRTC's registers range from 0x0 to 0xD */
+       if (reg > 0xd || !vrtc_virt_base)
+               return 0xff;
+
+       lock_cmos_prefix(reg);
+       retval = __raw_readb(vrtc_virt_base + (reg << 2));
+       lock_cmos_suffix(reg);
+       return retval;
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_read);
+
+void vrtc_cmos_write(unsigned char val, unsigned char reg)
+{
+       if (reg > 0xd || !vrtc_virt_base)
+               return;
+
+       lock_cmos_prefix(reg);
+       __raw_writeb(val, vrtc_virt_base + (reg << 2));
+       lock_cmos_suffix(reg);
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_write);
+
+unsigned long vrtc_get_time(void)
+{
+       u8 sec, min, hour, mday, mon;
+       u32 year;
+
+       while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
+               cpu_relax();
+
+       sec = vrtc_cmos_read(RTC_SECONDS);
+       min = vrtc_cmos_read(RTC_MINUTES);
+       hour = vrtc_cmos_read(RTC_HOURS);
+       mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+       mon = vrtc_cmos_read(RTC_MONTH);
+       year = vrtc_cmos_read(RTC_YEAR);
+
+       /* vRTC YEAR reg contains the offset to 1960 */
+       year += 1960;
+
+       printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
+               "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
+
+       return mktime(year, mon, mday, hour, min, sec);
+}
+
+/* Only care about the minutes and seconds */
+int vrtc_set_mmss(unsigned long nowtime)
+{
+       int real_sec, real_min;
+       int vrtc_min;
+
+       vrtc_min = vrtc_cmos_read(RTC_MINUTES);
+
+       real_sec = nowtime % 60;
+       real_min = nowtime / 60;
+       if (((abs(real_min - vrtc_min) + 15)/30) & 1)
+               real_min += 30;
+       real_min %= 60;
+
+       vrtc_cmos_write(real_sec, RTC_SECONDS);
+       vrtc_cmos_write(real_min, RTC_MINUTES);
+       return 0;
+}
+
+void __init mrst_rtc_init(void)
+{
+       unsigned long rtc_paddr;
+       void __iomem *virt_base;
+
+       sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+       if (!sfi_mrtc_num)
+               return;
+
+       rtc_paddr = sfi_mrtc_array[0].phys_addr;
+
+       /* vRTC's register address may not be page aligned */
+       set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
+
+       virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
+       virt_base += rtc_paddr & ~PAGE_MASK;
+       vrtc_virt_base = virt_base;
+
+       x86_platform.get_wallclock = vrtc_get_time;
+       x86_platform.set_wallclock = vrtc_set_mmss;
+}
+
+/*
+ * The Moorestown platform has a memory mapped virtual RTC device that emulates
+ * the programming interface of the RTC.
+ */
+
+static struct resource vrtc_resources[] = {
+       [0] = {
+               .flags  = IORESOURCE_MEM,
+       },
+       [1] = {
+               .flags  = IORESOURCE_IRQ,
+       }
+};
+
+static struct platform_device vrtc_device = {
+       .name           = "rtc_mrst",
+       .id             = -1,
+       .resource       = vrtc_resources,
+       .num_resources  = ARRAY_SIZE(vrtc_resources),
+};
+
+/* Register the RTC device if appropriate */
+static int __init mrst_device_create(void)
+{
+       /* No Moorestown, no device */
+       if (!mrst_identify_cpu())
+               return -ENODEV;
+       /* No timer, no device */
+       if (!sfi_mrtc_num)
+               return -ENODEV;
+
+       /* iomem resource */
+       vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
+       vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
+                               MRST_VRTC_MAP_SZ;
+       /* irq resource */
+       vrtc_resources[1].start = sfi_mrtc_array[0].irq;
+       vrtc_resources[1].end = sfi_mrtc_array[0].irq;
+
+       return platform_device_register(&vrtc_device);
+}
+
+module_init(mrst_device_create);
index dd4c281ffe5720c3ff15f1eceaa09759e17df7d1..ca54875ac795117079b7a9521bfd0bf42bf9f980 100644 (file)
@@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address)
 /* All CPUs enumerated by SFI must be present and enabled */
 static void __cpuinit mp_sfi_register_lapic(u8 id)
 {
-       if (MAX_APICS - id <= 0) {
+       if (MAX_LOCAL_APIC - id <= 0) {
                pr_warning("Processor #%d invalid (max %d)\n",
-                       id, MAX_APICS);
+                       id, MAX_LOCAL_APIC);
                return;
        }
 
index ba9caa808a9c1b42c6a616968c57e96769039314..df58e9cad96ae9441a4f86f22900a6e0bf05aa64 100644 (file)
@@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode)
 
        /*
         * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-        * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
+        * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
         */
        bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
                                * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
 /*
  * initialize the bau_control structure for each cpu
  */
-static void __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs)
 {
        int i;
        int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
        struct bau_control *smaster = NULL;
        struct socket_desc {
                short num_cpus;
-               short cpu_number[16];
+               short cpu_number[MAX_CPUS_PER_SOCKET];
        };
        struct uvhub_desc {
                unsigned short socket_mask;
@@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
                sdp = &bdp->socket[socket];
                sdp->cpu_number[sdp->num_cpus] = cpu;
                sdp->num_cpus++;
+               if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
+                       printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+                       return 1;
+               }
        }
        for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
                if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
                                bcp->uvhub_master = hmaster;
                                bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
                                                blade_processor_id;
+                               if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+                                       printk(KERN_EMERG
+                                               "%d cpus per uvhub invalid\n",
+                                               bcp->uvhub_cpu);
+                                       return 1;
+                               }
                        }
 nextsocket:
                        socket++;
@@ -1595,6 +1605,7 @@ nextsocket:
                bcp->congested_reps = congested_reps;
                bcp->congested_period = congested_period;
        }
+       return 0;
 }
 
 /*
@@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void)
        spin_lock_init(&disable_lock);
        congested_cycles = microsec_2_cycles(congested_response_us);
 
-       uv_init_per_cpu(nuvhubs);
+       if (uv_init_per_cpu(nuvhubs)) {
+               nobau = 1;
+               return 0;
+       }
 
        uv_partition_base_pnode = 0x7fffffff;
        for (uvhub = 0; uvhub < nuvhubs; uvhub++)
index 3371bd053b89f29e14e5f55d88929d2427f123b2..63203767174683b3db1ab410be9263aabe280ab0 100644 (file)
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
        ver = m->apicver;
        if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
                printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-                       m->apicid, MAX_APICS);
+                       m->apicid, MAX_LOCAL_APIC);
                return;
        }
 
index 5718566e00f9b27573db228ba74559570ba48f89..d9926afec110997b618b70062d50450847d9d1ff 100644 (file)
@@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 int __init acpi_numa_init(void)
 {
        int ret = 0;
+       int nr_cpu_entries = nr_cpu_ids;
+
+#ifdef CONFIG_X86
+       /*
+        * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
+        * SRAT cpu entries could have different order with that in MADT.
+        * So go over all cpu entries in SRAT to get apicid to node mapping.
+        */
+       nr_cpu_entries = MAX_LOCAL_APIC;
+#endif
 
        /* SRAT: Static Resource Affinity Table */
        if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
                acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-                                    acpi_parse_x2apic_affinity, nr_cpu_ids);
+                                    acpi_parse_x2apic_affinity, nr_cpu_entries);
                acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-                                    acpi_parse_processor_affinity, nr_cpu_ids);
+                                    acpi_parse_processor_affinity, nr_cpu_entries);
                ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
                                            acpi_parse_memory_affinity,
                                            NR_NODE_MEMBLKS);
index 42396df555567660d597ac524631e9020edfbea3..9252e85706ef2ce54728a0c8bc366c80c793033c 100644 (file)
@@ -38,7 +38,7 @@ static int agp_bridges_found;
 
 static void amd64_tlbflush(struct agp_memory *temp)
 {
-       k8_flush_garts();
+       amd_flush_garts();
 }
 
 static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
        u32 temp;
        struct aper_size_info_32 *values;
 
-       dev = k8_northbridges.nb_misc[0];
+       dev = node_to_amd_nb(0)->misc;
        if (dev==NULL)
                return 0;
 
@@ -181,16 +181,15 @@ static int amd_8151_configure(void)
        unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
        int i;
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return 0;
 
        /* Configure AGP regs in each x86-64 host bridge. */
-       for (i = 0; i < k8_northbridges.num; i++) {
+       for (i = 0; i < amd_nb_num(); i++) {
                agp_bridge->gart_bus_addr =
-                               amd64_configure(k8_northbridges.nb_misc[i],
-                                               gatt_bus);
+                       amd64_configure(node_to_amd_nb(i)->misc, gatt_bus);
        }
-       k8_flush_garts();
+       amd_flush_garts();
        return 0;
 }
 
@@ -200,11 +199,11 @@ static void amd64_cleanup(void)
        u32 tmp;
        int i;
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return;
 
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
                /* disable gart translation */
                pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
                tmp &= ~GARTEN;
@@ -331,15 +330,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
 {
        int i;
 
-       if (cache_k8_northbridges() < 0)
+       if (amd_cache_northbridges() < 0)
                return -ENODEV;
 
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                return -ENODEV;
 
        i = 0;
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
                if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
                        dev_err(&dev->dev, "no usable aperture found\n");
 #ifdef __x86_64__
@@ -416,7 +415,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
        }
 
        /* shadow x86-64 registers into ULi registers */
-       pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+       pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
                               &httfea);
 
        /* if x86-64 aperture base is beyond 4G, exit here */
@@ -484,7 +483,7 @@ static int nforce3_agp_init(struct pci_dev *pdev)
        pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
 
        /* shadow x86-64 registers into NVIDIA registers */
-       pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+       pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
                               &apbase);
 
        /* if x86-64 aperture base is beyond 4G, exit here */
@@ -778,7 +777,7 @@ int __init agp_amd64_init(void)
                }
 
                /* First check that we have at least one AMD64 NB */
-               if (!pci_dev_present(k8_nb_ids))
+               if (!pci_dev_present(amd_nb_misc_ids))
                        return -ENODEV;
 
                /* Look for any AGP bridge */
index eca9ba193e94a914aa64740f0b4262b8f7c99b81..df211181fca41627cb0bdea2a089cd9d26f7faf4 100644 (file)
@@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void)
 
        opstate_init();
 
-       if (cache_k8_northbridges() < 0)
+       if (amd_cache_northbridges() < 0)
                goto err_ret;
 
        msrs = msrs_alloc();
@@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void)
         * to finish initialization of the MC instances.
         */
        err = -ENODEV;
-       for (nb = 0; nb < k8_northbridges.num; nb++) {
+       for (nb = 0; nb < amd_nb_num(); nb++) {
                if (!pvt_lookup[nb])
                        continue;
 
index 41a9e34899ac5f81da6dde61f6aa2a1d34e2e134..ca35b0ce944a58ca017ff5b5678b73a468d7a42a 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/sfi.h>
 #include <asm/mrst.h>
 #include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
 
 /* IPC defines the following message types */
 #define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
@@ -699,6 +700,9 @@ static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id)
                iounmap(ipcdev.ipc_base);
                return -ENOMEM;
        }
+
+       intel_scu_devices_create();
+
        return 0;
 }
 
@@ -720,6 +724,7 @@ static void ipc_remove(struct pci_dev *pdev)
        iounmap(ipcdev.ipc_base);
        iounmap(ipcdev.i2c_base);
        ipcdev.pdev = NULL;
+       intel_scu_devices_destroy();
 }
 
 static const struct pci_device_id pci_ids[] = {
index 2883428d5ac806408b5082221036ffa30915356c..4941cade319f5cef06d508d0b1f1354d951c1034 100644 (file)
@@ -463,6 +463,18 @@ config RTC_DRV_CMOS
          This driver can also be built as a module. If so, the module
          will be called rtc-cmos.
 
+config RTC_DRV_VRTC
+       tristate "Virtual RTC for Moorestown platforms"
+       depends on X86_MRST
+       default y if X86_MRST
+
+       help
+       Say "yes" here to get direct support for the real time clock
+       found on Moorestown platforms. The VRTC is a emulated RTC that
+       derives its clock source from a real RTC in the PMIC. The MC146818
+       style programming interface is mostly conserved, but any
+       updates are done via IPC calls to the system controller FW.
+
 config RTC_DRV_DS1216
        tristate "Dallas DS1216"
        depends on SNI_RM
index 4c2832df4697d3cdba4af02990c683dc13118776..2afdaf3ff98660f53c72a189503786ebeac0a27c 100644 (file)
@@ -30,6 +30,7 @@ obj-$(CONFIG_RTC_DRV_CMOS)    += rtc-cmos.o
 obj-$(CONFIG_RTC_DRV_COH901331)        += rtc-coh901331.o
 obj-$(CONFIG_RTC_DRV_DAVINCI)  += rtc-davinci.o
 obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o
+obj-$(CONFIG_RTC_DRV_VRTC)     += rtc-mrst.o
 obj-$(CONFIG_RTC_DRV_DS1216)   += rtc-ds1216.o
 obj-$(CONFIG_RTC_DRV_DS1286)   += rtc-ds1286.o
 obj-$(CONFIG_RTC_DRV_DS1302)   += rtc-ds1302.o
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
new file mode 100644 (file)
index 0000000..bcd0cf6
--- /dev/null
@@ -0,0 +1,582 @@
+/*
+ * rtc-mrst.c: Driver for Moorestown virtual RTC
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *        Feng Tang (feng.tang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based upon drivers/rtc/rtc-cmos.c
+ */
+
+/*
+ * Note:
+ *  * vRTC only supports binary mode and 24H mode
+ *  * vRTC only support PIE and AIE, no UIE, and its PIE only happens
+ *    at 23:59:59pm everyday, no support for adjustable frequency
+ *  * Alarm function is also limited to hr/min/sec.
+ */
+
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+
+#include <asm-generic/rtc.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+
+struct mrst_rtc {
+       struct rtc_device       *rtc;
+       struct device           *dev;
+       int                     irq;
+       struct resource         *iomem;
+
+       u8                      enabled_wake;
+       u8                      suspend_ctrl;
+};
+
+static const char driver_name[] = "rtc_mrst";
+
+#define        RTC_IRQMASK     (RTC_PF | RTC_AF)
+
+static inline int is_intr(u8 rtc_intr)
+{
+       if (!(rtc_intr & RTC_IRQF))
+               return 0;
+       return rtc_intr & RTC_IRQMASK;
+}
+
+/*
+ * rtc_time's year contains the increment over 1900, but vRTC's YEAR
+ * register can't be programmed to value larger than 0x64, so vRTC
+ * driver chose to use 1960 (1970 is UNIX time start point) as the base,
+ * and does the translation at read/write time.
+ *
+ * Why not just use 1970 as the offset? it's because using 1960 will
+ * make it consistent in leap year setting for both vrtc and low-level
+ * physical rtc devices.
+ */
+static int mrst_read_time(struct device *dev, struct rtc_time *time)
+{
+       unsigned long flags;
+
+       if (rtc_is_updating())
+               mdelay(20);
+
+       spin_lock_irqsave(&rtc_lock, flags);
+       time->tm_sec = vrtc_cmos_read(RTC_SECONDS);
+       time->tm_min = vrtc_cmos_read(RTC_MINUTES);
+       time->tm_hour = vrtc_cmos_read(RTC_HOURS);
+       time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+       time->tm_mon = vrtc_cmos_read(RTC_MONTH);
+       time->tm_year = vrtc_cmos_read(RTC_YEAR);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       /* Adjust for the 1960/1900 */
+       time->tm_year += 60;
+       time->tm_mon--;
+       return RTC_24H;
+}
+
+static int mrst_set_time(struct device *dev, struct rtc_time *time)
+{
+       int ret;
+       unsigned long flags;
+       unsigned char mon, day, hrs, min, sec;
+       unsigned int yrs;
+
+       yrs = time->tm_year;
+       mon = time->tm_mon + 1;   /* tm_mon starts at zero */
+       day = time->tm_mday;
+       hrs = time->tm_hour;
+       min = time->tm_min;
+       sec = time->tm_sec;
+
+       if (yrs < 70 || yrs > 138)
+               return -EINVAL;
+       yrs -= 60;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+
+       vrtc_cmos_write(yrs, RTC_YEAR);
+       vrtc_cmos_write(mon, RTC_MONTH);
+       vrtc_cmos_write(day, RTC_DAY_OF_MONTH);
+       vrtc_cmos_write(hrs, RTC_HOURS);
+       vrtc_cmos_write(min, RTC_MINUTES);
+       vrtc_cmos_write(sec, RTC_SECONDS);
+
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME);
+       return ret;
+}
+
+static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char rtc_control;
+
+       if (mrst->irq <= 0)
+               return -EIO;
+
+       /* Basic alarms only support hour, minute, and seconds fields.
+        * Some also support day and month, for alarms up to a year in
+        * the future.
+        */
+       t->time.tm_mday = -1;
+       t->time.tm_mon = -1;
+       t->time.tm_year = -1;
+
+       /* vRTC only supports binary mode */
+       spin_lock_irq(&rtc_lock);
+       t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM);
+       t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM);
+       t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM);
+
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       spin_unlock_irq(&rtc_lock);
+
+       t->enabled = !!(rtc_control & RTC_AIE);
+       t->pending = 0;
+
+       return 0;
+}
+
+static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control)
+{
+       unsigned char   rtc_intr;
+
+       /*
+        * NOTE after changing RTC_xIE bits we always read INTR_FLAGS;
+        * allegedly some older rtcs need that to handle irqs properly
+        */
+       rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS);
+       rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF;
+       if (is_intr(rtc_intr))
+               rtc_update_irq(mrst->rtc, 1, rtc_intr);
+}
+
+static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask)
+{
+       unsigned char   rtc_control;
+
+       /*
+        * Flush any pending IRQ status, notably for update irqs,
+        * before we enable new IRQs
+        */
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       mrst_checkintr(mrst, rtc_control);
+
+       rtc_control |= mask;
+       vrtc_cmos_write(rtc_control, RTC_CONTROL);
+
+       mrst_checkintr(mrst, rtc_control);
+}
+
+static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask)
+{
+       unsigned char   rtc_control;
+
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       rtc_control &= ~mask;
+       vrtc_cmos_write(rtc_control, RTC_CONTROL);
+       mrst_checkintr(mrst, rtc_control);
+}
+
+static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char hrs, min, sec;
+       int ret = 0;
+
+       if (!mrst->irq)
+               return -EIO;
+
+       hrs = t->time.tm_hour;
+       min = t->time.tm_min;
+       sec = t->time.tm_sec;
+
+       spin_lock_irq(&rtc_lock);
+       /* Next rtc irq must not be from previous alarm setting */
+       mrst_irq_disable(mrst, RTC_AIE);
+
+       /* Update alarm */
+       vrtc_cmos_write(hrs, RTC_HOURS_ALARM);
+       vrtc_cmos_write(min, RTC_MINUTES_ALARM);
+       vrtc_cmos_write(sec, RTC_SECONDS_ALARM);
+
+       spin_unlock_irq(&rtc_lock);
+
+       ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM);
+       if (ret)
+               return ret;
+
+       spin_lock_irq(&rtc_lock);
+       if (t->enabled)
+               mrst_irq_enable(mrst, RTC_AIE);
+
+       spin_unlock_irq(&rtc_lock);
+
+       return 0;
+}
+
+static int mrst_irq_set_state(struct device *dev, int enabled)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned long   flags;
+
+       if (!mrst->irq)
+               return -ENXIO;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+
+       if (enabled)
+               mrst_irq_enable(mrst, RTC_PIE);
+       else
+               mrst_irq_disable(mrst, RTC_PIE);
+
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return 0;
+}
+
+#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
+
+/* Currently, the vRTC doesn't support UIE ON/OFF */
+static int
+mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned long   flags;
+
+       switch (cmd) {
+       case RTC_AIE_OFF:
+       case RTC_AIE_ON:
+               if (!mrst->irq)
+                       return -EINVAL;
+               break;
+       default:
+               /* PIE ON/OFF is handled by mrst_irq_set_state() */
+               return -ENOIOCTLCMD;
+       }
+
+       spin_lock_irqsave(&rtc_lock, flags);
+       switch (cmd) {
+       case RTC_AIE_OFF:       /* alarm off */
+               mrst_irq_disable(mrst, RTC_AIE);
+               break;
+       case RTC_AIE_ON:        /* alarm on */
+               mrst_irq_enable(mrst, RTC_AIE);
+               break;
+       }
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return 0;
+}
+
+#else
+#define        mrst_rtc_ioctl  NULL
+#endif
+
+#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
+
+static int mrst_procfs(struct device *dev, struct seq_file *seq)
+{
+       unsigned char   rtc_control, valid;
+
+       spin_lock_irq(&rtc_lock);
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       valid = vrtc_cmos_read(RTC_VALID);
+       spin_unlock_irq(&rtc_lock);
+
+       return seq_printf(seq,
+                       "periodic_IRQ\t: %s\n"
+                       "alarm\t\t: %s\n"
+                       "BCD\t\t: no\n"
+                       "periodic_freq\t: daily (not adjustable)\n",
+                       (rtc_control & RTC_PIE) ? "on" : "off",
+                       (rtc_control & RTC_AIE) ? "on" : "off");
+}
+
+#else
+#define        mrst_procfs     NULL
+#endif
+
+static const struct rtc_class_ops mrst_rtc_ops = {
+       .ioctl          = mrst_rtc_ioctl,
+       .read_time      = mrst_read_time,
+       .set_time       = mrst_set_time,
+       .read_alarm     = mrst_read_alarm,
+       .set_alarm      = mrst_set_alarm,
+       .proc           = mrst_procfs,
+       .irq_set_state  = mrst_irq_set_state,
+};
+
+static struct mrst_rtc mrst_rtc;
+
+/*
+ * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in
+ * Reg B, so no need for this driver to clear it
+ */
+static irqreturn_t mrst_rtc_irq(int irq, void *p)
+{
+       u8 irqstat;
+
+       spin_lock(&rtc_lock);
+       /* This read will clear all IRQ flags inside Reg C */
+       irqstat = vrtc_cmos_read(RTC_INTR_FLAGS);
+       spin_unlock(&rtc_lock);
+
+       irqstat &= RTC_IRQMASK | RTC_IRQF;
+       if (is_intr(irqstat)) {
+               rtc_update_irq(p, 1, irqstat);
+               return IRQ_HANDLED;
+       }
+       return IRQ_NONE;
+}
+
+static int __init
+vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
+{
+       int retval = 0;
+       unsigned char rtc_control;
+
+       /* There can be only one ... */
+       if (mrst_rtc.dev)
+               return -EBUSY;
+
+       if (!iomem)
+               return -ENODEV;
+
+       iomem = request_mem_region(iomem->start,
+                       iomem->end + 1 - iomem->start,
+                       driver_name);
+       if (!iomem) {
+               dev_dbg(dev, "i/o mem already in use.\n");
+               return -EBUSY;
+       }
+
+       mrst_rtc.irq = rtc_irq;
+       mrst_rtc.iomem = iomem;
+
+       mrst_rtc.rtc = rtc_device_register(driver_name, dev,
+                               &mrst_rtc_ops, THIS_MODULE);
+       if (IS_ERR(mrst_rtc.rtc)) {
+               retval = PTR_ERR(mrst_rtc.rtc);
+               goto cleanup0;
+       }
+
+       mrst_rtc.dev = dev;
+       dev_set_drvdata(dev, &mrst_rtc);
+       rename_region(iomem, dev_name(&mrst_rtc.rtc->dev));
+
+       spin_lock_irq(&rtc_lock);
+       mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE);
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       spin_unlock_irq(&rtc_lock);
+
+       if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY)))
+               dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n");
+
+       if (rtc_irq) {
+               retval = request_irq(rtc_irq, mrst_rtc_irq,
+                               IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev),
+                               mrst_rtc.rtc);
+               if (retval < 0) {
+                       dev_dbg(dev, "IRQ %d is already in use, err %d\n",
+                               rtc_irq, retval);
+                       goto cleanup1;
+               }
+       }
+       dev_dbg(dev, "initialised\n");
+       return 0;
+
+cleanup1:
+       mrst_rtc.dev = NULL;
+       rtc_device_unregister(mrst_rtc.rtc);
+cleanup0:
+       release_region(iomem->start, iomem->end + 1 - iomem->start);
+       dev_err(dev, "rtc-mrst: unable to initialise\n");
+       return retval;
+}
+
+static void rtc_mrst_do_shutdown(void)
+{
+       spin_lock_irq(&rtc_lock);
+       mrst_irq_disable(&mrst_rtc, RTC_IRQMASK);
+       spin_unlock_irq(&rtc_lock);
+}
+
+static void __exit rtc_mrst_do_remove(struct device *dev)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       struct resource *iomem;
+
+       rtc_mrst_do_shutdown();
+
+       if (mrst->irq)
+               free_irq(mrst->irq, mrst->rtc);
+
+       rtc_device_unregister(mrst->rtc);
+       mrst->rtc = NULL;
+
+       iomem = mrst->iomem;
+       release_region(iomem->start, iomem->end + 1 - iomem->start);
+       mrst->iomem = NULL;
+
+       mrst->dev = NULL;
+       dev_set_drvdata(dev, NULL);
+}
+
+#ifdef CONFIG_PM
+static int mrst_suspend(struct device *dev, pm_message_t mesg)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char   tmp;
+
+       /* Only the alarm might be a wakeup event source */
+       spin_lock_irq(&rtc_lock);
+       mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL);
+       if (tmp & (RTC_PIE | RTC_AIE)) {
+               unsigned char   mask;
+
+               if (device_may_wakeup(dev))
+                       mask = RTC_IRQMASK & ~RTC_AIE;
+               else
+                       mask = RTC_IRQMASK;
+               tmp &= ~mask;
+               vrtc_cmos_write(tmp, RTC_CONTROL);
+
+               mrst_checkintr(mrst, tmp);
+       }
+       spin_unlock_irq(&rtc_lock);
+
+       if (tmp & RTC_AIE) {
+               mrst->enabled_wake = 1;
+               enable_irq_wake(mrst->irq);
+       }
+
+       dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n",
+                       (tmp & RTC_AIE) ? ", alarm may wake" : "",
+                       tmp);
+
+       return 0;
+}
+
+/*
+ * We want RTC alarms to wake us from the deep power saving state
+ */
+static inline int mrst_poweroff(struct device *dev)
+{
+       return mrst_suspend(dev, PMSG_HIBERNATE);
+}
+
+static int mrst_resume(struct device *dev)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char tmp = mrst->suspend_ctrl;
+
+       /* Re-enable any irqs previously active */
+       if (tmp & RTC_IRQMASK) {
+               unsigned char   mask;
+
+               if (mrst->enabled_wake) {
+                       disable_irq_wake(mrst->irq);
+                       mrst->enabled_wake = 0;
+               }
+
+               spin_lock_irq(&rtc_lock);
+               do {
+                       vrtc_cmos_write(tmp, RTC_CONTROL);
+
+                       mask = vrtc_cmos_read(RTC_INTR_FLAGS);
+                       mask &= (tmp & RTC_IRQMASK) | RTC_IRQF;
+                       if (!is_intr(mask))
+                               break;
+
+                       rtc_update_irq(mrst->rtc, 1, mask);
+                       tmp &= ~RTC_AIE;
+               } while (mask & RTC_AIE);
+               spin_unlock_irq(&rtc_lock);
+       }
+
+       dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp);
+
+       return 0;
+}
+
+#else
+#define        mrst_suspend    NULL
+#define        mrst_resume     NULL
+
+static inline int mrst_poweroff(struct device *dev)
+{
+       return -ENOSYS;
+}
+
+#endif
+
+static int __init vrtc_mrst_platform_probe(struct platform_device *pdev)
+{
+       return vrtc_mrst_do_probe(&pdev->dev,
+                       platform_get_resource(pdev, IORESOURCE_MEM, 0),
+                       platform_get_irq(pdev, 0));
+}
+
+static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev)
+{
+       rtc_mrst_do_remove(&pdev->dev);
+       return 0;
+}
+
+static void vrtc_mrst_platform_shutdown(struct platform_device *pdev)
+{
+       if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev))
+               return;
+
+       rtc_mrst_do_shutdown();
+}
+
+MODULE_ALIAS("platform:vrtc_mrst");
+
+static struct platform_driver vrtc_mrst_platform_driver = {
+       .probe          = vrtc_mrst_platform_probe,
+       .remove         = __exit_p(vrtc_mrst_platform_remove),
+       .shutdown       = vrtc_mrst_platform_shutdown,
+       .driver = {
+               .name           = (char *) driver_name,
+               .suspend        = mrst_suspend,
+               .resume         = mrst_resume,
+       }
+};
+
+static int __init vrtc_mrst_init(void)
+{
+       return platform_driver_register(&vrtc_mrst_platform_driver);
+}
+
+static void __exit vrtc_mrst_exit(void)
+{
+       platform_driver_unregister(&vrtc_mrst_platform_driver);
+}
+
+module_init(vrtc_mrst_init);
+module_exit(vrtc_mrst_exit);
+
+MODULE_AUTHOR("Jacob Pan; Feng Tang");
+MODULE_DESCRIPTION("Driver for Moorestown virtual RTC");
+MODULE_LICENSE("GPL");
index 5476c066d4ee336733445eda2f804561179ecb41..3c4039d5eef12d1b35ffd93c3f1861e43cc9b520 100644 (file)
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        int metadata;
        unsigned int revokes = 0;
        int x;
-       int error;
+       int error = 0;
 
        if (!*top)
                sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
 
-       error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+       if (ip != GFS2_I(sdp->sd_rindex))
+               error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+       else if (!sdp->sd_rgrps)
+               error = gfs2_ri_update(ip);
+
        if (error)
                return error;
 
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-       gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+       if (ip != GFS2_I(sdp->sd_rindex))
+               gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
 
index f92c1770416981df8b625b6f918bac6c061c6e5e..08a8beb152e60d6aa4dd0b38ea852973e99263d4 100644 (file)
@@ -541,21 +541,6 @@ out_locked:
        spin_unlock(&gl->gl_spin);
 }
 
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                unsigned int req_state,
-                                unsigned int flags)
-{
-       int ret = LM_OUT_ERROR;
-
-       if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-               return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
-       if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-               ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                        req_state, flags);
-       return ret;
-}
-
 /**
  * do_xmote - Calls the DLM to change the state of a lock
  * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
 
        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                      LM_FLAG_PRIORITY);
-       BUG_ON(gl->gl_state == target);
-       BUG_ON(gl->gl_state == gl->gl_target);
+       GLOCK_BUG_ON(gl, gl->gl_state == target);
+       GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
            glops->go_inval) {
                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                do_error(gl, 0); /* Fail queued try locks */
        }
+       gl->gl_req = target;
        spin_unlock(&gl->gl_spin);
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
            gl->gl_state == LM_ST_DEFERRED) &&
            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                lck_flags |= LM_FLAG_TRY_1CB;
-       ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
 
-       if (!(ret & LM_OUT_ASYNC)) {
-               finish_xmote(gl, ret);
+       if (sdp->sd_lockstruct.ls_ops->lm_lock) {
+               /* lock_dlm */
+               ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+               GLOCK_BUG_ON(gl, ret);
+       } else { /* lock_nolock */
+               finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
-       } else {
-               GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
        }
+
        spin_lock(&gl->gl_spin);
 }
 
@@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+       struct va_format vaf;
        va_list args;
 
        va_start(args, fmt);
+
        if (seq) {
                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
                seq_printf(seq, gi->string);
        } else {
-               printk(KERN_ERR " ");
-               vprintk(fmt, args);
+               vaf.fmt = fmt;
+               vaf.va = &args;
+
+               printk(KERN_ERR " %pV", &vaf);
        }
+
        va_end(args);
 }
 
@@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
  * @gl: Pointer to the glock
  * @ret: The return value from the dlm
  *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
  */
 
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
 
+       spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
 
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-               spin_lock(&gl->gl_spin);
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
                        return;
                }
-               spin_unlock(&gl->gl_spin);
        }
+
+       spin_unlock(&gl->gl_spin);
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+       smp_wmb();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put(gl);
@@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
-       char buffer[KSYM_SYMBOL_LEN];
        char flags_buf[32];
 
-       sprint_symbol(buffer, gh->gh_ip);
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-       gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-                 state2str(gh->gh_state),
-                 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                 gh->gh_error, 
-                 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                 gh_owner ? gh_owner->comm : "(ended)", buffer);
+       gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+                      state2str(gh->gh_state),
+                      hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                      gh->gh_error,
+                      gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                      gh_owner ? gh_owner->comm : "(ended)",
+                      (void *)gh->gh_ip);
        return 0;
 }
 
@@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void)
        }
 #endif
 
-       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
-       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
-                                               WQ_FREEZEABLE, 0);
+       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+                                               WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                               0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
                return PTR_ERR(gfs2_delete_workqueue);
index db1c26d6d2206c8f9e9b68396380ed8791f3c720..691851ceb6153f59b91cd64d1ce0fecea46904e0 100644 (file)
@@ -87,11 +87,10 @@ enum {
 #define GL_ASYNC               0x00000040
 #define GL_EXACT               0x00000080
 #define GL_SKIP                        0x00000100
-#define GL_ATIME               0x00000200
 #define GL_NOCACHE             0x00000400
   
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
  *
  * LM_OUT_ST_MASK
  * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
  * LM_OUT_CANCELED
  * The lock request was canceled.
  *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
  */
 
 #define LM_OUT_ST_MASK         0x00000003
 #define LM_OUT_CANCELED                0x00000008
-#define LM_OUT_ASYNC           0x00000080
-#define LM_OUT_ERROR           0x00000100
+#define LM_OUT_ERROR           0x00000004
 
 /*
  * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-       unsigned int (*lm_lock) (struct gfs2_glock *gl,
-                                unsigned int req_state, unsigned int flags);
+       int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+                       unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
        const match_table_t *lm_tokens;
 };
 
-#define LM_FLAG_TRY            0x00000001
-#define LM_FLAG_TRY_1CB                0x00000002
-#define LM_FLAG_NOEXP          0x00000004
-#define LM_FLAG_ANY            0x00000008
-#define LM_FLAG_PRIORITY       0x00000010
-
-#define GL_ASYNC               0x00000040
-#define GL_EXACT               0x00000080
-#define GL_SKIP                        0x00000100
-#define GL_NOCACHE             0x00000400
-
-#define GLR_TRYFAILED          13
-
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
index 0d149dcc04e515adfaaeb632a6677e5e3b555f45..263561bf1a5059b4bf644340faa6a4435c62d14f 100644 (file)
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
 
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-               flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
index 764fbb49efc8e3adbdeda7f83f178b0fd6ea70f8..8d3d2b4a0a7d64431d63edff082cbedbd5b2543b 100644 (file)
@@ -207,12 +207,14 @@ struct gfs2_glock {
 
        spinlock_t gl_spin;
 
-       unsigned int gl_state;
-       unsigned int gl_target;
-       unsigned int gl_reply;
+       /* State fields protected by gl_spin */
+       unsigned int gl_state:2,        /* Current state */
+                    gl_target:2,       /* Target state */
+                    gl_demote_state:2, /* State requested by remote node */
+                    gl_req:2,          /* State in last dlm request */
+                    gl_reply:8;        /* Last reply from the dlm */
+
        unsigned int gl_hash;
-       unsigned int gl_req;
-       unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
        struct list_head gl_holders;
 
index e1213f7f92179aa2472304ff0db4294be66040d8..14e682dbe8bff4bd4063e4a54a5445cbcfe17937 100644 (file)
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        if (error)
                return error;
 
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               error = vmtruncate(inode, attr->ia_size);
-               if (error)
-                       return error;
-       }
-
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-
-       gfs2_assert_warn(GFS2_SB(inode), !error);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
index 1c09425b45fd728ba52c1f5f49c3feac187640a2..6e493aee28f82dfb593574f751ed81025207645c 100644 (file)
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
        return lkf;
 }
 
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
-                             unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+                    unsigned int flags)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-       int error;
        int req;
        u32 lkf;
 
-       gl->gl_req = req_state;
        req = make_mode(req_state);
        lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
 
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
         * Submit the actual lock request.
         */
 
-       error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-       if (error == -EAGAIN)
-               return 0;
-       if (error)
-               return LM_OUT_ERROR;
-       return LM_OUT_ASYNC;
+       return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+                       GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
 
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
index 12cbea7502c26040fb90db5750e764bdd831079a..1db6b73432298d4092c0e8684483b8fff29c78ee 100644 (file)
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-       struct buffer_head *dibh;
        u32 ouid, ogid, nuid, ngid;
        int error;
 
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
 
-       error = gfs2_meta_inode_buffer(ip, &dibh);
+       error = gfs2_setattr_simple(ip, attr);
        if (error)
                goto out_end_trans;
 
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               int error;
-
-               error = vmtruncate(inode, attr->ia_size);
-               gfs2_assert_warn(sdp, !error);
-       }
-
-       setattr_copy(inode, attr);
-       mark_inode_dirty(inode);
-
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-       brelse(dibh);
-
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                gfs2_quota_change(ip, -blocks, ouid, ogid);
index f606baf9ba7247e9a5fd9ccfb2cc9426019e589e..a689901963dea43c82b6178a4451c09560061e76 100644 (file)
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
+               if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                       qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                       qd->qd_qb.qb_value = qp->qu_value;
+               }
        }
 
        /* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
 }
 
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                          struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+       if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+           ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+               fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
        if (fdq->d_fieldmask == 0)
                goto out_i;
 
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
-
index 33c8407b876f00ceef0741221ebae4ba46ecb426..7293ea27020c680307e0145e863ebbb7eb0d6949 100644 (file)
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-               if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+               if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
  * Returns: 0 on successful update, error code otherwise
  */
 
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
@@ -613,46 +613,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
        return 0;
 }
 
-/**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-       struct inode *inode = &ip->i_inode;
-       struct file_ra_state ra_state;
-       struct gfs2_rgrpd *rgd;
-       unsigned int max_data = 0;
-       int error;
-
-       file_ra_state_init(&ra_state, inode->i_mapping);
-       for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-               /* Ignore partials */
-               if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                   i_size_read(inode))
-                       break;
-               error = read_rindex_entry(ip, &ra_state);
-               if (error) {
-                       clear_rgrpdi(sdp);
-                       return error;
-               }
-       }
-       list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-               if (rgd->rd_data > max_data)
-                       max_data = rgd->rd_data;
-       sdp->sd_max_rg_data = max_data;
-
-       sdp->sd_rindex_uptodate = 1;
-       return 0;
-}
-
 /**
  * gfs2_rindex_hold - Grab a lock on the rindex
  * @sdp: The GFS2 superblock
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
-                       error = gfs2_ri_update_special(ip);
+                       error = gfs2_ri_update(ip);
                if (error)
                        return error;
        }
 
+try_again:
        do {
                error = get_local_rgrp(ip, &last_unlinked);
                /* If there is no space, flushing the log may release some */
-               if (error)
+               if (error) {
+                       if (ip == GFS2_I(sdp->sd_rindex) &&
+                           !sdp->sd_rindex_uptodate) {
+                               error = gfs2_ri_update(ip);
+                               if (error)
+                                       return error;
+                               goto try_again;
+                       }
                        gfs2_log_flush(sdp, NULL);
+               }
        } while (error && tries++ < 3);
 
        if (error) {
index 0e35c0466f9a6c5979a3fe8c339def323bc37fad..50c2bb04369c8dd617fed95513461f6dc3651d0d 100644 (file)
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
index 30b58f07c8a6b219fc964efe101ce5f861397885..439b61c03262b767956e23f761b637e0b6905383 100644 (file)
@@ -1296,10 +1296,8 @@ fail:
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-       struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
-       struct buffer_head *dibh;
        int error;
 
        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
 
-       error = gfs2_meta_inode_buffer(ip, &dibh);
-       if (error)
-               goto out_trans_end;
-
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               int error;
-
-               error = vmtruncate(inode, attr->ia_size);
-               gfs2_assert_warn(GFS2_SB(inode), !error);
-       }
-
-       setattr_copy(inode, attr);
-       mark_inode_dirty(inode);
-
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-       brelse(dibh);
-
-out_trans_end:
+       error = gfs2_setattr_simple(ip, attr);
        gfs2_trans_end(sdp);
        return error;
 }
index 182845147fe45bde8f5607a799f23cc1e2818117..08cba2c3b61240e085b9861967af5bd0adb4d227 100644 (file)
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+       struct inode *inode = m->private;
+       struct task_struct *p;
+
+       p = get_proc_task(inode);
+       if (!p)
+               return -ESRCH;
+       proc_sched_autogroup_show_task(p, m);
+
+       put_task_struct(p);
+
+       return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+           size_t count, loff_t *offset)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct task_struct *p;
+       char buffer[PROC_NUMBUF];
+       long nice;
+       int err;
+
+       memset(buffer, 0, sizeof(buffer));
+       if (count > sizeof(buffer) - 1)
+               count = sizeof(buffer) - 1;
+       if (copy_from_user(buffer, buf, count))
+               return -EFAULT;
+
+       err = strict_strtol(strstrip(buffer), 0, &nice);
+       if (err)
+               return -EINVAL;
+
+       p = get_proc_task(inode);
+       if (!p)
+               return -ESRCH;
+
+       err = nice;
+       err = proc_sched_autogroup_set_nice(p, &err);
+       if (err)
+               count = err;
+
+       put_task_struct(p);
+
+       return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+       int ret;
+
+       ret = single_open(filp, sched_autogroup_show, NULL);
+       if (!ret) {
+               struct seq_file *m = filp->private_data;
+
+               m->private = inode;
+       }
+       return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+       .open           = sched_autogroup_open,
+       .read           = seq_read,
+       .write          = sched_autogroup_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
 static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
 {
@@ -2732,6 +2808,9 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+       REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
index 36d57f74cd01c6c126ee2f7c2ea2c98f66868b2f..51494e6b55487f30496c8870165dd75f8ba4c7b1 100644 (file)
@@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
 extern int wait_for_completion_killable(struct completion *x);
 extern unsigned long wait_for_completion_timeout(struct completion *x,
                                                   unsigned long timeout);
-extern unsigned long wait_for_completion_interruptible_timeout(
-                       struct completion *x, unsigned long timeout);
-extern unsigned long wait_for_completion_killable_timeout(
-                       struct completion *x, unsigned long timeout);
+extern long wait_for_completion_interruptible_timeout(
+       struct completion *x, unsigned long timeout);
+extern long wait_for_completion_killable_timeout(
+       struct completion *x, unsigned long timeout);
 extern bool try_wait_for_completion(struct completion *x);
 extern bool completion_done(struct completion *x);
 
index a90b3892074a083cd076f695e8fbc4046214c563..1c70028f81f902321ea9e99ccff1619e48911f30 100644 (file)
@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
 extern int ddebug_remove_module(const char *mod_name);
 
 #define dynamic_pr_debug(fmt, ...) do {                                        \
-       __label__ do_printk;                                            \
-       __label__ out;                                                  \
        static struct _ddebug descriptor                                \
        __used                                                          \
        __attribute__((section("__verbose"), aligned(8))) =             \
        { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
                _DPRINTK_FLAGS_DEFAULT };                               \
-       JUMP_LABEL(&descriptor.enabled, do_printk);                     \
-       goto out;                                                       \
-do_printk:                                                             \
-       printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);                 \
-out:   ;                                                               \
+       if (unlikely(descriptor.enabled))                               \
+               printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);         \
        } while (0)
 
 
 #define dynamic_dev_dbg(dev, fmt, ...) do {                            \
-       __label__ do_printk;                                            \
-       __label__ out;                                                  \
        static struct _ddebug descriptor                                \
        __used                                                          \
        __attribute__((section("__verbose"), aligned(8))) =             \
        { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
                _DPRINTK_FLAGS_DEFAULT };                               \
-       JUMP_LABEL(&descriptor.enabled, do_printk);                     \
-       goto out;                                                       \
-do_printk:                                                             \
-       dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);                \
-out:   ;                                                               \
+       if (unlikely(descriptor.enabled))                               \
+               dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);        \
        } while (0)
 
 #else
index fd0c1b857d3dbcd9c074e461ded81b69f3ab897d..330586ffffbbccad534b1f81b7309d766f8d48fb 100644 (file)
@@ -22,7 +22,7 @@
 #include <linux/wait.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
-
+#include <linux/timerqueue.h>
 
 struct hrtimer_clock_base;
 struct hrtimer_cpu_base;
@@ -79,8 +79,8 @@ enum hrtimer_restart {
 
 /**
  * struct hrtimer - the basic hrtimer structure
- * @node:      red black tree node for time ordered insertion
- * @_expires:  the absolute expiry time in the hrtimers internal
+ * @node:      timerqueue node, which also manages node.expires,
+ *             the absolute expiry time in the hrtimers internal
  *             representation. The time is related to the clock on
  *             which the timer is based. Is setup by adding
  *             slack to the _softexpires value. For non range timers
@@ -101,8 +101,7 @@ enum hrtimer_restart {
  * The hrtimer structure must be initialized by hrtimer_init()
  */
 struct hrtimer {
-       struct rb_node                  node;
-       ktime_t                         _expires;
+       struct timerqueue_node          node;
        ktime_t                         _softexpires;
        enum hrtimer_restart            (*function)(struct hrtimer *);
        struct hrtimer_clock_base       *base;
@@ -141,8 +140,7 @@ struct hrtimer_sleeper {
 struct hrtimer_clock_base {
        struct hrtimer_cpu_base *cpu_base;
        clockid_t               index;
-       struct rb_root          active;
-       struct rb_node          *first;
+       struct timerqueue_head  active;
        ktime_t                 resolution;
        ktime_t                 (*get_time)(void);
        ktime_t                 softirq_time;
@@ -158,7 +156,6 @@ struct hrtimer_clock_base {
  * @lock:              lock protecting the base and associated clock bases
  *                     and timers
  * @clock_base:                array of clock bases for this cpu
- * @curr_timer:                the timer which is executing a callback right now
  * @expires_next:      absolute time of the next event which was scheduled
  *                     via clock_set_next_event()
  * @hres_active:       State of high resolution mode
@@ -184,43 +181,43 @@ struct hrtimer_cpu_base {
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
-       timer->_expires = time;
+       timer->node.expires = time;
        timer->_softexpires = time;
 }
 
 static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
 {
        timer->_softexpires = time;
-       timer->_expires = ktime_add_safe(time, delta);
+       timer->node.expires = ktime_add_safe(time, delta);
 }
 
 static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
 {
        timer->_softexpires = time;
-       timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+       timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
 }
 
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
-       timer->_expires.tv64 = tv64;
+       timer->node.expires.tv64 = tv64;
        timer->_softexpires.tv64 = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
-       timer->_expires = ktime_add_safe(timer->_expires, time);
+       timer->node.expires = ktime_add_safe(timer->node.expires, time);
        timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
 }
 
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
 {
-       timer->_expires = ktime_add_ns(timer->_expires, ns);
+       timer->node.expires = ktime_add_ns(timer->node.expires, ns);
        timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
 }
 
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 {
-       return timer->_expires;
+       return timer->node.expires;
 }
 
 static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
@@ -230,7 +227,7 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
 
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
-       return timer->_expires.tv64;
+       return timer->node.expires.tv64;
 }
 static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 {
@@ -239,12 +236,12 @@ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
-       return ktime_to_ns(timer->_expires);
+       return ktime_to_ns(timer->node.expires);
 }
 
 static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 {
-    return ktime_sub(timer->_expires, timer->base->get_time());
+       return ktime_sub(timer->node.expires, timer->base->get_time());
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
index 1f8c06ce0fa66b83760863735eaf1209908205d7..caa151fbebb74c661289a69ffb52762435178d53 100644 (file)
 #include <linux/securebits.h>
 #include <net/net_namespace.h>
 
+#ifdef CONFIG_SMP
+# define INIT_PUSHABLE_TASKS(tsk)                                      \
+       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
+#else
+# define INIT_PUSHABLE_TASKS(tsk)
+#endif
+
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
@@ -83,6 +90,12 @@ extern struct group_info init_groups;
  */
 # define CAP_INIT_BSET  CAP_FULL_SET
 
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()                                          \
+       .rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_TREE_PREEMPT()                                   \
        .rcu_blocked_node = NULL,
@@ -94,7 +107,8 @@ extern struct group_info init_groups;
        .rcu_read_lock_nesting = 0,                                     \
        .rcu_read_unlock_special = 0,                                   \
        .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),           \
-       INIT_TASK_RCU_TREE_PREEMPT()
+       INIT_TASK_RCU_TREE_PREEMPT()                                    \
+       INIT_TASK_RCU_BOOST()
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
@@ -137,7 +151,7 @@ extern struct cred init_cred;
                .nr_cpus_allowed = NR_CPUS,                             \
        },                                                              \
        .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
-       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
+       INIT_PUSHABLE_TASKS(tsk)                                        \
        .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
        .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
        .real_parent    = &tsk,                                         \
index 79d0c4f6d0719452c20494b1439d0d695e212e90..55e0d4253e4927eb67254f38137b2a9e787afa9d 100644 (file)
@@ -114,15 +114,15 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
 struct irqaction {
        irq_handler_t handler;
        unsigned long flags;
-       const char *name;
        void *dev_id;
        struct irqaction *next;
        int irq;
-       struct proc_dir_entry *dir;
        irq_handler_t thread_fn;
        struct task_struct *thread;
        unsigned long thread_flags;
-};
+       const char *name;
+       struct proc_dir_entry *dir;
+} ____cacheline_internodealigned_in_smp;
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
 
index 7575bbbdf2a2b8e6a716fb6252c3d8e958756f52..8b17fd8c790d8601f8aff0a33c7b909984e30545 100644 (file)
@@ -308,6 +308,9 @@ struct module
        /* The size of the executable code in each section.  */
        unsigned int init_text_size, core_text_size;
 
+       /* Size of RO sections of the module (text+rodata) */
+       unsigned int init_ro_size, core_ro_size;
+
        /* Arch-specific module values */
        struct mod_arch_specific arch;
 
@@ -672,7 +675,6 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
 {
        return 0;
 }
-
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
@@ -687,6 +689,13 @@ extern int module_sysfs_initialized;
 
 #define __MODULE_STRING(x) __stringify(x)
 
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+extern void set_all_modules_text_rw(void);
+extern void set_all_modules_text_ro(void);
+#else
+static inline void set_all_modules_text_rw(void) { }
+static inline void set_all_modules_text_ro(void) { }
+#endif
 
 #ifdef CONFIG_GENERIC_BUG
 void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
index f363bc8fdc74c821c99aa59d5bfcb9554c012c9a..94b48bd40dd735f77963fcd31797d32bb68b3379 100644 (file)
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
+#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
+#define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
 #endif
index f31ef61f1c650b585bd6faf969f7cec754dffe2d..2dea94fc44026a1048f912913be298b8df179873 100644 (file)
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define list_first_entry_rcu(ptr, type, member) \
        list_entry_rcu((ptr)->next, type, member)
 
-#define __list_for_each_rcu(pos, head) \
-       for (pos = rcu_dereference_raw(list_next_rcu(head)); \
-               pos != (head); \
-               pos = rcu_dereference_raw(list_next_rcu((pos)))
-
 /**
  * list_for_each_entry_rcu     -       iterate over rcu list of given type
  * @pos:       the type * to use as a loop cursor.
index 03cda7bed98587b128c5a9953316644a8debb4d2..af5614856285d32e0f07d3ca7e7294b03b9b27b7 100644 (file)
@@ -47,6 +47,8 @@
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
+#define UINT_CMP_GE(a, b)      (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)      (UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
 
@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
 extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
 extern int sched_expedited_torture_stats(char *page);
 
 static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
-extern void rcu_init(void);
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
index 13877cb93a6000043f11a6704f2d90b0cc04552d..30ebd7c8d874b4dfeb9c9c9e5c5e857fbb43ab62 100644 (file)
@@ -27,7 +27,9 @@
 
 #include <linux/cache.h>
 
-#define rcu_init_sched()       do { } while (0)
+static inline void rcu_init(void)
+{
+}
 
 #ifdef CONFIG_TINY_RCU
 
@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
        synchronize_sched();
 }
 
+static inline void synchronize_sched_expedited(void)
+{
+       synchronize_sched();
+}
+
 #ifdef CONFIG_TINY_RCU
 
 static inline void rcu_preempt_note_context_switch(void)
@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 extern int rcu_scheduler_active __read_mostly;
 extern void rcu_scheduler_starting(void);
-
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
 static inline void rcu_scheduler_starting(void)
 {
 }
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #endif /* __LINUX_RCUTINY_H */
index 95518e6287946177e0eceb5cbf201ebfcaf0e072..3a933482734aeccbafc7a0bb735be11ede47cbd2 100644 (file)
@@ -30,6 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
+extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
index a99d735db3dfe5ee26fba4aebc5261dedfa74af9..777cd01e240ee0fca7a8b6a76c74137d0dcfaaa9 100644 (file)
@@ -513,6 +513,8 @@ struct thread_group_cputimer {
        spinlock_t lock;
 };
 
+struct autogroup;
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -580,6 +582,9 @@ struct signal_struct {
 
        struct tty_struct *tty; /* NULL if no tty */
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
@@ -1233,13 +1238,18 @@ struct task_struct {
 #ifdef CONFIG_TREE_PREEMPT_RCU
        struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+       struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        struct sched_info sched_info;
 #endif
 
        struct list_head tasks;
+#ifdef CONFIG_SMP
        struct plist_node pushable_tasks;
+#endif
 
        struct mm_struct *mm, *active_mm;
 #if defined(SPLIT_RSS_COUNTING)
@@ -1763,7 +1773,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #ifdef CONFIG_PREEMPT_RCU
 
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
@@ -1771,7 +1782,10 @@ static inline void rcu_copy_process(struct task_struct *p)
        p->rcu_read_unlock_special = 0;
 #ifdef CONFIG_TREE_PREEMPT_RCU
        p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+       p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
        INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
@@ -1876,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
-extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
 #endif
 
-extern void sched_idle_next(void);
-
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
 extern void wake_up_idle_cpu(int cpu);
 #else
@@ -1893,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
 extern unsigned int sysctl_sched_child_runs_first;
 
 enum sched_tunable_scaling {
@@ -1910,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *length,
@@ -1935,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
 
 extern unsigned int sysctl_sched_compat_yield;
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1953,9 +1981,10 @@ extern int task_nice(const struct task_struct *p);
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler(struct task_struct *, int,
+                             const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int,
-                                     struct sched_param *);
+                                     const struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
index 7f770c638e99d670840ed99856dce0b07a660f8b..fe817918b30e49ff96648394d99d8fc85eebaac8 100644 (file)
@@ -77,6 +77,8 @@
 #define SFI_OEM_ID_SIZE                6
 #define SFI_OEM_TABLE_ID_SIZE  8
 
+#define SFI_NAME_LEN           16
+
 #define SFI_SYST_SEARCH_BEGIN          0x000E0000
 #define SFI_SYST_SEARCH_END            0x000FFFFF
 
@@ -156,13 +158,13 @@ struct sfi_device_table_entry {
        u16     addr;
        u8      irq;
        u32     max_freq;
-       char    name[16];
+       char    name[SFI_NAME_LEN];
 } __packed;
 
 struct sfi_gpio_table_entry {
-       char    controller_name[16];
+       char    controller_name[SFI_NAME_LEN];
        u16     pin_no;
-       char    pin_name[16];
+       char    pin_name[SFI_NAME_LEN];
 } __packed;
 
 typedef int (*sfi_table_handler) (struct sfi_table_header *table);
index 38cf093ef62c745d9f06e1038127ef61834796a1..6abd9138beda57f7555b96b9fa0d51c60edaa50f 100644 (file)
@@ -24,9 +24,9 @@ struct timer_list {
        int slack;
 
 #ifdef CONFIG_TIMER_STATS
+       int start_pid;
        void *start_site;
        char start_comm[16];
-       int start_pid;
 #endif
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map lockdep_map;
@@ -48,12 +48,38 @@ extern struct tvec_base boot_tvec_bases;
 #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
 #endif
 
+/*
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
+ */
+#define TBASE_DEFERRABLE_FLAG          (0x1)
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {                \
                .entry = { .prev = TIMER_ENTRY_STATIC },        \
                .function = (_function),                        \
                .expires = (_expires),                          \
                .data = (_data),                                \
                .base = &boot_tvec_bases,                       \
+               .slack = -1,                                    \
+               __TIMER_LOCKDEP_MAP_INITIALIZER(                \
+                       __FILE__ ":" __stringify(__LINE__))     \
+       }
+
+#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *)         \
+                 ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
+
+#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\
+               .entry = { .prev = TIMER_ENTRY_STATIC },        \
+               .function = (_function),                        \
+               .expires = (_expires),                          \
+               .data = (_data),                                \
+               .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases),  \
                __TIMER_LOCKDEP_MAP_INITIALIZER(                \
                        __FILE__ ":" __stringify(__LINE__))     \
        }
@@ -248,11 +274,11 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
 
 extern void add_timer(struct timer_list *timer);
 
+extern int try_to_del_timer_sync(struct timer_list *timer);
+
 #ifdef CONFIG_SMP
-  extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
-# define try_to_del_timer_sync(t)      del_timer(t)
 # define del_timer_sync(t)             del_timer(t)
 #endif
 
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
new file mode 100644 (file)
index 0000000..d24aaba
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef _LINUX_TIMERQUEUE_H
+#define _LINUX_TIMERQUEUE_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+
+
+struct timerqueue_node {
+       struct rb_node node;
+       ktime_t expires;
+};
+
+struct timerqueue_head {
+       struct rb_root head;
+       struct timerqueue_node *next;
+};
+
+
+extern void timerqueue_add(struct timerqueue_head *head,
+                               struct timerqueue_node *node);
+extern void timerqueue_del(struct timerqueue_head *head,
+                               struct timerqueue_node *node);
+extern struct timerqueue_node *timerqueue_iterate_next(
+                                               struct timerqueue_node *node);
+
+/**
+ * timerqueue_getnext - Returns the timer with the earlies expiration time
+ *
+ * @head: head of timerqueue
+ *
+ * Returns a pointer to the timer node that has the
+ * earliest expiration time.
+ */
+static inline
+struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
+{
+       return head->next;
+}
+
+static inline void timerqueue_init(struct timerqueue_node *node)
+{
+       RB_CLEAR_NODE(&node->node);
+}
+
+static inline void timerqueue_init_head(struct timerqueue_head *head)
+{
+       head->head = RB_ROOT;
+       head->next = NULL;
+}
+#endif /* _LINUX_TIMERQUEUE_H */
index d3e4f87e95c0fa67236f92c2a688fdaa640cfaae..c6814616653b1ff4d385b8b4148c4b2b14847e31 100644 (file)
@@ -32,7 +32,7 @@ struct tracepoint {
        int state;                      /* State. */
        void (*regfunc)(void);
        void (*unregfunc)(void);
-       struct tracepoint_func *funcs;
+       struct tracepoint_func __rcu *funcs;
 } __attribute__((aligned(32)));                /*
                                         * Aligned on 32 bytes because it is
                                         * globally visible and gcc happily
@@ -326,7 +326,7 @@ do_trace:                                                           \
  *             memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *             __entry->next_pid       = next->pid;
  *             __entry->next_prio      = next->prio;
- *     )
+ *     ),
  *
  *     *
  *     * Formatted output of a trace record via TP_printk().
index 0c0771f06bfa745e8e4e5add4ec4823cf52eb813..bd257fee60310184b52d0f8c15f1206f6a4f5dad 100644 (file)
@@ -127,12 +127,20 @@ struct execute_work {
        .timer = TIMER_INITIALIZER(NULL, 0, 0),                 \
        }
 
+#define __DEFERRED_WORK_INITIALIZER(n, f) {                    \
+       .work = __WORK_INITIALIZER((n).work, (f)),              \
+       .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0),        \
+       }
+
 #define DECLARE_WORK(n, f)                                     \
        struct work_struct n = __WORK_INITIALIZER(n, f)
 
 #define DECLARE_DELAYED_WORK(n, f)                             \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
 
+#define DECLARE_DEFERRED_WORK(n, f)                            \
+       struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f)
+
 /*
  * initialize a work item's function pointer
  */
index b0b4eb24d592fb1f8ecba11294c10e802ff7cd2b..da39b22636f711548b7e6300a5ebf2b7699ed019 100644 (file)
 #undef CREATE_TRACE_POINTS
 
 #include <linux/stringify.h>
+/*
+ * module.h includes tracepoints, and because ftrace.h
+ * pulls in module.h:
+ *  trace/ftrace.h -> linux/ftrace_event.h -> linux/perf_event.h ->
+ *  linux/ftrace.h -> linux/module.h
+ * we must include module.h here before we play with any of
+ * the TRACE_EVENT() macros, otherwise the tracepoints included
+ * by module.h may break the build.
+ */
+#include <linux/module.h>
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
index 75ce9d500d8e3c62dbfffbc0acafca13d11d90aa..f10293c41b1e8e690706941d12f654550f61b65b 100644 (file)
@@ -25,9 +25,7 @@ TRACE_EVENT(kfree_skb,
 
        TP_fast_assign(
                __entry->skbaddr = skb;
-               if (skb) {
-                       __entry->protocol = ntohs(skb->protocol);
-               }
+               __entry->protocol = ntohs(skb->protocol);
                __entry->location = location;
        ),
 
index c9728992a776356e043d21df7b33aa045c2d7904..8dfd094e68753dd919dd50299c82fa7385d6621e 100644 (file)
@@ -393,7 +393,6 @@ config PREEMPT_RCU
 
 config RCU_TRACE
        bool "Enable tracing for RCU"
-       depends on TREE_RCU || TREE_PREEMPT_RCU
        help
          This option provides tracing in RCU which presents stats
          in debugfs for debugging RCU implementation.
@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
          TREE_PREEMPT_RCU implementations, permitting Makefile to
          trivially select kernel/rcutree_trace.c.
 
+config RCU_BOOST
+       bool "Enable RCU priority boosting"
+       depends on RT_MUTEXES && TINY_PREEMPT_RCU
+       default n
+       help
+         This option boosts the priority of preempted RCU readers that
+         block the current preemptible RCU grace period for too long.
+         This option also prevents heavy loads from blocking RCU
+         callback invocation for all flavors of RCU.
+
+         Say Y here if you are working with real-time apps or heavy loads
+         Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+       int "Real-time priority to boost RCU readers to"
+       range 1 99
+       depends on RCU_BOOST
+       default 1
+       help
+         This option specifies the real-time priority to which preempted
+         RCU readers are to be boosted.  If you are working with CPU-bound
+         real-time applications, you should specify a priority higher then
+         the highest-priority CPU-bound application.
+
+         Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+       int "Milliseconds to delay boosting after RCU grace-period start"
+       range 0 3000
+       depends on RCU_BOOST
+       default 500
+       help
+         This option specifies the time to wait after the beginning of
+         a given grace period before priority-boosting preempted RCU
+         readers blocking that grace period.  Note that any RCU reader
+         blocking an expedited RCU grace period is boosted immediately.
+
+         Accept the default if unsure.
+
+config SRCU_SYNCHRONIZE_DELAY
+       int "Microseconds to delay before waiting for readers"
+       range 0 20
+       default 10
+       help
+         This option controls how long SRCU delays before entering its
+         loop waiting on SRCU readers.  The purpose of this loop is
+         to avoid the unconditional context-switch penalty that would
+         otherwise be incurred if there was an active SRCU reader,
+         in a manner similar to adaptive locking schemes.  This should
+         be set to be a bit longer than the common-case SRCU read-side
+         critical-section overhead.
+
+         Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
@@ -741,6 +794,19 @@ config NET_NS
 
 endif # NAMESPACES
 
+config SCHED_AUTOGROUP
+       bool "Automatic process group scheduling"
+       select EVENTFD
+       select CGROUPS
+       select CGROUP_SCHED
+       select FAIR_GROUP_SCHED
+       help
+         This option optimizes the scheduler for common desktop workloads by
+         automatically creating and populating task groups.  This separation
+         of workloads isolates aggressive CPU burners (like build jobs) from
+         desktop applications.  Task group autogeneration is currently based
+         upon task session.
+
 config MM_OWNER
        bool
 
index 0b5ff083fa22fa381a6fc1f092a824f255bda01a..e0f2831634b4c9388e4a1d73d605951253902cb0 100644 (file)
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
index f6e726f184916029e2d1cfdbcd4acb2b26f14e69..156cc555614089345553a6e7710580c4f069be0e 100644 (file)
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
 }
 
 struct take_cpu_down_param {
-       struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
-       unsigned int cpu = (unsigned long)param->hcpu;
        int err;
 
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
 
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
 
-       if (task_cpu(param->caller) == cpu)
-               move_task_off_dead_cpu(cpu, param->caller);
-       /* Force idle task to run as soon as we yield: it should
-          immediately notice cpu is offline and die quickly. */
-       sched_idle_next();
        return 0;
 }
 
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
-               .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        }
        BUG_ON(cpu_online(cpu));
 
-       /* Wait for it to sleep (leaving idle task). */
+       /*
+        * The migration_call() CPU_DYING callback will have removed all
+        * runnable tasks from the cpu, there's only the idle task left now
+        * that the migration thread is done doing the stop_machine thing.
+        *
+        * Wait for the stop thread to go away.
+        */
        while (!idle_cpu(cpu))
-               yield();
+               cpu_relax();
 
        /* This actually kills the CPU. */
        __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
 
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
+
 int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
+       arch_disable_nonboot_cpus_begin();
 
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
                }
        }
 
+       arch_disable_nonboot_cpus_end();
+
        if (!error) {
                BUG_ON(num_online_cpus() > 1);
                /* Make sure the CPUs won't be enabled by someone else */
index 5447dc7defa95b8f0e13acb80b45487df7dc2e73..7d164e25b0f0ea42498d748f389824773e2a0c78 100644 (file)
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
 
 static inline void put_signal_struct(struct signal_struct *sig)
 {
-       if (atomic_dec_and_test(&sig->sigcnt))
+       if (atomic_dec_and_test(&sig->sigcnt)) {
+               sched_autogroup_exit(sig);
                free_signal_struct(sig);
+       }
 }
 
 void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        posix_cpu_timers_init_group(sig);
 
        tty_audit_fork(sig);
+       sched_autogroup_fork(sig);
 
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
-               free_signal_struct(p->signal);
+               put_signal_struct(p->signal);
 bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
index 40a8777a27d0d85e173f4b7a3efbd4ecff1c9654..3019b92e691744169b3ac50bb3836afae5ab1085 100644 (file)
@@ -68,6 +68,14 @@ int __read_mostly futex_cmpxchg_enabled;
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED           0x01
+#define FLAGS_CLOCKRT          0x02
+#define FLAGS_HAS_TIMEOUT      0x04
+
 /*
  * Priority Inheritance state:
  */
@@ -123,6 +131,12 @@ struct futex_q {
        u32 bitset;
 };
 
+static const struct futex_q futex_q_init = {
+       /* list gets initialized in queue_me()*/
+       .key = FUTEX_KEY_INIT,
+       .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
 /*
  * Hash buckets are shared by all the futex_keys that hash to the same
  * location.  Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
        return 0;
 }
 
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
 {
        drop_futex_key_refs(key);
 }
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 /*
  * Wake up waiters matching bitset queued on this futex (uaddr).
  */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
 
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
 
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        }
 
        spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
 out:
        return ret;
 }
@@ -917,7 +931,7 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 
 retry:
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -962,11 +976,11 @@ retry_private:
                if (ret)
                        goto out_put_keys;
 
-               if (!fshared)
+               if (!(flags & FLAGS_SHARED))
                        goto retry_private;
 
-               put_futex_key(fshared, &key2);
-               put_futex_key(fshared, &key1);
+               put_futex_key(&key2);
+               put_futex_key(&key1);
                goto retry;
        }
 
@@ -996,9 +1010,9 @@ retry_private:
 
        double_unlock_hb(hb1, hb2);
 out_put_keys:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
 out_put_key1:
-       put_futex_key(fshared, &key1);
+       put_futex_key(&key1);
 out:
        return ret;
 }
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
  * @uaddr1:    source futex user address
- * @fshared:   0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:     futex flags (FLAGS_SHARED, etc.)
  * @uaddr2:    target futex user address
  * @nr_wake:   number of waiters to wake (must be 1 for requeue_pi)
  * @nr_requeue:        number of waiters to requeue (0-INT_MAX)
  * @cmpval:    @uaddr1 expected value (or %NULL)
  * @requeue_pi:        if we are attempting to requeue from a non-pi futex to a
- *             pi futex (pi to pi requeue is not supported)
+ *             pi futex (pi to pi requeue is not supported)
  *
  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
  * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  * >=0 - on success, the number of tasks requeued or woken
  *  <0 - on error
  */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-                        int nr_wake, int nr_requeue, u32 *cmpval,
-                        int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+                        u32 __user *uaddr2, int nr_wake, int nr_requeue,
+                        u32 *cmpval, int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
                pi_state = NULL;
        }
 
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -1216,11 +1230,11 @@ retry_private:
                        if (ret)
                                goto out_put_keys;
 
-                       if (!fshared)
+                       if (!(flags & FLAGS_SHARED))
                                goto retry_private;
 
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                        goto retry;
                }
                if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
                        break;
                case -EFAULT:
                        double_unlock_hb(hb1, hb2);
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
                case -EAGAIN:
                        /* The owner was exiting, try again. */
                        double_unlock_hb(hb1, hb2);
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                        cond_resched();
                        goto retry;
                default:
@@ -1352,9 +1366,9 @@ out_unlock:
                drop_futex_key_refs(&key1);
 
 out_put_keys:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
 out_put_key1:
-       put_futex_key(fshared, &key1);
+       put_futex_key(&key1);
 out:
        if (pi_state != NULL)
                free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
  * private futexes.
  */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                               struct task_struct *newowner, int fshared)
+                               struct task_struct *newowner)
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
        goto retry;
 }
 
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED           0x01
-#define FLAGS_CLOCKRT          0x02
-#define FLAGS_HAS_TIMEOUT      0x04
-
 static long futex_wait_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
  * @uaddr:     user address of the futex
- * @fshared:   whether the futex is shared (1) or not (0)
  * @q:         futex_q (contains pi_state and access to the rt_mutex)
  * @locked:    if the attempt to take the rt_mutex succeeded (1) or not (0)
  *
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
  *  0 - success, lock not taken
  * <0 - on error (-EFAULT)
  */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
-                      int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 {
        struct task_struct *owner;
        int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * did a lock-steal - fix up the PI-state in that case:
                 */
                if (q->pi_state->owner != current)
-                       ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                       ret = fixup_pi_state_owner(uaddr, q, current);
                goto out;
        }
 
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * lock. Fix the state up.
                 */
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-               ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+               ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
 
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  * futex_wait_setup() - Prepare to wait on a futex
  * @uaddr:     the futex userspace address
  * @val:       the expected value
- * @fshared:   whether the futex is shared (1) or not (0)
+ * @flags:     futex flags (FLAGS_SHARED, etc.)
  * @q:         the associated futex_q
  * @hb:                storage for hash_bucket pointer to be returned to caller
  *
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  *  0 - uaddr contains val and hb has been locked
  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
  */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
 {
        u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         * rare, but normal.
         */
 retry:
-       q->key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q->key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
        if (unlikely(ret != 0))
                return ret;
 
@@ -1769,10 +1772,10 @@ retry_private:
                if (ret)
                        goto out;
 
-               if (!fshared)
+               if (!(flags & FLAGS_SHARED))
                        goto retry_private;
 
-               put_futex_key(fshared, &q->key);
+               put_futex_key(&q->key);
                goto retry;
        }
 
@@ -1783,32 +1786,29 @@ retry_private:
 
 out:
        if (ret)
-               put_futex_key(fshared, &q->key);
+               put_futex_key(&q->key);
        return ret;
 }
 
-static int futex_wait(u32 __user *uaddr, int fshared,
-                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+                     ktime_t *abs_time, u32 bitset)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
-       struct futex_q q;
+       struct futex_q q = futex_q_init;
        int ret;
 
        if (!bitset)
                return -EINVAL;
-
-       q.pi_state = NULL;
        q.bitset = bitset;
-       q.rt_waiter = NULL;
-       q.requeue_pi_key = NULL;
 
        if (abs_time) {
                to = &timeout;
 
-               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                     HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
         * Prepare to wait on uaddr. On success, holds hb lock and increments
         * q.key refs.
         */
-       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out;
 
@@ -1852,12 +1852,7 @@ retry:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-       restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
-       if (fshared)
-               restart->futex.flags |= FLAGS_SHARED;
-       if (clockrt)
-               restart->futex.flags |= FLAGS_CLOCKRT;
+       restart->futex.flags = flags;
 
        ret = -ERESTART_RESTARTBLOCK;
 
@@ -1873,7 +1868,6 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = restart->futex.uaddr;
-       int fshared = 0;
        ktime_t t, *tp = NULL;
 
        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
                tp = &t;
        }
        restart->fn = do_no_restart_syscall;
-       if (restart->futex.flags & FLAGS_SHARED)
-               fshared = 1;
-       return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
-                               restart->futex.bitset,
-                               restart->futex.flags & FLAGS_CLOCKRT);
+
+       return (long)futex_wait(uaddr, restart->futex.flags,
+                               restart->futex.val, tp, restart->futex.bitset);
 }
 
 
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
-                        int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+                        ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-       struct futex_q q;
+       struct futex_q q = futex_q_init;
        int res, ret;
 
        if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                hrtimer_set_expires(&to->timer, *time);
        }
 
-       q.pi_state = NULL;
-       q.rt_waiter = NULL;
-       q.requeue_pi_key = NULL;
 retry:
-       q.key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q.key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
        if (unlikely(ret != 0))
                goto out;
 
@@ -1941,7 +1929,7 @@ retry_private:
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                       put_futex_key(fshared, &q.key);
+                       put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
                default:
@@ -1971,7 +1959,7 @@ retry_private:
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
-       res = fixup_owner(uaddr, fshared, &q, !ret);
+       res = fixup_owner(uaddr, &q, !ret);
        /*
         * If fixup_owner() returned an error, proprogate that.  If it acquired
         * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
        queue_unlock(&q, hb);
 
 out_put_key:
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
        if (ret)
                goto out_put_key;
 
-       if (!fshared)
+       if (!(flags & FLAGS_SHARED))
                goto retry_private;
 
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
        goto retry;
 }
 
@@ -2020,7 +2008,7 @@ uaddr_faulted:
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
 
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
 
@@ -2093,14 +2081,14 @@ retry:
 
 out_unlock:
        spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
 
 out:
        return ret;
 
 pi_faulted:
        spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
 
        ret = fault_in_user_writeable(uaddr);
        if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 /**
  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
  * @uaddr:     the futex we initially wait on (non-pi)
- * @fshared:   whether the futexes are shared (1) or not (0).  They must be
+ * @flags:     futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
  *             the same type, no requeueing from private to shared, etc.
  * @val:       the expected value of uaddr
  * @abs_time:  absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  *  0 - On success
  * <0 - On error
  */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 val, ktime_t *abs_time, u32 bitset,
-                                int clockrt, u32 __user *uaddr2)
+                                u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct rt_mutex *pi_mutex = NULL;
        struct futex_hash_bucket *hb;
-       union futex_key key2;
-       struct futex_q q;
+       union futex_key key2 = FUTEX_KEY_INIT;
+       struct futex_q q = futex_q_init;
        int res, ret;
 
        if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 
        if (abs_time) {
                to = &timeout;
-               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                     HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
 
-       key2 = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out;
 
-       q.pi_state = NULL;
        q.bitset = bitset;
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
         * count.
         */
-       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out_key2;
 
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 */
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
-                       ret = fixup_pi_state_owner(uaddr2, &q, current,
-                                                  fshared);
+                       ret = fixup_pi_state_owner(uaddr2, &q, current);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
                 */
-               res = fixup_owner(uaddr2, fshared, &q, !ret);
+               res = fixup_owner(uaddr2, &q, !ret);
                /*
                 * If fixup_owner() returned an error, proprogate that.  If it
                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        }
 
 out_put_keys:
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
 out_key2:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
 
 out:
        if (to) {
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-       int clockrt, ret = -ENOSYS;
-       int cmd = op & FUTEX_CMD_MASK;
-       int fshared = 0;
+       int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+       unsigned int flags = 0;
 
        if (!(op & FUTEX_PRIVATE_FLAG))
-               fshared = 1;
+               flags |= FLAGS_SHARED;
 
-       clockrt = op & FUTEX_CLOCK_REALTIME;
-       if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
-               return -ENOSYS;
+       if (op & FUTEX_CLOCK_REALTIME) {
+               flags |= FLAGS_CLOCKRT;
+               if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                       return -ENOSYS;
+       }
 
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-               ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+               ret = futex_wait(uaddr, flags, val, timeout, val3);
                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-               ret = futex_wake(uaddr, fshared, val, val3);
+               ret = futex_wake(uaddr, flags, val, val3);
                break;
        case FUTEX_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
-                                   0);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
                break;
        case FUTEX_WAKE_OP:
-               ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+               ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
                if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+                       ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                break;
        case FUTEX_UNLOCK_PI:
                if (futex_cmpxchg_enabled)
-                       ret = futex_unlock_pi(uaddr, fshared);
+                       ret = futex_unlock_pi(uaddr, flags);
                break;
        case FUTEX_TRYLOCK_PI:
                if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+                       ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-               ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
-                                           clockrt, uaddr2);
+               ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+                                           uaddr2);
                break;
        case FUTEX_CMP_REQUEUE_PI:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
-                                   1);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
                break;
        default:
                ret = -ENOSYS;
index 72206cf5c6cf854898d889a6a645e44febdd526f..f2429fc3438c4f1c2094e59fe54415dc30e4bb51 100644 (file)
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                struct hrtimer *timer;
+               struct timerqueue_node *next;
 
-               if (!base->first)
+               next = timerqueue_getnext(&base->active);
+               if (!next)
                        continue;
-               timer = rb_entry(base->first, struct hrtimer, node);
+               timer = container_of(next, struct hrtimer, node);
+
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                /*
                 * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
 static int enqueue_hrtimer(struct hrtimer *timer,
                           struct hrtimer_clock_base *base)
 {
-       struct rb_node **link = &base->active.rb_node;
-       struct rb_node *parent = NULL;
-       struct hrtimer *entry;
-       int leftmost = 1;
-
        debug_activate(timer);
 
-       /*
-        * Find the right place in the rbtree:
-        */
-       while (*link) {
-               parent = *link;
-               entry = rb_entry(parent, struct hrtimer, node);
-               /*
-                * We dont care about collisions. Nodes with
-                * the same expiry time stay together.
-                */
-               if (hrtimer_get_expires_tv64(timer) <
-                               hrtimer_get_expires_tv64(entry)) {
-                       link = &(*link)->rb_left;
-               } else {
-                       link = &(*link)->rb_right;
-                       leftmost = 0;
-               }
-       }
-
-       /*
-        * Insert the timer to the rbtree and check whether it
-        * replaces the first pending timer
-        */
-       if (leftmost)
-               base->first = &timer->node;
+       timerqueue_add(&base->active, &timer->node);
 
-       rb_link_node(&timer->node, parent, link);
-       rb_insert_color(&timer->node, &base->active);
        /*
         * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
         * state of a possibly running callback.
         */
        timer->state |= HRTIMER_STATE_ENQUEUED;
 
-       return leftmost;
+       return (&timer->node == base->active.next);
 }
 
 /*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
        if (!(timer->state & HRTIMER_STATE_ENQUEUED))
                goto out;
 
-       /*
-        * Remove the timer from the rbtree and replace the first
-        * entry pointer if necessary.
-        */
-       if (base->first == &timer->node) {
-               base->first = rb_next(&timer->node);
+       if (&timer->node == timerqueue_getnext(&base->active)) {
 #ifdef CONFIG_HIGH_RES_TIMERS
                /* Reprogram the clock event device. if enabled */
                if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
                }
 #endif
        }
-       rb_erase(&timer->node, &base->active);
+       timerqueue_del(&base->active, &timer->node);
 out:
        timer->state = newstate;
 }
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
        if (!hrtimer_hres_active()) {
                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                        struct hrtimer *timer;
+                       struct timerqueue_node *next;
 
-                       if (!base->first)
+                       next = timerqueue_getnext(&base->active);
+                       if (!next)
                                continue;
 
-                       timer = rb_entry(base->first, struct hrtimer, node);
+                       timer = container_of(next, struct hrtimer, node);
                        delta.tv64 = hrtimer_get_expires_tv64(timer);
                        delta = ktime_sub(delta, base->get_time());
                        if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
        timer->base = &cpu_base->clock_base[clock_id];
        hrtimer_init_timer_hres(timer);
+       timerqueue_init(&timer->node);
 
 #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
 
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                ktime_t basenow;
-               struct rb_node *node;
+               struct timerqueue_node *node;
 
                basenow = ktime_add(now, base->offset);
 
-               while ((node = base->first)) {
+               while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
 
-                       timer = rb_entry(node, struct hrtimer, node);
+                       timer = container_of(node, struct hrtimer, node);
 
                        /*
                         * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
  */
 void hrtimer_run_queues(void)
 {
-       struct rb_node *node;
+       struct timerqueue_node *node;
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
        int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
 
        for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
                base = &cpu_base->clock_base[index];
-
-               if (!base->first)
+               if (!timerqueue_getnext(&base->active))
                        continue;
 
                if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
 
                raw_spin_lock(&cpu_base->lock);
 
-               while ((node = base->first)) {
+               while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;
 
-                       timer = rb_entry(node, struct hrtimer, node);
+                       timer = container_of(node, struct hrtimer, node);
                        if (base->softirq_time.tv64 <=
                                        hrtimer_get_expires_tv64(timer))
                                break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
        raw_spin_lock_init(&cpu_base->lock);
 
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                cpu_base->clock_base[i].cpu_base = cpu_base;
+               timerqueue_init_head(&cpu_base->clock_base[i].active);
+       }
 
        hrtimer_init_hres(cpu_base);
 }
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
 {
        struct hrtimer *timer;
-       struct rb_node *node;
+       struct timerqueue_node *node;
 
-       while ((node = rb_first(&old_base->active))) {
-               timer = rb_entry(node, struct hrtimer, node);
+       while ((node = timerqueue_getnext(&old_base->active))) {
+               timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);
 
index 5f92acc5f952e0afb0489017c265a943a4a7d464..91a5fa25054e1d14d62339749f3229fae49f3766 100644 (file)
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
  */
 static int irq_thread(void *data)
 {
-       struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+       static struct sched_param param = {
+               .sched_priority = MAX_USER_RT_PRIO/2,
+       };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
        int wake, oneshot = desc->status & IRQ_ONESHOT;
index ca61bbdd44b2e11acad4d866ab31224a5a5cab16..5355cfd44a3fd21cd767c13d053410ec338f2ede 100644 (file)
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        wait_for_completion(&create.done);
 
        if (!IS_ERR(create.result)) {
-               struct sched_param param = { .sched_priority = 0 };
+               static struct sched_param param = { .sched_priority = 0 };
                va_list args;
 
                va_start(args, namefmt);
index 59b76c8ce9d7172e8176f355da9719495077a133..1969d2fc4b36328cf48798620506ddcd0ec330d0 100644 (file)
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                namelen += 2;
 
        for (i = 0; i < LOCKSTAT_POINTS; i++) {
-               char sym[KSYM_SYMBOL_LEN];
                char ip[32];
 
                if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                if (!i)
                        seq_line(m, '-', 40-namelen, namelen);
 
-               sprint_symbol(sym, class->contention_point[i]);
                snprintf(ip, sizeof(ip), "[<%p>]",
                                (void *)class->contention_point[i]);
-               seq_printf(m, "%40s %14lu %29s %s\n", name,
-                               stats->contention_point[i],
-                               ip, sym);
+               seq_printf(m, "%40s %14lu %29s %pS\n",
+                          name, stats->contention_point[i],
+                          ip, (void *)class->contention_point[i]);
        }
        for (i = 0; i < LOCKSTAT_POINTS; i++) {
-               char sym[KSYM_SYMBOL_LEN];
                char ip[32];
 
                if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                if (!i)
                        seq_line(m, '-', 40-namelen, namelen);
 
-               sprint_symbol(sym, class->contending_point[i]);
                snprintf(ip, sizeof(ip), "[<%p>]",
                                (void *)class->contending_point[i]);
-               seq_printf(m, "%40s %14lu %29s %s\n", name,
-                               stats->contending_point[i],
-                               ip, sym);
+               seq_printf(m, "%40s %14lu %29s %pS\n",
+                          name, stats->contending_point[i],
+                          ip, (void *)class->contending_point[i]);
        }
        if (i) {
                seq_puts(m, "\n");
index d190664f25ff3fa10dca29f37b483f08ad07eae1..34e00b708fad2c79b260ab3d8d4cc199cece8eca 100644 (file)
@@ -56,6 +56,7 @@
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
 #include <linux/jump_label.h>
+#include <linux/pfn.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 #define ARCH_SHF_SMALL 0
 #endif
 
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ?                \
+               (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+                        PFN_DOWN((unsigned long)BASE) + 1)     \
+               : (0UL))
+
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
 
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
        return 0;
 }
 
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+       unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+       unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+
+       if (end_pfn > begin_pfn)
+               set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+
+static void set_section_ro_nx(void *base,
+                       unsigned long text_size,
+                       unsigned long ro_size,
+                       unsigned long total_size)
+{
+       /* begin and end PFNs of the current subsection */
+       unsigned long begin_pfn;
+       unsigned long end_pfn;
+
+       /*
+        * Set RO for module text and RO-data:
+        * - Always protect first page.
+        * - Do not protect last partial page.
+        */
+       if (ro_size > 0)
+               set_page_attributes(base, base + ro_size, set_memory_ro);
+
+       /*
+        * Set NX permissions for module data:
+        * - Do not protect first partial page.
+        * - Always protect last page.
+        */
+       if (total_size > text_size) {
+               begin_pfn = PFN_UP((unsigned long)base + text_size);
+               end_pfn = PFN_UP((unsigned long)base + total_size);
+               if (end_pfn > begin_pfn)
+                       set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+       }
+}
+
+/* Setting memory back to RW+NX before releasing it */
+void unset_section_ro_nx(struct module *mod, void *module_region)
+{
+       unsigned long total_pages;
+
+       if (mod->module_core == module_region) {
+               /* Set core as NX+RW */
+               total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+               set_memory_nx((unsigned long)mod->module_core, total_pages);
+               set_memory_rw((unsigned long)mod->module_core, total_pages);
+
+       } else if (mod->module_init == module_region) {
+               /* Set init as NX+RW */
+               total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+               set_memory_nx((unsigned long)mod->module_init, total_pages);
+               set_memory_rw((unsigned long)mod->module_init, total_pages);
+       }
+}
+
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw()
+{
+       struct module *mod;
+
+       mutex_lock(&module_mutex);
+       list_for_each_entry_rcu(mod, &modules, list) {
+               if ((mod->module_core) && (mod->core_text_size)) {
+                       set_page_attributes(mod->module_core,
+                                               mod->module_core + mod->core_text_size,
+                                               set_memory_rw);
+               }
+               if ((mod->module_init) && (mod->init_text_size)) {
+                       set_page_attributes(mod->module_init,
+                                               mod->module_init + mod->init_text_size,
+                                               set_memory_rw);
+               }
+       }
+       mutex_unlock(&module_mutex);
+}
+
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro()
+{
+       struct module *mod;
+
+       mutex_lock(&module_mutex);
+       list_for_each_entry_rcu(mod, &modules, list) {
+               if ((mod->module_core) && (mod->core_text_size)) {
+                       set_page_attributes(mod->module_core,
+                                               mod->module_core + mod->core_text_size,
+                                               set_memory_ro);
+               }
+               if ((mod->module_init) && (mod->init_text_size)) {
+                       set_page_attributes(mod->module_init,
+                                               mod->module_init + mod->init_text_size,
+                                               set_memory_ro);
+               }
+       }
+       mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+#endif
+
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
        destroy_params(mod->kp, mod->num_kp);
 
        /* This may be NULL, but that's OK */
+       unset_section_ro_nx(mod, mod->module_init);
        module_free(mod, mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
        lockdep_free_key_range(mod->module_core, mod->core_size);
 
        /* Finally, free the core (containing the module structure) */
+       unset_section_ro_nx(mod, mod->module_core);
        module_free(mod, mod->module_core);
 
 #ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
                        DEBUGP("\t%s\n", name);
                }
-               if (m == 0)
+               switch (m) {
+               case 0: /* executable */
+                       mod->core_size = debug_align(mod->core_size);
                        mod->core_text_size = mod->core_size;
+                       break;
+               case 1: /* RO: text and ro-data */
+                       mod->core_size = debug_align(mod->core_size);
+                       mod->core_ro_size = mod->core_size;
+                       break;
+               case 3: /* whole core */
+                       mod->core_size = debug_align(mod->core_size);
+                       break;
+               }
        }
 
        DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                         | INIT_OFFSET_MASK);
                        DEBUGP("\t%s\n", sname);
                }
-               if (m == 0)
+               switch (m) {
+               case 0: /* executable */
+                       mod->init_size = debug_align(mod->init_size);
                        mod->init_text_size = mod->init_size;
+                       break;
+               case 1: /* RO: text and ro-data */
+                       mod->init_size = debug_align(mod->init_size);
+                       mod->init_ro_size = mod->init_size;
+                       break;
+               case 3: /* whole init */
+                       mod->init_size = debug_align(mod->init_size);
+                       break;
+               }
        }
 }
 
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        blocking_notifier_call_chain(&module_notify_list,
                        MODULE_STATE_COMING, mod);
 
+       /* Set RO and NX regions for core */
+       set_section_ro_nx(mod->module_core,
+                               mod->core_text_size,
+                               mod->core_ro_size,
+                               mod->core_size);
+
+       /* Set RO and NX regions for init */
+       set_section_ro_nx(mod->module_init,
+                               mod->init_text_size,
+                               mod->init_ro_size,
+                               mod->init_size);
+
        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        mod->symtab = mod->core_symtab;
        mod->strtab = mod->core_strtab;
 #endif
+       unset_section_ro_nx(mod, mod->module_init);
        module_free(mod, mod->module_init);
        mod->module_init = NULL;
        mod->init_size = 0;
index 200407c1502f509ee3f9d8a665bc4d3b78a27f74..a5889fb28ecff33eaf5fae64c9d2a50ca03cb2f7 100644 (file)
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * memory barriers as we'll eventually observe the right
                 * values at the cost of a few extra spins.
                 */
-               cpu_relax();
+               arch_mutex_cpu_relax();
        }
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
index 9ca4973f736d53b04bf4eea9373ce635cf7098c3..93bd2eb2bc53efe76dd120501b0cbda115b71bfd 100644 (file)
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
 
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
 
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+
+#define lock_timer(tid, flags)                                            \
+({     struct k_itimer *__timr;                                           \
+       __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
+       __timr;                                                            \
+})
 
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
@@ -619,7 +625,7 @@ out:
  * the find to the timer lock.  To avoid a dead lock, the timer id MUST
  * be release with out holding the timer lock.
  */
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
        /*
index a23315dc4498844c113cecc9792eabd063e1d87b..ab3ffc5b3b64613507134573dbb94af132c4adff 100644 (file)
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
 
 void printk_tick(void)
 {
-       if (__get_cpu_var(printk_pending)) {
-               __get_cpu_var(printk_pending) = 0;
+       if (__this_cpu_read(printk_pending)) {
+               __this_cpu_write(printk_pending, 0);
                wake_up_interruptible(&log_wait);
        }
 }
 
 int printk_needs_cpu(int cpu)
 {
-       if (unlikely(cpu_is_offline(cpu)))
+       if (cpu_is_offline(cpu))
                printk_tick();
-       return per_cpu(printk_pending, cpu);
+       return __this_cpu_read(printk_pending);
 }
 
 void wake_up_klogd(void)
index d806735342acb10bc3e3ae787e62ade34f1d5955..0344937247495d69b3ef5255ace0d94b2250fac2 100644 (file)
 #include <linux/time.h>
 #include <linux/cpu.h>
 
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-       struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
-       struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
-       struct rcu_head **curtail;      /* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-       .donetail       = &rcu_sched_ctrlblk.rcucblist,
-       .curtail        = &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-       .donetail       = &rcu_bh_ctrlblk.rcucblist,
-       .curtail        = &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
 
 /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
 }
 
 /*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
 }
 
 /*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
 }
 
 /*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
  */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
        struct rcu_head *next, *list;
        unsigned long flags;
+       RCU_TRACE(int cb_count = 0);
 
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
+               local_bh_disable();
                list->func(list);
+               local_bh_enable();
                list = next;
+               RCU_TRACE(cb_count++);
        }
+       RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
 
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
  */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
 {
-       __rcu_process_callbacks(&rcu_sched_ctrlblk);
-       __rcu_process_callbacks(&rcu_bh_ctrlblk);
-       rcu_preempt_process_callbacks();
+       unsigned long work;
+       unsigned long morework;
+       unsigned long flags;
+
+       for (;;) {
+               wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+               morework = rcu_boost();
+               local_irq_save(flags);
+               work = have_rcu_kthread_work;
+               have_rcu_kthread_work = morework;
+               local_irq_restore(flags);
+               if (work) {
+                       rcu_process_callbacks(&rcu_sched_ctrlblk);
+                       rcu_process_callbacks(&rcu_bh_ctrlblk);
+                       rcu_preempt_process_callbacks();
+               }
+               schedule_timeout_interruptible(1); /* Leave CPU for others. */
+       }
+
+       return 0;  /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       have_rcu_kthread_work = 1;
+       wake_up(&rcu_kthread_wq);
+       local_irq_restore(flags);
 }
 
 /*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
        local_irq_save(flags);
        *rcp->curtail = head;
        rcp->curtail = &head->next;
+       RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
 }
 
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+       struct sched_param sp;
+
+       rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+       sp.sched_priority = RCU_BOOST_PRIO;
+       sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+       return 0;
 }
+early_initcall(rcu_spawn_kthreads);
index 6ceca4f745ffa1f4535c69467ea59704e2ddbe97..015abaea962ad4087130014506b72dc19b33b43d 100644 (file)
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt)        stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+       struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+       struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+       struct rcu_head **curtail;      /* ->next pointer of last CB. */
+       RCU_TRACE(long qlen);           /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+       .donetail       = &rcu_sched_ctrlblk.rcucblist,
+       .curtail        = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+       .donetail       = &rcu_bh_ctrlblk.rcucblist,
+       .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
 
 #include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
        struct list_head *gp_tasks;
                                /* Pointer to the first task blocking the */
                                /*  current grace period, or NULL if there */
-                               /*  is not such task. */
+                               /*  is no such task. */
        struct list_head *exp_tasks;
                                /* Pointer to first task blocking the */
                                /*  current expedited grace period, or NULL */
                                /*  if there is no such task.  If there */
                                /*  is no current expedited grace period, */
                                /*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+       struct list_head *boost_tasks;
+                               /* Pointer to first task that needs to be */
+                               /*  priority-boosted, or NULL if no priority */
+                               /*  boosting is needed.  If there is no */
+                               /*  current or expedited grace period, there */
+                               /*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
        u8 gpnum;               /* Current grace period. */
        u8 gpcpu;               /* Last grace period blocked by the CPU. */
        u8 completed;           /* Last grace period completed. */
                                /*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+       s8 boosted_this_gp;     /* Has boosting already happened? */
+       unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+       unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+       unsigned long n_tasks_boosted;
+       unsigned long n_exp_boosts;
+       unsigned long n_normal_boosts;
+       unsigned long n_normal_balk_blkd_tasks;
+       unsigned long n_normal_balk_gp_tasks;
+       unsigned long n_normal_balk_boost_tasks;
+       unsigned long n_normal_balk_boosted;
+       unsigned long n_normal_balk_notyet;
+       unsigned long n_normal_balk_nos;
+       unsigned long n_exp_balk_blkd_tasks;
+       unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
 
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -121,6 +183,210 @@ static int rcu_preempt_gp_in_progress(void)
        return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
 }
 
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+       struct list_head *np;
+
+       np = t->rcu_node_entry.next;
+       if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+               np = NULL;
+       return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+       seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+                  rcu_preempt_ctrlblk.rcb.qlen,
+                  rcu_preempt_ctrlblk.n_grace_periods,
+                  rcu_preempt_ctrlblk.gpnum,
+                  rcu_preempt_ctrlblk.gpcpu,
+                  rcu_preempt_ctrlblk.completed,
+                  "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+                  "N."[!rcu_preempt_ctrlblk.gp_tasks],
+                  "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+       seq_printf(m, "             ttb=%c btg=",
+                  "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+       switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+       case -1:
+               seq_puts(m, "exp");
+               break;
+       case 0:
+               seq_puts(m, "no");
+               break;
+       case 1:
+               seq_puts(m, "begun");
+               break;
+       case 2:
+               seq_puts(m, "done");
+               break;
+       default:
+               seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+       }
+       seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+                  rcu_preempt_ctrlblk.n_tasks_boosted,
+                  rcu_preempt_ctrlblk.n_exp_boosts,
+                  rcu_preempt_ctrlblk.n_normal_boosts,
+                  (int)(jiffies & 0xffff),
+                  (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+       seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+                  "normal balk",
+                  rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+                  rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+                  rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+                  rcu_preempt_ctrlblk.n_normal_balk_boosted,
+                  rcu_preempt_ctrlblk.n_normal_balk_notyet,
+                  rcu_preempt_ctrlblk.n_normal_balk_nos);
+       seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+                  rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+                  rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+       unsigned long flags;
+       struct rt_mutex mtx;
+       struct list_head *np;
+       struct task_struct *t;
+
+       if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+               return 0;  /* Nothing to boost. */
+       raw_local_irq_save(flags);
+       rcu_preempt_ctrlblk.boosted_this_gp++;
+       t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+                        rcu_node_entry);
+       np = rcu_next_node_entry(t);
+       rt_mutex_init_proxy_locked(&mtx, t);
+       t->rcu_boost_mutex = &mtx;
+       t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+       raw_local_irq_restore(flags);
+       rt_mutex_lock(&mtx);
+       RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+       rcu_preempt_ctrlblk.boosted_this_gp++;
+       rt_mutex_unlock(&mtx);
+       return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+       if (!rcu_preempt_blocked_readers_cgp()) {
+               RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+               return 0;
+       }
+       if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+           rcu_preempt_ctrlblk.boost_tasks == NULL &&
+           rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+           ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+               rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+               invoke_rcu_kthread();
+               RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+       } else
+               RCU_TRACE(rcu_initiate_boost_trace());
+       return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+       unsigned long flags;
+
+       raw_local_irq_save(flags);
+       if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+               rcu_preempt_ctrlblk.boost_tasks =
+                       rcu_preempt_ctrlblk.blkd_tasks.next;
+               rcu_preempt_ctrlblk.boosted_this_gp = -1;
+               invoke_rcu_kthread();
+               RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+       } else
+               RCU_TRACE(rcu_initiate_exp_boost_trace());
+       raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+       rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+       if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+               rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+       return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+       return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
 /*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 
+       /* If there is no GP then there is nothing more to do.  */
+       if (!rcu_preempt_gp_in_progress())
+               return;
        /*
-        * If there is no GP, or if blocked readers are still blocking GP,
-        * then there is nothing more to do.
+        * Check up on boosting.  If there are no readers blocking the
+        * current grace period, leave.
         */
-       if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+       if (rcu_initiate_boost())
                return;
 
        /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
        if (!rcu_preempt_blocked_readers_any())
                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
 
-       /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+       /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
 }
 
 /*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
 
                /* Official start of GP. */
                rcu_preempt_ctrlblk.gpnum++;
+               RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
 
                /* Any blocked RCU readers block new GP. */
                if (rcu_preempt_blocked_readers_any())
                        rcu_preempt_ctrlblk.gp_tasks =
                                rcu_preempt_ctrlblk.blkd_tasks.next;
 
+               /* Set up for RCU priority boosting. */
+               rcu_preempt_boost_start_gp();
+
                /* If there is no running reader, CPU is done with GP. */
                if (!rcu_preempt_running_reader())
                        rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                empty = !rcu_preempt_blocked_readers_cgp();
                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-               np = t->rcu_node_entry.next;
-               if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-                       np = NULL;
+               np = rcu_next_node_entry(t);
                list_del(&t->rcu_node_entry);
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
                        rcu_preempt_ctrlblk.gp_tasks = np;
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
                        rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+               if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+                       rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
                INIT_LIST_HEAD(&t->rcu_node_entry);
 
                /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_report_exp_done();
        }
+#ifdef CONFIG_RCU_BOOST
+       /* Unboost self if was boosted. */
+       if (special & RCU_READ_UNLOCK_BOOSTED) {
+               t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+               rt_mutex_unlock(t->rcu_boost_mutex);
+               t->rcu_boost_mutex = NULL;
+       }
+#endif /* #ifdef CONFIG_RCU_BOOST */
        local_irq_restore(flags);
 }
 
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
            rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
 
 /*
  * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
  * handle that case.  Of course, it is invoked for all flavors of
  * RCU, but RCU callbacks can appear only on one of the lists, and
  * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
  */
 static void rcu_preempt_process_callbacks(void)
 {
-       __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+       rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 
 /*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        local_irq_save(flags);
        *rcu_preempt_ctrlblk.nexttail = head;
        rcu_preempt_ctrlblk.nexttail = &head->next;
+       RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
        rcu_preempt_start_gp();  /* checks to see if GP needed. */
        local_irq_restore(flags);
 }
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
 
        /* Wait for tail of ->blkd_tasks list to drain. */
        if (rcu_preempted_readers_exp())
+               rcu_initiate_expedited_boost();
                wait_event(sync_rcu_preempt_exp_wq,
                           !rcu_preempted_readers_exp());
 
@@ -572,6 +857,27 @@ void exit_rcu(void)
 
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+       return 0;
+}
+
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 #include <linux/kernel_stat.h>
 
 /*
  * During boot, we forgive RCU lockdep issues.  After this function is
  * invoked, we start taking RCU lockdep issues seriously.
  */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
        WARN_ON(nr_context_switches() > 0);
        rcu_scheduler_active = 1;
 }
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+       if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+               rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+       else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+               rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+       else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+               rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+       else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+               rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+       else
+               rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+       if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+               rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+       else
+               rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+       unsigned long flags;
+
+       raw_local_irq_save(flags);
+       rcp->qlen -= n;
+       raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+       show_tiny_preempt_stats(m);
+       seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+       seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+       return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+       .owner = THIS_MODULE,
+       .open = show_tiny_stats_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+       struct dentry *retval;
+
+       rcudir = debugfs_create_dir("rcu", NULL);
+       if (!rcudir)
+               goto free_out;
+       retval = debugfs_create_file("rcudata", 0444, rcudir,
+                                    NULL, &show_tiny_stats_fops);
+       if (!retval)
+               goto free_out;
+       return 0;
+free_out:
+       debugfs_remove_recursive(rcudir);
+       return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+       debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
index 9d8e8fb2515f4e4801c214841a7f8c95b8b45ffe..89613f97ff264e35cac497419bd0a4dac798ce5f 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
+#include <linux/sched.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1;     /* RCU readers from irq (timers). */
 static int fqs_duration = 0;   /* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;    /* Hold time within burst (us). */
 static int fqs_stutter = 3;    /* Wait time between bursts (s). */
+static int test_boost = 1;     /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime;  /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);             /* protect setting boost_starttime */
+                                       /*  and boost task create/destroy. */
+
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 
 #define FULLSTOP_DONTSTOP 0    /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
+       int can_boost;
        char *name;
 };
 
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+       .can_boost      = rcu_can_boost(),
        .name           = "rcu"
 };
 
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+       .can_boost      = rcu_can_boost(),
        .name           = "rcu_sync"
 };
 
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
+       .can_boost      = rcu_can_boost(),
        .name           = "rcu_expedited"
 };
 
@@ -683,6 +714,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
        .name           = "sched_expedited"
 };
 
+/*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+       struct rcu_head rcu;
+       int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+       struct rcu_boost_inflight *rbip =
+               container_of(head, struct rcu_boost_inflight, rcu);
+
+       smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+       rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+       unsigned long call_rcu_time;
+       unsigned long endtime;
+       unsigned long oldstarttime;
+       struct rcu_boost_inflight rbi = { .inflight = 0 };
+       struct sched_param sp;
+
+       VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+       /* Set real-time priority. */
+       sp.sched_priority = 1;
+       if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+               VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+               n_rcu_torture_boost_rterror++;
+       }
+
+       /* Each pass through the following loop does one boost-test cycle. */
+       do {
+               /* Wait for the next test interval. */
+               oldstarttime = boost_starttime;
+               while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                       schedule_timeout_uninterruptible(1);
+                       rcu_stutter_wait("rcu_torture_boost");
+                       if (kthread_should_stop() ||
+                           fullstop != FULLSTOP_DONTSTOP)
+                               goto checkwait;
+               }
+
+               /* Do one boost-test interval. */
+               endtime = oldstarttime + test_boost_duration * HZ;
+               call_rcu_time = jiffies;
+               while (jiffies - endtime > ULONG_MAX / 2) {
+                       /* If we don't have a callback in flight, post one. */
+                       if (!rbi.inflight) {
+                               smp_mb(); /* RCU core before ->inflight = 1. */
+                               rbi.inflight = 1;
+                               call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+                               if (jiffies - call_rcu_time >
+                                        test_boost_duration * HZ - HZ / 2) {
+                                       VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+                                       n_rcu_torture_boost_failure++;
+                               }
+                               call_rcu_time = jiffies;
+                       }
+                       cond_resched();
+                       rcu_stutter_wait("rcu_torture_boost");
+                       if (kthread_should_stop() ||
+                           fullstop != FULLSTOP_DONTSTOP)
+                               goto checkwait;
+               }
+
+               /*
+                * Set the start time of the next test interval.
+                * Yes, this is vulnerable to long delays, but such
+                * delays simply cause a false negative for the next
+                * interval.  Besides, we are running at RT priority,
+                * so delays should be relatively rare.
+                */
+               while (oldstarttime == boost_starttime) {
+                       if (mutex_trylock(&boost_mutex)) {
+                               boost_starttime = jiffies +
+                                                 test_boost_interval * HZ;
+                               n_rcu_torture_boosts++;
+                               mutex_unlock(&boost_mutex);
+                               break;
+                       }
+                       schedule_timeout_uninterruptible(1);
+               }
+
+               /* Go do the stutter. */
+checkwait:     rcu_stutter_wait("rcu_torture_boost");
+       } while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+
+       /* Clean up and exit. */
+       VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+       rcutorture_shutdown_absorb("rcu_torture_boost");
+       while (!kthread_should_stop() || rbi.inflight)
+               schedule_timeout_uninterruptible(1);
+       smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+       return 0;
+}
+
 /*
  * RCU torture force-quiescent-state kthread.  Repeatedly induces
  * bursts of calls to force_quiescent_state(), increasing the probability
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-                      "rtmbe: %d nt: %ld",
+                      "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+                      "rtbf: %ld rtb: %ld nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_alloc_fail),
                       atomic_read(&n_rcu_torture_free),
                       atomic_read(&n_rcu_torture_mberror),
+                      n_rcu_torture_boost_ktrerror,
+                      n_rcu_torture_boost_rterror,
+                      n_rcu_torture_boost_allocerror,
+                      n_rcu_torture_boost_afferror,
+                      n_rcu_torture_boost_failure,
+                      n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-       if (atomic_read(&n_rcu_torture_mberror) != 0)
+       if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+           n_rcu_torture_boost_ktrerror != 0 ||
+           n_rcu_torture_boost_rterror != 0 ||
+           n_rcu_torture_boost_allocerror != 0 ||
+           n_rcu_torture_boost_afferror != 0 ||
+           n_rcu_torture_boost_failure != 0)
                cnt += sprintf(&page[cnt], " !!!");
        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
 }
 
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
                "shuffle_interval=%d stutter=%d irqreader=%d "
-               "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+               "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+               "test_boost=%d/%d test_boost_interval=%d "
+               "test_boost_duration=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-               stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+               stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+               test_boost, cur_ops->can_boost,
+               test_boost_interval, test_boost_duration);
 }
 
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
        .notifier_call = rcutorture_shutdown_notify,
 };
 
+static void rcutorture_booster_cleanup(int cpu)
+{
+       struct task_struct *t;
+
+       if (boost_tasks[cpu] == NULL)
+               return;
+       mutex_lock(&boost_mutex);
+       VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+       t = boost_tasks[cpu];
+       boost_tasks[cpu] = NULL;
+       mutex_unlock(&boost_mutex);
+
+       /* This must be outside of the mutex, otherwise deadlock! */
+       kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+       int retval;
+
+       if (boost_tasks[cpu] != NULL)
+               return 0;  /* Already created, nothing more to do. */
+
+       /* Don't allow time recalculation while creating a new task. */
+       mutex_lock(&boost_mutex);
+       VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+       boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+                                         "rcu_torture_boost");
+       if (IS_ERR(boost_tasks[cpu])) {
+               retval = PTR_ERR(boost_tasks[cpu]);
+               VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+               n_rcu_torture_boost_ktrerror++;
+               boost_tasks[cpu] = NULL;
+               mutex_unlock(&boost_mutex);
+               return retval;
+       }
+       kthread_bind(boost_tasks[cpu], cpu);
+       wake_up_process(boost_tasks[cpu]);
+       mutex_unlock(&boost_mutex);
+       return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+                                unsigned long action, void *hcpu)
+{
+       long cpu = (long)hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_DOWN_FAILED:
+               (void)rcutorture_booster_init(cpu);
+               break;
+       case CPU_DOWN_PREPARE:
+               rcutorture_booster_cleanup(cpu);
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+       .notifier_call = rcutorture_cpu_notify,
+};
+
 static void
 rcu_torture_cleanup(void)
 {
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
        }
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
-       unregister_reboot_notifier(&rcutorture_nb);
+       unregister_reboot_notifier(&rcutorture_shutdown_nb);
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
                kthread_stop(fqs_task);
        }
        fqs_task = NULL;
+       if ((test_boost == 1 && cur_ops->can_boost) ||
+           test_boost == 2) {
+               unregister_cpu_notifier(&rcutorture_cpu_nb);
+               for_each_possible_cpu(i)
+                       rcutorture_booster_cleanup(i);
+       }
 
        /* Wait for all RCU callbacks to fire.  */
 
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
        if (cur_ops->cleanup)
                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
-               rcu_torture_print_module_parms("End of test: FAILURE");
+               rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
        else
-               rcu_torture_print_module_parms("End of test: SUCCESS");
+               rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 
 static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
                nrealreaders = nreaders;
        else
                nrealreaders = 2 * num_online_cpus();
-       rcu_torture_print_module_parms("Start of test");
+       rcu_torture_print_module_parms(cur_ops, "Start of test");
        fullstop = FULLSTOP_DONTSTOP;
 
        /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_free, 0);
        atomic_set(&n_rcu_torture_mberror, 0);
        atomic_set(&n_rcu_torture_error, 0);
+       n_rcu_torture_boost_ktrerror = 0;
+       n_rcu_torture_boost_rterror = 0;
+       n_rcu_torture_boost_allocerror = 0;
+       n_rcu_torture_boost_afferror = 0;
+       n_rcu_torture_boost_failure = 0;
+       n_rcu_torture_boosts = 0;
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                atomic_set(&rcu_torture_wcount[i], 0);
        for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
-       register_reboot_notifier(&rcutorture_nb);
+       if (test_boost_interval < 1)
+               test_boost_interval = 1;
+       if (test_boost_duration < 2)
+               test_boost_duration = 2;
+       if ((test_boost == 1 && cur_ops->can_boost) ||
+           test_boost == 2) {
+               int retval;
+
+               boost_starttime = jiffies + test_boost_interval * HZ;
+               register_cpu_notifier(&rcutorture_cpu_nb);
+               for_each_possible_cpu(i) {
+                       if (cpu_is_offline(i))
+                               continue;  /* Heuristic: CPU can go offline. */
+                       retval = rcutorture_booster_init(i);
+                       if (retval < 0) {
+                               firsterr = retval;
+                               goto unwind;
+                       }
+               }
+       }
+       register_reboot_notifier(&rcutorture_shutdown_nb);
        mutex_unlock(&fullstop_mutex);
        return 0;
 
index ccdc04c479815addc8dbacea69643174a4636670..d0ddfea6579d027809cfb0bce885289bac0f957e 100644 (file)
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-       .orphan_cbs_list = NULL, \
-       .orphan_cbs_tail = &structname.orphan_cbs_list, \
-       .orphan_qlen = 0, \
        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
        if (rdp->gpnum != rnp->gpnum) {
-               rdp->qs_pending = 1;
-               rdp->passed_quiesc = 0;
+               /*
+                * If the current grace period is waiting for this CPU,
+                * set up to detect a quiescent state, otherwise don't
+                * go looking for one.
+                */
                rdp->gpnum = rnp->gpnum;
+               if (rnp->qsmask & rdp->grpmask) {
+                       rdp->qs_pending = 1;
+                       rdp->passed_quiesc = 0;
+               } else
+                       rdp->qs_pending = 0;
        }
 }
 
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+
+               /*
+                * If we were in an extended quiescent state, we may have
+                * missed some grace periods that others CPUs handled on
+                * our behalf. Catch up with this state to avoid noting
+                * spurious new grace periods.  If another grace period
+                * has started, then rnp->gpnum will have advanced, so
+                * we will detect this later on.
+                */
+               if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+                       rdp->gpnum = rdp->completed;
+
+               /*
+                * If RCU does not need a quiescent state from this CPU,
+                * then make sure that this CPU doesn't go looking for one.
+                */
+               if ((rnp->qsmask & rdp->grpmask) == 0)
+                       rdp->qs_pending = 0;
        }
 }
 
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU.  The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
  */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
        int i;
+       /* current DYING CPU is cleared in the cpu_online_mask */
+       int receive_cpu = cpumask_any(cpu_online_mask);
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+       struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
 
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-       raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-       *rsp->orphan_cbs_tail = rdp->nxtlist;
-       rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+       *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+       receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+       receive_rdp->qlen += rdp->qlen;
+       receive_rdp->n_cbs_adopted += rdp->qlen;
+       rdp->n_cbs_orphaned += rdp->qlen;
+
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
-       rsp->orphan_qlen += rdp->qlen;
-       rdp->n_cbs_orphaned += rdp->qlen;
        rdp->qlen = 0;
-       raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-       unsigned long flags;
-       struct rcu_data *rdp;
-
-       raw_spin_lock_irqsave(&rsp->onofflock, flags);
-       rdp = this_cpu_ptr(rsp->rda);
-       if (rsp->orphan_cbs_list == NULL) {
-               raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-               return;
-       }
-       *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-       rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-       rdp->qlen += rsp->orphan_qlen;
-       rdp->n_cbs_adopted += rsp->orphan_qlen;
-       rsp->orphan_cbs_list = NULL;
-       rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-       rsp->orphan_qlen = 0;
-       raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
-
-       rcu_adopt_orphan_cbs(rsp);
 }
 
 /*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 }
 
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         */
        local_irq_save(flags);
        rdp = this_cpu_ptr(rsp->rda);
-       rcu_process_gp_end(rsp, rdp);
-       check_for_new_grace_period(rsp, rdp);
 
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 
-       /* Start a new grace period if one not already started. */
-       if (!rcu_gp_in_progress(rsp)) {
-               unsigned long nestflag;
-               struct rcu_node *rnp_root = rcu_get_root(rsp);
-
-               raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-               rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-       }
-
        /*
         * Force the grace period if too many callbacks or too long waiting.
         * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * is the only one waiting for a grace period to complete.
         */
        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-               rdp->blimit = LONG_MAX;
-               if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-                   *rdp->nxttail[RCU_DONE_TAIL] != head)
-                       force_quiescent_state(rsp, 0);
-               rdp->n_force_qs_snap = rsp->n_force_qs;
-               rdp->qlen_last_fqs_check = rdp->qlen;
+
+               /* Are we ignoring a completed grace period? */
+               rcu_process_gp_end(rsp, rdp);
+               check_for_new_grace_period(rsp, rdp);
+
+               /* Start a new grace period if one not already started. */
+               if (!rcu_gp_in_progress(rsp)) {
+                       unsigned long nestflag;
+                       struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+                       raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                       rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+               } else {
+                       /* Give the grace period a kick. */
+                       rdp->blimit = LONG_MAX;
+                       if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                           *rdp->nxttail[RCU_DONE_TAIL] != head)
+                               force_quiescent_state(rsp, 0);
+                       rdp->n_force_qs_snap = rsp->n_force_qs;
+                       rdp->qlen_last_fqs_check = rdp->qlen;
+               }
        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
         * decrement rcu_barrier_cpu_count -- otherwise the first CPU
         * might complete its grace period before all of the other CPUs
         * did their increment, causing this function to return too
-        * early.
+        * early.  Note that on_each_cpu() disables irqs, which prevents
+        * any CPUs from coming online or going offline until each online
+        * CPU has queued its RCU-barrier callback.
         */
        atomic_set(&rcu_barrier_cpu_count, 1);
-       preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-       rcu_adopt_orphan_cbs(rsp);
        on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-       preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
                complete(&rcu_barrier_completion);
        wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /*
-                * preempt_disable() in _rcu_barrier() prevents stop_machine(),
-                * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-                * returns, all online cpus have queued rcu_barrier_func().
-                * The dying CPU clears its cpu_online_mask bit and
-                * moves all of its RCU callbacks to ->orphan_cbs_list
-                * in the context of stop_machine(), so subsequent calls
-                * to _rcu_barrier() will adopt these callbacks and only
-                * then queue rcu_barrier_func() on all remaining CPUs.
+                * The whole machine is "stopped" except this CPU, so we can
+                * touch any data without introducing corruption. We send the
+                * dying CPU's callbacks to an arbitrarily chosen online CPU.
                 */
-               rcu_send_cbs_to_orphanage(&rcu_bh_state);
-               rcu_send_cbs_to_orphanage(&rcu_sched_state);
-               rcu_preempt_send_cbs_to_orphanage();
+               rcu_send_cbs_to_online(&rcu_bh_state);
+               rcu_send_cbs_to_online(&rcu_sched_state);
+               rcu_preempt_send_cbs_to_online();
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
 
-       for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+       for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+       rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
index 91d4170c5c13afd2e8997bd59b28e7cc2a4385e8..e8f057e44e3ee00466e840593983ca5062302545 100644 (file)
 /*
  * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
  * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
  */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT           (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ        (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE              (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF       16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1         (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2         (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3         (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4         (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS       1
 #  define NUM_RCU_LVL_0              1
 #  define NUM_RCU_LVL_1              (NR_CPUS)
 #  define NUM_RCU_LVL_2              0
 #  define NUM_RCU_LVL_3              0
 #  define NUM_RCU_LVL_4              0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS       2
 #  define NUM_RCU_LVL_0              1
-#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2              (NR_CPUS)
 #  define NUM_RCU_LVL_3              0
 #  define NUM_RCU_LVL_4              0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS       3
 #  define NUM_RCU_LVL_0              1
-#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_3              NR_CPUS
+#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_3              (NR_CPUS)
 #  define NUM_RCU_LVL_4              0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS       4
 #  define NUM_RCU_LVL_0              1
-#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_3              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_4              NR_CPUS
+#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_3              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_4              (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
-       unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
-       unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
+       unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+       unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
        /* End of fields guarded by root rcu_node's lock. */
 
        raw_spinlock_t onofflock;               /* exclude on/offline and */
-                                               /*  starting new GP.  Also */
-                                               /*  protects the following */
-                                               /*  orphan_cbs fields. */
-       struct rcu_head *orphan_cbs_list;       /* list of rcu_head structs */
-                                               /*  orphaned by all CPUs in */
-                                               /*  a given leaf rcu_node */
-                                               /*  going offline. */
-       struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
-       long orphan_qlen;                       /* Number of orphaned cbs. */
+                                               /*  starting new GP. */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
 
index 71a4147473f95f51d2b2e88db4c14372dafe375f..a3638710dc67f4627f5cdb88e1cafb43b500d24a 100644 (file)
@@ -25,6 +25,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
 
 /*
  * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-       rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+       rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 
 /*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 /*
  * Because there is no preemptable RCU, there are no callbacks to move.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
 
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
 
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+       cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+       /*
+        * There must be a full memory barrier on each affected CPU
+        * between the time that try_stop_cpus() is called and the
+        * time that it returns.
+        *
+        * In the current initial implementation of cpu_stop, the
+        * above condition is already met when the control reaches
+        * this point and the following smp_mb() is not strictly
+        * necessary.  Do smp_mb() anyway for documentation and
+        * robustness against future implementation changes.
+        */
+       smp_mb(); /* See above comment block. */
+       return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+       int firstsnap, s, snap, trycount = 0;
+
+       /* Note that atomic_inc_return() implies full memory barrier. */
+       firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+       get_online_cpus();
+
+       /*
+        * Each pass through the following loop attempts to force a
+        * context switch on each CPU.
+        */
+       while (try_stop_cpus(cpu_online_mask,
+                            synchronize_sched_expedited_cpu_stop,
+                            NULL) == -EAGAIN) {
+               put_online_cpus();
+
+               /* No joy, try again later.  Or just synchronize_sched(). */
+               if (trycount++ < 10)
+                       udelay(trycount * num_online_cpus());
+               else {
+                       synchronize_sched();
+                       return;
+               }
+
+               /* Check to see if someone else did our work for us. */
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       return;
+               }
+
+               /*
+                * Refetching sync_sched_expedited_started allows later
+                * callers to piggyback on our grace period.  We subtract
+                * 1 to get the same token that the last incrementer got.
+                * We retry after they started, so our grace period works
+                * for them, and they started after our first try, so their
+                * grace period works for us.
+                */
+               get_online_cpus();
+               snap = atomic_read(&sync_sched_expedited_started) - 1;
+               smp_mb(); /* ensure read is before try_stop_cpus(). */
+       }
+
+       /*
+        * Everyone up to our most recent fetch is covered by our grace
+        * period.  Update the counter, but only if our work is still
+        * relevant -- which it won't be if someone who started later
+        * than we did beat us to the punch.
+        */
+       do {
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       break;
+               }
+       } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+       put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 
 /*
index d15430b9d122f4d619e76fb6b5069aa1f494a575..c8e97853b970f71ad662732ef46da011cf46ac1d 100644 (file)
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                     "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+                     "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                  rsp->n_force_qs_lh, rsp->orphan_qlen);
+                  rsp->n_force_qs_lh);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
 
 static struct dentry *rcudir;
 
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
        struct dentry *retval;
 
@@ -337,14 +337,14 @@ free_out:
        return 1;
 }
 
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
        debugfs_remove_recursive(rcudir);
 }
 
 
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
 
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
index c68cead94dd76942beeffa932498d0ae5a2cfe41..04949089e7601ccd2a9b82f0f30c5905cbc9777b 100644 (file)
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include <asm/mutex.h>
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
        unsigned long shares;
+
+       atomic_t load_weight;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
        struct task_group *parent;
        struct list_head siblings;
        struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
 };
 
 #define root_task_group init_task_group
 
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-       return list_empty(&root_task_group.children);
-}
-#endif
-
 # define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
 
 /*
@@ -342,6 +341,7 @@ struct cfs_rq {
         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
         * list is used during load balance.
         */
+       int on_list;
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
 
@@ -360,14 +360,17 @@ struct cfs_rq {
        unsigned long h_load;
 
        /*
-        * this cpu's part of tg->shares
+        * Maintaining per-cpu shares distribution for group scheduling
+        *
+        * load_stamp is the last time we updated the load average
+        * load_last is the last time we updated the load average and saw load
+        * load_unacc_exec_time is currently unaccounted execution time
         */
-       unsigned long shares;
+       u64 load_avg;
+       u64 load_period;
+       u64 load_stamp, load_last, load_unacc_exec_time;
 
-       /*
-        * load.weight at the time we set shares
-        */
-       unsigned long rq_weight;
+       unsigned long load_contribution;
 #endif
 #endif
 };
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+       struct task_group *tg;
        struct cgroup_subsys_state *css;
 
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                        lockdep_is_held(&task_rq(p)->lock));
-       return container_of(css, struct task_group, css);
+       tg = container_of(css, struct task_group, css);
+
+       return autogroup_task_group(p, tg);
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -792,20 +798,6 @@ late_initcall(sched_init_debug);
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
-/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
 /*
  * period over which we average the RT time consumption, measured
  * in ms.
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
        lw->inv_weight = 0;
 }
 
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+       lw->weight = w;
+       lw->inv_weight = 0;
+}
+
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                   unsigned long sd_shares,
-                                   unsigned long sd_rq_weight,
-                                   unsigned long *usd_rq_weight)
-{
-       unsigned long shares, rq_weight;
-       int boost = 0;
-
-       rq_weight = usd_rq_weight[cpu];
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       /*
-        *             \Sum_j shares_j * rq_weight_i
-        * shares_i =  -----------------------------
-        *                  \Sum_j rq_weight_j
-        */
-       shares = (sd_shares * rq_weight) / sd_rq_weight;
-       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
-       if (abs(shares - tg->se[cpu]->load.weight) >
-                       sysctl_sched_shares_thresh) {
-               struct rq *rq = cpu_rq(cpu);
-               unsigned long flags;
-
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               __set_se_shares(tg->se[cpu], shares);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-       unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-       unsigned long *usd_rq_weight;
-       struct sched_domain *sd = data;
-       unsigned long flags;
-       int i;
-
-       if (!tg->se[0])
-               return 0;
-
-       local_irq_save(flags);
-       usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
-       for_each_cpu(i, sched_domain_span(sd)) {
-               weight = tg->cfs_rq[i]->load.weight;
-               usd_rq_weight[i] = weight;
-
-               rq_weight += weight;
-               /*
-                * If there are currently no tasks on the cpu pretend there
-                * is one of average load so that when a new task gets to
-                * run here it will not get delayed by group starvation.
-                */
-               if (!weight)
-                       weight = NICE_0_LOAD;
-
-               sum_weight += weight;
-               shares += tg->cfs_rq[i]->shares;
-       }
-
-       if (!rq_weight)
-               rq_weight = sum_weight;
-
-       if ((!shares && rq_weight) || shares > tg->shares)
-               shares = tg->shares;
-
-       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-               shares = tg->shares;
-
-       for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
-       local_irq_restore(flags);
-
-       return 0;
-}
-
 /*
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                load = cpu_rq(cpu)->load.weight;
        } else {
                load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->cfs_rq[cpu]->shares;
+               load *= tg->se[cpu]->load.weight;
                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
        }
 
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
        return 0;
 }
 
-static void update_shares(struct sched_domain *sd)
-{
-       s64 elapsed;
-       u64 now;
-
-       if (root_task_group_empty())
-               return;
-
-       now = local_clock();
-       elapsed = now - sd->last_update;
-
-       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-               sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, sd);
-       }
-}
-
 static void update_h_load(long cpu)
 {
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
 #endif
 
 #ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 #endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-       cfs_rq->shares = shares;
-#endif
-}
-#endif
-
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
 #include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
 {
-       struct rq *rq = task_rq(p);
-
        /*
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
 
        /* No more Mr. Nice Guy. */
-       if (unlikely(dest_cpu >= nr_cpu_ids)) {
-               dest_cpu = cpuset_cpus_allowed_fallback(p);
-               /*
-                * Don't tell them about moving exiting tasks or
-                * kernel threads (both mm NULL), since they never
-                * leave kernel.
-                */
-               if (p->mm && printk_ratelimit()) {
-                       printk(KERN_INFO "process %d (%s) no "
-                              "longer affine to cpu%d\n",
-                              task_pid_nr(p), p->comm, cpu);
-               }
+       dest_cpu = cpuset_cpus_allowed_fallback(p);
+       /*
+        * Don't tell them about moving exiting tasks or
+        * kernel threads (both mm NULL), since they never
+        * leave kernel.
+        */
+       if (p->mm && printk_ratelimit()) {
+               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
        }
 
        return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
 
        put_cpu();
 }
@@ -3549,7 +3418,7 @@ void sched_exec(void)
         * select_task_rq() can race against ->cpus_allowed
         */
        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+           likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
 
                task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                if (task_thread_info(rq->curr) != owner || need_resched())
                        return 0;
 
-               cpu_relax();
+               arch_mutex_cpu_relax();
        }
 
        return 1;
@@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
  */
-unsigned long __sched
+long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
 {
@@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
  * signaled or for a specified timeout to expire. It can be
  * interrupted by a kill signal. The timeout is in jiffies.
  */
-unsigned long __sched
+long __sched
 wait_for_completion_killable_timeout(struct completion *x,
                                     unsigned long timeout)
 {
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
 }
 
 static int __sched_setscheduler(struct task_struct *p, int policy,
-                               struct sched_param *param, bool user)
+                               const struct sched_param *param, bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+                      const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, true);
 }
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
  * but our caller might not have that capability.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                              struct sched_param *param)
+                              const struct sched_param *param)
 {
        return __sched_setscheduler(p, policy, param, false);
 }
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
        unsigned state;
 
        state = p->state ? __ffs(p->state) + 1 : 0;
-       printk(KERN_INFO "%-13.13s %c", p->comm,
+       printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
        if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
        SET_SYSCTL(sched_min_granularity);
        SET_SYSCTL(sched_latency);
        SET_SYSCTL(sched_wakeup_granularity);
-       SET_SYSCTL(sched_shares_ratelimit);
 #undef SET_SYSCTL
 }
 
@@ -5830,7 +5698,7 @@ again:
                goto out;
 
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (migrate_task(p, dest_cpu)) {
+       if (migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
 /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
  */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
 {
-       struct rq *rq = cpu_rq(dead_cpu);
-       int needs_cpu, uninitialized_var(dest_cpu);
-       unsigned long flags;
+       struct mm_struct *mm = current->active_mm;
 
-       local_irq_save(flags);
+       BUG_ON(cpu_online(smp_processor_id()));
 
-       raw_spin_lock(&rq->lock);
-       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-       if (needs_cpu)
-               dest_cpu = select_fallback_rq(dead_cpu, p);
-       raw_spin_unlock(&rq->lock);
-       /*
-        * It can only fail if we race with set_cpus_allowed(),
-        * in the racer should migrate the task anyway.
-        */
-       if (needs_cpu)
-               __migrate_task(p, dead_cpu, dest_cpu);
-       local_irq_restore(flags);
+       if (mm != &init_mm)
+               switch_mm(mm, &init_mm, current);
+       mmdrop(mm);
 }
 
 /*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-       unsigned long flags;
 
-       local_irq_save(flags);
-       double_rq_lock(rq_src, rq_dest);
        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
        rq_src->nr_uninterruptible = 0;
-       double_rq_unlock(rq_src, rq_dest);
-       local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-       struct task_struct *p, *t;
-
-       read_lock(&tasklist_lock);
-
-       do_each_thread(t, p) {
-               if (p == current)
-                       continue;
-
-               if (task_cpu(p) == src_cpu)
-                       move_task_off_dead_cpu(src_cpu, p);
-       } while_each_thread(t, p);
-
-       read_unlock(&tasklist_lock);
 }
 
 /*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
  */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
 {
-       int this_cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(this_cpu);
-       struct task_struct *p = rq->idle;
-       unsigned long flags;
-
-       /* cpu has to be offline */
-       BUG_ON(cpu_online(this_cpu));
-
-       /*
-        * Strictly not necessary since rest of the CPUs are stopped by now
-        * and interrupts disabled on the current cpu.
-        */
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
-       activate_task(rq, p, 0);
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
 }
 
 /*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
  */
-void idle_task_exit(void)
-{
-       struct mm_struct *mm = current->active_mm;
-
-       BUG_ON(cpu_online(smp_processor_id()));
-
-       if (mm != &init_mm)
-               switch_mm(mm, &init_mm, current);
-       mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
 {
        struct rq *rq = cpu_rq(dead_cpu);
-
-       /* Must be exiting, otherwise would be on tasklist. */
-       BUG_ON(!p->exit_state);
-
-       /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->state == TASK_DEAD);
-
-       get_task_struct(p);
+       struct task_struct *next, *stop = rq->stop;
+       int dest_cpu;
 
        /*
-        * Drop lock around migration; if someone else moves it,
-        * that's OK. No task can be added to this CPU, so iteration is
-        * fine.
+        * Fudge the rq selection such that the below task selection loop
+        * doesn't get stuck on the currently eligible stop task.
+        *
+        * We're currently inside stop_machine() and the rq is either stuck
+        * in the stop_machine_cpu_stop() loop, or we're executing this code,
+        * either way we should never end up calling schedule() until we're
+        * done here.
         */
-       raw_spin_unlock_irq(&rq->lock);
-       move_task_off_dead_cpu(dead_cpu, p);
-       raw_spin_lock_irq(&rq->lock);
-
-       put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-       struct rq *rq = cpu_rq(dead_cpu);
-       struct task_struct *next;
+       rq->stop = NULL;
 
        for ( ; ; ) {
-               if (!rq->nr_running)
+               /*
+                * There's this thread running, bail when that's the only
+                * remaining thread.
+                */
+               if (rq->nr_running == 1)
                        break;
+
                next = pick_next_task(rq);
-               if (!next)
-                       break;
+               BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
-               migrate_dead(dead_cpu, next);
 
+               /* Find suitable destination for @next, with force if needed. */
+               dest_cpu = select_fallback_rq(dead_cpu, next);
+               raw_spin_unlock(&rq->lock);
+
+               __migrate_task(next, dead_cpu, dest_cpu);
+
+               raw_spin_lock(&rq->lock);
        }
-}
 
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-       rq->calc_load_active = 0;
+       rq->stop = stop;
 }
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        unsigned long flags;
        struct rq *rq = cpu_rq(cpu);
 
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
 
        case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
                rq->calc_load_update = calc_load_update;
                break;
 
        case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               migrate_live_tasks(cpu);
-               /* Idle task back to normal (off runqueue, low prio) */
-               raw_spin_lock_irq(&rq->lock);
-               deactivate_task(rq, rq->idle, 0);
-               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-               rq->idle->sched_class = &idle_sched_class;
-               migrate_dead_tasks(cpu);
-               raw_spin_unlock_irq(&rq->lock);
-               migrate_nr_uninterruptible(rq);
-               BUG_ON(rq->nr_running != 0);
-               calc_global_load_remove(rq);
-               break;
-
        case CPU_DYING:
-       case CPU_DYING_FROZEN:
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
+               migrate_tasks(cpu);
+               BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               migrate_nr_uninterruptible(rq);
+               calc_global_load_remove(rq);
                break;
 #endif
        }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                               struct sched_entity *se, int cpu, int add,
+                               struct sched_entity *se, int cpu,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
        tg->cfs_rq[cpu] = cfs_rq;
        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
-       if (add)
-               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 
        tg->se[cpu] = se;
        /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
 
        se->my_q = cfs_rq;
-       se->load.weight = tg->shares;
-       se->load.inv_weight = 0;
+       update_load_set(&se->load, 0);
        se->parent = parent;
 }
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu, int add,
+               struct sched_rt_entity *rt_se, int cpu,
                struct sched_rt_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-       if (add)
-               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
 #ifdef CONFIG_CGROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
        INIT_LIST_HEAD(&init_task_group.children);
-
+       autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
 
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-       update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                           __alignof__(unsigned long));
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
 
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
                init_task_group.shares = init_task_group_load;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                /*
                 * How much cpu bandwidth does init_task_group get?
                 *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
                 * We achieve this by letting init_task_group's tasks sit
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                 */
-               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
-               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
 #endif
 
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
 
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
 
        return 1;
@@ -8497,15 +8271,21 @@ err:
        return 0;
 }
 
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                       &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
-       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /*
+       * Only empty task groups can be destroyed; so we can speculatively
+       * check on_list without danger of it being re-added.
+       */
+       if (!tg->cfs_rq[cpu]->on_list)
+               return;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
@@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
 }
 
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
@@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
 
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
 
        return 1;
@@ -8586,17 +8362,6 @@ err_free_rq:
 err:
        return 0;
 }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                       &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
@@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        return 1;
 }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
        unsigned long flags;
-       int i;
 
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                goto err;
 
        spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
-               register_fair_sched_group(tg, i);
-               register_rt_sched_group(tg, i);
-       }
        list_add_rcu(&tg->list, &task_groups);
 
        WARN_ON(!parent); /* root should already exist */
@@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
        unsigned long flags;
        int i;
 
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
+       /* end participation in shares distribution */
+       for_each_possible_cpu(i)
                unregister_fair_sched_group(tg, i);
-               unregister_rt_sched_group(tg, i);
-       }
+
+       spin_lock_irqsave(&task_group_lock, flags);
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_CGROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       int on_rq;
-
-       on_rq = se->on_rq;
-       if (on_rq)
-               dequeue_entity(cfs_rq, se, 0);
-
-       se->load.weight = shares;
-       se->load.inv_weight = 0;
-
-       if (on_rq)
-               enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __set_se_shares(se, shares);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
 static DEFINE_MUTEX(shares_mutex);
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (tg->shares == shares)
                goto done;
 
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
-       list_del_rcu(&tg->siblings);
-       spin_unlock_irqrestore(&task_group_lock, flags);
-
-       /* wait for any ongoing reference to this group to finish */
-       synchronize_sched();
-
-       /*
-        * Now we are free to modify the group's share on each cpu
-        * w/o tripping rebalance_share or load_balance_fair.
-        */
        tg->shares = shares;
        for_each_possible_cpu(i) {
-               /*
-                * force a rebalance
-                */
-               cfs_rq_set_shares(tg->cfs_rq[i], 0);
-               set_se_shares(tg->se[i], shares);
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se;
+
+               se = tg->se[i];
+               /* Propagate contribution to hierarchy */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               for_each_sched_entity(se)
+                       update_cfs_shares(group_cfs_rq(se), 0);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
 
-       /*
-        * Enable load balance activity on this group, by inserting it back on
-        * each cpu's rq->leaf_cfs_rq_list.
-        */
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               register_fair_sched_group(tg, i);
-       list_add_rcu(&tg->siblings, &tg->parent->children);
-       spin_unlock_irqrestore(&task_group_lock, flags);
 done:
        mutex_unlock(&shares_mutex);
        return 0;
@@ -9532,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif /* CONFIG_CGROUP_CPUACCT */
 
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-       barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-       /*
-        * There must be a full memory barrier on each affected CPU
-        * between the time that try_stop_cpus() is called and the
-        * time that it returns.
-        *
-        * In the current initial implementation of cpu_stop, the
-        * above condition is already met when the control reaches
-        * this point and the following smp_mb() is not strictly
-        * necessary.  Do smp_mb() anyway for documentation and
-        * robustness against future implementation changes.
-        */
-       smp_mb(); /* See above comment block. */
-       return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-       int snap, trycount = 0;
-
-       smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-       get_online_cpus();
-       while (try_stop_cpus(cpu_online_mask,
-                            synchronize_sched_expedited_cpu_stop,
-                            NULL) == -EAGAIN) {
-               put_online_cpus();
-               if (trycount++ < 10)
-                       udelay(trycount * num_online_cpus());
-               else {
-                       synchronize_sched();
-                       return;
-               }
-               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                       smp_mb(); /* ensure test happens before caller kfree */
-                       return;
-               }
-               get_online_cpus();
-       }
-       atomic_inc(&synchronize_sched_expedited_count);
-       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-       put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644 (file)
index 0000000..c80fedc
--- /dev/null
@@ -0,0 +1,238 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+       autogroup_default.tg = &init_task_group;
+       init_task_group.autogroup = &autogroup_default;
+       kref_init(&autogroup_default.kref);
+       init_rwsem(&autogroup_default.lock);
+       init_task->signal->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_free(struct task_group *tg)
+{
+       kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+       struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+       sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+       kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+       kref_get(&ag->kref);
+       return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+       struct autogroup *ag;
+       unsigned long flags;
+
+       if (!lock_task_sighand(p, &flags))
+               return autogroup_kref_get(&autogroup_default);
+
+       ag = autogroup_kref_get(p->signal->autogroup);
+       unlock_task_sighand(p, &flags);
+
+       return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+       struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+       struct task_group *tg;
+
+       if (!ag)
+               goto out_fail;
+
+       tg = sched_create_group(&init_task_group);
+
+       if (IS_ERR(tg))
+               goto out_free;
+
+       kref_init(&ag->kref);
+       init_rwsem(&ag->lock);
+       ag->id = atomic_inc_return(&autogroup_seq_nr);
+       ag->tg = tg;
+       tg->autogroup = ag;
+
+       return ag;
+
+out_free:
+       kfree(ag);
+out_fail:
+       if (printk_ratelimit()) {
+               printk(KERN_WARNING "autogroup_create: %s failure.\n",
+                       ag ? "sched_create_group()" : "kmalloc()");
+       }
+
+       return autogroup_kref_get(&autogroup_default);
+}
+
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+       if (tg != &root_task_group)
+               return false;
+
+       if (p->sched_class != &fair_sched_class)
+               return false;
+
+       /*
+        * We can only assume the task group can't go away on us if
+        * autogroup_move_group() can see us on ->thread_group list.
+        */
+       if (p->flags & PF_EXITING)
+               return false;
+
+       return true;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+       if (enabled && task_wants_autogroup(p, tg))
+               return p->signal->autogroup->tg;
+
+       return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+       struct autogroup *prev;
+       struct task_struct *t;
+       unsigned long flags;
+
+       BUG_ON(!lock_task_sighand(p, &flags));
+
+       prev = p->signal->autogroup;
+       if (prev == ag) {
+               unlock_task_sighand(p, &flags);
+               return;
+       }
+
+       p->signal->autogroup = autogroup_kref_get(ag);
+
+       t = p;
+       do {
+               sched_move_task(t);
+       } while_each_thread(p, t);
+
+       unlock_task_sighand(p, &flags);
+       autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+       struct autogroup *ag = autogroup_create();
+
+       autogroup_move_group(p, ag);
+       /* drop extra refrence added by autogroup_create() */
+       autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+       autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+       sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+       autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+       sysctl_sched_autogroup_enabled = 0;
+
+       return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+       static unsigned long next = INITIAL_JIFFIES;
+       struct autogroup *ag;
+       int err;
+
+       if (*nice < -20 || *nice > 19)
+               return -EINVAL;
+
+       err = security_task_setnice(current, *nice);
+       if (err)
+               return err;
+
+       if (*nice < 0 && !can_nice(current, *nice))
+               return -EPERM;
+
+       /* this is a heavy operation taking global locks.. */
+       if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+               return -EAGAIN;
+
+       next = HZ / 10 + jiffies;
+       ag = autogroup_task_get(p);
+
+       down_write(&ag->lock);
+       err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+       if (!err)
+               ag->nice = *nice;
+       up_write(&ag->lock);
+
+       autogroup_kref_put(ag);
+
+       return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+       struct autogroup *ag = autogroup_task_get(p);
+
+       down_read(&ag->lock);
+       seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+       up_read(&ag->lock);
+
+       autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644 (file)
index 0000000..5358e24
--- /dev/null
@@ -0,0 +1,32 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+       struct kref             kref;
+       struct task_group       *tg;
+       struct rw_semaphore     lock;
+       unsigned long           id;
+       int                     nice;
+};
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
index 52f1a149bfb15a871a362255498fadf90e357c57..9d8af0b3fb64544d9ca7076f3478d2239b46540e 100644 (file)
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 }
 EXPORT_SYMBOL_GPL(sched_clock);
 
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 __read_mostly int sched_clock_stable;
index 2e1b0d17dd9b6a8b4ac48891a988c025a8b07ed5..1dfae3d014b5934eba4b3be25bfbad746196b4b7 100644 (file)
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
-               struct task_group *tg)
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
        struct sched_entity *se = tg->se[cpu];
        if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 
-#ifdef CONFIG_CGROUP_SCHED
-       {
-               char path[64];
-
-               rcu_read_lock();
-               cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-               rcu_read_unlock();
-               SEQ_printf(m, " %s", path);
-       }
-#endif
        SEQ_printf(m, "\n");
 }
 
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_unlock_irqrestore(&tasklist_lock, flags);
 }
 
-#if defined(CONFIG_CGROUP_SCHED) && \
-       (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-       /* may be NULL if the underlying cgroup isn't fully-created yet */
-       if (!tg->css.cgroup) {
-               buf[0] = '\0';
-               return;
-       }
-       cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
        s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
 
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-       char path[128];
-       struct task_group *tg = cfs_rq->tg;
-
-       task_group_path(tg, path, sizeof(path));
-
-       SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
 
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        spread0 = min_vruntime - rq0_min_vruntime;
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
                        SPLIT_NS(spread0));
-       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-
        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
+       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-       SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+                       SPLIT_NS(cfs_rq->load_avg));
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+                       SPLIT_NS(cfs_rq->load_period));
+       SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+                       cfs_rq->load_contribution);
+       SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+                       atomic_read(&cfs_rq->tg->load_weight));
 #endif
+
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
-       char path[128];
-       struct task_group *tg = rt_rq->tg;
-
-       task_group_path(tg, path, sizeof(path));
-
-       SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
-#else
        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-
 
 #define P(x) \
        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 #undef P
 }
 
+extern __read_mostly int sched_clock_running;
+
 static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
 
 static int sched_debug_show(struct seq_file *m, void *v)
 {
-       u64 now = ktime_to_ns(ktime_get());
+       u64 ktime, sched_clk, cpu_clk;
+       unsigned long flags;
        int cpu;
 
-       SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+       local_irq_save(flags);
+       ktime = ktime_to_ns(ktime_get());
+       sched_clk = sched_clock();
+       cpu_clk = local_clock();
+       local_irq_restore(flags);
+
+       SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
 
-       SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+       SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+       SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+       PN(ktime);
+       PN(sched_clk);
+       PN(cpu_clk);
+       P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+       P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+
+       SEQ_printf(m, "\n");
+       SEQ_printf(m, "sysctl_sched\n");
 
 #define P(x) \
        SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
        SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-       P(jiffies);
        PN(sysctl_sched_latency);
        PN(sysctl_sched_min_granularity);
        PN(sysctl_sched_wakeup_granularity);
index 00ebd7686676bd87a6e5b3be513d27028e518796..c62ebae65cf0c5e5d1628b0368692a94cda37568 100644 (file)
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
 static const struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_rq->on_list) {
+               /*
+                * Ensure we either appear before our parent (if already
+                * enqueued) or force our parent to appear after us when it is
+                * enqueued.  The fact that we always enqueue bottom-up
+                * reduces this to two cases.
+                */
+               if (cfs_rq->tg->parent &&
+                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               } else {
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               }
+
+               cfs_rq->on_list = 1;
+       }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->on_list) {
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+               cfs_rq->on_list = 0;
+       }
+}
+
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
        return &cpu_rq(this_cpu)->cfs;
 }
 
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
        WRT_SYSCTL(sched_min_granularity);
        WRT_SYSCTL(sched_latency);
        WRT_SYSCTL(sched_wakeup_granularity);
-       WRT_SYSCTL(sched_shares_ratelimit);
 #undef WRT_SYSCTL
 
        return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_add(&se->group_node, &cfs_rq->tasks);
        }
        cfs_rq->nr_running++;
-       se->on_rq = 1;
 }
 
 static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                list_del_init(&se->group_node);
        }
        cfs_rq->nr_running--;
-       se->on_rq = 0;
 }
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                           int global_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long load_avg;
+
+       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+       load_avg -= cfs_rq->load_contribution;
+
+       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+               atomic_add(load_avg, &tg->load_weight);
+               cfs_rq->load_contribution += load_avg;
+       }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+       u64 period = sysctl_sched_shares_window;
+       u64 now, delta;
+       unsigned long load = cfs_rq->load.weight;
+
+       if (!cfs_rq)
+               return;
+
+       now = rq_of(cfs_rq)->clock;
+       delta = now - cfs_rq->load_stamp;
+
+       /* truncate load history at 4 idle periods */
+       if (cfs_rq->load_stamp > cfs_rq->load_last &&
+           now - cfs_rq->load_last > 4 * period) {
+               cfs_rq->load_period = 0;
+               cfs_rq->load_avg = 0;
+       }
+
+       cfs_rq->load_stamp = now;
+       cfs_rq->load_unacc_exec_time = 0;
+       cfs_rq->load_period += delta;
+       if (load) {
+               cfs_rq->load_last = now;
+               cfs_rq->load_avg += delta * load;
+       }
+
+       /* consider updating load contribution on each fold or truncate */
+       if (global_update || cfs_rq->load_period > period
+           || !cfs_rq->load_period)
+               update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+       while (cfs_rq->load_period > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (cfs_rq->load_period));
+               cfs_rq->load_period /= 2;
+               cfs_rq->load_avg /= 2;
+       }
+
+       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+               list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                           unsigned long weight)
+{
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
+               account_entity_dequeue(cfs_rq, se);
+       }
+
+       update_load_set(&se->load, weight);
+
+       if (se->on_rq)
+               account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+       struct task_group *tg;
+       struct sched_entity *se;
+       long load_weight, load, shares;
+
+       if (!cfs_rq)
+               return;
+
+       tg = cfs_rq->tg;
+       se = tg->se[cpu_of(rq_of(cfs_rq))];
+       if (!se)
+               return;
+
+       load = cfs_rq->load.weight + weight_delta;
+
+       load_weight = atomic_read(&tg->load_weight);
+       load_weight -= cfs_rq->load_contribution;
+       load_weight += load;
+
+       shares = (tg->shares * load);
+       if (load_weight)
+               shares /= load_weight;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       if (shares > tg->shares)
+               shares = tg->shares;
+
+       reweight_entity(cfs_rq_of(se), se, shares);
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+       update_cfs_load(cfs_rq, 0);
+       update_cfs_shares(cfs_rq, se->load.weight);
        account_entity_enqueue(cfs_rq, se);
 
        if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        check_spread(cfs_rq, se);
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
+       se->on_rq = 1;
+
+       if (cfs_rq->nr_running == 1)
+               list_add_leaf_cfs_rq(cfs_rq);
 }
 
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
+       se->on_rq = 0;
+       update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+       update_cfs_shares(cfs_rq, 0);
 
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_curr(cfs_rq);
 
+       /*
+        * Update share accounting for long-running entities.
+        */
+       update_entity_shares_tick(cfs_rq);
+
 #ifdef CONFIG_SCHED_HRTICK
        /*
         * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                flags = ENQUEUE_WAKEUP;
        }
 
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
        hrtick_update(rq);
 }
 
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
+
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
                flags |= DEQUEUE_SLEEP;
        }
 
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
        hrtick_update(rq);
 }
 
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
  * Adding load to a group doesn't make a group heavier, but can cause movement
  * of group shares between cpus. Assuming the shares were perfectly aligned one
  * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
  */
-static long effective_load(struct task_group *tg, int cpu,
-               long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
 
        if (!tg->parent)
                return wl;
 
-       /*
-        * By not taking the decrease of shares on the other cpu into
-        * account our error leans towards reducing the affine wakeups.
-        */
-       if (!wl && sched_feat(ASYM_EFF_LOAD))
-               return wl;
-
        for_each_sched_entity(se) {
                long S, rw, s, a, b;
-               long more_w;
-
-               /*
-                * Instead of using this increment, also add the difference
-                * between when the shares were last updated and now.
-                */
-               more_w = se->my_q->load.weight - se->my_q->rq_weight;
-               wl += more_w;
-               wg += more_w;
 
                S = se->my_q->tg->shares;
-               s = se->my_q->shares;
-               rw = se->my_q->rq_weight;
+               s = se->load.weight;
+               rw = se->my_q->load.weight;
 
                a = S*(rw + wl);
                b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                        sd = tmp;
        }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       if (sched_feat(LB_SHARES_UPDATE)) {
-               /*
-                * Pick the largest domain to update shares over
-                */
-               tmp = sd;
-               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                       tmp = affine_sd;
-
-               if (tmp) {
-                       raw_spin_unlock(&rq->lock);
-                       update_shares(tmp);
-                       raw_spin_lock(&rq->lock);
-               }
-       }
-#endif
-
        if (affine_sd) {
                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                        return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned long flags;
+       struct rq *rq;
+
+       if (!tg->se[cpu])
+               return 0;
+
+       rq = cpu_rq(cpu);
+       cfs_rq = tg->cfs_rq[cpu];
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       update_rq_clock(rq);
+       update_cfs_load(cfs_rq, 1);
+
+       /*
+        * We need to update shares after updating tg->load_weight in
+        * order to adjust the weight of groups with long running tasks.
+        */
+       update_cfs_shares(cfs_rq, 0);
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return 0;
+}
+
+static void update_shares(int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       struct rq *rq = cpu_rq(cpu);
+
+       rcu_read_lock();
+       for_each_leaf_cfs_rq(rq, cfs_rq)
+               update_shares_cpu(cfs_rq->tg, cpu);
+       rcu_read_unlock();
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return max_load_move - rem_load_move;
 }
 #else
+static inline void update_shares(int cpu)
+{
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 
 redo:
-       update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
 
@@ -3174,8 +3381,6 @@ out_one_pinned:
        else
                ld_moved = 0;
 out:
-       if (ld_moved)
-               update_shares(sd);
        return ld_moved;
 }
 
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
         */
        raw_spin_unlock(&this_rq->lock);
 
+       update_shares(this_cpu);
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
 
+       update_shares(cpu);
+
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
index 185f920ec1a2e923b0d966f787c610ff26a7b6cb..68e69acc29b9570b10ecf8a5892ea62c48700c5c 100644 (file)
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
 
 /*
  * Spin-wait on mutex acquisition when the mutex owner is running on
index bea7d79f7e9ca958bba514cbd8eb48ceab47bab3..c914ec747ca6709e25a177eb3c4152c75cb40aee 100644 (file)
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
 
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_add_rcu(&rt_rq->leaf_rt_rq_list,
+                       &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
 
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
 #define for_each_leaf_rt_rq(rt_rq, rq) \
        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
 
+       if (!rt_rq->rt_nr_running)
+               list_add_leaf_rt_rq(rt_rq);
+
        if (head)
                list_add(&rt_se->run_list, queue);
        else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                __clear_bit(rt_se_prio(rt_se), array->bitmap);
 
        dec_rt_tasks(rt_se, rt_rq);
+       if (!rt_rq->rt_nr_running)
+               list_del_leaf_rt_rq(rt_rq);
 }
 
 /*
index 18f4be0d5fe0bbf853935972d9b441e95bc61c5a..d4d918a91881407acd8abbde6691f77197cd013c 100644 (file)
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN: {
-               struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+               static struct sched_param param = {
+                       .sched_priority = MAX_RT_PRIO-1
+               };
 
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
index c71e075005368eceff3aab4340f94beca4aee249..98d8c1e80edbcb106ba8e87c34777459aa4eff55 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
         * all srcu_read_lock() calls using the old counters have completed.
         * Their corresponding critical sections might well be still
         * executing, but the srcu_read_lock() primitives themselves
-        * will have finished executing.
+        * will have finished executing.  We initially give readers
+        * an arbitrarily chosen 10 microseconds to get out of their
+        * SRCU read-side critical sections, then loop waiting 1/HZ
+        * seconds per iteration.
         */
 
+       if (srcu_readers_active_idx(sp, idx))
+               udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
 
index 7f5a0cd296a96ca44e43f0db028026094dbbb57a..2745dcdb6c6c5756a7bafd19e6497c31cc077d60 100644 (file)
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
        err = session;
 out:
        write_unlock_irq(&tasklist_lock);
-       if (err > 0)
+       if (err > 0) {
                proc_sid_connector(group_leader);
+               sched_autogroup_create_attach(group_leader);
+       }
        return err;
 }
 
index 46404414d8a7d4b187903906927157e1f092020a..ae5cbb1e3ced15b8cc2e00b052496953709cf4c2 100644 (file)
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns;                       /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;   /* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 
 #ifdef CONFIG_COMPACTION
@@ -304,15 +302,6 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
-       {
-               .procname       = "sched_shares_ratelimit",
-               .data           = &sysctl_sched_shares_ratelimit,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = sched_proc_update_handler,
-               .extra1         = &min_sched_shares_ratelimit,
-               .extra2         = &max_sched_shares_ratelimit,
-       },
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -322,14 +311,6 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_sched_tunable_scaling,
                .extra2         = &max_sched_tunable_scaling,
        },
-       {
-               .procname       = "sched_shares_thresh",
-               .data           = &sysctl_sched_shares_thresh,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &zero,
-       },
        {
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
@@ -351,6 +332,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "sched_shares_window",
+               .data           = &sysctl_sched_shares_window,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
        {
                .procname       = "timer_migration",
                .data           = &sysctl_timer_migration,
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_SCHED_AUTOGROUP
+       {
+               .procname       = "sched_autogroup_enabled",
+               .data           = &sysctl_sched_autogroup_enabled,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
index ac38fbb176ccd0bb598b1eaaa7f2a703b17ec565..a9ae369925ce14fa4cf7ca1674d9f4903a6f4a24 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/kernel.h>
 
 /*
  * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
        int index;
        int num_samples = sync->num_samples;
 
-       if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+       if (num_samples > ARRAY_SIZE(buffer)) {
                samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
                if (!samples) {
                        samples = buffer;
-                       num_samples = sizeof(buffer)/sizeof(buffer[0]);
+                       num_samples = ARRAY_SIZE(buffer);
                }
        } else {
                samples = buffer;
index 49010d822f725b47726742fa7e1b45aad076ef90..5bb86da8200373a2e6cd64fdcbe0355f43f5a27f 100644 (file)
@@ -32,6 +32,8 @@ struct timekeeper {
        cycle_t cycle_interval;
        /* Number of clock shifted nano seconds in one NTP interval. */
        u64     xtime_interval;
+       /* shifted nano seconds left over when rounding cycle_interval */
+       s64     xtime_remainder;
        /* Raw nano seconds accumulated per NTP interval. */
        u32     raw_interval;
 
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
 static void timekeeper_setup_internals(struct clocksource *clock)
 {
        cycle_t interval;
-       u64 tmp;
+       u64 tmp, ntpinterval;
 
        timekeeper.clock = clock;
        clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
+       ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
 
        /* Go back from cycles -> shifted ns */
        timekeeper.xtime_interval = (u64) interval * clock->mult;
+       timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
        timekeeper.raw_interval =
                ((u64) interval * clock->mult) >> clock->shift;
 
@@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 
        /* Accumulate error between NTP and clock interval */
        timekeeper.ntp_error += tick_length << shift;
-       timekeeper.ntp_error -= timekeeper.xtime_interval <<
+       timekeeper.ntp_error -=
+           (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
                                (timekeeper.ntp_error_shift + shift);
 
        return offset;
index ab8f5e33fa92c76db813d1419e6a339f3a7aca52..32a19f9397fc347c3144a01e142308c026f49c70 100644 (file)
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 {
        struct hrtimer *timer, tmp;
        unsigned long next = 0, i;
-       struct rb_node *curr;
+       struct timerqueue_node *curr;
        unsigned long flags;
 
 next_one:
        i = 0;
        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
 
-       curr = base->first;
+       curr = timerqueue_getnext(&base->active);
        /*
         * Crude but we have to do this O(N*N) thing, because
         * we have to unlock the base when printing:
         */
        while (curr && i < next) {
-               curr = rb_next(curr);
+               curr = timerqueue_iterate_next(curr);
                i++;
        }
 
        if (curr) {
 
-               timer = rb_entry(curr, struct hrtimer, node);
+               timer = container_of(curr, struct hrtimer, node);
                tmp = *timer;
                raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
 
index 353b9227c2ecfe11793a17b0a41f534ebdbd14f8..43ca9936f2d06a2cba572f1c877406f9281e68a3 100644 (file)
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG          (0x1)
-
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-       timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
-                                      TBASE_DEFERRABLE_FLAG));
+       timer->base = TBASE_MAKE_DEFERRED(timer->base);
 }
 
 static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
-
-static inline void set_running_timer(struct tvec_base *base,
-                                       struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
-       base->running_timer = timer;
-#endif
-}
-
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
        unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
 /**
  * try_to_del_timer_sync - Try to deactivate a timer
  * @timer: timer do del
  *
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
@@ -973,6 +948,7 @@ out:
 }
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
+#ifdef CONFIG_SMP
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  *
  * Synchronization rules: Callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
+ * hardirq contexts. The caller must not hold locks which would prevent
  * completion of the timer's handler. The timer's handler must not call
  * add_timer_on(). Upon exit the timer is not queued and the handler is
  * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 int del_timer_sync(struct timer_list *timer)
 {
 #ifdef CONFIG_LOCKDEP
-       unsigned long flags;
-
-       local_irq_save(flags);
+       local_bh_disable();
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
-       local_irq_restore(flags);
+       local_bh_enable();
 #endif
-
+       /*
+        * don't use it in hardirq context, because it
+        * could lead to deadlock.
+        */
+       WARN_ON(in_irq());
        for (;;) {
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
 
                        timer_stats_account_timer(timer);
 
-                       set_running_timer(base, timer);
+                       base->running_timer = timer;
                        detach_timer(timer, 1);
 
                        spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
                        spin_lock_irq(&base->lock);
                }
        }
-       set_running_timer(base, NULL);
+       base->running_timer = NULL;
        spin_unlock_irq(&base->lock);
 }
 
@@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
  */
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
-       struct tvec_base *base = __get_cpu_var(tvec_bases);
+       struct tvec_base *base = __this_cpu_read(tvec_bases);
        unsigned long expires;
 
        /*
@@ -1298,7 +1276,7 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-       struct tvec_base *base = __get_cpu_var(tvec_bases);
+       struct tvec_base *base = __this_cpu_read(tvec_bases);
 
        hrtimer_run_pending();
 
index 53f338190b260df929d3ef6020d2bad817fcc73f..761c510a06c5989ac7e03be55991a1ac346b4db2 100644 (file)
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
index f8cf959bad456dead3a5dec2fdd1a21e384f22bb..dc53ecb8058919ed329b02c5644c7f23449cce7b 100644 (file)
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 
        __this_cpu_inc(user_stack_count);
 
-
-
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
-               return;
+               goto out_drop_count;
        entry   = ring_buffer_event_data(event);
 
        entry->tgid             = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
 
+ out_drop_count:
        __this_cpu_dec(user_stack_count);
-
  out:
        preempt_enable();
 }
index 155a415b3209c0c4e65936be1b593678aea42d27..562c56e048fdbc34b18dded38cd21ccb3cd87f08 100644 (file)
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 static int trace_wakeup_test_thread(void *data)
 {
        /* Make this a RT thread, doesn't need to be too high */
-       struct sched_param param = { .sched_priority = 5 };
+       static struct sched_param param = { .sched_priority = 5 };
        struct completion *x = data;
 
        sched_setscheduler(current, SCHED_FIFO, &param);
index aaa8dae0823619222010875d9fdc6f5dac182d2d..6e7b575ac33cf2dcba3f9dc749f7039e6805a3f0 100644 (file)
@@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  */
 static int watchdog(void *unused)
 {
-       struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+       static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
        sched_setscheduler(current, SCHED_FIFO, &param);
index e6a3763b82126729ecad6636caec9686cd7dec5f..9e2db72d128e6f22ec560ff7f04584777c293dd7 100644 (file)
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 endif
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
-        rbtree.o radix-tree.o dump_stack.o \
+        rbtree.o radix-tree.o dump_stack.o timerqueue.o\
         idr.o int_sqrt.o extable.o prio_tree.o \
         sha1.o irq_regs.o reciprocal_div.o argv_split.o \
         proportions.o prio_heap.o ratelimit.o show_mem.o \
index 3094318bfea70abd29ffdfdedca291e86020ee04..b335acb43be2cae4a4257cc9c8eb1ba5d914ce8b 100644 (file)
@@ -141,11 +141,10 @@ static void ddebug_change(const struct ddebug_query *query,
                        else if (!dp->flags)
                                dt->num_enabled++;
                        dp->flags = newflags;
-                       if (newflags) {
-                               jump_label_enable(&dp->enabled);
-                       } else {
-                               jump_label_disable(&dp->enabled);
-                       }
+                       if (newflags)
+                               dp->enabled = 1;
+                       else
+                               dp->enabled = 0;
                        if (verbose)
                                printk(KERN_INFO
                                        "ddebug: changed %s:%d [%s]%s %s\n",
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
new file mode 100644 (file)
index 0000000..e3a1050
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ *  Generic Timer-queue
+ *
+ *  Manages a simple queue of timers, ordered by expiration time.
+ *  Uses rbtrees for quick list adds and expiration.
+ *
+ *  NOTE: All of the following functions need to be serialized
+ *  to avoid races. No locking is done by this libary code.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/timerqueue.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+
+/**
+ * timerqueue_add - Adds timer to timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be added
+ *
+ * Adds the timer node to the timerqueue, sorted by the
+ * node's expires value.
+ */
+void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+       struct rb_node **p = &head->head.rb_node;
+       struct rb_node *parent = NULL;
+       struct timerqueue_node  *ptr;
+
+       /* Make sure we don't add nodes that are already added */
+       WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
+
+       while (*p) {
+               parent = *p;
+               ptr = rb_entry(parent, struct timerqueue_node, node);
+               if (node->expires.tv64 < ptr->expires.tv64)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+       rb_link_node(&node->node, parent, p);
+       rb_insert_color(&node->node, &head->head);
+
+       if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+               head->next = node;
+}
+EXPORT_SYMBOL_GPL(timerqueue_add);
+
+/**
+ * timerqueue_del - Removes a timer from the timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be removed
+ *
+ * Removes the timer node from the timerqueue.
+ */
+void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+       WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
+
+       /* update next pointer */
+       if (head->next == node) {
+               struct rb_node *rbn = rb_next(&node->node);
+
+               head->next = rbn ?
+                       rb_entry(rbn, struct timerqueue_node, node) : NULL;
+       }
+       rb_erase(&node->node, &head->head);
+       RB_CLEAR_NODE(&node->node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_del);
+
+/**
+ * timerqueue_iterate_next - Returns the timer after the provided timer
+ *
+ * @node: Pointer to a timer.
+ *
+ * Provides the timer that is after the given node. This is used, when
+ * necessary, to iterate through the list of timers in a timer list
+ * without modifying the list.
+ */
+struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
+{
+       struct rb_node *next;
+
+       if (!node)
+               return NULL;
+       next = rb_next(&node->node);
+       if (!next)
+               return NULL;
+       return container_of(next, struct timerqueue_node, node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
index 39580a5dc5df6083a5766ae8853c6610e417474f..9f85012acf0d2fb749a4ded8bb962d514d524c03 100755 (executable)
@@ -155,6 +155,8 @@ use strict;
 # '@parameter' - name of a parameter
 # '%CONST' - name of a constant.
 
+## init lots of data
+
 my $errors = 0;
 my $warnings = 0;
 my $anon_struct_union = 0;
@@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1",
                        $type_param, "\$1" );
 my $blankline_list = "";
 
-sub usage {
-    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
-    print "         [ -no-doc-sections ]\n";
-    print "         [ -function funcname [ -function funcname ...] ]\n";
-    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
-    print "         c source file(s) > outputfile\n";
-    print "         -v : verbose output, more warnings & other info listed\n";
-    exit 1;
-}
-
 # read arguments
 if ($#ARGV == -1) {
     usage();
 }
 
+my $kernelversion;
+my $dohighlight = "";
+
 my $verbose = 0;
 my $output_mode = "man";
 my $no_doc_sections = 0;
@@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June',
                'November', 'December')[(localtime)[4]] .
   " " . ((localtime)[5]+1900);
 
-# Essentially these are globals
+# Essentially these are globals.
 # They probably want to be tidied up, made more localised or something.
 # CAVEAT EMPTOR!  Some of the others I localised may not want to be, which
 # could cause "use of undefined value" or other bugs.
@@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) {
     }
 }
 
+# continue execution near EOF;
+
+sub usage {
+    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
+    print "         [ -no-doc-sections ]\n";
+    print "         [ -function funcname [ -function funcname ...] ]\n";
+    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
+    print "         c source file(s) > outputfile\n";
+    print "         -v : verbose output, more warnings & other info listed\n";
+    exit 1;
+}
+
 # get kernel version from env
 sub get_kernel_version() {
     my $version = 'unknown kernel version';
@@ -362,15 +369,6 @@ sub get_kernel_version() {
     }
     return $version;
 }
-my $kernelversion = get_kernel_version();
-
-# generate a sequence of code that will splice in highlighting information
-# using the s// operator.
-my $dohighlight = "";
-foreach my $pattern (keys %highlights) {
-#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
-    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
-}
 
 ##
 # dumps section contents to arrays/hashes intended for that purpose.
@@ -1851,34 +1849,6 @@ sub dump_function($$) {
                       });
 }
 
-sub process_file($);
-
-# Read the file that maps relative names to absolute names for
-# separate source and object directories and for shadow trees.
-if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
-       my ($relname, $absname);
-       while(<SOURCE_MAP>) {
-               chop();
-               ($relname, $absname) = (split())[0..1];
-               $relname =~ s:^/+::;
-               $source_map{$relname} = $absname;
-       }
-       close(SOURCE_MAP);
-}
-
-foreach (@ARGV) {
-    chomp;
-    process_file($_);
-}
-if ($verbose && $errors) {
-  print STDERR "$errors errors\n";
-}
-if ($verbose && $warnings) {
-  print STDERR "$warnings warnings\n";
-}
-
-exit($errors);
-
 sub reset_state {
     $function = "";
     %constants = ();
@@ -2285,3 +2255,39 @@ sub process_file($) {
        }
     }
 }
+
+
+$kernelversion = get_kernel_version();
+
+# generate a sequence of code that will splice in highlighting information
+# using the s// operator.
+foreach my $pattern (keys %highlights) {
+#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
+    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
+}
+
+# Read the file that maps relative names to absolute names for
+# separate source and object directories and for shadow trees.
+if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
+       my ($relname, $absname);
+       while(<SOURCE_MAP>) {
+               chop();
+               ($relname, $absname) = (split())[0..1];
+               $relname =~ s:^/+::;
+               $source_map{$relname} = $absname;
+       }
+       close(SOURCE_MAP);
+}
+
+foreach (@ARGV) {
+    chomp;
+    process_file($_);
+}
+if ($verbose && $errors) {
+  print STDERR "$errors errors\n";
+}
+if ($verbose && $warnings) {
+  print STDERR "$warnings warnings\n";
+}
+
+exit($errors);