Merge remote-tracking branch 'tip/auto-latest'

author Thierry Reding <treding@nvidia.com>

Thu, 24 Oct 2013 12:39:48 +0000 (14:39 +0200)

committer Thierry Reding <treding@nvidia.com>

Thu, 24 Oct 2013 12:51:22 +0000 (14:51 +0200)
author Thierry Reding <treding@nvidia.com>
Thu, 24 Oct 2013 12:39:48 +0000 (14:39 +0200)
committer Thierry Reding <treding@nvidia.com>
Thu, 24 Oct 2013 12:51:22 +0000 (14:51 +0200)
diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl

index fe397f90a34f50153f5936cfa13635ed7bc85b7b..6c9d9d37c83a30e24cce3ed25abf484e1bbd18f1 100644 (file)
--- a/Documentation/DocBook/device-drivers.tmpl
+++ b/Documentation/DocBook/device-drivers.tmpl
@@ -87,7 +87,10 @@ X!Iinclude/linux/kobject.h
  !Ekernel/printk/printk.c
  !Ekernel/panic.c
  !Ekernel/sys.c
-!Ekernel/rcupdate.c
+!Ekernel/rcu/srcu.c
+!Ekernel/rcu/tree.c
+!Ekernel/rcu/tree_plugin.h
+!Ekernel/rcu/update.c
       </sect1>
  
       <sect1><title>Device Resource Management</title>
diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl

index d16d21b7a3b7aec2884c2183005c337b3d075002..46347f6033539e4b6a207cb5640c335158b6dd3c 100644 (file)
--- a/Documentation/DocBook/genericirq.tmpl
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -87,7 +87,7 @@
    <chapter id="rationale">
      <title>Rationale</title>
         <para>
-       The original implementation of interrupt handling in Linux is using
+       The original implementation of interrupt handling in Linux uses
         the __do_IRQ() super-handler, which is able to deal with every
         type of interrupt logic.
         </para>
@@ -111,19 +111,19 @@
         </itemizedlist>
         </para>
         <para>
-       This split implementation of highlevel IRQ handlers allows us to
+       This split implementation of high-level IRQ handlers allows us to
         optimize the flow of the interrupt handling for each specific
-       interrupt type. This reduces complexity in that particular codepath
+       interrupt type. This reduces complexity in that particular code path
         and allows the optimized handling of a given type.
         </para>
         <para>
         The original general IRQ implementation used hw_interrupt_type
         structures and their ->ack(), ->end() [etc.] callbacks to
         differentiate the flow control in the super-handler. This leads to
-       a mix of flow logic and lowlevel hardware logic, and it also leads
-       to unnecessary code duplication: for example in i386, there is a
-       ioapic_level_irq and a ioapic_edge_irq irq-type which share many
-       of the lowlevel details but have different flow handling.
+       a mix of flow logic and low-level hardware logic, and it also leads
+       to unnecessary code duplication: for example in i386, there is an
+       ioapic_level_irq and an ioapic_edge_irq IRQ-type which share many
+       of the low-level details but have different flow handling.
         </para>
         <para>
         A more natural abstraction is the clean separation of the
@@ -132,23 +132,23 @@
         <para>
         Analysing a couple of architecture's IRQ subsystem implementations
         reveals that most of them can use a generic set of 'irq flow'
-       methods and only need to add the chip level specific code.
+       methods and only need to add the chip-level specific code.
         The separation is also valuable for (sub)architectures
-       which need specific quirks in the irq flow itself but not in the
-       chip-details - and thus provides a more transparent IRQ subsystem
+       which need specific quirks in the IRQ flow itself but not in the
+       chip details - and thus provides a more transparent IRQ subsystem
         design.
         </para>
         <para>
-       Each interrupt descriptor is assigned its own highlevel flow
+       Each interrupt descriptor is assigned its own high-level flow
         handler, which is normally one of the generic
-       implementations. (This highlevel flow handler implementation also
+       implementations. (This high-level flow handler implementation also
         makes it simple to provide demultiplexing handlers which can be
         found in embedded platforms on various architectures.)
         </para>
         <para>
         The separation makes the generic interrupt handling layer more
         flexible and extensible. For example, an (sub)architecture can
-       use a generic irq-flow implementation for 'level type' interrupts
+       use a generic IRQ-flow implementation for 'level type' interrupts
         and add a (sub)architecture specific 'edge type' implementation.
         </para>
         <para>
@@ -172,9 +172,9 @@
      <para>
         There are three main levels of abstraction in the interrupt code:
         <orderedlist>
-         <listitem><para>Highlevel driver API</para></listitem>
-         <listitem><para>Highlevel IRQ flow handlers</para></listitem>
-         <listitem><para>Chiplevel hardware encapsulation</para></listitem>
+         <listitem><para>High-level driver API</para></listitem>
+         <listitem><para>High-level IRQ flow handlers</para></listitem>
+         <listitem><para>Chip-level hardware encapsulation</para></listitem>
         </orderedlist>
      </para>
      <sect1 id="Interrupt_control_flow">
@@ -189,16 +189,16 @@
         which are assigned to this interrupt.
         </para>
         <para>
-       Whenever an interrupt triggers, the lowlevel arch code calls into
-       the generic interrupt code by calling desc->handle_irq().
-       This highlevel IRQ handling function only uses desc->irq_data.chip
+       Whenever an interrupt triggers, the low-level architecture code calls
+       into the generic interrupt code by calling desc->handle_irq().
+       This high-level IRQ handling function only uses desc->irq_data.chip
         primitives referenced by the assigned chip descriptor structure.
         </para>
      </sect1>
      <sect1 id="Highlevel_Driver_API">
-       <title>Highlevel Driver API</title>
+       <title>High-level Driver API</title>
         <para>
-         The highlevel Driver API consists of following functions:
+         The high-level Driver API consists of following functions:
           <itemizedlist>
           <listitem><para>request_irq()</para></listitem>
           <listitem><para>free_irq()</para></listitem>
@@ -216,7 +216,7 @@
         </para>
      </sect1>
      <sect1 id="Highlevel_IRQ_flow_handlers">
-       <title>Highlevel IRQ flow handlers</title>
+       <title>High-level IRQ flow handlers</title>
         <para>
           The generic layer provides a set of pre-defined irq-flow methods:
           <itemizedlist>
@@ -228,7 +228,7 @@
           <listitem><para>handle_edge_eoi_irq</para></listitem>
           <listitem><para>handle_bad_irq</para></listitem>
           </itemizedlist>
-         The interrupt flow handlers (either predefined or architecture
+         The interrupt flow handlers (either pre-defined or architecture
           specific) are assigned to specific interrupts by the architecture
           either during bootup or during device initialization.
         </para>
@@ -297,7 +297,7 @@ desc->irq_data.chip->irq_unmask();
                 <para>
                 handle_fasteoi_irq provides a generic implementation
                 for interrupts, which only need an EOI at the end of
-               the handler
+               the handler.
                 </para>
                 <para>
                 The following control flow is implemented (simplified excerpt):
@@ -394,7 +394,7 @@ if (desc->irq_data.chip->irq_eoi)
         The generic functions are intended for 'clean' architectures and chips,
         which have no platform-specific IRQ handling quirks. If an architecture
         needs to implement quirks on the 'flow' level then it can do so by
-       overriding the highlevel irq-flow handler.
+       overriding the high-level irq-flow handler.
         </para>
         </sect2>
         <sect2 id="Delayed_interrupt_disable">
@@ -419,9 +419,9 @@ if (desc->irq_data.chip->irq_eoi)
         </sect2>
      </sect1>
      <sect1 id="Chiplevel_hardware_encapsulation">
-       <title>Chiplevel hardware encapsulation</title>
+       <title>Chip-level hardware encapsulation</title>
         <para>
-       The chip level hardware descriptor structure irq_chip
+       The chip-level hardware descriptor structure irq_chip
         contains all the direct chip relevant functions, which
         can be utilized by the irq flow implementations.
           <itemizedlist>
@@ -429,14 +429,14 @@ if (desc->irq_data.chip->irq_eoi)
           <listitem><para>irq_mask_ack() - Optional, recommended for performance</para></listitem>
           <listitem><para>irq_mask()</para></listitem>
           <listitem><para>irq_unmask()</para></listitem>
-         <listitem><para>irq_eoi() - Optional, required for eoi flow handlers</para></listitem>
+         <listitem><para>irq_eoi() - Optional, required for EOI flow handlers</para></listitem>
           <listitem><para>irq_retrigger() - Optional</para></listitem>
           <listitem><para>irq_set_type() - Optional</para></listitem>
           <listitem><para>irq_set_wake() - Optional</para></listitem>
           </itemizedlist>
         These primitives are strictly intended to mean what they say: ack means
         ACK, masking means masking of an IRQ line, etc. It is up to the flow
-       handler(s) to use these basic units of lowlevel functionality.
+       handler(s) to use these basic units of low-level functionality.
         </para>
      </sect1>
    </chapter>
@@ -445,7 +445,7 @@ if (desc->irq_data.chip->irq_eoi)
       <title>__do_IRQ entry point</title>
       <para>
         The original implementation __do_IRQ() was an alternative entry
-       point for all types of interrupts. It not longer exists.
+       point for all types of interrupts. It no longer exists.
       </para>
       <para>
         This handler turned out to be not suitable for all
@@ -468,11 +468,11 @@ if (desc->irq_data.chip->irq_eoi)
    <chapter id="genericchip">
       <title>Generic interrupt chip</title>
       <para>
-       To avoid copies of identical implementations of irq chips the
+       To avoid copies of identical implementations of IRQ chips the
         core provides a configurable generic interrupt chip
         implementation. Developers should check carefuly whether the
         generic chip fits their needs before implementing the same
-       functionality slightly different themself.
+       functionality slightly differently themselves.
       </para>
  !Ekernel/irq/generic-chip.c
    </chapter>
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt

index 7703ec73a9bbb225a45258c4cde11c8cc9dd236f..91266193b8f49e840709db77d75f8357428b2806 100644 (file)
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -202,8 +202,8 @@ over a rather long period of time, but improvements are always welcome!
         updater uses call_rcu_sched() or synchronize_sched(), then
         the corresponding readers must disable preemption, possibly
         by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
-       If the updater uses synchronize_srcu() or call_srcu(),
-       the the corresponding readers must use srcu_read_lock() and
+       If the updater uses synchronize_srcu() or call_srcu(), then
+       the corresponding readers must use srcu_read_lock() and
         srcu_read_unlock(), and with the same srcu_struct.  The rules for
         the expedited primitives are the same as for their non-expedited
         counterparts.  Mixing things up will result in confusion and
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt

index 8e9359de1d28b2e845d25425be72451b3f6be4f7..6f3a0057548ec0b3c13d02fe5d82ea7b0bf2ece4 100644 (file)
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -12,12 +12,12 @@ CONFIG_RCU_CPU_STALL_TIMEOUT
         This kernel configuration parameter defines the period of time
         that RCU will wait from the beginning of a grace period until it
         issues an RCU CPU stall warning.  This time period is normally
-       sixty seconds.
+       21 seconds.
  
         This configuration parameter may be changed at runtime via the
         /sys/module/rcutree/parameters/rcu_cpu_stall_timeout, however
         this parameter is checked only at the beginning of a cycle.
-       So if you are 30 seconds into a 70-second stall, setting this
+       So if you are 10 seconds into a 40-second stall, setting this
         sysfs parameter to (say) five will shorten the timeout for the
         -next- stall, or the following warning for the current stall
         (assuming the stall lasts long enough).  It will not affect the
@@ -32,7 +32,7 @@ CONFIG_RCU_CPU_STALL_VERBOSE
         also dump the stacks of any tasks that are blocking the current
         RCU-preempt grace period.
  
-RCU_CPU_STALL_INFO
+CONFIG_RCU_CPU_STALL_INFO
  
         This kernel configuration parameter causes the stall warning to
         print out additional per-CPU diagnostic information, including
@@ -43,7 +43,8 @@ RCU_STALL_DELAY_DELTA
         Although the lockdep facility is extremely useful, it does add
         some overhead.  Therefore, under CONFIG_PROVE_RCU, the
         RCU_STALL_DELAY_DELTA macro allows five extra seconds before
-       giving an RCU CPU stall warning message.
+       giving an RCU CPU stall warning message.  (This is a cpp
+       macro, not a kernel configuration parameter.)
  
  RCU_STALL_RAT_DELAY
  
@@ -52,7 +53,8 @@ RCU_STALL_RAT_DELAY
         However, if the offending CPU does not detect its own stall in
         the number of jiffies specified by RCU_STALL_RAT_DELAY, then
         some other CPU will complain.  This delay is normally set to
-       two jiffies.
+       two jiffies.  (This is a cpp macro, not a kernel configuration
+       parameter.)
  
  When a CPU detects that it is stalling, it will print a message similar
  to the following:
@@ -86,7 +88,12 @@ printing, there will be a spurious stall-warning message:
  
  INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies)
  
-This is rare, but does happen from time to time in real life.
+This is rare, but does happen from time to time in real life.  It is also
+possible for a zero-jiffy stall to be flagged in this case, depending
+on how the stall warning and the grace-period initialization happen to
+interact.  Please note that it is not possible to entirely eliminate this
+sort of false positive without resorting to things like stop_machine(),
+which is overkill for this sort of problem.
  
  If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set,
  more information is printed with the stall-warning message, for example:
@@ -216,4 +223,5 @@ that portion of the stack which remains the same from trace to trace.
  If you can reliably trigger the stall, ftrace can be quite helpful.
  
  RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
-and with RCU's event tracing.
+and with RCU's event tracing.  For information on RCU's event tracing,
+see include/trace/events/rcu.h.
diff --git a/Documentation/x86/efi-stub.txt b/Documentation/efi-stub.txt

similarity index 100%

rename from Documentation/x86/efi-stub.txt

rename to Documentation/efi-stub.txt
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index fcbb736d55feb439c1152be71355d5ab79f8edd2..17431e91a1b1280036771c667f4c4dec33c08ad0 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1975,6 +1975,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
         noapic          [SMP,APIC] Tells the kernel to not make use of any
                         IOAPICs that may be present in the system.
  
+       nokaslr         [X86]
+                       Disable kernel base offset ASLR (Address Space
+                       Layout Randomization) if built into the kernel.
+
         noautogroup     Disable scheduler automatic task group creation.
  
         nobats          [PPC] Do not use BATs for mapping kernel lowmem
@@ -2599,7 +2603,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
         ramdisk_size=   [RAM] Sizes of RAM disks in kilobytes
                         See Documentation/blockdev/ramdisk.txt.
  
-       rcu_nocbs=      [KNL,BOOT]
+       rcu_nocbs=      [KNL]
                         In kernels built with CONFIG_RCU_NOCB_CPU=y, set
                         the specified list of CPUs to be no-callback CPUs.
                         Invocation of these CPUs' RCU callbacks will
@@ -2612,7 +2616,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         real-time workloads.  It can also improve energy
                         efficiency for asymmetric multiprocessors.
  
-       rcu_nocb_poll   [KNL,BOOT]
+       rcu_nocb_poll   [KNL]
                         Rather than requiring that offloaded CPUs
                         (specified by rcu_nocbs= above) explicitly
                         awaken the corresponding "rcuoN" kthreads,
@@ -2623,126 +2627,145 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         energy efficiency by requiring that the kthreads
                         periodically wake up to do the polling.
  
-       rcutree.blimit= [KNL,BOOT]
+       rcutree.blimit= [KNL]
                         Set maximum number of finished RCU callbacks to process
                         in one batch.
  
-       rcutree.fanout_leaf=    [KNL,BOOT]
+       rcutree.rcu_fanout_leaf= [KNL]
                         Increase the number of CPUs assigned to each
                         leaf rcu_node structure.  Useful for very large
                         systems.
  
-       rcutree.jiffies_till_first_fqs= [KNL,BOOT]
+       rcutree.jiffies_till_first_fqs= [KNL]
                         Set delay from grace-period initialization to
                         first attempt to force quiescent states.
                         Units are jiffies, minimum value is zero,
                         and maximum value is HZ.
  
-       rcutree.jiffies_till_next_fqs= [KNL,BOOT]
+       rcutree.jiffies_till_next_fqs= [KNL]
                         Set delay between subsequent attempts to force
                         quiescent states.  Units are jiffies, minimum
                         value is one, and maximum value is HZ.
  
-       rcutree.qhimark=        [KNL,BOOT]
+       rcutree.qhimark= [KNL]
                         Set threshold of queued
                         RCU callbacks over which batch limiting is disabled.
  
-       rcutree.qlowmark=       [KNL,BOOT]
+       rcutree.qlowmark= [KNL]
                         Set threshold of queued RCU callbacks below which
                         batch limiting is re-enabled.
  
-       rcutree.rcu_cpu_stall_suppress= [KNL,BOOT]
-                       Suppress RCU CPU stall warning messages.
-
-       rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
-                       Set timeout for RCU CPU stall warning messages.
-
-       rcutree.rcu_idle_gp_delay=      [KNL,BOOT]
+       rcutree.rcu_idle_gp_delay= [KNL]
                         Set wakeup interval for idle CPUs that have
                         RCU callbacks (RCU_FAST_NO_HZ=y).
  
-       rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT]
+       rcutree.rcu_idle_lazy_gp_delay= [KNL]
                         Set wakeup interval for idle CPUs that have
                         only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
                         Lazy RCU callbacks are those which RCU can
                         prove do nothing more than free memory.
  
-       rcutorture.fqs_duration= [KNL,BOOT]
+       rcutorture.fqs_duration= [KNL]
                         Set duration of force_quiescent_state bursts.
  
-       rcutorture.fqs_holdoff= [KNL,BOOT]
+       rcutorture.fqs_holdoff= [KNL]
                         Set holdoff time within force_quiescent_state bursts.
  
-       rcutorture.fqs_stutter= [KNL,BOOT]
+       rcutorture.fqs_stutter= [KNL]
                         Set wait time between force_quiescent_state bursts.
  
-       rcutorture.irqreader= [KNL,BOOT]
-                       Test RCU readers from irq handlers.
+       rcutorture.gp_exp= [KNL]
+                       Use expedited update-side primitives.
+
+       rcutorture.gp_normal= [KNL]
+                       Use normal (non-expedited) update-side primitives.
+                       If both gp_exp and gp_normal are set, do both.
+                       If neither gp_exp nor gp_normal are set, still
+                       do both.
  
-       rcutorture.n_barrier_cbs= [KNL,BOOT]
+       rcutorture.n_barrier_cbs= [KNL]
                         Set callbacks/threads for rcu_barrier() testing.
  
-       rcutorture.nfakewriters= [KNL,BOOT]
+       rcutorture.nfakewriters= [KNL]
                         Set number of concurrent RCU writers.  These just
                         stress RCU, they don't participate in the actual
                         test, hence the "fake".
  
-       rcutorture.nreaders= [KNL,BOOT]
+       rcutorture.nreaders= [KNL]
                         Set number of RCU readers.
  
-       rcutorture.onoff_holdoff= [KNL,BOOT]
+       rcutorture.object_debug= [KNL]
+                       Enable debug-object double-call_rcu() testing.
+
+       rcutorture.onoff_holdoff= [KNL]
                         Set time (s) after boot for CPU-hotplug testing.
  
-       rcutorture.onoff_interval= [KNL,BOOT]
+       rcutorture.onoff_interval= [KNL]
                         Set time (s) between CPU-hotplug operations, or
                         zero to disable CPU-hotplug testing.
  
-       rcutorture.shuffle_interval= [KNL,BOOT]
+       rcutorture.rcutorture_runnable= [BOOT]
+                       Start rcutorture running at boot time.
+
+       rcutorture.shuffle_interval= [KNL]
                         Set task-shuffle interval (s).  Shuffling tasks
                         allows some CPUs to go into dyntick-idle mode
                         during the rcutorture test.
  
-       rcutorture.shutdown_secs= [KNL,BOOT]
+       rcutorture.shutdown_secs= [KNL]
                         Set time (s) after boot system shutdown.  This
                         is useful for hands-off automated testing.
  
-       rcutorture.stall_cpu= [KNL,BOOT]
+       rcutorture.stall_cpu= [KNL]
                         Duration of CPU stall (s) to test RCU CPU stall
                         warnings, zero to disable.
  
-       rcutorture.stall_cpu_holdoff= [KNL,BOOT]
+       rcutorture.stall_cpu_holdoff= [KNL]
                         Time to wait (s) after boot before inducing stall.
  
-       rcutorture.stat_interval= [KNL,BOOT]
+       rcutorture.stat_interval= [KNL]
                         Time (s) between statistics printk()s.
  
-       rcutorture.stutter= [KNL,BOOT]
+       rcutorture.stutter= [KNL]
                         Time (s) to stutter testing, for example, specifying
                         five seconds causes the test to run for five seconds,
                         wait for five seconds, and so on.  This tests RCU's
                         ability to transition abruptly to and from idle.
  
-       rcutorture.test_boost= [KNL,BOOT]
+       rcutorture.test_boost= [KNL]
                         Test RCU priority boosting?  0=no, 1=maybe, 2=yes.
                         "Maybe" means test if the RCU implementation
                         under test support RCU priority boosting.
  
-       rcutorture.test_boost_duration= [KNL,BOOT]
+       rcutorture.test_boost_duration= [KNL]
                         Duration (s) of each individual boost test.
  
-       rcutorture.test_boost_interval= [KNL,BOOT]
+       rcutorture.test_boost_interval= [KNL]
                         Interval (s) between each boost test.
  
-       rcutorture.test_no_idle_hz= [KNL,BOOT]
+       rcutorture.test_no_idle_hz= [KNL]
                         Test RCU's dyntick-idle handling.  See also the
                         rcutorture.shuffle_interval parameter.
  
-       rcutorture.torture_type= [KNL,BOOT]
+       rcutorture.torture_type= [KNL]
                         Specify the RCU implementation to test.
  
-       rcutorture.verbose= [KNL,BOOT]
+       rcutorture.verbose= [KNL]
                         Enable additional printk() statements.
  
+       rcupdate.rcu_expedited= [KNL]
+                       Use expedited grace-period primitives, for
+                       example, synchronize_rcu_expedited() instead
+                       of synchronize_rcu().  This reduces latency,
+                       but can increase CPU utilization, degrade
+                       real-time latency, and degrade energy efficiency.
+
+       rcupdate.rcu_cpu_stall_suppress= [KNL]
+                       Suppress RCU CPU stall warning messages.
+
+       rcupdate.rcu_cpu_stall_timeout= [KNL]
+                       Set timeout for RCU CPU stall warning messages.
+
         rdinit=         [KNL]
                         Format: <full_path>
                         Run specified binary instead of /init from the ramdisk,
@@ -3471,11 +3494,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         default x2apic cluster mode on platforms
                         supporting x2apic.
  
-       x86_mrst_timer= [X86-32,APBT]
-                       Choose timer option for x86 Moorestown MID platform.
+       x86_intel_mid_timer= [X86-32,APBT]
+                       Choose timer option for x86 Intel MID platform.
                         Two valid options are apbt timer only and lapic timer
                         plus one apbt timer for broadcast timer.
-                       x86_mrst_timer=apbt_only | lapic_and_apbt
+                       x86_intel_mid_timer=apbt_only | lapic_and_apbt
  
         xen_emul_unplug=                [HW,X86,XEN]
                         Unplug Xen emulated devices
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt

index 32351bfabf2038617c35326f4545ac9f33b02073..827104fb9364cf60df2359e45c3336f61d4568c5 100644 (file)
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -181,12 +181,17 @@ To reduce its OS jitter, do any of the following:
                 make sure that this is safe on your particular system.
         d.      It is not possible to entirely get rid of OS jitter
                 from vmstat_update() on CONFIG_SMP=y systems, but you
-               can decrease its frequency by writing a large value to
-               /proc/sys/vm/stat_interval.  The default value is HZ,
-               for an interval of one second.  Of course, larger values
-               will make your virtual-memory statistics update more
-               slowly.  Of course, you can also run your workload at
-               a real-time priority, thus preempting vmstat_update().
+               can decrease its frequency by writing a large value
+               to /proc/sys/vm/stat_interval.  The default value is
+               HZ, for an interval of one second.  Of course, larger
+               values will make your virtual-memory statistics update
+               more slowly.  Of course, you can also run your workload
+               at a real-time priority, thus preempting vmstat_update(),
+               but if your workload is CPU-bound, this is a bad idea.
+               However, there is an RFC patch from Christoph Lameter
+               (based on an earlier one from Gilad Ben-Yossef) that
+               reduces or even eliminates vmstat overhead for some
+               workloads at https://lkml.org/lkml/2013/9/4/379.
         e.      If running on high-end powerpc servers, build with
                 CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
                 daemon from running on each CPU every second or so.
diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt

index dd2f7b26ca3077737dd93ee97217248216b1878d..72d010689751b3cc7c60349342ede7cd3f8faf49 100644 (file)
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -46,16 +46,14 @@ With these hooks we provide the following statistics:
   contentions       - number of lock acquisitions that had to wait
   wait time min     - shortest (non-0) time we ever had to wait for a lock
             max     - longest time we ever had to wait for a lock
-           total   - total time we spend waiting on this lock
+          total   - total time we spend waiting on this lock
+          avg     - average time spent waiting on this lock
   acq-bounces       - number of lock acquisitions that involved x-cpu data
   acquisitions      - number of times we took the lock
   hold time min     - shortest (non-0) time we ever held the lock
-           max     - longest time we ever held the lock
-           total   - total time this lock was held
-
-From these number various other statistics can be derived, such as:
-
- hold time average = hold time total / acquisitions
+          max     - longest time we ever held the lock
+          total   - total time this lock was held
+          avg     - average time this lock was held
  
  These numbers are gathered per lock class, per read/write state (when
  applicable).
@@ -84,37 +82,38 @@ Look at the current lock statistics:
  
  # less /proc/lock_stat
  
-01 lock_stat version 0.3
-02 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-03                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
-04 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+01 lock_stat version 0.4
+02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
+04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  05
-06                          &mm->mmap_sem-W:           233            538 18446744073708       22924.27      607243.51           1342          45806           1.71        8595.89     1180582.34
-07                          &mm->mmap_sem-R:           205            587 18446744073708       28403.36      731975.00           1940         412426           0.58      187825.45     6307502.88
-08                          ---------------
-09                            &mm->mmap_sem            487          [<ffffffff8053491f>] do_page_fault+0x466/0x928
-10                            &mm->mmap_sem            179          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
-11                            &mm->mmap_sem            279          [<ffffffff80210a57>] sys_mmap+0x75/0xce
-12                            &mm->mmap_sem             76          [<ffffffff802a490b>] sys_munmap+0x32/0x59
-13                          ---------------
-14                            &mm->mmap_sem            270          [<ffffffff80210a57>] sys_mmap+0x75/0xce
-15                            &mm->mmap_sem            431          [<ffffffff8053491f>] do_page_fault+0x466/0x928
-16                            &mm->mmap_sem            138          [<ffffffff802a490b>] sys_munmap+0x32/0x59
-17                            &mm->mmap_sem            145          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
+06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
+07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
+08                         ---------------
+09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
+19                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
+13                         ---------------
+14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
+15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
+16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
  18
-19 ...............................................................................................................................................................................................
+19.............................................................................................................................................................................................................................
  20
-21                              dcache_lock:           621            623           0.52         118.26        1053.02           6745          91930           0.29         316.29      118423.41
-22                              -----------
-23                              dcache_lock            179          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
-24                              dcache_lock            113          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
-25                              dcache_lock             99          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
-26                              dcache_lock            104          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
-27                              -----------
-28                              dcache_lock            192          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
-29                              dcache_lock             98          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
-30                              dcache_lock             72          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
-31                              dcache_lock            112          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
+21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
+22                         ---------------
+23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+27                         ---------------
+28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+
  
  This excerpt shows the first two lock class statistics. Line 01 shows the
  output version - each time the format changes this will be updated. Line 02-04
@@ -131,30 +130,30 @@ The integer part of the time values is in us.
  
  Dealing with nested locks, subclasses may appear:
  
-32...............................................................................................................................................................................................
+32...........................................................................................................................................................................................................................
  33
-34                               &rq->lock:         13128          13128           0.43         190.53      103881.26          97454        3453404           0.00         401.11    13224683.11
+34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
  35                               ---------
-36                               &rq->lock            645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-37                               &rq->lock            297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-38                               &rq->lock            360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
-39                               &rq->lock            428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
+36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
+39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
  40                               ---------
-41                               &rq->lock             77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-42                               &rq->lock            174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-43                               &rq->lock           4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-44                               &rq->lock            893          [<ffffffff81340524>] schedule+0x157/0x7b8
+41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
  45
-46...............................................................................................................................................................................................
+46...........................................................................................................................................................................................................................
  47
-48                             &rq->lock/1:         11526          11488           0.33         388.73      136294.31          21461          38404           0.00          37.93      109388.53
+48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
  49                             -----------
-50                             &rq->lock/1          11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
  51                             -----------
-52                             &rq->lock/1           5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-53                             &rq->lock/1           1224          [<ffffffff81340524>] schedule+0x157/0x7b8
-54                             &rq->lock/1           4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-55                             &rq->lock/1            181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
+54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
  
  Line 48 shows statistics for the second subclass (/1) of &rq->lock class
  (subclass starts from 0), since in this case, as line 50 suggests,
@@ -163,16 +162,16 @@ double_rq_lock actually acquires a nested lock of two spinlocks.
  View the top contending locks:
  
  # grep : /proc/lock_stat | head
-              &inode->i_data.tree_lock-W:            15          21657           0.18     1093295.30 11547131054.85             58          10415           0.16          87.51        6387.60
-              &inode->i_data.tree_lock-R:             0              0           0.00           0.00           0.00          23302         231198           0.25           8.45       98023.38
-                             dcache_lock:          1037           1161           0.38          45.32         774.51           6611         243371           0.15         306.48       77387.24
-                         &inode->i_mutex:           161            286 18446744073709       62882.54     1244614.55           3653          20598 18446744073709       62318.60     1693822.74
-                         &zone->lru_lock:            94             94           0.53           7.33          92.10           4366          32690           0.29          59.81       16350.06
-              &inode->i_data.i_mmap_mutex:            79             79           0.40           3.77          53.03          11779          87755           0.28         116.93       29898.44
-                        &q->__queue_lock:            48             50           0.52          31.62          86.31            774          13131           0.17         113.08       12277.52
-                        &rq->rq_lock_key:            43             47           0.74          68.50         170.63           3706          33929           0.22         107.99       17460.62
-                      &rq->rq_lock_key#2:            39             46           0.75           6.68          49.03           2979          32292           0.17         125.17       17137.63
-                         tasklist_lock-W:            15             15           1.45          10.87          32.70           1201           7390           0.58          62.55       13648.47
+                       clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
+                    tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
+                 &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
+                              &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
+              &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
+                        tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
+                        tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
+                             rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
+       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
+                             rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
  
  Clear the statistics:
  
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt

index 9d4c1d18ad447e0db3dbbbc9776a29e4c93450c0..4273b2d71a278cf26d3d381192c1de8ac4d7a340 100644 (file)
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -355,6 +355,82 @@ utilize.
  
  ==============================================================
  
+numa_balancing
+
+Enables/disables automatic page fault based NUMA memory
+balancing. Memory is moved automatically to nodes
+that access it often.
+
+Enables/disables automatic NUMA memory balancing. On NUMA machines, there
+is a performance penalty if remote memory is accessed by a CPU. When this
+feature is enabled the kernel samples what task thread is accessing memory
+by periodically unmapping pages and later trapping a page fault. At the
+time of the page fault, it is determined if the data being accessed should
+be migrated to a local memory node.
+
+The unmapping of pages and trapping faults incur additional overhead that
+ideally is offset by improved memory locality but there is no universal
+guarantee. If the target workload is already bound to NUMA nodes then this
+feature should be disabled. Otherwise, if the system overhead from the
+feature is too high then the rate the kernel samples for NUMA hinting
+faults may be controlled by the numa_balancing_scan_period_min_ms,
+numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
+numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
+numa_balancing_migrate_deferred.
+
+==============================================================
+
+numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms,
+numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
+
+Automatic NUMA balancing scans tasks address space and unmaps pages to
+detect if pages are properly placed or if the data should be migrated to a
+memory node local to where the task is running.  Every "scan delay" the task
+scans the next "scan size" number of pages in its address space. When the
+end of the address space is reached the scanner restarts from the beginning.
+
+In combination, the "scan delay" and "scan size" determine the scan rate.
+When "scan delay" decreases, the scan rate increases.  The scan delay and
+hence the scan rate of every task is adaptive and depends on historical
+behaviour. If pages are properly placed then the scan delay increases,
+otherwise the scan delay decreases.  The "scan size" is not adaptive but
+the higher the "scan size", the higher the scan rate.
+
+Higher scan rates incur higher system overhead as page faults must be
+trapped and potentially data must be migrated. However, the higher the scan
+rate, the more quickly a tasks memory is migrated to a local node if the
+workload pattern changes and minimises performance impact due to remote
+memory accesses. These sysctls control the thresholds for scan delays and
+the number of pages scanned.
+
+numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the maximum scanning
+rate for each task.
+
+numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
+when it initially forks.
+
+numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the minimum scanning
+rate for each task.
+
+numa_balancing_scan_size_mb is how many megabytes worth of pages are
+scanned for a given scan.
+
+numa_balancing_settle_count is how many scan periods must complete before
+the schedule balancer stops pushing the task towards a preferred node. This
+gives the scheduler a chance to place the task on an alternative node if the
+preferred node is overloaded.
+
+numa_balancing_migrate_deferred is how many page migrations get skipped
+unconditionally, after a page migration is skipped because a page is shared
+with other tasks. This reduces page migration overhead, and determines
+how much stronger the "move task near its memory" policy scheduler becomes,
+versus the "move memory near its task" memory management policy, for workloads
+with shared memory.
+
+==============================================================
+
  osrelease, ostype & version:
  
  # cat osrelease
diff --git a/MAINTAINERS b/MAINTAINERS

index 10a90130579c8f603ff9998e82dca556ee847744..316572ad49f8a8dd02ab5b2c601aaf5938542f11 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6971,7 +6971,7 @@ M:        "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
  S:     Supported
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
  F:     Documentation/RCU/torture.txt
-F:     kernel/rcutorture.c
+F:     kernel/rcu/torture.c
  
  RDC R-321X SoC
  M:     Florian Fainelli <florian@openwrt.org>
@@ -6998,8 +6998,9 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
  F:     Documentation/RCU/
  X:     Documentation/RCU/torture.txt
  F:     include/linux/rcu*
-F:     kernel/rcu*
-X:     kernel/rcutorture.c
+X:     include/linux/srcu.h
+F:     kernel/rcu/
+X:     kernel/rcu/torture.c
  
  REAL TIME CLOCK (RTC) SUBSYSTEM
  M:     Alessandro Zummo <a.zummo@towertech.it>
@@ -7324,6 +7325,8 @@ S:        Maintained
  F:     kernel/sched/
  F:     include/linux/sched.h
  F:     include/uapi/linux/sched.h
+F:     kernel/wait.c
+F:     include/linux/wait.h
  
  SCORE ARCHITECTURE
  M:     Chen Liqin <liqin.linux@gmail.com>
@@ -7687,8 +7690,8 @@ M:        "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
  W:     http://www.rdrop.com/users/paulmck/RCU/
  S:     Supported
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
-F:     include/linux/srcu*
-F:     kernel/srcu*
+F:     include/linux/srcu.h
+F:     kernel/rcu/srcu.c
  
  SMACK SECURITY MODULE
  M:     Casey Schaufler <casey@schaufler-ca.com>
diff --git a/arch/Kconfig b/arch/Kconfig

index af2cc6eabcc781c4e8f7ee2067de75844ed5874d..ded747c7b74c32e66fa681d7e710528c23421f9e 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -353,6 +353,18 @@ config HAVE_CONTEXT_TRACKING
  config HAVE_VIRT_CPU_ACCOUNTING
         bool
  
+config HAVE_VIRT_CPU_ACCOUNTING_GEN
+       bool
+       default y if 64BIT
+       help
+         With VIRT_CPU_ACCOUNTING_GEN, cputime_t becomes 64-bit.
+         Before enabling this option, arch code must be audited
+         to ensure there are no races in concurrent read/write of
+         cputime_t. For example, reading/writing 64-bit cputime_t on
+         some 32-bit arches may require multiple accesses, so proper
+         locking is needed to protect against concurrent accesses.
+
+
  config HAVE_IRQ_TIME_ACCOUNTING
         bool
         help
@@ -390,6 +402,16 @@ config HAVE_UNDERSCORE_SYMBOL_PREFIX
           Some architectures generate an _ in front of C symbols; things like
           module loading and assembly files need to know about this.
  
+config HAVE_IRQ_EXIT_ON_IRQ_STACK
+       bool
+       help
+         Architecture doesn't only execute the irq handler on the irq stack
+         but also irq_exit(). This way we can process softirqs on this irq
+         stack instead of switching to a new one when we call __do_softirq()
+         in the end of an hardirq.
+         This spares a stack switch and improves cache usage on softirq
+         processing.
+
  #
  # ABI hall of shame
  #
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild

index a6e85f448c1c0f3723eb6e4b91363191d4789b3c..f01fb505ad52ffac8bb98c2619dfd2d7f2c204ab 100644 (file)
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
  
  generic-y += exec.h
  generic-y += trace_clock.h
+generic-y += preempt.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild

index d8dd660898b9b214927c903e1198e36026513984..5943f7f9d32550313a5ef767e29ead6e64f5e3f6 100644 (file)
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -46,3 +46,4 @@ generic-y += ucontext.h
  generic-y += user.h
  generic-y += vga.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index b6a708ef6067124e3b8b7e3d0cc8aebccb72787d..5e27ab643757e84871ef6616b5ae8acb100d72d7 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -58,6 +58,7 @@ config ARM
         select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_SYSCALL_TRACEPOINTS
         select HAVE_UID16
+       select HAVE_VIRT_CPU_ACCOUNTING_GEN
         select IRQ_FORCED_THREADING
         select KTIME_SCALAR
         select MODULES_USE_ELF_REL
diff --git a/arch/arm/boot/dts/zynq-7000.dtsi b/arch/arm/boot/dts/zynq-7000.dtsi

index e32b92b949d2fe762d52ed40e315421786b16113..e7f73b2e45501772b94ce8489ea8e67f4e8c3ac0 100644 (file)
--- a/arch/arm/boot/dts/zynq-7000.dtsi
+++ b/arch/arm/boot/dts/zynq-7000.dtsi
@@ -92,6 +92,14 @@
                         };
                 };
  
+               global_timer: timer@f8f00200 {
+                       compatible = "arm,cortex-a9-global-timer";
+                       reg = <0xf8f00200 0x20>;
+                       interrupts = <1 11 0x301>;
+                       interrupt-parent = <&intc>;
+                       clocks = <&clkc 4>;
+               };
+
                 ttc0: ttc0@f8001000 {
                         interrupt-parent = <&intc>;
                         interrupts = < 0 10 4 0 11 4 0 12 4 >;
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild

index a6395c0277152f74645b8fceb572abc9b1c5a2db..c38b58c8020215f54af8140e97de8467ad3c3c16 100644 (file)
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -33,3 +33,4 @@ generic-y += termios.h
  generic-y += timex.h
  generic-y += trace_clock.h
  generic-y += unaligned.h
+generic-y += preempt.h
diff --git a/arch/arm/include/asm/arch_timer.h b/arch/arm/include/asm/arch_timer.h

index 5665134bfa3ef28043fb7a41c9843cfa8669f465..0704e0cf557146da78c086079daeec8c0112d326 100644 (file)
--- a/arch/arm/include/asm/arch_timer.h
+++ b/arch/arm/include/asm/arch_timer.h
@@ -87,17 +87,43 @@ static inline u64 arch_counter_get_cntvct(void)
         return cval;
  }
  
-static inline void arch_counter_set_user_access(void)
+static inline u32 arch_timer_get_cntkctl(void)
  {
         u32 cntkctl;
-
         asm volatile("mrc p15, 0, %0, c14, c1, 0" : "=r" (cntkctl));
+       return cntkctl;
+}
  
-       /* disable user access to everything */
-       cntkctl &= ~((3 << 8) | (7 << 0));
-
+static inline void arch_timer_set_cntkctl(u32 cntkctl)
+{
         asm volatile("mcr p15, 0, %0, c14, c1, 0" : : "r" (cntkctl));
  }
+
+static inline void arch_counter_set_user_access(void)
+{
+       u32 cntkctl = arch_timer_get_cntkctl();
+
+       /* Disable user access to both physical/virtual counters/timers */
+       /* Also disable virtual event stream */
+       cntkctl &= ~(ARCH_TIMER_USR_PT_ACCESS_EN
+                       | ARCH_TIMER_USR_VT_ACCESS_EN
+                       | ARCH_TIMER_VIRT_EVT_EN
+                       | ARCH_TIMER_USR_VCT_ACCESS_EN
+                       | ARCH_TIMER_USR_PCT_ACCESS_EN);
+       arch_timer_set_cntkctl(cntkctl);
+}
+
+static inline void arch_timer_evtstrm_enable(int divider)
+{
+       u32 cntkctl = arch_timer_get_cntkctl();
+       cntkctl &= ~ARCH_TIMER_EVT_TRIGGER_MASK;
+       /* Set the divider and enable virtual event stream */
+       cntkctl |= (divider << ARCH_TIMER_EVT_TRIGGER_SHIFT)
+                       | ARCH_TIMER_VIRT_EVT_EN;
+       arch_timer_set_cntkctl(cntkctl);
+       elf_hwcap |= HWCAP_EVTSTRM;
+}
+
  #endif
  
  #endif
diff --git a/arch/arm/include/uapi/asm/hwcap.h b/arch/arm/include/uapi/asm/hwcap.h

index 6d34d080372abe0fd14769c7c6e7360aece5dfdd..7dcc10d6725352b9b1d35f7c5583187cb3a744c1 100644 (file)
--- a/arch/arm/include/uapi/asm/hwcap.h
+++ b/arch/arm/include/uapi/asm/hwcap.h
@@ -26,5 +26,6 @@
  #define HWCAP_VFPD32   (1 << 19)       /* set if VFP has 32 regs (not 16) */
  #define HWCAP_IDIV     (HWCAP_IDIVA | HWCAP_IDIVT)
  #define HWCAP_LPAE     (1 << 20)
+#define HWCAP_EVTSTRM  (1 << 21)
  
  #endif /* _UAPI__ASMARM_HWCAP_H */
diff --git a/arch/arm/kernel/arch_timer.c b/arch/arm/kernel/arch_timer.c

index 221f07b11ccb0ad4c79cdcbb65072ffb2b822b0a..1791f12c180bbe3264bc6c48849f1f6ed4aaecde 100644 (file)
--- a/arch/arm/kernel/arch_timer.c
+++ b/arch/arm/kernel/arch_timer.c
@@ -11,7 +11,6 @@
  #include <linux/init.h>
  #include <linux/types.h>
  #include <linux/errno.h>
-#include <linux/sched_clock.h>
  
  #include <asm/delay.h>
  
@@ -22,13 +21,6 @@ static unsigned long arch_timer_read_counter_long(void)
         return arch_timer_read_counter();
  }
  
-static u32 sched_clock_mult __read_mostly;
-
-static unsigned long long notrace arch_timer_sched_clock(void)
-{
-       return arch_timer_read_counter() * sched_clock_mult;
-}
-
  static struct delay_timer arch_delay_timer;
  
  static void __init arch_timer_delay_timer_register(void)
@@ -48,11 +40,5 @@ int __init arch_timer_arch_init(void)
  
         arch_timer_delay_timer_register();
  
-       /* Cache the sched_clock multiplier to save a divide in the hot path. */
-       sched_clock_mult = NSEC_PER_SEC / arch_timer_rate;
-       sched_clock_func = arch_timer_sched_clock;
-       pr_info("sched_clock: ARM arch timer >56 bits at %ukHz, resolution %uns\n",
-               arch_timer_rate / 1000, sched_clock_mult);
-
         return 0;
  }
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c

index 6b4ce802ac4ea3e2d88bbe04adce6713651fb066..ffb5809efc626b6c6eb61a0d30fc03104ec8ec39 100644 (file)
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -981,6 +981,7 @@ static const char *hwcap_str[] = {
         "idivt",
         "vfpd32",
         "lpae",
+       "evtstrm",
         NULL
  };
  
diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c

index 696fb73296d0e7bd0d4419e5363c496794a73079..1e9c3383daba7e6995f301527eb480b5ebdb8662 100644 (file)
--- a/arch/arm/mach-msm/timer.c
+++ b/arch/arm/mach-msm/timer.c
@@ -274,7 +274,6 @@ static void __init msm_dt_timer_init(struct device_node *np)
                 pr_err("Unknown frequency\n");
                 return;
         }
-       of_node_put(np);
  
         event_base = base + 0x4;
         sts_base = base + 0x88;
diff --git a/arch/arm/mach-zynq/Kconfig b/arch/arm/mach-zynq/Kconfig

index 04f8a4a6e755e79b4c519cdbb0e487e6fcb99b78..6b04260aa142da12e290b20aab00a35cc105d364 100644 (file)
--- a/arch/arm/mach-zynq/Kconfig
+++ b/arch/arm/mach-zynq/Kconfig
@@ -13,5 +13,6 @@ config ARCH_ZYNQ
         select HAVE_SMP
         select SPARSE_IRQ
         select CADENCE_TTC_TIMER
+       select ARM_GLOBAL_TIMER
         help
           Support for Xilinx Zynq ARM Cortex A9 Platform
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index c04454876bcbe6520a52ada910a7d193ff884e85..35fd0eb572703c23c627f56e6597aec81438a041 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -14,6 +14,7 @@ config ARM64
         select GENERIC_IOMAP
         select GENERIC_IRQ_PROBE
         select GENERIC_IRQ_SHOW
+       select GENERIC_SCHED_CLOCK
         select GENERIC_SMP_IDLE_THREAD
         select GENERIC_TIME_VSYSCALL
         select HARDIRQS_SW_RESEND
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild

index 79a642d199f204247d7232caa69dbb9d78a18a79..519f89f5b6a3de345e05ee79a9d1b66116131e77 100644 (file)
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -50,3 +50,4 @@ generic-y += unaligned.h
  generic-y += user.h
  generic-y += vga.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h

index c9f1d2816c2bb9086be3b2e001c5d81921a076bf..9400596a0f3972a91a8858da56e9d7097e4b9319 100644 (file)
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -92,19 +92,49 @@ static inline u32 arch_timer_get_cntfrq(void)
         return val;
  }
  
-static inline void arch_counter_set_user_access(void)
+static inline u32 arch_timer_get_cntkctl(void)
  {
         u32 cntkctl;
-
-       /* Disable user access to the timers and the physical counter. */
         asm volatile("mrs       %0, cntkctl_el1" : "=r" (cntkctl));
-       cntkctl &= ~((3 << 8) | (1 << 0));
+       return cntkctl;
+}
  
-       /* Enable user access to the virtual counter and frequency. */
-       cntkctl |= (1 << 1);
+static inline void arch_timer_set_cntkctl(u32 cntkctl)
+{
         asm volatile("msr       cntkctl_el1, %0" : : "r" (cntkctl));
  }
  
+static inline void arch_counter_set_user_access(void)
+{
+       u32 cntkctl = arch_timer_get_cntkctl();
+
+       /* Disable user access to the timers and the physical counter */
+       /* Also disable virtual event stream */
+       cntkctl &= ~(ARCH_TIMER_USR_PT_ACCESS_EN
+                       | ARCH_TIMER_USR_VT_ACCESS_EN
+                       | ARCH_TIMER_VIRT_EVT_EN
+                       | ARCH_TIMER_USR_PCT_ACCESS_EN);
+
+       /* Enable user access to the virtual counter */
+       cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+
+       arch_timer_set_cntkctl(cntkctl);
+}
+
+static inline void arch_timer_evtstrm_enable(int divider)
+{
+       u32 cntkctl = arch_timer_get_cntkctl();
+       cntkctl &= ~ARCH_TIMER_EVT_TRIGGER_MASK;
+       /* Set the divider and enable virtual event stream */
+       cntkctl |= (divider << ARCH_TIMER_EVT_TRIGGER_SHIFT)
+                       | ARCH_TIMER_VIRT_EVT_EN;
+       arch_timer_set_cntkctl(cntkctl);
+       elf_hwcap |= HWCAP_EVTSTRM;
+#ifdef CONFIG_COMPAT
+       compat_elf_hwcap |= COMPAT_HWCAP_EVTSTRM;
+#endif
+}
+
  static inline u64 arch_counter_get_cntvct(void)
  {
         u64 cval;
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h

index e2950b098e76f68803559ca264d7bed94ffa4789..6cddbb0c9f5459cff851101fd3010ad74882a1ef 100644 (file)
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -30,6 +30,7 @@
  #define COMPAT_HWCAP_IDIVA     (1 << 17)
  #define COMPAT_HWCAP_IDIVT     (1 << 18)
  #define COMPAT_HWCAP_IDIV      (COMPAT_HWCAP_IDIVA|COMPAT_HWCAP_IDIVT)
+#define COMPAT_HWCAP_EVTSTRM   (1 << 21)
  
  #ifndef __ASSEMBLY__
  /*
@@ -37,11 +38,11 @@
   * instruction set this cpu supports.
   */
  #define ELF_HWCAP              (elf_hwcap)
-#define COMPAT_ELF_HWCAP       (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\
-                                COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\
-                                COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\
-                                COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\
-                                COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV)
+
+#ifdef CONFIG_COMPAT
+#define COMPAT_ELF_HWCAP       (compat_elf_hwcap)
+extern unsigned int compat_elf_hwcap;
+#endif
  
  extern unsigned long elf_hwcap;
  #endif
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h

index eea497578b877194473f457d8ef827aba18e0943..9b12476e9c8567545c493704075f56c8643bbae6 100644 (file)
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -21,6 +21,7 @@
   */
  #define HWCAP_FP               (1 << 0)
  #define HWCAP_ASIMD            (1 << 1)
+#define HWCAP_EVTSTRM          (1 << 2)
  
  
  #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c

index a4ed2d3e4de9e158ad66d44c370ed519355d3902..c7ad57b4391893f386c1a6b6fd871498125c90a2 100644 (file)
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -60,6 +60,16 @@ EXPORT_SYMBOL(processor_id);
  unsigned long elf_hwcap __read_mostly;
  EXPORT_SYMBOL_GPL(elf_hwcap);
  
+#ifdef CONFIG_COMPAT
+#define COMPAT_ELF_HWCAP_DEFAULT       \
+                               (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\
+                                COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\
+                                COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\
+                                COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\
+                                COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV)
+unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
+#endif
+
  static const char *cpu_name;
  static const char *machine_name;
  phys_addr_t __fdt_pointer __initdata;
@@ -252,6 +262,7 @@ subsys_initcall(topology_init);
  static const char *hwcap_str[] = {
         "fp",
         "asimd",
+       "evtstrm",
         NULL
  };
  
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c

index 03dc3718eb136d24db7295133709a4a9e92c21b5..29c39d5d77e31983d49ff2754b0413cd5bd48ff8 100644 (file)
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -61,13 +61,6 @@ unsigned long profile_pc(struct pt_regs *regs)
  EXPORT_SYMBOL(profile_pc);
  #endif
  
-static u64 sched_clock_mult __read_mostly;
-
-unsigned long long notrace sched_clock(void)
-{
-       return arch_timer_read_counter() * sched_clock_mult;
-}
-
  void __init time_init(void)
  {
         u32 arch_timer_rate;
@@ -78,9 +71,6 @@ void __init time_init(void)
         if (!arch_timer_rate)
                 panic("Unable to initialise architected timer.\n");
  
-       /* Cache the sched_clock multiplier to save a divide in the hot path. */
-       sched_clock_mult = NSEC_PER_SEC / arch_timer_rate;
-
         /* Calibrate the delay loop directly */
         lpj_fine = arch_timer_rate / HZ;
  }
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild

index fd7980743890b2ebcf5ff9b5ca40fc4ef83de220..658001b52400583ed3bcc1867f70f99ef9c97cb5 100644 (file)
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -7,6 +7,7 @@ generic-y       += div64.h
  generic-y       += emergency-restart.h
  generic-y      += exec.h
  generic-y       += futex.h
+generic-y      += preempt.h
  generic-y       += irq_regs.h
  generic-y      += param.h
  generic-y       += local.h
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild

index 127826f8a375ab9d7f6b4d9a9cdd3bed641534f6..f2b43474b0e2f13134b48e860d420b8f144a0a73 100644 (file)
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -44,3 +44,4 @@ generic-y += ucontext.h
  generic-y += unaligned.h
  generic-y += user.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild

index e49f918531ad443671b6226a9f0f951f60156b7f..fc0b3c356027d26cbf46cc10ca0fc0056cb4992f 100644 (file)
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -56,3 +56,4 @@ generic-y += ucontext.h
  generic-y += user.h
  generic-y += vga.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild

index c8325455520ef932fcbbb4ce44834d3618568a4e..b06caf649a954a6c1678b192dc947adeed692c5e 100644 (file)
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += module.h
  generic-y += trace_clock.h
  generic-y += vga.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild

index c5d76702830603f4d2fbde47e26412c179acfde6..74742dc6a3daabd3ec3a4b2c9c5af03f90d7685c 100644 (file)
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -2,3 +2,4 @@
  generic-y += clkdev.h
  generic-y += exec.h
  generic-y += trace_clock.h
+generic-y += preempt.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild

index 1da17caac23c9b353c19b84263905c8c75a1d4d4..67c3450309b7454881d09a162ecb2e5123b4395b 100644 (file)
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -53,3 +53,4 @@ generic-y += types.h
  generic-y += ucontext.h
  generic-y += unaligned.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild

index a3456f34f6722ec4d20f20232970182323a9c972..f93ee087e8fe622dcaf9e409910075eb541a73bf 100644 (file)
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -3,4 +3,5 @@ generic-y += clkdev.h
  generic-y += exec.h
  generic-y += kvm_para.h
  generic-y += trace_clock.h
+generic-y += preempt.h
  generic-y += vtime.h
 \ No newline at end of file
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h

index 74a7cc3293bc3aa279c735e4144a8cd7439d0172..0d2bcb37ec353019ad24b33d61ba67a80e687dbd 100644 (file)
--- a/arch/ia64/include/asm/io.h
+++ b/arch/ia64/include/asm/io.h
@@ -424,6 +424,7 @@ extern void __iomem * ioremap(unsigned long offset, unsigned long size);
  extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
  extern void iounmap (volatile void __iomem *addr);
  extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size);
+#define early_memremap(phys_addr, size)        early_ioremap(phys_addr, size)
  extern void early_iounmap (volatile void __iomem *addr, unsigned long size);
  static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size)
  {
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c

index 51bce594eb83582daac87f3853235db70bce3b85..da5b462e6de6c9e91c18d42756ff482bb9f70505 100644 (file)
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -44,10 +44,15 @@
  
  #define EFI_DEBUG      0
  
+static __initdata unsigned long palo_phys;
+
+static __initdata efi_config_table_type_t arch_tables[] = {
+       {PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID, "PALO", &palo_phys},
+       {NULL_GUID, NULL, 0},
+};
+
  extern efi_status_t efi_call_phys (void *, ...);
  
-struct efi efi;
-EXPORT_SYMBOL(efi);
  static efi_runtime_services_t *runtime;
  static u64 mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
  
@@ -423,9 +428,9 @@ static u8 __init palo_checksum(u8 *buffer, u32 length)
   * Parse and handle PALO table which is published at:
   * http://www.dig64.org/home/DIG64_PALO_R1_0.pdf
   */
-static void __init handle_palo(unsigned long palo_phys)
+static void __init handle_palo(unsigned long phys_addr)
  {
-       struct palo_table *palo = __va(palo_phys);
+       struct palo_table *palo = __va(phys_addr);
         u8  checksum;
  
         if (strncmp(palo->signature, PALO_SIG, sizeof(PALO_SIG) - 1)) {
@@ -467,12 +472,10 @@ void __init
  efi_init (void)
  {
         void *efi_map_start, *efi_map_end;
-       efi_config_table_t *config_tables;
         efi_char16_t *c16;
         u64 efi_desc_size;
         char *cp, vendor[100] = "unknown";
         int i;
-       unsigned long palo_phys;
  
         /*
          * It's too early to be able to use the standard kernel command line
@@ -514,8 +517,6 @@ efi_init (void)
                        efi.systab->hdr.revision >> 16,
                        efi.systab->hdr.revision & 0xffff);
  
-       config_tables = __va(efi.systab->tables);
-
         /* Show what we know for posterity */
         c16 = __va(efi.systab->fw_vendor);
         if (c16) {
@@ -528,43 +529,10 @@ efi_init (void)
                efi.systab->hdr.revision >> 16,
                efi.systab->hdr.revision & 0xffff, vendor);
  
-       efi.mps        = EFI_INVALID_TABLE_ADDR;
-       efi.acpi       = EFI_INVALID_TABLE_ADDR;
-       efi.acpi20     = EFI_INVALID_TABLE_ADDR;
-       efi.smbios     = EFI_INVALID_TABLE_ADDR;
-       efi.sal_systab = EFI_INVALID_TABLE_ADDR;
-       efi.boot_info  = EFI_INVALID_TABLE_ADDR;
-       efi.hcdp       = EFI_INVALID_TABLE_ADDR;
-       efi.uga        = EFI_INVALID_TABLE_ADDR;
-
         palo_phys      = EFI_INVALID_TABLE_ADDR;
  
-       for (i = 0; i < (int) efi.systab->nr_tables; i++) {
-               if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
-                       efi.mps = config_tables[i].table;
-                       printk(" MPS=0x%lx", config_tables[i].table);
-               } else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
-                       efi.acpi20 = config_tables[i].table;
-                       printk(" ACPI 2.0=0x%lx", config_tables[i].table);
-               } else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
-                       efi.acpi = config_tables[i].table;
-                       printk(" ACPI=0x%lx", config_tables[i].table);
-               } else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
-                       efi.smbios = config_tables[i].table;
-                       printk(" SMBIOS=0x%lx", config_tables[i].table);
-               } else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) {
-                       efi.sal_systab = config_tables[i].table;
-                       printk(" SALsystab=0x%lx", config_tables[i].table);
-               } else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
-                       efi.hcdp = config_tables[i].table;
-                       printk(" HCDP=0x%lx", config_tables[i].table);
-               } else if (efi_guidcmp(config_tables[i].guid,
-                        PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID) == 0) {
-                       palo_phys = config_tables[i].table;
-                       printk(" PALO=0x%lx", config_tables[i].table);
-               }
-       }
-       printk("\n");
+       if (efi_config_init(arch_tables) != 0)
+               return;
  
         if (palo_phys != EFI_INVALID_TABLE_ADDR)
                 handle_palo(palo_phys);
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild

index bebdc36ebb0a85551c19a07f7caacb0155eecb0e..2b58c5f0bc3848ad1d13eb8afc357bebff9484a6 100644 (file)
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
  generic-y += exec.h
  generic-y += module.h
  generic-y += trace_clock.h
+generic-y += preempt.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild

index 09d77a862da3d961029bdbd260557a33f472523f..a5d27f272a59f105f3b34103fdf8fd6b488c4f2c 100644 (file)
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -31,3 +31,4 @@ generic-y += trace_clock.h
  generic-y += types.h
  generic-y += word-at-a-time.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild

index 6ae0ccb632cb8a8eaa2aee67c219112bf2526f4a..84d0c1d6b9b30fd4afe4f9ad2a38c712154667d2 100644 (file)
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -52,3 +52,4 @@ generic-y += unaligned.h
  generic-y += user.h
  generic-y += vga.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h

index 23f5118f58db97bde7fd6d345ac273e81af50496..8e9c0b3b9691f3746f0ce7efec4acffd0ebdfe68 100644 (file)
--- a/arch/metag/include/asm/topology.h
+++ b/arch/metag/include/asm/topology.h
@@ -26,6 +26,8 @@
         .last_balance           = jiffies,              \
         .balance_interval       = 1,                    \
         .nr_balance_failed      = 0,                    \
+       .max_newidle_lb_cost    = 0,                    \
+       .next_decay_max_lb_cost = jiffies,              \
  }
  
  #define cpu_to_node(cpu)       ((void)(cpu), 0)
diff --git a/arch/metag/kernel/irq.c b/arch/metag/kernel/irq.c

index 2a2c9d55187e075375e892abbc4e23139f63bf86..3b4b7f6c0950ef89d975dc4c1e1845333a6d0fe9 100644 (file)
--- a/arch/metag/kernel/irq.c
+++ b/arch/metag/kernel/irq.c
@@ -159,44 +159,30 @@ void irq_ctx_exit(int cpu)
  
  extern asmlinkage void __do_softirq(void);
  
-asmlinkage void do_softirq(void)
+void do_softirq_own_stack(void)
  {
-       unsigned long flags;
         struct thread_info *curctx;
         union irq_ctx *irqctx;
         u32 *isp;
  
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-
-       if (local_softirq_pending()) {
-               curctx = current_thread_info();
-               irqctx = softirq_ctx[smp_processor_id()];
-               irqctx->tinfo.task = curctx->task;
-
-               /* build the stack frame on the softirq stack */
-               isp = (u32 *) ((char *)irqctx + sizeof(struct thread_info));
-
-               asm volatile (
-                       "MOV   D0.5,%0\n"
-                       "SWAP  A0StP,D0.5\n"
-                       "CALLR D1RtP,___do_softirq\n"
-                       "MOV   A0StP,D0.5\n"
-                       :
-                       : "r" (isp)
-                       : "memory", "cc", "D1Ar1", "D0Ar2", "D1Ar3", "D0Ar4",
-                         "D1Ar5", "D0Ar6", "D0Re0", "D1Re0", "D0.4", "D1RtP",
-                         "D0.5"
-                       );
-               /*
-                * Shouldn't happen, we returned above if in_interrupt():
-                */
-               WARN_ON_ONCE(softirq_count());
-       }
-
-       local_irq_restore(flags);
+       curctx = current_thread_info();
+       irqctx = softirq_ctx[smp_processor_id()];
+       irqctx->tinfo.task = curctx->task;
+
+       /* build the stack frame on the softirq stack */
+       isp = (u32 *) ((char *)irqctx + sizeof(struct thread_info));
+
+       asm volatile (
+               "MOV   D0.5,%0\n"
+               "SWAP  A0StP,D0.5\n"
+               "CALLR D1RtP,___do_softirq\n"
+               "MOV   A0StP,D0.5\n"
+               :
+               : "r" (isp)
+               : "memory", "cc", "D1Ar1", "D0Ar2", "D1Ar3", "D0Ar4",
+                 "D1Ar5", "D0Ar6", "D0Re0", "D1Re0", "D0.4", "D1RtP",
+                 "D0.5"
+               );
  }
  #endif
  
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild

index d3c51a6a601dd2017c5ce6a03b86aae95cfe78b7..ce0bbf8f5640e78127f89616188a59be1bb5ba8b 100644 (file)
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
  generic-y += exec.h
  generic-y += trace_clock.h
  generic-y += syscalls.h
+generic-y += preempt.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild

index 454ddf9bb76f8a5fc5660bc6ab02c4d5380f7593..1acbb8b77a71be49cd16c0967efcac851339f4e3 100644 (file)
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -11,5 +11,6 @@ generic-y += sections.h
  generic-y += segment.h
  generic-y += serial.h
  generic-y += trace_clock.h
+generic-y += preempt.h
  generic-y += ucontext.h
  generic-y += xor.h
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c

index d763f11e35e2e9812a87c58e67f81759bb3dfd17..2c12ea1668d13df21b8908abc7fbc4f80d60678c 100644 (file)
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -172,8 +172,9 @@ int rtlx_open(int index, int can_sleep)
         if (rtlx == NULL) {
                 if( (p = vpe_get_shared(tclimit)) == NULL) {
                     if (can_sleep) {
-                       __wait_event_interruptible(channel_wqs[index].lx_queue,
-                               (p = vpe_get_shared(tclimit)), ret);
+                       ret = __wait_event_interruptible(
+                                       channel_wqs[index].lx_queue,
+                                       (p = vpe_get_shared(tclimit)));
                         if (ret)
                                 goto out_fail;
                     } else {
@@ -263,11 +264,10 @@ unsigned int rtlx_read_poll(int index, int can_sleep)
         /* data available to read? */
         if (chan->lx_read == chan->lx_write) {
                 if (can_sleep) {
-                       int ret = 0;
-
-                       __wait_event_interruptible(channel_wqs[index].lx_queue,
+                       int ret = __wait_event_interruptible(
+                               channel_wqs[index].lx_queue,
                                 (chan->lx_read != chan->lx_write) ||
-                               sp_stopping, ret);
+                               sp_stopping);
                         if (ret)
                                 return ret;
  
@@ -440,14 +440,13 @@ static ssize_t file_write(struct file *file, const char __user * buffer,
  
         /* any space left... */
         if (!rtlx_write_poll(minor)) {
-               int ret = 0;
+               int ret;
  
                 if (file->f_flags & O_NONBLOCK)
                         return -EAGAIN;
  
-               __wait_event_interruptible(channel_wqs[minor].rt_queue,
-                                          rtlx_write_poll(minor),
-                                          ret);
+               ret = __wait_event_interruptible(channel_wqs[minor].rt_queue,
+                                          rtlx_write_poll(minor));
                 if (ret)
                         return ret;
         }
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c

index e205ef598e97bcc16becade0f627397a157987e1..12156176c7caa937ff5abcea16b249b95c748cb1 100644 (file)
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -124,7 +124,7 @@ void *kmap_coherent(struct page *page, unsigned long addr)
  
         BUG_ON(Page_dcache_dirty(page));
  
-       inc_preempt_count();
+       pagefault_disable();
         idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
  #ifdef CONFIG_MIPS_MT_SMTC
         idx += FIX_N_COLOURS * smp_processor_id() +
@@ -193,8 +193,7 @@ void kunmap_coherent(void)
         write_c0_entryhi(old_ctx);
         EXIT_CRITICAL(flags);
  #endif
-       dec_preempt_count();
-       preempt_check_resched();
+       pagefault_enable();
  }
  
  void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild

index c5d76702830603f4d2fbde47e26412c179acfde6..74742dc6a3daabd3ec3a4b2c9c5af03f90d7685c 100644 (file)
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -2,3 +2,4 @@
  generic-y += clkdev.h
  generic-y += exec.h
  generic-y += trace_clock.h
+generic-y += preempt.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild

index 195653e851da2a460fcafeca692af220bcef58d1..78405625e799016b46647db52711c77f5b7ef224 100644 (file)
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -67,3 +67,4 @@ generic-y += ucontext.h
  generic-y += user.h
  generic-y += word-at-a-time.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild

index ff4c9faed546da39c89d4bd6f1a851376a667040..a603b9ebe54ce38c19328ac936059618771a07c0 100644 (file)
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \
           div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \
           poll.h xor.h clkdev.h exec.h
  generic-y += trace_clock.h
+generic-y += preempt.h
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c

index 2e6443b1e9228426ba94d8602c8956c5daec93c2..ef592768529951154f725474abee9127556dced0 100644 (file)
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -499,22 +499,9 @@ static void execute_on_irq_stack(void *func, unsigned long param1)
         *irq_stack_in_use = 1;
  }
  
-asmlinkage void do_softirq(void)
+void do_softirq_own_stack(void)
  {
-       __u32 pending;
-       unsigned long flags;
-
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-
-       pending = local_softirq_pending();
-
-       if (pending)
-               execute_on_irq_stack(__do_softirq, 0);
-
-       local_irq_restore(flags);
+       execute_on_irq_stack(__do_softirq, 0);
  }
  #endif /* CONFIG_IRQSTACKS */
  
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index e2e03a6d060f4543e76c8bf4c34b6a5886822fe3..58736860cffee48fbc4908a48fcb2e6a1c444c39 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
         select OLD_SIGSUSPEND
         select OLD_SIGACTION if PPC32
         select HAVE_DEBUG_STACKOVERFLOW
+       select HAVE_IRQ_EXIT_ON_IRQ_STACK
  
  config GENERIC_CSUM
         def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild

index 704e6f10ae806e0e5702b056cfb4a231542ebfba..d8f9d2f18a23bc932cb4562ac00dc26b1257238b 100644 (file)
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -2,4 +2,5 @@
  generic-y += clkdev.h
  generic-y += rwsem.h
  generic-y += trace_clock.h
+generic-y += preempt.h
  generic-y += vtime.h
 \ No newline at end of file
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c

index c7cb8c232d2f4fdedf9129ec691d8471736e2fc7..ba0165615215577be84a8a576283a5fd6df12e51 100644 (file)
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -594,7 +594,7 @@ void irq_ctx_init(void)
         }
  }
  
-static inline void do_softirq_onstack(void)
+void do_softirq_own_stack(void)
  {
         struct thread_info *curtp, *irqtp;
  
@@ -612,21 +612,6 @@ static inline void do_softirq_onstack(void)
                 set_bits(irqtp->flags, &curtp->flags);
  }
  
-void do_softirq(void)
-{
-       unsigned long flags;
-
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-
-       if (local_softirq_pending())
-               do_softirq_onstack();
-
-       local_irq_restore(flags);
-}
-
  irq_hw_number_t virq_to_hw(unsigned int virq)
  {
         struct irq_data *irq_data = irq_get_irq_data(virq);
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild

index f313f9cbcf44fa410c88e85b17642a865bf39a74..7a5288f3479ae66fe330587516fb6c74240798a2 100644 (file)
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -2,3 +2,4 @@
  
  generic-y += clkdev.h
  generic-y += trace_clock.h
+generic-y += preempt.h
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c

index 8ac2097f13d4e99eeaf913baca503f4532e3a055..bb27a262c44aa8076dfe7eb9cb5a243d8c315079 100644 (file)
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -157,39 +157,29 @@ int arch_show_interrupts(struct seq_file *p, int prec)
  /*
   * Switch to the asynchronous interrupt stack for softirq execution.
   */
-asmlinkage void do_softirq(void)
+void do_softirq_own_stack(void)
  {
-       unsigned long flags, old, new;
-
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-
-       if (local_softirq_pending()) {
-               /* Get current stack pointer. */
-               asm volatile("la %0,0(15)" : "=a" (old));
-               /* Check against async. stack address range. */
-               new = S390_lowcore.async_stack;
-               if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) {
-                       /* Need to switch to the async. stack. */
-                       new -= STACK_FRAME_OVERHEAD;
-                       ((struct stack_frame *) new)->back_chain = old;
-
-                       asm volatile("   la    15,0(%0)\n"
-                                    "   basr  14,%2\n"
-                                    "   la    15,0(%1)\n"
-                                    : : "a" (new), "a" (old),
-                                        "a" (__do_softirq)
-                                    : "0", "1", "2", "3", "4", "5", "14",
-                                      "cc", "memory" );
-               } else {
-                       /* We are already on the async stack. */
-                       __do_softirq();
-               }
+       unsigned long old, new;
+
+       /* Get current stack pointer. */
+       asm volatile("la %0,0(15)" : "=a" (old));
+       /* Check against async. stack address range. */
+       new = S390_lowcore.async_stack;
+       if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) {
+               /* Need to switch to the async. stack. */
+               new -= STACK_FRAME_OVERHEAD;
+               ((struct stack_frame *) new)->back_chain = old;
+               asm volatile("   la    15,0(%0)\n"
+                            "   basr  14,%2\n"
+                            "   la    15,0(%1)\n"
+                            : : "a" (new), "a" (old),
+                                "a" (__do_softirq)
+                            : "0", "1", "2", "3", "4", "5", "14",
+                              "cc", "memory" );
+       } else {
+               /* We are already on the async stack. */
+               __do_softirq();
         }
-
-       local_irq_restore(flags);
  }
  
  /*
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild

index e1c7bb999b06af31fe908089c3ff8e65484e4f82..f3414ade77a37c2628f2fa104efccdd1ec0bbc23 100644 (file)
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -4,3 +4,4 @@ header-y +=
  generic-y += clkdev.h
  generic-y += trace_clock.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild

index 280bea9e5e2be362a7ff32a36cdd966b5f38ffe6..231efbb6810825f636c8cde163ebada4ef783098 100644 (file)
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -34,3 +34,4 @@ generic-y += termios.h
  generic-y += trace_clock.h
  generic-y += ucontext.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c

index 063af10ff3c1eb9e76c4724d3977b6bd9421b9fe..0833736afa3238185196bb10e408ebf9328e35ea 100644 (file)
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -149,47 +149,32 @@ void irq_ctx_exit(int cpu)
         hardirq_ctx[cpu] = NULL;
  }
  
-asmlinkage void do_softirq(void)
+void do_softirq_own_stack(void)
  {
-       unsigned long flags;
         struct thread_info *curctx;
         union irq_ctx *irqctx;
         u32 *isp;
  
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-
-       if (local_softirq_pending()) {
-               curctx = current_thread_info();
-               irqctx = softirq_ctx[smp_processor_id()];
-               irqctx->tinfo.task = curctx->task;
-               irqctx->tinfo.previous_sp = current_stack_pointer;
-
-               /* build the stack frame on the softirq stack */
-               isp = (u32 *)((char *)irqctx + sizeof(*irqctx));
-
-               __asm__ __volatile__ (
-                       "mov    r15, r9         \n"
-                       "jsr    @%0             \n"
-                       /* switch to the softirq stack */
-                       " mov   %1, r15         \n"
-                       /* restore the thread stack */
-                       "mov    r9, r15         \n"
-                       : /* no outputs */
-                       : "r" (__do_softirq), "r" (isp)
-                       : "memory", "r0", "r1", "r2", "r3", "r4",
-                         "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
-               );
-
-               /*
-                * Shouldn't happen, we returned above if in_interrupt():
-                */
-               WARN_ON_ONCE(softirq_count());
-       }
-
-       local_irq_restore(flags);
+       curctx = current_thread_info();
+       irqctx = softirq_ctx[smp_processor_id()];
+       irqctx->tinfo.task = curctx->task;
+       irqctx->tinfo.previous_sp = current_stack_pointer;
+
+       /* build the stack frame on the softirq stack */
+       isp = (u32 *)((char *)irqctx + sizeof(*irqctx));
+
+       __asm__ __volatile__ (
+               "mov    r15, r9         \n"
+               "jsr    @%0             \n"
+               /* switch to the softirq stack */
+               " mov   %1, r15         \n"
+               /* restore the thread stack */
+               "mov    r9, r15         \n"
+               : /* no outputs */
+               : "r" (__do_softirq), "r" (isp)
+               : "memory", "r0", "r1", "r2", "r3", "r4",
+                 "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
+       );
  }
  #else
  static inline void handle_one_irq(unsigned int irq)
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild

index 7e4a97fbded412158d845926026987290907902c..bf390667657ae0777365424b4c675fac496e9462 100644 (file)
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -16,3 +16,4 @@ generic-y += serial.h
  generic-y += trace_clock.h
  generic-y += types.h
  generic-y += word-at-a-time.h
+generic-y += preempt.h
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c

index d4840cec2c55c156203463ec27e3d8d716f0e309..666193f4e8bb414fcf64e45a15f89ebf0ab2f628 100644 (file)
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -698,30 +698,19 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
         set_irq_regs(old_regs);
  }
  
-void do_softirq(void)
+void do_softirq_own_stack(void)
  {
-       unsigned long flags;
-
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
+       void *orig_sp, *sp = softirq_stack[smp_processor_id()];
  
-       if (local_softirq_pending()) {
-               void *orig_sp, *sp = softirq_stack[smp_processor_id()];
-
-               sp += THREAD_SIZE - 192 - STACK_BIAS;
-
-               __asm__ __volatile__("mov %%sp, %0\n\t"
-                                    "mov %1, %%sp"
-                                    : "=&r" (orig_sp)
-                                    : "r" (sp));
-               __do_softirq();
-               __asm__ __volatile__("mov %0, %%sp"
-                                    : : "r" (orig_sp));
-       }
+       sp += THREAD_SIZE - 192 - STACK_BIAS;
  
-       local_irq_restore(flags);
+       __asm__ __volatile__("mov %%sp, %0\n\t"
+                            "mov %1, %%sp"
+                            : "=&r" (orig_sp)
+                            : "r" (sp));
+       __do_softirq();
+       __asm__ __volatile__("mov %0, %%sp"
+                            : : "r" (orig_sp));
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild

index 664d6ad23f80626a62e17dbfae3c3b82cd0f6bb0..22f3bd147fa72456dc7a906cd5700acecde6026f 100644 (file)
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -38,3 +38,4 @@ generic-y += termios.h
  generic-y += trace_clock.h
  generic-y += types.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild

index b30f34a79882c4413e77de1495bba161ed61a9af..fdde187e608771c8fa7c5d3a4c37fdbd4d96bdef 100644 (file)
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h
  generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h
  generic-y += switch_to.h clkdev.h
  generic-y += trace_clock.h
+generic-y += preempt.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild

index 89d8b6c4e39a09d5739bf215f4ce4c663a9f2ef3..00045cbe5c6336bb9e410e7d9e9bb2ea2bb8ed0c 100644 (file)
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -60,3 +60,4 @@ generic-y += unaligned.h
  generic-y += user.h
  generic-y += vga.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index e0836de76f3c90dcb57ec0e7bbfb293cb7d0ca99..591b19ec8af8216954888958c315195e3fe14ce4 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
         select COMPAT_OLD_SIGACTION if IA32_EMULATION
         select RTC_LIB
         select HAVE_DEBUG_STACKOVERFLOW
+       select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
  
  config INSTRUCTION_DECODER
         def_bool y
@@ -753,20 +754,25 @@ config DMI
           BIOS code.
  
  config GART_IOMMU
-       bool "GART IOMMU support" if EXPERT
-       default y
+       bool "Old AMD GART IOMMU support"
         select SWIOTLB
         depends on X86_64 && PCI && AMD_NB
         ---help---
-         Support for full DMA access of devices with 32bit memory access only
-         on systems with more than 3GB. This is usually needed for USB,
-         sound, many IDE/SATA chipsets and some other devices.
-         Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
-         based hardware IOMMU and a software bounce buffer based IOMMU used
-         on Intel systems and as fallback.
-         The code is only active when needed (enough memory and limited
-         device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
-         too.
+         Provides a driver for older AMD Athlon64/Opteron/Turion/Sempron
+         GART based hardware IOMMUs.
+
+         The GART supports full DMA access for devices with 32-bit access
+         limitations, on systems with more than 3 GB. This is usually needed
+         for USB, sound, many IDE/SATA chipsets and some other devices.
+
+         Newer systems typically have a modern AMD IOMMU, supported via
+         the CONFIG_AMD_IOMMU=y config option.
+
+         In normal configurations this driver is only active when needed:
+         there's more than 3 GB of memory and the system contains a
+         32-bit limited device.
+
+         If unsure, say Y.
  
  config CALGARY_IOMMU
         bool "IBM Calgary IOMMU support"
@@ -1591,7 +1597,7 @@ config EFI_STUB
            This kernel feature allows a bzImage to be loaded directly
           by EFI firmware without the use of a bootloader.
  
-         See Documentation/x86/efi-stub.txt for more information.
+         See Documentation/efi-stub.txt for more information.
  
  config SECCOMP
         def_bool y
@@ -1720,16 +1726,56 @@ config RELOCATABLE
  
           Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
           it has been loaded at and the compile time physical address
-         (CONFIG_PHYSICAL_START) is ignored.
+         (CONFIG_PHYSICAL_START) is used as the minimum location.
  
-# Relocation on x86-32 needs some additional build support
+config RANDOMIZE_BASE
+       bool "Randomize the address of the kernel image"
+       depends on RELOCATABLE
+       depends on !HIBERNATION
+       default n
+       ---help---
+          Randomizes the physical and virtual address at which the
+          kernel image is decompressed, as a security feature that
+          deters exploit attempts relying on knowledge of the location
+          of kernel internals.
+
+          Entropy is generated using the RDRAND instruction if it
+          is supported.  If not, then RDTSC is used, if supported. If
+          neither RDRAND nor RDTSC are supported, then no randomness
+          is introduced.
+
+          The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
+          and aligned according to PHYSICAL_ALIGN.
+
+config RANDOMIZE_BASE_MAX_OFFSET
+       hex "Maximum ASLR offset allowed"
+       depends on RANDOMIZE_BASE
+       range 0x0 0x20000000 if X86_32
+       default "0x20000000" if X86_32
+       range 0x0 0x40000000 if X86_64
+       default "0x40000000" if X86_64
+       ---help---
+        Determines the maximal offset in bytes that will be applied to the
+        kernel when Address Space Layout Randomization (ASLR) is active.
+        Must be less than or equal to the actual physical memory on the
+        system. This must be a multiple of CONFIG_PHYSICAL_ALIGN.
+
+        On 32-bit this is limited to 512MiB.
+
+        On 64-bit this is limited by how the kernel fixmap page table is
+        positioned, so this cannot be larger that 1GiB currently. Normally
+        there is a 512MiB to 1.5GiB split between kernel and modules. When
+        this is raised above the 512MiB default, the modules area will
+        shrink to compensate, up to the current maximum 1GiB to 1GiB split.
+
+# Relocation on x86 needs some additional build support
  config X86_NEED_RELOCS
         def_bool y
-       depends on X86_32 && RELOCATABLE
+       depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
  
  config PHYSICAL_ALIGN
         hex "Alignment value to which kernel should be aligned"
-       default "0x1000000"
+       default "0x200000"
         range 0x2000 0x1000000 if X86_32
         range 0x200000 0x1000000 if X86_64
         ---help---
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile

index 379814bc41e3a956dc037a5f1d4ca23709854fbc..28f7db7993f5015f27b724bbad43c89c2ec0bb73 100644 (file)
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -20,7 +20,7 @@ targets               := vmlinux.bin setup.bin setup.elf bzImage
  targets                += fdimage fdimage144 fdimage288 image.iso mtools.conf
  subdir-                := compressed
  
-setup-y                += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o
+setup-y                += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
  setup-y                += early_serial_console.o edd.o header.o main.o mca.o memory.o
  setup-y                += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
  setup-y                += video-mode.o version.o
@@ -71,7 +71,8 @@ GCOV_PROFILE := n
  $(obj)/bzImage: asflags-y  := $(SVGA_MODE)
  
  quiet_cmd_image = BUILD   $@
-cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/zoffset.h > $@
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
+                              $(obj)/zoffset.h $@
  
  $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
         $(call if_changed,image)
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h

index ef72baeff48475cd1ce9583880c33b9970b1b556..50f8c5e0f37e1d2dd7030da9d72c5be21e2ee282 100644 (file)
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -26,9 +26,8 @@
  #include <asm/boot.h>
  #include <asm/setup.h>
  #include "bitops.h"
-#include <asm/cpufeature.h>
-#include <asm/processor-flags.h>
  #include "ctype.h"
+#include "cpuflags.h"
  
  /* Useful macros */
  #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -307,14 +306,7 @@ static inline int cmdline_find_option_bool(const char *option)
         return __cmdline_find_option_bool(cmd_line_ptr, option);
  }
  
-
  /* cpu.c, cpucheck.c */
-struct cpu_features {
-       int level;              /* Family, or 64 for x86-64 */
-       int model;
-       u32 flags[NCAPINTS];
-};
-extern struct cpu_features cpu;
  int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
  int validate_cpu(void);
  
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile

index dcd90df10ab4e271dd6ee0e3ed737015326efa18..ae8b5dbbd8c5c401ff63e6a188aac29e0b197ca1 100644 (file)
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,7 +27,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include
  
  VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
         $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
-       $(obj)/piggy.o
+       $(obj)/piggy.o $(obj)/cpuflags.o $(obj)/aslr.o
  
  $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
  
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c

new file mode 100644 (file)

index 0000000..0595798
--- /dev/null
+++ b/arch/x86/boot/compressed/aslr.c
@@ -0,0 +1,267 @@
+#include "misc.h"
+
+#ifdef CONFIG_RANDOMIZE_BASE
+#include <asm/msr.h>
+#include <asm/archrandom.h>
+#include <asm/e820.h>
+
+#define I8254_PORT_CONTROL     0x43
+#define I8254_PORT_COUNTER0    0x40
+#define I8254_CMD_READBACK     0xC0
+#define I8254_SELECT_COUNTER0  0x02
+#define I8254_STATUS_NOTREADY  0x40
+static inline u16 i8254(void)
+{
+       u16 status, timer;
+
+       do {
+               outb(I8254_PORT_CONTROL,
+                    I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+               status = inb(I8254_PORT_COUNTER0);
+               timer  = inb(I8254_PORT_COUNTER0);
+               timer |= inb(I8254_PORT_COUNTER0) << 8;
+       } while (status & I8254_STATUS_NOTREADY);
+
+       return timer;
+}
+
+static unsigned long get_random_long(void)
+{
+       unsigned long random;
+
+       if (has_cpuflag(X86_FEATURE_RDRAND)) {
+               debug_putstr("KASLR using RDRAND...\n");
+               if (rdrand_long(&random))
+                       return random;
+       }
+
+       if (has_cpuflag(X86_FEATURE_TSC)) {
+               uint32_t raw;
+
+               debug_putstr("KASLR using RDTSC...\n");
+               rdtscl(raw);
+
+               /* Only use the low bits of rdtsc. */
+               random = raw & 0xffff;
+       } else {
+               debug_putstr("KASLR using i8254...\n");
+               random = i8254();
+       }
+
+       /* Extend timer bits poorly... */
+       random |= (random << 16);
+#ifdef CONFIG_X86_64
+       random |= (random << 32);
+#endif
+       return random;
+}
+
+struct mem_vector {
+       unsigned long start;
+       unsigned long size;
+};
+
+#define MEM_AVOID_MAX 5
+struct mem_vector mem_avoid[MEM_AVOID_MAX];
+
+static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
+{
+       /* Item at least partially before region. */
+       if (item->start < region->start)
+               return false;
+       /* Item at least partially after region. */
+       if (item->start + item->size > region->start + region->size)
+               return false;
+       return true;
+}
+
+static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
+{
+       /* Item one is entirely before item two. */
+       if (one->start + one->size <= two->start)
+               return false;
+       /* Item one is entirely after item two. */
+       if (one->start >= two->start + two->size)
+               return false;
+       return true;
+}
+
+static void mem_avoid_init(unsigned long input, unsigned long input_size,
+                          unsigned long output, unsigned long output_size)
+{
+       u64 initrd_start, initrd_size;
+       u64 cmd_line, cmd_line_size;
+       unsigned long unsafe, unsafe_len;
+       char *ptr;
+
+       /*
+        * Avoid the region that is unsafe to overlap during
+        * decompression (see calculations at top of misc.c).
+        */
+       unsafe_len = (output_size >> 12) + 32768 + 18;
+       unsafe = (unsigned long)input + input_size - unsafe_len;
+       mem_avoid[0].start = unsafe;
+       mem_avoid[0].size = unsafe_len;
+
+       /* Avoid initrd. */
+       initrd_start  = (u64)real_mode->ext_ramdisk_image << 32;
+       initrd_start |= real_mode->hdr.ramdisk_image;
+       initrd_size  = (u64)real_mode->ext_ramdisk_size << 32;
+       initrd_size |= real_mode->hdr.ramdisk_size;
+       mem_avoid[1].start = initrd_start;
+       mem_avoid[1].size = initrd_size;
+
+       /* Avoid kernel command line. */
+       cmd_line  = (u64)real_mode->ext_cmd_line_ptr << 32;
+       cmd_line |= real_mode->hdr.cmd_line_ptr;
+       /* Calculate size of cmd_line. */
+       ptr = (char *)(unsigned long)cmd_line;
+       for (cmd_line_size = 0; ptr[cmd_line_size++]; )
+               ;
+       mem_avoid[2].start = cmd_line;
+       mem_avoid[2].size = cmd_line_size;
+
+       /* Avoid heap memory. */
+       mem_avoid[3].start = (unsigned long)free_mem_ptr;
+       mem_avoid[3].size = BOOT_HEAP_SIZE;
+
+       /* Avoid stack memory. */
+       mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
+       mem_avoid[4].size = BOOT_STACK_SIZE;
+}
+
+/* Does this memory vector overlap a known avoided area? */
+bool mem_avoid_overlap(struct mem_vector *img)
+{
+       int i;
+
+       for (i = 0; i < MEM_AVOID_MAX; i++) {
+               if (mem_overlaps(img, &mem_avoid[i]))
+                       return true;
+       }
+
+       return false;
+}
+
+unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / CONFIG_PHYSICAL_ALIGN];
+unsigned long slot_max = 0;
+
+static void slots_append(unsigned long addr)
+{
+       /* Overflowing the slots list should be impossible. */
+       if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
+                       CONFIG_PHYSICAL_ALIGN)
+               return;
+
+       slots[slot_max++] = addr;
+}
+
+static unsigned long slots_fetch_random(void)
+{
+       /* Handle case of no slots stored. */
+       if (slot_max == 0)
+               return 0;
+
+       return slots[get_random_long() % slot_max];
+}
+
+static void process_e820_entry(struct e820entry *entry,
+                              unsigned long minimum,
+                              unsigned long image_size)
+{
+       struct mem_vector region, img;
+
+       /* Skip non-RAM entries. */
+       if (entry->type != E820_RAM)
+               return;
+
+       /* Ignore entries entirely above our maximum. */
+       if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
+               return;
+
+       /* Ignore entries entirely below our minimum. */
+       if (entry->addr + entry->size < minimum)
+               return;
+
+       region.start = entry->addr;
+       region.size = entry->size;
+
+       /* Potentially raise address to minimum location. */
+       if (region.start < minimum)
+               region.start = minimum;
+
+       /* Potentially raise address to meet alignment requirements. */
+       region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
+
+       /* Did we raise the address above the bounds of this e820 region? */
+       if (region.start > entry->addr + entry->size)
+               return;
+
+       /* Reduce size by any delta from the original address. */
+       region.size -= region.start - entry->addr;
+
+       /* Reduce maximum size to fit end of image within maximum limit. */
+       if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
+               region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
+
+       /* Walk each aligned slot and check for avoided areas. */
+       for (img.start = region.start, img.size = image_size ;
+            mem_contains(&region, &img) ;
+            img.start += CONFIG_PHYSICAL_ALIGN) {
+               if (mem_avoid_overlap(&img))
+                       continue;
+               slots_append(img.start);
+       }
+}
+
+static unsigned long find_random_addr(unsigned long minimum,
+                                     unsigned long size)
+{
+       int i;
+       unsigned long addr;
+
+       /* Make sure minimum is aligned. */
+       minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+       /* Verify potential e820 positions, appending to slots list. */
+       for (i = 0; i < real_mode->e820_entries; i++) {
+               process_e820_entry(&real_mode->e820_map[i], minimum, size);
+       }
+
+       return slots_fetch_random();
+}
+
+unsigned char *choose_kernel_location(unsigned char *input,
+                                     unsigned long input_size,
+                                     unsigned char *output,
+                                     unsigned long output_size)
+{
+       unsigned long choice = (unsigned long)output;
+       unsigned long random;
+
+       if (cmdline_find_option_bool("nokaslr")) {
+               debug_putstr("KASLR disabled...\n");
+               goto out;
+       }
+
+       /* Record the various known unsafe memory ranges. */
+       mem_avoid_init((unsigned long)input, input_size,
+                      (unsigned long)output, output_size);
+
+       /* Walk e820 and find a random address. */
+       random = find_random_addr(choice, output_size);
+       if (!random) {
+               debug_putstr("KASLR could not find suitable E820 region...\n");
+               goto out;
+       }
+
+       /* Always enforce the minimum. */
+       if (random < choice)
+               goto out;
+
+       choice = random;
+out:
+       return (unsigned char *)choice;
+}
+
+#endif /* CONFIG_RANDOMIZE_BASE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c

index bffd73b45b1f27a4dc6cdd2ab6ee5451cd1bbd0d..b68e3033e6b9bd6fa76f89e3fbaa034823790ad7 100644 (file)
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,6 +1,6 @@
  #include "misc.h"
  
-#ifdef CONFIG_EARLY_PRINTK
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
  
  static unsigned long fs;
  static inline void set_fs(unsigned long seg)
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c

new file mode 100644 (file)

index 0000000..aa31346
--- /dev/null
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -0,0 +1,12 @@
+#ifdef CONFIG_RANDOMIZE_BASE
+
+#include "../cpuflags.c"
+
+bool has_cpuflag(int flag)
+{
+       get_cpuflags();
+
+       return test_bit(flag, cpu.flags);
+}
+
+#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c

index b7388a425f0994ba87a30a27fe8ba57d7e3e5457..a7677babf946dc406735d94512eca4765b31569d 100644 (file)
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -19,214 +19,10 @@
  
  static efi_system_table_t *sys_table;
  
-static void efi_char16_printk(efi_char16_t *str)
-{
-       struct efi_simple_text_output_protocol *out;
-
-       out = (struct efi_simple_text_output_protocol *)sys_table->con_out;
-       efi_call_phys2(out->output_string, out, str);
-}
-
-static void efi_printk(char *str)
-{
-       char *s8;
-
-       for (s8 = str; *s8; s8++) {
-               efi_char16_t ch[2] = { 0 };
-
-               ch[0] = *s8;
-               if (*s8 == '\n') {
-                       efi_char16_t nl[2] = { '\r', 0 };
-                       efi_char16_printk(nl);
-               }
-
-               efi_char16_printk(ch);
-       }
-}
-
-static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
-                             unsigned long *desc_size)
-{
-       efi_memory_desc_t *m = NULL;
-       efi_status_t status;
-       unsigned long key;
-       u32 desc_version;
-
-       *map_size = sizeof(*m) * 32;
-again:
-       /*
-        * Add an additional efi_memory_desc_t because we're doing an
-        * allocation which may be in a new descriptor region.
-        */
-       *map_size += sizeof(*m);
-       status = efi_call_phys3(sys_table->boottime->allocate_pool,
-                               EFI_LOADER_DATA, *map_size, (void **)&m);
-       if (status != EFI_SUCCESS)
-               goto fail;
-
-       status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
-                               m, &key, desc_size, &desc_version);
-       if (status == EFI_BUFFER_TOO_SMALL) {
-               efi_call_phys1(sys_table->boottime->free_pool, m);
-               goto again;
-       }
-
-       if (status != EFI_SUCCESS)
-               efi_call_phys1(sys_table->boottime->free_pool, m);
  
-fail:
-       *map = m;
-       return status;
-}
-
-/*
- * Allocate at the highest possible address that is not above 'max'.
- */
-static efi_status_t high_alloc(unsigned long size, unsigned long align,
-                             unsigned long *addr, unsigned long max)
-{
-       unsigned long map_size, desc_size;
-       efi_memory_desc_t *map;
-       efi_status_t status;
-       unsigned long nr_pages;
-       u64 max_addr = 0;
-       int i;
-
-       status = __get_map(&map, &map_size, &desc_size);
-       if (status != EFI_SUCCESS)
-               goto fail;
-
-       nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
-again:
-       for (i = 0; i < map_size / desc_size; i++) {
-               efi_memory_desc_t *desc;
-               unsigned long m = (unsigned long)map;
-               u64 start, end;
-
-               desc = (efi_memory_desc_t *)(m + (i * desc_size));
-               if (desc->type != EFI_CONVENTIONAL_MEMORY)
-                       continue;
-
-               if (desc->num_pages < nr_pages)
-                       continue;
+#include "../../../../drivers/firmware/efi/efi-stub-helper.c"
  
-               start = desc->phys_addr;
-               end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
  
-               if ((start + size) > end || (start + size) > max)
-                       continue;
-
-               if (end - size > max)
-                       end = max;
-
-               if (round_down(end - size, align) < start)
-                       continue;
-
-               start = round_down(end - size, align);
-
-               /*
-                * Don't allocate at 0x0. It will confuse code that
-                * checks pointers against NULL.
-                */
-               if (start == 0x0)
-                       continue;
-
-               if (start > max_addr)
-                       max_addr = start;
-       }
-
-       if (!max_addr)
-               status = EFI_NOT_FOUND;
-       else {
-               status = efi_call_phys4(sys_table->boottime->allocate_pages,
-                                       EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
-                                       nr_pages, &max_addr);
-               if (status != EFI_SUCCESS) {
-                       max = max_addr;
-                       max_addr = 0;
-                       goto again;
-               }
-
-               *addr = max_addr;
-       }
-
-free_pool:
-       efi_call_phys1(sys_table->boottime->free_pool, map);
-
-fail:
-       return status;
-}
-
-/*
- * Allocate at the lowest possible address.
- */
-static efi_status_t low_alloc(unsigned long size, unsigned long align,
-                             unsigned long *addr)
-{
-       unsigned long map_size, desc_size;
-       efi_memory_desc_t *map;
-       efi_status_t status;
-       unsigned long nr_pages;
-       int i;
-
-       status = __get_map(&map, &map_size, &desc_size);
-       if (status != EFI_SUCCESS)
-               goto fail;
-
-       nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
-       for (i = 0; i < map_size / desc_size; i++) {
-               efi_memory_desc_t *desc;
-               unsigned long m = (unsigned long)map;
-               u64 start, end;
-
-               desc = (efi_memory_desc_t *)(m + (i * desc_size));
-
-               if (desc->type != EFI_CONVENTIONAL_MEMORY)
-                       continue;
-
-               if (desc->num_pages < nr_pages)
-                       continue;
-
-               start = desc->phys_addr;
-               end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
-
-               /*
-                * Don't allocate at 0x0. It will confuse code that
-                * checks pointers against NULL. Skip the first 8
-                * bytes so we start at a nice even number.
-                */
-               if (start == 0x0)
-                       start += 8;
-
-               start = round_up(start, align);
-               if ((start + size) > end)
-                       continue;
-
-               status = efi_call_phys4(sys_table->boottime->allocate_pages,
-                                       EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
-                                       nr_pages, &start);
-               if (status == EFI_SUCCESS) {
-                       *addr = start;
-                       break;
-               }
-       }
-
-       if (i == map_size / desc_size)
-               status = EFI_NOT_FOUND;
-
-free_pool:
-       efi_call_phys1(sys_table->boottime->free_pool, map);
-fail:
-       return status;
-}
-
-static void low_free(unsigned long size, unsigned long addr)
-{
-       unsigned long nr_pages;
-
-       nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
-       efi_call_phys2(sys_table->boottime->free_pages, addr, nr_pages);
-}
  
  static void find_bits(unsigned long mask, u8 *pos, u8 *size)
  {
@@ -624,242 +420,6 @@ void setup_graphics(struct boot_params *boot_params)
         }
  }
  
-struct initrd {
-       efi_file_handle_t *handle;
-       u64 size;
-};
-
-/*
- * Check the cmdline for a LILO-style initrd= arguments.
- *
- * We only support loading an initrd from the same filesystem as the
- * kernel image.
- */
-static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
-                                   struct setup_header *hdr)
-{
-       struct initrd *initrds;
-       unsigned long initrd_addr;
-       efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
-       u64 initrd_total;
-       efi_file_io_interface_t *io;
-       efi_file_handle_t *fh;
-       efi_status_t status;
-       int nr_initrds;
-       char *str;
-       int i, j, k;
-
-       initrd_addr = 0;
-       initrd_total = 0;
-
-       str = (char *)(unsigned long)hdr->cmd_line_ptr;
-
-       j = 0;                  /* See close_handles */
-
-       if (!str || !*str)
-               return EFI_SUCCESS;
-
-       for (nr_initrds = 0; *str; nr_initrds++) {
-               str = strstr(str, "initrd=");
-               if (!str)
-                       break;
-
-               str += 7;
-
-               /* Skip any leading slashes */
-               while (*str == '/' || *str == '\\')
-                       str++;
-
-               while (*str && *str != ' ' && *str != '\n')
-                       str++;
-       }
-
-       if (!nr_initrds)
-               return EFI_SUCCESS;
-
-       status = efi_call_phys3(sys_table->boottime->allocate_pool,
-                               EFI_LOADER_DATA,
-                               nr_initrds * sizeof(*initrds),
-                               &initrds);
-       if (status != EFI_SUCCESS) {
-               efi_printk("Failed to alloc mem for initrds\n");
-               goto fail;
-       }
-
-       str = (char *)(unsigned long)hdr->cmd_line_ptr;
-       for (i = 0; i < nr_initrds; i++) {
-               struct initrd *initrd;
-               efi_file_handle_t *h;
-               efi_file_info_t *info;
-               efi_char16_t filename_16[256];
-               unsigned long info_sz;
-               efi_guid_t info_guid = EFI_FILE_INFO_ID;
-               efi_char16_t *p;
-               u64 file_sz;
-
-               str = strstr(str, "initrd=");
-               if (!str)
-                       break;
-
-               str += 7;
-
-               initrd = &initrds[i];
-               p = filename_16;
-
-               /* Skip any leading slashes */
-               while (*str == '/' || *str == '\\')
-                       str++;
-
-               while (*str && *str != ' ' && *str != '\n') {
-                       if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16))
-                               break;
-
-                       if (*str == '/') {
-                               *p++ = '\\';
-                               *str++;
-                       } else {
-                               *p++ = *str++;
-                       }
-               }
-
-               *p = '\0';
-
-               /* Only open the volume once. */
-               if (!i) {
-                       efi_boot_services_t *boottime;
-
-                       boottime = sys_table->boottime;
-
-                       status = efi_call_phys3(boottime->handle_protocol,
-                                       image->device_handle, &fs_proto, &io);
-                       if (status != EFI_SUCCESS) {
-                               efi_printk("Failed to handle fs_proto\n");
-                               goto free_initrds;
-                       }
-
-                       status = efi_call_phys2(io->open_volume, io, &fh);
-                       if (status != EFI_SUCCESS) {
-                               efi_printk("Failed to open volume\n");
-                               goto free_initrds;
-                       }
-               }
-
-               status = efi_call_phys5(fh->open, fh, &h, filename_16,
-                                       EFI_FILE_MODE_READ, (u64)0);
-               if (status != EFI_SUCCESS) {
-                       efi_printk("Failed to open initrd file: ");
-                       efi_char16_printk(filename_16);
-                       efi_printk("\n");
-                       goto close_handles;
-               }
-
-               initrd->handle = h;
-
-               info_sz = 0;
-               status = efi_call_phys4(h->get_info, h, &info_guid,
-                                       &info_sz, NULL);
-               if (status != EFI_BUFFER_TOO_SMALL) {
-                       efi_printk("Failed to get initrd info size\n");
-                       goto close_handles;
-               }
-
-grow:
-               status = efi_call_phys3(sys_table->boottime->allocate_pool,
-                                       EFI_LOADER_DATA, info_sz, &info);
-               if (status != EFI_SUCCESS) {
-                       efi_printk("Failed to alloc mem for initrd info\n");
-                       goto close_handles;
-               }
-
-               status = efi_call_phys4(h->get_info, h, &info_guid,
-                                       &info_sz, info);
-               if (status == EFI_BUFFER_TOO_SMALL) {
-                       efi_call_phys1(sys_table->boottime->free_pool, info);
-                       goto grow;
-               }
-
-               file_sz = info->file_size;
-               efi_call_phys1(sys_table->boottime->free_pool, info);
-
-               if (status != EFI_SUCCESS) {
-                       efi_printk("Failed to get initrd info\n");
-                       goto close_handles;
-               }
-
-               initrd->size = file_sz;
-               initrd_total += file_sz;
-       }
-
-       if (initrd_total) {
-               unsigned long addr;
-
-               /*
-                * Multiple initrd's need to be at consecutive
-                * addresses in memory, so allocate enough memory for
-                * all the initrd's.
-                */
-               status = high_alloc(initrd_total, 0x1000,
-                                  &initrd_addr, hdr->initrd_addr_max);
-               if (status != EFI_SUCCESS) {
-                       efi_printk("Failed to alloc highmem for initrds\n");
-                       goto close_handles;
-               }
-
-               /* We've run out of free low memory. */
-               if (initrd_addr > hdr->initrd_addr_max) {
-                       efi_printk("We've run out of free low memory\n");
-                       status = EFI_INVALID_PARAMETER;
-                       goto free_initrd_total;
-               }
-
-               addr = initrd_addr;
-               for (j = 0; j < nr_initrds; j++) {
-                       u64 size;
-
-                       size = initrds[j].size;
-                       while (size) {
-                               u64 chunksize;
-                               if (size > EFI_READ_CHUNK_SIZE)
-                                       chunksize = EFI_READ_CHUNK_SIZE;
-                               else
-                                       chunksize = size;
-                               status = efi_call_phys3(fh->read,
-                                                       initrds[j].handle,
-                                                       &chunksize, addr);
-                               if (status != EFI_SUCCESS) {
-                                       efi_printk("Failed to read initrd\n");
-                                       goto free_initrd_total;
-                               }
-                               addr += chunksize;
-                               size -= chunksize;
-                       }
-
-                       efi_call_phys1(fh->close, initrds[j].handle);
-               }
-
-       }
-
-       efi_call_phys1(sys_table->boottime->free_pool, initrds);
-
-       hdr->ramdisk_image = initrd_addr;
-       hdr->ramdisk_size = initrd_total;
-
-       return status;
-
-free_initrd_total:
-       low_free(initrd_total, initrd_addr);
-
-close_handles:
-       for (k = j; k < i; k++)
-               efi_call_phys1(fh->close, initrds[k].handle);
-free_initrds:
-       efi_call_phys1(sys_table->boottime->free_pool, initrds);
-fail:
-       hdr->ramdisk_image = 0;
-       hdr->ramdisk_size = 0;
-
-       return status;
-}
  
  /*
   * Because the x86 boot code expects to be passed a boot_params we
@@ -875,14 +435,15 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
         struct efi_info *efi;
         efi_loaded_image_t *image;
         void *options;
-       u32 load_options_size;
         efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
         int options_size = 0;
         efi_status_t status;
-       unsigned long cmdline;
+       char *cmdline_ptr;
         u16 *s2;
         u8 *s1;
         int i;
+       unsigned long ramdisk_addr;
+       unsigned long ramdisk_size;
  
         sys_table = _table;
  
@@ -893,13 +454,14 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
         status = efi_call_phys3(sys_table->boottime->handle_protocol,
                                 handle, &proto, (void *)&image);
         if (status != EFI_SUCCESS) {
-               efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
+               efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
                 return NULL;
         }
  
-       status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
+       status = efi_low_alloc(sys_table, 0x4000, 1,
+                              (unsigned long *)&boot_params);
         if (status != EFI_SUCCESS) {
-               efi_printk("Failed to alloc lowmem for boot params\n");
+               efi_printk(sys_table, "Failed to alloc lowmem for boot params\n");
                 return NULL;
         }
  
@@ -926,40 +488,11 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
         hdr->type_of_loader = 0x21;
  
         /* Convert unicode cmdline to ascii */
-       options = image->load_options;
-       load_options_size = image->load_options_size / 2; /* ASCII */
-       cmdline = 0;
-       s2 = (u16 *)options;
-
-       if (s2) {
-               while (*s2 && *s2 != '\n' && options_size < load_options_size) {
-                       s2++;
-                       options_size++;
-               }
-
-               if (options_size) {
-                       if (options_size > hdr->cmdline_size)
-                               options_size = hdr->cmdline_size;
-
-                       options_size++; /* NUL termination */
-
-                       status = low_alloc(options_size, 1, &cmdline);
-                       if (status != EFI_SUCCESS) {
-                               efi_printk("Failed to alloc mem for cmdline\n");
-                               goto fail;
-                       }
-
-                       s1 = (u8 *)(unsigned long)cmdline;
-                       s2 = (u16 *)options;
-
-                       for (i = 0; i < options_size - 1; i++)
-                               *s1++ = *s2++;
-
-                       *s1 = '\0';
-               }
-       }
-
-       hdr->cmd_line_ptr = cmdline;
+       cmdline_ptr = efi_convert_cmdline_to_ascii(sys_table, image,
+                                                  &options_size);
+       if (!cmdline_ptr)
+               goto fail;
+       hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
  
         hdr->ramdisk_image = 0;
         hdr->ramdisk_size = 0;
@@ -969,96 +502,64 @@ struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
  
         memset(sdt, 0, sizeof(*sdt));
  
-       status = handle_ramdisks(image, hdr);
+       status = handle_cmdline_files(sys_table, image,
+                                     (char *)(unsigned long)hdr->cmd_line_ptr,
+                                     "initrd=", hdr->initrd_addr_max,
+                                     &ramdisk_addr, &ramdisk_size);
         if (status != EFI_SUCCESS)
                 goto fail2;
+       hdr->ramdisk_image = ramdisk_addr;
+       hdr->ramdisk_size = ramdisk_size;
  
         return boot_params;
  fail2:
-       if (options_size)
-               low_free(options_size, hdr->cmd_line_ptr);
+       efi_free(sys_table, options_size, hdr->cmd_line_ptr);
  fail:
-       low_free(0x4000, (unsigned long)boot_params);
+       efi_free(sys_table, 0x4000, (unsigned long)boot_params);
         return NULL;
  }
  
-static efi_status_t exit_boot(struct boot_params *boot_params,
-                             void *handle)
+static void add_e820ext(struct boot_params *params,
+                       struct setup_data *e820ext, u32 nr_entries)
  {
-       struct efi_info *efi = &boot_params->efi_info;
-       struct e820entry *e820_map = &boot_params->e820_map[0];
-       struct e820entry *prev = NULL;
-       unsigned long size, key, desc_size, _size;
-       efi_memory_desc_t *mem_map;
+       struct setup_data *data;
         efi_status_t status;
-       __u32 desc_version;
-       bool called_exit = false;
-       u8 nr_entries;
-       int i;
-
-       size = sizeof(*mem_map) * 32;
-
-again:
-       size += sizeof(*mem_map) * 2;
-       _size = size;
-       status = low_alloc(size, 1, (unsigned long *)&mem_map);
-       if (status != EFI_SUCCESS)
-               return status;
-
-get_map:
-       status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
-                               mem_map, &key, &desc_size, &desc_version);
-       if (status == EFI_BUFFER_TOO_SMALL) {
-               low_free(_size, (unsigned long)mem_map);
-               goto again;
-       }
+       unsigned long size;
  
-       if (status != EFI_SUCCESS)
-               goto free_mem_map;
+       e820ext->type = SETUP_E820_EXT;
+       e820ext->len = nr_entries * sizeof(struct e820entry);
+       e820ext->next = 0;
  
-       memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
-       efi->efi_systab = (unsigned long)sys_table;
-       efi->efi_memdesc_size = desc_size;
-       efi->efi_memdesc_version = desc_version;
-       efi->efi_memmap = (unsigned long)mem_map;
-       efi->efi_memmap_size = size;
-
-#ifdef CONFIG_X86_64
-       efi->efi_systab_hi = (unsigned long)sys_table >> 32;
-       efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
-#endif
+       data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
  
-       /* Might as well exit boot services now */
-       status = efi_call_phys2(sys_table->boottime->exit_boot_services,
-                               handle, key);
-       if (status != EFI_SUCCESS) {
-               /*
-                * ExitBootServices() will fail if any of the event
-                * handlers change the memory map. In which case, we
-                * must be prepared to retry, but only once so that
-                * we're guaranteed to exit on repeated failures instead
-                * of spinning forever.
-                */
-               if (called_exit)
-                       goto free_mem_map;
+       while (data && data->next)
+               data = (struct setup_data *)(unsigned long)data->next;
  
-               called_exit = true;
-               goto get_map;
-       }
+       if (data)
+               data->next = (unsigned long)e820ext;
+       else
+               params->hdr.setup_data = (unsigned long)e820ext;
+}
  
-       /* Historic? */
-       boot_params->alt_mem_k = 32 * 1024;
+static efi_status_t setup_e820(struct boot_params *params,
+                              struct setup_data *e820ext, u32 e820ext_size)
+{
+       struct e820entry *e820_map = &params->e820_map[0];
+       struct efi_info *efi = &params->efi_info;
+       struct e820entry *prev = NULL;
+       u32 nr_entries;
+       u32 nr_desc;
+       int i;
  
-       /*
-        * Convert the EFI memory map to E820.
-        */
         nr_entries = 0;
-       for (i = 0; i < size / desc_size; i++) {
+       nr_desc = efi->efi_memmap_size / efi->efi_memdesc_size;
+
+       for (i = 0; i < nr_desc; i++) {
                 efi_memory_desc_t *d;
                 unsigned int e820_type = 0;
-               unsigned long m = (unsigned long)mem_map;
+               unsigned long m = efi->efi_memmap;
  
-               d = (efi_memory_desc_t *)(m + (i * desc_size));
+               d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
                 switch (d->type) {
                 case EFI_RESERVED_TYPE:
                 case EFI_RUNTIME_SERVICES_CODE:
@@ -1095,61 +596,151 @@ get_map:
  
                 /* Merge adjacent mappings */
                 if (prev && prev->type == e820_type &&
-                   (prev->addr + prev->size) == d->phys_addr)
+                   (prev->addr + prev->size) == d->phys_addr) {
                         prev->size += d->num_pages << 12;
-               else {
-                       e820_map->addr = d->phys_addr;
-                       e820_map->size = d->num_pages << 12;
-                       e820_map->type = e820_type;
-                       prev = e820_map++;
-                       nr_entries++;
+                       continue;
+               }
+
+               if (nr_entries == ARRAY_SIZE(params->e820_map)) {
+                       u32 need = (nr_desc - i) * sizeof(struct e820entry) +
+                                  sizeof(struct setup_data);
+
+                       if (!e820ext || e820ext_size < need)
+                               return EFI_BUFFER_TOO_SMALL;
+
+                       /* boot_params map full, switch to e820 extended */
+                       e820_map = (struct e820entry *)e820ext->data;
                 }
+
+               e820_map->addr = d->phys_addr;
+               e820_map->size = d->num_pages << PAGE_SHIFT;
+               e820_map->type = e820_type;
+               prev = e820_map++;
+               nr_entries++;
         }
  
-       boot_params->e820_entries = nr_entries;
+       if (nr_entries > ARRAY_SIZE(params->e820_map)) {
+               u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_map);
+
+               add_e820ext(params, e820ext, nr_e820ext);
+               nr_entries -= nr_e820ext;
+       }
+
+       params->e820_entries = (u8)nr_entries;
  
         return EFI_SUCCESS;
+}
+
+static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
+                                 u32 *e820ext_size)
+{
+       efi_status_t status;
+       unsigned long size;
+
+       size = sizeof(struct setup_data) +
+               sizeof(struct e820entry) * nr_desc;
+
+       if (*e820ext) {
+               efi_call_phys1(sys_table->boottime->free_pool, *e820ext);
+               *e820ext = NULL;
+               *e820ext_size = 0;
+       }
+
+       status = efi_call_phys3(sys_table->boottime->allocate_pool,
+                               EFI_LOADER_DATA, size, e820ext);
+
+       if (status == EFI_SUCCESS)
+               *e820ext_size = size;
  
-free_mem_map:
-       low_free(_size, (unsigned long)mem_map);
         return status;
  }
  
-static efi_status_t relocate_kernel(struct setup_header *hdr)
+static efi_status_t exit_boot(struct boot_params *boot_params,
+                             void *handle)
  {
-       unsigned long start, nr_pages;
+       struct efi_info *efi = &boot_params->efi_info;
+       unsigned long map_sz, key, desc_size;
+       efi_memory_desc_t *mem_map;
+       struct setup_data *e820ext;
+       __u32 e820ext_size;
+       __u32 nr_desc, prev_nr_desc;
         efi_status_t status;
+       __u32 desc_version;
+       bool called_exit = false;
+       u8 nr_entries;
+       int i;
  
-       /*
-        * The EFI firmware loader could have placed the kernel image
-        * anywhere in memory, but the kernel has various restrictions
-        * on the max physical address it can run at. Attempt to move
-        * the kernel to boot_params.pref_address, or as low as
-        * possible.
-        */
-       start = hdr->pref_address;
-       nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+       nr_desc = 0;
+       e820ext = NULL;
+       e820ext_size = 0;
  
-       status = efi_call_phys4(sys_table->boottime->allocate_pages,
-                               EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
-                               nr_pages, &start);
-       if (status != EFI_SUCCESS) {
-               status = low_alloc(hdr->init_size, hdr->kernel_alignment,
-                                  &start);
+get_map:
+       status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size,
+                                   &desc_version, &key);
+
+       if (status != EFI_SUCCESS)
+               return status;
+
+       prev_nr_desc = nr_desc;
+       nr_desc = map_sz / desc_size;
+       if (nr_desc > prev_nr_desc &&
+           nr_desc > ARRAY_SIZE(boot_params->e820_map)) {
+               u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map);
+
+               status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size);
                 if (status != EFI_SUCCESS)
-                       efi_printk("Failed to alloc mem for kernel\n");
+                       goto free_mem_map;
+
+               efi_call_phys1(sys_table->boottime->free_pool, mem_map);
+               goto get_map; /* Allocated memory, get map again */
         }
  
-       if (status == EFI_SUCCESS)
-               memcpy((void *)start, (void *)(unsigned long)hdr->code32_start,
-                      hdr->init_size);
+       memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
+       efi->efi_systab = (unsigned long)sys_table;
+       efi->efi_memdesc_size = desc_size;
+       efi->efi_memdesc_version = desc_version;
+       efi->efi_memmap = (unsigned long)mem_map;
+       efi->efi_memmap_size = map_sz;
+
+#ifdef CONFIG_X86_64
+       efi->efi_systab_hi = (unsigned long)sys_table >> 32;
+       efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
+#endif
  
-       hdr->pref_address = hdr->code32_start;
-       hdr->code32_start = (__u32)start;
+       /* Might as well exit boot services now */
+       status = efi_call_phys2(sys_table->boottime->exit_boot_services,
+                               handle, key);
+       if (status != EFI_SUCCESS) {
+               /*
+                * ExitBootServices() will fail if any of the event
+                * handlers change the memory map. In which case, we
+                * must be prepared to retry, but only once so that
+                * we're guaranteed to exit on repeated failures instead
+                * of spinning forever.
+                */
+               if (called_exit)
+                       goto free_mem_map;
  
+               called_exit = true;
+               efi_call_phys1(sys_table->boottime->free_pool, mem_map);
+               goto get_map;
+       }
+
+       /* Historic? */
+       boot_params->alt_mem_k = 32 * 1024;
+
+       status = setup_e820(boot_params, e820ext, e820ext_size);
+       if (status != EFI_SUCCESS)
+               return status;
+
+       return EFI_SUCCESS;
+
+free_mem_map:
+       efi_call_phys1(sys_table->boottime->free_pool, mem_map);
         return status;
  }
  
+
  /*
   * On success we return a pointer to a boot_params structure, and NULL
   * on failure.
@@ -1157,7 +748,7 @@ static efi_status_t relocate_kernel(struct setup_header *hdr)
  struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
                              struct boot_params *boot_params)
  {
-       struct desc_ptr *gdt, *idt;
+       struct desc_ptr *gdt;
         efi_loaded_image_t *image;
         struct setup_header *hdr = &boot_params->hdr;
         efi_status_t status;
@@ -1177,37 +768,33 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
                                 EFI_LOADER_DATA, sizeof(*gdt),
                                 (void **)&gdt);
         if (status != EFI_SUCCESS) {
-               efi_printk("Failed to alloc mem for gdt structure\n");
+               efi_printk(sys_table, "Failed to alloc mem for gdt structure\n");
                 goto fail;
         }
  
         gdt->size = 0x800;
-       status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
-       if (status != EFI_SUCCESS) {
-               efi_printk("Failed to alloc mem for gdt\n");
-               goto fail;
-       }
-
-       status = efi_call_phys3(sys_table->boottime->allocate_pool,
-                               EFI_LOADER_DATA, sizeof(*idt),
-                               (void **)&idt);
+       status = efi_low_alloc(sys_table, gdt->size, 8,
+                          (unsigned long *)&gdt->address);
         if (status != EFI_SUCCESS) {
-               efi_printk("Failed to alloc mem for idt structure\n");
+               efi_printk(sys_table, "Failed to alloc mem for gdt\n");
                 goto fail;
         }
  
-       idt->size = 0;
-       idt->address = 0;
-
         /*
          * If the kernel isn't already loaded at the preferred load
          * address, relocate it.
          */
         if (hdr->pref_address != hdr->code32_start) {
-               status = relocate_kernel(hdr);
-
+               unsigned long bzimage_addr = hdr->code32_start;
+               status = efi_relocate_kernel(sys_table, &bzimage_addr,
+                                            hdr->init_size, hdr->init_size,
+                                            hdr->pref_address,
+                                            hdr->kernel_alignment);
                 if (status != EFI_SUCCESS)
                         goto fail;
+
+               hdr->pref_address = hdr->code32_start;
+               hdr->code32_start = bzimage_addr;
         }
  
         status = exit_boot(boot_params, handle);
@@ -1267,10 +854,8 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
         desc->base2 = 0x00;
  #endif /* CONFIG_X86_64 */
  
-       asm volatile ("lidt %0" : : "m" (*idt));
-       asm volatile ("lgdt %0" : : "m" (*gdt));
-
         asm volatile("cli");
+       asm volatile ("lgdt %0" : : "m" (*gdt));
  
         return boot_params;
  fail:
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h

index e5b0a8f91c5f129b556398b299886c1760a846ff..81b6b652b46a948440601964e4e0f114563f4f5d 100644 (file)
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -11,9 +11,6 @@
  
  #define DESC_TYPE_CODE_DATA    (1 << 0)
  
-#define EFI_PAGE_SIZE          (1UL << EFI_PAGE_SHIFT)
-#define EFI_READ_CHUNK_SIZE    (1024 * 1024)
-
  #define EFI_CONSOLE_OUT_DEVICE_GUID    \
         EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \
                   0x3f, 0xc1, 0x4d)
@@ -62,10 +59,4 @@ struct efi_uga_draw_protocol {
         void *blt;
  };
  
-struct efi_simple_text_output_protocol {
-       void *reset;
-       void *output_string;
-       void *test_string;
-};
-
  #endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S

index 5d6f6891b1881ee812c060bc730915d73d4d7f65..9116aac232c746ae4f5262508fba437db9fbb431 100644 (file)
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -117,9 +117,11 @@ preferred_addr:
         addl    %eax, %ebx
         notl    %eax
         andl    %eax, %ebx
-#else
-       movl    $LOAD_PHYSICAL_ADDR, %ebx
+       cmpl    $LOAD_PHYSICAL_ADDR, %ebx
+       jge     1f
  #endif
+       movl    $LOAD_PHYSICAL_ADDR, %ebx
+1:
  
         /* Target address to relocate to for decompression */
         addl    $z_extract_offset, %ebx
@@ -191,14 +193,14 @@ relocated:
         leal    boot_heap(%ebx), %eax
         pushl   %eax            /* heap area */
         pushl   %esi            /* real mode pointer */
-       call    decompress_kernel
+       call    decompress_kernel /* returns kernel location in %eax */
         addl    $24, %esp
  
  /*
   * Jump to the decompressed kernel.
   */
         xorl    %ebx, %ebx
-       jmp     *%ebp
+       jmp     *%eax
  
  /*
   * Stack and heap for uncompression
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S

index c337422b575de190280334ef5cd61214441644f9..c5c1ae0997e7b223128b009a220b899e22ffeb25 100644 (file)
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -94,9 +94,11 @@ ENTRY(startup_32)
         addl    %eax, %ebx
         notl    %eax
         andl    %eax, %ebx
-#else
-       movl    $LOAD_PHYSICAL_ADDR, %ebx
+       cmpl    $LOAD_PHYSICAL_ADDR, %ebx
+       jge     1f
  #endif
+       movl    $LOAD_PHYSICAL_ADDR, %ebx
+1:
  
         /* Target address to relocate to for decompression */
         addl    $z_extract_offset, %ebx
@@ -269,9 +271,11 @@ preferred_addr:
         addq    %rax, %rbp
         notq    %rax
         andq    %rax, %rbp
-#else
-       movq    $LOAD_PHYSICAL_ADDR, %rbp
+       cmpq    $LOAD_PHYSICAL_ADDR, %rbp
+       jge     1f
  #endif
+       movq    $LOAD_PHYSICAL_ADDR, %rbp
+1:
  
         /* Target address to relocate to for decompression */
         leaq    z_extract_offset(%rbp), %rbx
@@ -339,13 +343,13 @@ relocated:
         movl    $z_input_len, %ecx      /* input_len */
         movq    %rbp, %r8               /* output target address */
         movq    $z_output_len, %r9      /* decompressed length */
-       call    decompress_kernel
+       call    decompress_kernel       /* returns kernel location in %rax */
         popq    %rsi
  
  /*
   * Jump to the decompressed kernel.
   */
-       jmp     *%rbp
+       jmp     *%rax
  
         .code32
  no_longmode:
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c

index 434f077d2c4d7b664216e6adaa22ac44ea51786a..196eaf373a06ad89c11bc9133a5c97b0e5ebc6d9 100644 (file)
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -112,14 +112,8 @@ struct boot_params *real_mode;             /* Pointer to real-mode data */
  void *memset(void *s, int c, size_t n);
  void *memcpy(void *dest, const void *src, size_t n);
  
-#ifdef CONFIG_X86_64
-#define memptr long
-#else
-#define memptr unsigned
-#endif
-
-static memptr free_mem_ptr;
-static memptr free_mem_end_ptr;
+memptr free_mem_ptr;
+memptr free_mem_end_ptr;
  
  static char *vidmem;
  static int vidport;
@@ -395,7 +389,7 @@ static void parse_elf(void *output)
         free(phdrs);
  }
  
-asmlinkage void decompress_kernel(void *rmode, memptr heap,
+asmlinkage void *decompress_kernel(void *rmode, memptr heap,
                                   unsigned char *input_data,
                                   unsigned long input_len,
                                   unsigned char *output,
@@ -422,6 +416,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
         free_mem_ptr     = heap;        /* Heap */
         free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
  
+       output = choose_kernel_location(input_data, input_len,
+                                       output, output_len);
+
+       /* Validate memory location choices. */
         if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
                 error("Destination address inappropriately aligned");
  #ifdef CONFIG_X86_64
@@ -441,5 +439,5 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
         parse_elf(output);
         handle_relocations(output, output_len);
         debug_putstr("done.\nBooting the kernel.\n");
-       return;
+       return output;
  }
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h

index 674019d8e2355901b69de5c55cfb92669c5628be..24e3e569a13ce9bf1ba620811ff4b09d6fa6e32d 100644 (file)
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -23,7 +23,15 @@
  #define BOOT_BOOT_H
  #include "../ctype.h"
  
+#ifdef CONFIG_X86_64
+#define memptr long
+#else
+#define memptr unsigned
+#endif
+
  /* misc.c */
+extern memptr free_mem_ptr;
+extern memptr free_mem_end_ptr;
  extern struct boot_params *real_mode;          /* Pointer to real-mode data */
  void __putstr(const char *s);
  #define error_putstr(__x)  __putstr(__x)
@@ -39,23 +47,40 @@ static inline void debug_putstr(const char *s)
  
  #endif
  
-#ifdef CONFIG_EARLY_PRINTK
-
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
  /* cmdline.c */
  int cmdline_find_option(const char *option, char *buffer, int bufsize);
  int cmdline_find_option_bool(const char *option);
+#endif
  
-/* early_serial_console.c */
-extern int early_serial_base;
-void console_init(void);
  
+#if CONFIG_RANDOMIZE_BASE
+/* aslr.c */
+unsigned char *choose_kernel_location(unsigned char *input,
+                                     unsigned long input_size,
+                                     unsigned char *output,
+                                     unsigned long output_size);
+/* cpuflags.c */
+bool has_cpuflag(int flag);
  #else
+static inline
+unsigned char *choose_kernel_location(unsigned char *input,
+                                     unsigned long input_size,
+                                     unsigned char *output,
+                                     unsigned long output_size)
+{
+       return output;
+}
+#endif
  
+#ifdef CONFIG_EARLY_PRINTK
  /* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+#else
  static const int early_serial_base;
  static inline void console_init(void)
  { }
-
  #endif
  
  #endif
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c

index 958a641483dd5fa3817eebae63170da86e36e2b1..b669ab65bf6cf2e6973d6dc50cb693a1a8568324 100644 (file)
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -36,11 +36,12 @@ int main(int argc, char *argv[])
         uint32_t olen;
         long ilen;
         unsigned long offs;
-       FILE *f;
+       FILE *f = NULL;
+       int retval = 1;
  
         if (argc < 2) {
                 fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
-               return 1;
+               goto bail;
         }
  
         /* Get the information for the compressed kernel image first */
@@ -48,7 +49,7 @@ int main(int argc, char *argv[])
         f = fopen(argv[1], "r");
         if (!f) {
                 perror(argv[1]);
-               return 1;
+               goto bail;
         }
  
  
@@ -58,12 +59,11 @@ int main(int argc, char *argv[])
  
         if (fread(&olen, sizeof(olen), 1, f) != 1) {
                 perror(argv[1]);
-               return 1;
+               goto bail;
         }
  
         ilen = ftell(f);
         olen = get_unaligned_le32(&olen);
-       fclose(f);
  
         /*
          * Now we have the input (compressed) and output (uncompressed)
@@ -91,5 +91,9 @@ int main(int argc, char *argv[])
         printf(".incbin \"%s\"\n", argv[1]);
         printf("input_data_end:\n");
  
-       return 0;
+       retval = 0;
+bail:
+       if (f)
+               fclose(f);
+       return retval;
  }
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c

index 4d3ff037201ffbee0bb468e04d069b02909d094d..100a9a10076a649e7e7008a3579867391ca16fa4 100644 (file)
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -28,8 +28,6 @@
  #include <asm/required-features.h>
  #include <asm/msr-index.h>
  
-struct cpu_features cpu;
-static u32 cpu_vendor[3];
  static u32 err_flags[NCAPINTS];
  
  static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
@@ -69,92 +67,8 @@ static int is_transmeta(void)
                cpu_vendor[2] == A32('M', 'x', '8', '6');
  }
  
-static int has_fpu(void)
-{
-       u16 fcw = -1, fsw = -1;
-       u32 cr0;
-
-       asm("movl %%cr0,%0" : "=r" (cr0));
-       if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
-               cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
-               asm volatile("movl %0,%%cr0" : : "r" (cr0));
-       }
-
-       asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-                    : "+m" (fsw), "+m" (fcw));
-
-       return fsw == 0 && (fcw & 0x103f) == 0x003f;
-}
-
-static int has_eflag(u32 mask)
-{
-       u32 f0, f1;
-
-       asm("pushfl ; "
-           "pushfl ; "
-           "popl %0 ; "
-           "movl %0,%1 ; "
-           "xorl %2,%1 ; "
-           "pushl %1 ; "
-           "popfl ; "
-           "pushfl ; "
-           "popl %1 ; "
-           "popfl"
-           : "=&r" (f0), "=&r" (f1)
-           : "ri" (mask));
-
-       return !!((f0^f1) & mask);
-}
-
-static void get_flags(void)
-{
-       u32 max_intel_level, max_amd_level;
-       u32 tfms;
-
-       if (has_fpu())
-               set_bit(X86_FEATURE_FPU, cpu.flags);
-
-       if (has_eflag(X86_EFLAGS_ID)) {
-               asm("cpuid"
-                   : "=a" (max_intel_level),
-                     "=b" (cpu_vendor[0]),
-                     "=d" (cpu_vendor[1]),
-                     "=c" (cpu_vendor[2])
-                   : "a" (0));
-
-               if (max_intel_level >= 0x00000001 &&
-                   max_intel_level <= 0x0000ffff) {
-                       asm("cpuid"
-                           : "=a" (tfms),
-                             "=c" (cpu.flags[4]),
-                             "=d" (cpu.flags[0])
-                           : "a" (0x00000001)
-                           : "ebx");
-                       cpu.level = (tfms >> 8) & 15;
-                       cpu.model = (tfms >> 4) & 15;
-                       if (cpu.level >= 6)
-                               cpu.model += ((tfms >> 16) & 0xf) << 4;
-               }
-
-               asm("cpuid"
-                   : "=a" (max_amd_level)
-                   : "a" (0x80000000)
-                   : "ebx", "ecx", "edx");
-
-               if (max_amd_level >= 0x80000001 &&
-                   max_amd_level <= 0x8000ffff) {
-                       u32 eax = 0x80000001;
-                       asm("cpuid"
-                           : "+a" (eax),
-                             "=c" (cpu.flags[6]),
-                             "=d" (cpu.flags[1])
-                           : : "ebx");
-               }
-       }
-}
-
  /* Returns a bitmask of which words we have error bits in */
-static int check_flags(void)
+static int check_cpuflags(void)
  {
         u32 err;
         int i;
@@ -187,8 +101,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
         if (has_eflag(X86_EFLAGS_AC))
                 cpu.level = 4;
  
-       get_flags();
-       err = check_flags();
+       get_cpuflags();
+       err = check_cpuflags();
  
         if (test_bit(X86_FEATURE_LM, cpu.flags))
                 cpu.level = 64;
@@ -207,8 +121,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
                 eax &= ~(1 << 15);
                 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
  
-               get_flags();    /* Make sure it really did something */
-               err = check_flags();
+               get_cpuflags(); /* Make sure it really did something */
+               err = check_cpuflags();
         } else if (err == 0x01 &&
                    !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
                    is_centaur() && cpu.model >= 6) {
@@ -223,7 +137,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
                 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
  
                 set_bit(X86_FEATURE_CX8, cpu.flags);
-               err = check_flags();
+               err = check_cpuflags();
         } else if (err == 0x01 && is_transmeta()) {
                 /* Transmeta might have masked feature bits in word 0 */
  
@@ -238,7 +152,7 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
                     : : "ecx", "ebx");
                 asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
  
-               err = check_flags();
+               err = check_cpuflags();
         }
  
         if (err_flags_ptr)
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c

new file mode 100644 (file)

index 0000000..a9fcb7c
--- /dev/null
+++ b/arch/x86/boot/cpuflags.c
@@ -0,0 +1,104 @@
+#include <linux/types.h>
+#include "bitops.h"
+
+#include <asm/processor-flags.h>
+#include <asm/required-features.h>
+#include <asm/msr-index.h>
+#include "cpuflags.h"
+
+struct cpu_features cpu;
+u32 cpu_vendor[3];
+
+static bool loaded_flags;
+
+static int has_fpu(void)
+{
+       u16 fcw = -1, fsw = -1;
+       unsigned long cr0;
+
+       asm volatile("mov %%cr0,%0" : "=r" (cr0));
+       if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
+               cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
+               asm volatile("mov %0,%%cr0" : : "r" (cr0));
+       }
+
+       asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+                    : "+m" (fsw), "+m" (fcw));
+
+       return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+int has_eflag(unsigned long mask)
+{
+       unsigned long f0, f1;
+
+       asm volatile("pushf     \n\t"
+                    "pushf     \n\t"
+                    "pop %0    \n\t"
+                    "mov %0,%1 \n\t"
+                    "xor %2,%1 \n\t"
+                    "push %1   \n\t"
+                    "popf      \n\t"
+                    "pushf     \n\t"
+                    "pop %1    \n\t"
+                    "popf"
+                    : "=&r" (f0), "=&r" (f1)
+                    : "ri" (mask));
+
+       return !!((f0^f1) & mask);
+}
+
+/* Handle x86_32 PIC using ebx. */
+#if defined(__i386__) && defined(__PIC__)
+# define EBX_REG "=r"
+#else
+# define EBX_REG "=b"
+#endif
+
+static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+       asm volatile(".ifnc %%ebx,%3 ; movl  %%ebx,%3 ; .endif  \n\t"
+                    "cpuid                                     \n\t"
+                    ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif  \n\t"
+                   : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
+                   : "a" (id)
+       );
+}
+
+void get_cpuflags(void)
+{
+       u32 max_intel_level, max_amd_level;
+       u32 tfms;
+       u32 ignored;
+
+       if (loaded_flags)
+               return;
+       loaded_flags = true;
+
+       if (has_fpu())
+               set_bit(X86_FEATURE_FPU, cpu.flags);
+
+       if (has_eflag(X86_EFLAGS_ID)) {
+               cpuid(0x0, &max_intel_level, &cpu_vendor[0], &cpu_vendor[2],
+                     &cpu_vendor[1]);
+
+               if (max_intel_level >= 0x00000001 &&
+                   max_intel_level <= 0x0000ffff) {
+                       cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
+                             &cpu.flags[0]);
+                       cpu.level = (tfms >> 8) & 15;
+                       cpu.model = (tfms >> 4) & 15;
+                       if (cpu.level >= 6)
+                               cpu.model += ((tfms >> 16) & 0xf) << 4;
+               }
+
+               cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
+                     &ignored);
+
+               if (max_amd_level >= 0x80000001 &&
+                   max_amd_level <= 0x8000ffff) {
+                       cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6],
+                             &cpu.flags[1]);
+               }
+       }
+}
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h

new file mode 100644 (file)

index 0000000..ea97697
--- /dev/null
+++ b/arch/x86/boot/cpuflags.h
@@ -0,0 +1,19 @@
+#ifndef BOOT_CPUFLAGS_H
+#define BOOT_CPUFLAGS_H
+
+#include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
+
+struct cpu_features {
+       int level;              /* Family, or 64 for x86-64 */
+       int model;
+       u32 flags[NCAPINTS];
+};
+
+extern struct cpu_features cpu;
+extern u32 cpu_vendor[3];
+
+int has_eflag(unsigned long mask);
+void get_cpuflags(void);
+
+#endif
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c

index c941d6a8887f8099cb31edabd9a6be80d11b8a1d..8e15b22391fc40ad9e0a981ef3176d76643e7c45 100644 (file)
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -5,14 +5,15 @@
   */
  
  /*
- * This file builds a disk-image from two different files:
+ * This file builds a disk-image from three different files:
   *
   * - setup: 8086 machine code, sets up system parm
   * - system: 80386 code for actual system
+ * - zoffset.h: header with ZO_* defines
   *
- * It does some checking that all files are of the correct type, and
- * just writes the result to stdout, removing headers and padding to
- * the right amount. It also writes some system data to stderr.
+ * It does some checking that all files are of the correct type, and writes
+ * the result to the specified destination, removing headers and padding to
+ * the right amount. It also writes some system data to stdout.
   */
  
  /*
@@ -136,7 +137,7 @@ static void die(const char * str, ...)
  
  static void usage(void)
  {
-       die("Usage: build setup system [zoffset.h] [> image]");
+       die("Usage: build setup system zoffset.h image");
  }
  
  #ifdef CONFIG_EFI_STUB
@@ -265,7 +266,7 @@ int main(int argc, char ** argv)
         int c;
         u32 sys_size;
         struct stat sb;
-       FILE *file;
+       FILE *file, *dest;
         int fd;
         void *kernel;
         u32 crc = 0xffffffffUL;
@@ -280,10 +281,13 @@ int main(int argc, char ** argv)
         startup_64 = 0x200;
  #endif
  
-       if (argc == 4)
-               parse_zoffset(argv[3]);
-       else if (argc != 3)
+       if (argc != 5)
                 usage();
+       parse_zoffset(argv[3]);
+
+       dest = fopen(argv[4], "w");
+       if (!dest)
+               die("Unable to write `%s': %m", argv[4]);
  
         /* Copy the setup code */
         file = fopen(argv[1], "r");
@@ -318,7 +322,7 @@ int main(int argc, char ** argv)
         /* Set the default root device */
         put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
  
-       fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
+       printf("Setup is %d bytes (padded to %d bytes).\n", c, i);
  
         /* Open and stat the kernel file */
         fd = open(argv[2], O_RDONLY);
@@ -327,7 +331,7 @@ int main(int argc, char ** argv)
         if (fstat(fd, &sb))
                 die("Unable to stat `%s': %m", argv[2]);
         sz = sb.st_size;
-       fprintf (stderr, "System is %d kB\n", (sz+1023)/1024);
+       printf("System is %d kB\n", (sz+1023)/1024);
         kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
         if (kernel == MAP_FAILED)
                 die("Unable to mmap '%s': %m", argv[2]);
@@ -348,27 +352,31 @@ int main(int argc, char ** argv)
  #endif
  
         crc = partial_crc32(buf, i, crc);
-       if (fwrite(buf, 1, i, stdout) != i)
+       if (fwrite(buf, 1, i, dest) != i)
                 die("Writing setup failed");
  
         /* Copy the kernel code */
         crc = partial_crc32(kernel, sz, crc);
-       if (fwrite(kernel, 1, sz, stdout) != sz)
+       if (fwrite(kernel, 1, sz, dest) != sz)
                 die("Writing kernel failed");
  
         /* Add padding leaving 4 bytes for the checksum */
         while (sz++ < (sys_size*16) - 4) {
                 crc = partial_crc32_one('\0', crc);
-               if (fwrite("\0", 1, 1, stdout) != 1)
+               if (fwrite("\0", 1, 1, dest) != 1)
                         die("Writing padding failed");
         }
  
         /* Write the CRC */
-       fprintf(stderr, "CRC %x\n", crc);
+       printf("CRC %x\n", crc);
         put_unaligned_le32(crc, buf);
-       if (fwrite(buf, 1, 4, stdout) != 4)
+       if (fwrite(buf, 1, 4, dest) != 4)
                 die("Writing CRC failed");
  
+       /* Catch any delayed write failures */
+       if (fclose(dest))
+               die("Writing image failed");
+
         close(fd);
  
         /* Everything is OK */
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h

index 0d9ec770f2f8770f8e5a320300ea47f3bf39b9bb..e6a92455740e6a5dfc64c5b1a1193a693594feab 100644 (file)
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -39,6 +39,20 @@
  
  #ifdef CONFIG_ARCH_RANDOM
  
+/* Instead of arch_get_random_long() when alternatives haven't run. */
+static inline int rdrand_long(unsigned long *v)
+{
+       int ok;
+       asm volatile("1: " RDRAND_LONG "\n\t"
+                    "jc 2f\n\t"
+                    "decl %0\n\t"
+                    "jnz 1b\n\t"
+                    "2:"
+                    : "=r" (ok), "=a" (*v)
+                    : "0" (RDRAND_RETRY_LOOPS));
+       return ok;
+}
+
  #define GET_RANDOM(name, type, rdrand, nop)                    \
  static inline int name(type *v)                                        \
  {                                                              \
@@ -68,6 +82,13 @@ GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3);
  
  #endif /* CONFIG_X86_64 */
  
+#else
+
+static inline int rdrand_long(unsigned long *v)
+{
+       return 0;
+}
+
  #endif  /* CONFIG_ARCH_RANDOM */
  
  extern void x86_init_rdrand(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h

index 722aa3b0462448fc5442cd4c72a485fc23cbf5f8..da31c8b8a92ddd0459bc1b6a0a7058f4d1f93766 100644 (file)
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -6,6 +6,7 @@
  #include <asm/processor.h>
  #include <asm/alternative.h>
  #include <asm/cmpxchg.h>
+#include <asm/rmwcc.h>
  
  /*
   * Atomic operations that C can't guarantee us.  Useful for
@@ -76,12 +77,7 @@ static inline void atomic_sub(int i, atomic_t *v)
   */
  static inline int atomic_sub_and_test(int i, atomic_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
-                    : "+m" (v->counter), "=qm" (c)
-                    : "ir" (i) : "memory");
-       return c;
+       GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, i, "%0", "e");
  }
  
  /**
@@ -118,12 +114,7 @@ static inline void atomic_dec(atomic_t *v)
   */
  static inline int atomic_dec_and_test(atomic_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "decl %0; sete %1"
-                    : "+m" (v->counter), "=qm" (c)
-                    : : "memory");
-       return c != 0;
+       GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
  }
  
  /**
@@ -136,12 +127,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
   */
  static inline int atomic_inc_and_test(atomic_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "incl %0; sete %1"
-                    : "+m" (v->counter), "=qm" (c)
-                    : : "memory");
-       return c != 0;
+       GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
  }
  
  /**
@@ -155,12 +141,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
   */
  static inline int atomic_add_negative(int i, atomic_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
-                    : "+m" (v->counter), "=qm" (c)
-                    : "ir" (i) : "memory");
-       return c;
+       GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, i, "%0", "s");
  }
  
  /**
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h

index 0e1cbfc8ee0633267969cb6ff7b2050607b922c1..3f065c985aeed272d120e3cec19f04b9ee9f5254 100644 (file)
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -72,12 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
   */
  static inline int atomic64_sub_and_test(long i, atomic64_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
-                    : "=m" (v->counter), "=qm" (c)
-                    : "er" (i), "m" (v->counter) : "memory");
-       return c;
+       GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, i, "%0", "e");
  }
  
  /**
@@ -116,12 +111,7 @@ static inline void atomic64_dec(atomic64_t *v)
   */
  static inline int atomic64_dec_and_test(atomic64_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "decq %0; sete %1"
-                    : "=m" (v->counter), "=qm" (c)
-                    : "m" (v->counter) : "memory");
-       return c != 0;
+       GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e");
  }
  
  /**
@@ -134,12 +124,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v)
   */
  static inline int atomic64_inc_and_test(atomic64_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "incq %0; sete %1"
-                    : "=m" (v->counter), "=qm" (c)
-                    : "m" (v->counter) : "memory");
-       return c != 0;
+       GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e");
  }
  
  /**
@@ -153,12 +138,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v)
   */
  static inline int atomic64_add_negative(long i, atomic64_t *v)
  {
-       unsigned char c;
-
-       asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
-                    : "=m" (v->counter), "=qm" (c)
-                    : "er" (i), "m" (v->counter) : "memory");
-       return c;
+       GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, i, "%0", "s");
  }
  
  /**
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h

index 41639ce8fd634a4e8436a062006b763702749fc1..6d76d093598930ce24476118fd68d690bb78035c 100644 (file)
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -14,6 +14,7 @@
  
  #include <linux/compiler.h>
  #include <asm/alternative.h>
+#include <asm/rmwcc.h>
  
  #if BITS_PER_LONG == 32
  # define _BITOPS_LONG_SHIFT 5
@@ -204,12 +205,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
   */
  static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
  {
-       int oldbit;
-
-       asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
-                    "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
-
-       return oldbit;
+       GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, nr, "%0", "c");
  }
  
  /**
@@ -255,13 +251,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
   */
  static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
  {
-       int oldbit;
-
-       asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
-                    "sbb %0,%0"
-                    : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
-
-       return oldbit;
+       GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, nr, "%0", "c");
  }
  
  /**
@@ -314,13 +304,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
   */
  static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
  {
-       int oldbit;
-
-       asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
-                    "sbb %0,%0"
-                    : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
-
-       return oldbit;
+       GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, nr, "%0", "c");
  }
  
  static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h

index 0fa67503391257d89d1b55016c07e97fba7fa667..cb4c73bfeb48169056001d1c8426492a3ad4af79 100644 (file)
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,6 +48,8 @@ For 32-bit we have the following conventions - kernel is built with
  
  #include <asm/dwarf2.h>
  
+#ifdef CONFIG_X86_64
+
  /*
   * 64-bit system call stack frame layout defines and helpers,
   * for assembly code:
@@ -192,3 +194,51 @@ For 32-bit we have the following conventions - kernel is built with
         .macro icebp
         .byte 0xf1
         .endm
+
+#else /* CONFIG_X86_64 */
+
+/*
+ * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
+ * are different from the entry_32.S versions in not changing the segment
+ * registers. So only suitable for in kernel use, not when transitioning
+ * from or to user space. The resulting stack frame is not a standard
+ * pt_regs frame. The main use case is calling C code from assembler
+ * when all the registers need to be preserved.
+ */
+
+       .macro SAVE_ALL
+       pushl_cfi %eax
+       CFI_REL_OFFSET eax, 0
+       pushl_cfi %ebp
+       CFI_REL_OFFSET ebp, 0
+       pushl_cfi %edi
+       CFI_REL_OFFSET edi, 0
+       pushl_cfi %esi
+       CFI_REL_OFFSET esi, 0
+       pushl_cfi %edx
+       CFI_REL_OFFSET edx, 0
+       pushl_cfi %ecx
+       CFI_REL_OFFSET ecx, 0
+       pushl_cfi %ebx
+       CFI_REL_OFFSET ebx, 0
+       .endm
+
+       .macro RESTORE_ALL
+       popl_cfi %ebx
+       CFI_RESTORE ebx
+       popl_cfi %ecx
+       CFI_RESTORE ecx
+       popl_cfi %edx
+       CFI_RESTORE edx
+       popl_cfi %esi
+       CFI_RESTORE esi
+       popl_cfi %edi
+       CFI_RESTORE edi
+       popl_cfi %ebp
+       CFI_RESTORE ebp
+       popl_cfi %eax
+       CFI_RESTORE eax
+       .endm
+
+#endif /* CONFIG_X86_64 */
+
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h

new file mode 100644 (file)

index 0000000..459769d
--- /dev/null
+++ b/arch/x86/include/asm/intel-mid.h
@@ -0,0 +1,113 @@
+/*
+ * intel-mid.h: Intel MID specific setup code
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _ASM_X86_INTEL_MID_H
+#define _ASM_X86_INTEL_MID_H
+
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+
+extern int intel_mid_pci_init(void);
+extern int get_gpio_by_name(const char *name);
+extern void intel_scu_device_register(struct platform_device *pdev);
+extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int __init sfi_parse_mtmr(struct sfi_table_header *table);
+extern int sfi_mrtc_num;
+extern struct sfi_rtc_table_entry sfi_mrtc_array[];
+
+/*
+ * Here defines the array of devices platform data that IAFW would export
+ * through SFI "DEVS" table, we use name and type to match the device and
+ * its platform data.
+ */
+struct devs_id {
+       char name[SFI_NAME_LEN + 1];
+       u8 type;
+       u8 delay;
+       void *(*get_platform_data)(void *info);
+       /* Custom handler for devices */
+       void (*device_handler)(struct sfi_device_table_entry *pentry,
+                               struct devs_id *dev);
+};
+
+#define sfi_device(i)   \
+       static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \
+       __attribute__((__section__(".x86_intel_mid_dev.init"))) = &i
+
+/*
+ * Medfield is the follow-up of Moorestown, it combines two chip solution into
+ * one. Other than that it also added always-on and constant tsc and lapic
+ * timers. Medfield is the platform name, and the chip name is called Penwell
+ * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
+ * identified via MSRs.
+ */
+enum intel_mid_cpu_type {
+       /* 1 was Moorestown */
+       INTEL_MID_CPU_CHIP_PENWELL = 2,
+};
+
+extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
+
+#ifdef CONFIG_X86_INTEL_MID
+
+static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void)
+{
+       return __intel_mid_cpu_chip;
+}
+
+static inline bool intel_mid_has_msic(void)
+{
+       return (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL);
+}
+
+#else /* !CONFIG_X86_INTEL_MID */
+
+#define intel_mid_identify_cpu()    (0)
+#define intel_mid_has_msic()    (0)
+
+#endif /* !CONFIG_X86_INTEL_MID */
+
+enum intel_mid_timer_options {
+       INTEL_MID_TIMER_DEFAULT,
+       INTEL_MID_TIMER_APBT_ONLY,
+       INTEL_MID_TIMER_LAPIC_APBT,
+};
+
+extern enum intel_mid_timer_options intel_mid_timer_options;
+
+/*
+ * Penwell uses spread spectrum clock, so the freq number is not exactly
+ * the same as reported by MSR based on SDM.
+ */
+#define PENWELL_FSB_FREQ_83SKU         83200
+#define PENWELL_FSB_FREQ_100SKU        99840
+
+#define SFI_MTMR_MAX_NUM 8
+#define SFI_MRTC_MAX   8
+
+extern struct console early_mrst_console;
+extern void mrst_early_console_init(void);
+
+extern struct console early_hsu_console;
+extern void hsu_early_console_init(const char *);
+
+extern void intel_scu_devices_create(void);
+extern void intel_scu_devices_destroy(void);
+
+/* VRTC timer */
+#define MRST_VRTC_MAP_SZ       (1024)
+/*#define MRST_VRTC_PGOFFSET   (0xc00) */
+
+extern void intel_mid_rtc_init(void);
+
+/* the offset for the mapping of global gpio pin to irq */
+#define INTEL_MID_IRQ_OFFSET 0x100
+
+#endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h

similarity index 81%

rename from arch/x86/include/asm/mrst-vrtc.h

rename to arch/x86/include/asm/intel_mid_vrtc.h

index 1e69a75412a4038ae335897515beadc271235850..86ff4685c4095cd9071832139c4534dfd1186c91 100644 (file)
--- a/arch/x86/include/asm/mrst-vrtc.h
+++ b/arch/x86/include/asm/intel_mid_vrtc.h
@@ -1,5 +1,5 @@
-#ifndef _MRST_VRTC_H
-#define _MRST_VRTC_H
+#ifndef _INTEL_MID_VRTC_H
+#define _INTEL_MID_VRTC_H
  
  extern unsigned char vrtc_cmos_read(unsigned char reg);
  extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h

index 2d89e3980cbdbab0f76f7801e07d6be2d8ffb48a..5b23e605e707cb21e254f7ce964029a48b69ee54 100644 (file)
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -52,12 +52,7 @@ static inline void local_sub(long i, local_t *l)
   */
  static inline int local_sub_and_test(long i, local_t *l)
  {
-       unsigned char c;
-
-       asm volatile(_ASM_SUB "%2,%0; sete %1"
-                    : "+m" (l->a.counter), "=qm" (c)
-                    : "ir" (i) : "memory");
-       return c;
+       GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, i, "%0", "e");
  }
  
  /**
@@ -70,12 +65,7 @@ static inline int local_sub_and_test(long i, local_t *l)
   */
  static inline int local_dec_and_test(local_t *l)
  {
-       unsigned char c;
-
-       asm volatile(_ASM_DEC "%0; sete %1"
-                    : "+m" (l->a.counter), "=qm" (c)
-                    : : "memory");
-       return c != 0;
+       GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e");
  }
  
  /**
@@ -88,12 +78,7 @@ static inline int local_dec_and_test(local_t *l)
   */
  static inline int local_inc_and_test(local_t *l)
  {
-       unsigned char c;
-
-       asm volatile(_ASM_INC "%0; sete %1"
-                    : "+m" (l->a.counter), "=qm" (c)
-                    : : "memory");
-       return c != 0;
+       GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e");
  }
  
  /**
@@ -107,12 +92,7 @@ static inline int local_inc_and_test(local_t *l)
   */
  static inline int local_add_negative(long i, local_t *l)
  {
-       unsigned char c;
-
-       asm volatile(_ASM_ADD "%2,%0; sets %1"
-                    : "+m" (l->a.counter), "=qm" (c)
-                    : "ir" (i) : "memory");
-       return c;
+       GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, i, "%0", "s");
  }
  
  /**
diff --git a/arch/x86/include/asm/misc.h b/arch/x86/include/asm/misc.h

new file mode 100644 (file)

index 0000000..475f5bb
--- /dev/null
+++ b/arch/x86/include/asm/misc.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_MISC_H
+#define _ASM_X86_MISC_H
+
+int num_digits(int val);
+
+#endif /* _ASM_X86_MISC_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h

deleted file mode 100644 (file)

index fc18bf3..0000000
--- a/arch/x86/include/asm/mrst.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * mrst.h: Intel Moorestown platform specific setup code
- *
- * (C) Copyright 2009 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#ifndef _ASM_X86_MRST_H
-#define _ASM_X86_MRST_H
-
-#include <linux/sfi.h>
-
-extern int pci_mrst_init(void);
-extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
-extern int sfi_mrtc_num;
-extern struct sfi_rtc_table_entry sfi_mrtc_array[];
-
-/*
- * Medfield is the follow-up of Moorestown, it combines two chip solution into
- * one. Other than that it also added always-on and constant tsc and lapic
- * timers. Medfield is the platform name, and the chip name is called Penwell
- * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
- * identified via MSRs.
- */
-enum mrst_cpu_type {
-       /* 1 was Moorestown */
-       MRST_CPU_CHIP_PENWELL = 2,
-};
-
-extern enum mrst_cpu_type __mrst_cpu_chip;
-
-#ifdef CONFIG_X86_INTEL_MID
-
-static inline enum mrst_cpu_type mrst_identify_cpu(void)
-{
-       return __mrst_cpu_chip;
-}
-
-#else /* !CONFIG_X86_INTEL_MID */
-
-#define mrst_identify_cpu()    (0)
-
-#endif /* !CONFIG_X86_INTEL_MID */
-
-enum mrst_timer_options {
-       MRST_TIMER_DEFAULT,
-       MRST_TIMER_APBT_ONLY,
-       MRST_TIMER_LAPIC_APBT,
-};
-
-extern enum mrst_timer_options mrst_timer_options;
-
-/*
- * Penwell uses spread spectrum clock, so the freq number is not exactly
- * the same as reported by MSR based on SDM.
- */
-#define PENWELL_FSB_FREQ_83SKU         83200
-#define PENWELL_FSB_FREQ_100SKU        99840
-
-#define SFI_MTMR_MAX_NUM 8
-#define SFI_MRTC_MAX   8
-
-extern struct console early_mrst_console;
-extern void mrst_early_console_init(void);
-
-extern struct console early_hsu_console;
-extern void hsu_early_console_init(const char *);
-
-extern void intel_scu_devices_create(void);
-extern void intel_scu_devices_destroy(void);
-
-/* VRTC timer */
-#define MRST_VRTC_MAP_SZ       (1024)
-/*#define MRST_VRTC_PGOFFSET   (0xc00) */
-
-extern void mrst_rtc_init(void);
-
-#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h

index 43dcd804ebd5732aea40415e6f4a419e363701c9..8de6d9cf3b954a7f31bcb756c92c20cc08db5759 100644 (file)
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -39,9 +39,18 @@
  #define __VIRTUAL_MASK_SHIFT   47
  
  /*
- * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
- * arch/x86/kernel/head_64.S), and it is mapped here:
+ * Kernel image size is limited to 1GiB due to the fixmap living in the
+ * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
+ * 512MiB by default, leaving 1.5GiB for modules once the page tables
+ * are fully set up. If kernel ASLR is configured, it can extend the
+ * kernel page table mapping, reducing the size of the modules area.
   */
-#define KERNEL_IMAGE_SIZE      (512 * 1024 * 1024)
+#define KERNEL_IMAGE_SIZE_DEFAULT      (512 * 1024 * 1024)
+#if defined(CONFIG_RANDOMIZE_BASE) && \
+       CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT
+#define KERNEL_IMAGE_SIZE   CONFIG_RANDOMIZE_BASE_MAX_OFFSET
+#else
+#define KERNEL_IMAGE_SIZE      KERNEL_IMAGE_SIZE_DEFAULT
+#endif
  
  #endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h

index 2d883440cb9a282f948d7062684c52789e95d90f..c883bf726398a2556d50b984736ad068bc0dd44c 100644 (file)
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -58,7 +58,7 @@ typedef struct { pteval_t pte; } pte_t;
  #define VMALLOC_START    _AC(0xffffc90000000000, UL)
  #define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
  #define VMEMMAP_START   _AC(0xffffea0000000000, UL)
-#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
+#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
  #define MODULES_END      _AC(0xffffffffff000000, UL)
  #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
  
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h

new file mode 100644 (file)

index 0000000..8729723
--- /dev/null
+++ b/arch/x86/include/asm/preempt.h
@@ -0,0 +1,100 @@
+#ifndef __ASM_PREEMPT_H
+#define __ASM_PREEMPT_H
+
+#include <asm/rmwcc.h>
+#include <asm/percpu.h>
+#include <linux/thread_info.h>
+
+DECLARE_PER_CPU(int, __preempt_count);
+
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
+static __always_inline int preempt_count(void)
+{
+       return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline void preempt_count_set(int pc)
+{
+       __this_cpu_write_4(__preempt_count, pc);
+}
+
+/*
+ * must be macros to avoid header recursion hell
+ */
+#define task_preempt_count(p) \
+       (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED)
+
+#define init_task_preempt_count(p) do { \
+       task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
+} while (0)
+
+#define init_idle_preempt_count(p, cpu) do { \
+       task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
+       per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+} while (0)
+
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * single instruction.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
+
+static __always_inline void set_preempt_need_resched(void)
+{
+       __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline void clear_preempt_need_resched(void)
+{
+       __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline bool test_preempt_need_resched(void)
+{
+       return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
+}
+
+/*
+ * The various preempt_count add/sub methods
+ */
+
+static __always_inline void __preempt_count_add(int val)
+{
+       __this_cpu_add_4(__preempt_count, val);
+}
+
+static __always_inline void __preempt_count_sub(int val)
+{
+       __this_cpu_add_4(__preempt_count, -val);
+}
+
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+       GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
+}
+
+/*
+ * Returns true when we need to resched and can (barring IRQ state).
+ */
+static __always_inline bool should_resched(void)
+{
+       return unlikely(!__this_cpu_read_4(__preempt_count));
+}
+
+#ifdef CONFIG_PREEMPT
+  extern asmlinkage void ___preempt_schedule(void);
+# define __preempt_schedule() asm ("call ___preempt_schedule")
+  extern asmlinkage void preempt_schedule(void);
+# ifdef CONFIG_CONTEXT_TRACKING
+    extern asmlinkage void ___preempt_schedule_context(void);
+#   define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
+# endif
+#endif
+
+#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h

new file mode 100644 (file)

index 0000000..1ff990f
--- /dev/null
+++ b/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_X86_RMWcc
+#define _ASM_X86_RMWcc
+
+#ifdef CC_HAVE_ASM_GOTO
+
+#define __GEN_RMWcc(fullop, var, cc, ...)                              \
+do {                                                                   \
+       asm_volatile_goto (fullop "; j" cc " %l[cc_label]"              \
+                       : : "m" (var), ## __VA_ARGS__                   \
+                       : "memory" : cc_label);                         \
+       return 0;                                                       \
+cc_label:                                                              \
+       return 1;                                                       \
+} while (0)
+
+#define GEN_UNARY_RMWcc(op, var, arg0, cc)                             \
+       __GEN_RMWcc(op " " arg0, var, cc)
+
+#define GEN_BINARY_RMWcc(op, var, val, arg0, cc)                       \
+       __GEN_RMWcc(op " %1, " arg0, var, cc, "er" (val))
+
+#else /* !CC_HAVE_ASM_GOTO */
+
+#define __GEN_RMWcc(fullop, var, cc, ...)                              \
+do {                                                                   \
+       char c;                                                         \
+       asm volatile (fullop "; set" cc " %1"                           \
+                       : "+m" (var), "=qm" (c)                         \
+                       : __VA_ARGS__ : "memory");                      \
+       return c != 0;                                                  \
+} while (0)
+
+#define GEN_UNARY_RMWcc(op, var, arg0, cc)                             \
+       __GEN_RMWcc(op " " arg0, var, cc)
+
+#define GEN_BINARY_RMWcc(op, var, val, arg0, cc)                       \
+       __GEN_RMWcc(op " %2, " arg0, var, cc, "er" (val))
+
+#endif /* CC_HAVE_ASM_GOTO */
+
+#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h

index 347555492dad198b9f643812222aa5a218b54d6c..59bcf4e22418573b4a9d98ada8a6ee45cf12565a 100644 (file)
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -51,9 +51,9 @@ extern void i386_reserve_resources(void);
  extern void setup_default_timer_irq(void);
  
  #ifdef CONFIG_X86_INTEL_MID
-extern void x86_mrst_early_setup(void);
+extern void x86_intel_mid_early_setup(void);
  #else
-static inline void x86_mrst_early_setup(void) { }
+static inline void x86_intel_mid_early_setup(void) { }
  #endif
  
  #ifdef CONFIG_X86_INTEL_CE
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h

index 27811190cbd70e5787263e6f064069776064fe3f..c46a46be1ec699c0a92180c4f584da150fd7f5eb 100644 (file)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -28,8 +28,7 @@ struct thread_info {
         __u32                   flags;          /* low level flags */
         __u32                   status;         /* thread synchronous flags */
         __u32                   cpu;            /* current CPU */
-       int                     preempt_count;  /* 0 => preemptable,
-                                                  <0 => BUG */
+       int                     saved_preempt_count;
         mm_segment_t            addr_limit;
         struct restart_block    restart_block;
         void __user             *sysenter_return;
@@ -49,7 +48,7 @@ struct thread_info {
         .exec_domain    = &default_exec_domain, \
         .flags          = 0,                    \
         .cpu            = 0,                    \
-       .preempt_count  = INIT_PREEMPT_COUNT,   \
+       .saved_preempt_count = INIT_PREEMPT_COUNT,      \
         .addr_limit     = KERNEL_DS,            \
         .restart_block = {                      \
                 .fn = do_no_restart_syscall,    \
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h

index 4f7923dd00079d0a582cb5467e05e43774001c44..64476bb2a1465dcf6ce15dc2a939ad1fb54fe7bf 100644 (file)
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -77,11 +77,10 @@ int copy_to_user(void __user *dst, const void *src, unsigned size)
  }
  
  static __always_inline __must_check
-int __copy_from_user(void *dst, const void __user *src, unsigned size)
+int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
  {
         int ret = 0;
  
-       might_fault();
         if (!__builtin_constant_p(size))
                 return copy_user_generic(dst, (__force void *)src, size);
         switch (size) {
@@ -121,11 +120,17 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size)
  }
  
  static __always_inline __must_check
-int __copy_to_user(void __user *dst, const void *src, unsigned size)
+int __copy_from_user(void *dst, const void __user *src, unsigned size)
+{
+       might_fault();
+       return __copy_from_user_nocheck(dst, src, size);
+}
+
+static __always_inline __must_check
+int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
  {
         int ret = 0;
  
-       might_fault();
         if (!__builtin_constant_p(size))
                 return copy_user_generic((__force void *)dst, src, size);
         switch (size) {
@@ -164,6 +169,13 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
         }
  }
  
+static __always_inline __must_check
+int __copy_to_user(void __user *dst, const void *src, unsigned size)
+{
+       might_fault();
+       return __copy_to_user_nocheck(dst, src, size);
+}
+
  static __always_inline __must_check
  int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
  {
@@ -220,13 +232,13 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
  static __must_check __always_inline int
  __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
  {
-       return copy_user_generic(dst, (__force const void *)src, size);
+       return __copy_from_user_nocheck(dst, (__force const void *)src, size);
  }
  
  static __must_check __always_inline int
  __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
  {
-       return copy_user_generic((__force void *)dst, src, size);
+       return __copy_to_user_nocheck((__force void *)dst, src, size);
  }
  
  extern long __copy_user_nocache(void *dst, const void __user *src,
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h

index 062921ef34e9136100d3b4820826642dd45f223b..8b1283daa332747eec92851dba4933666fab84ee 100644 (file)
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -12,7 +12,15 @@ extern enum uv_system_type get_uv_system_type(void);
  extern int is_uv_system(void);
  extern void uv_cpu_init(void);
  extern void uv_nmi_init(void);
+extern void uv_register_nmi_notifier(void);
  extern void uv_system_init(void);
+extern void (*uv_trace_nmi_func)(unsigned int reason, struct pt_regs *regs);
+extern void (*uv_trace_func)(const char *f, const int l, const char *fmt, ...);
+#define uv_trace(fmt, ...)                                             \
+do {                                                                   \
+       if (unlikely(uv_trace_func))                                    \
+               (uv_trace_func)(__func__, __LINE__, fmt, ##__VA_ARGS__);\
+} while (0)
  extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
                                                  struct mm_struct *mm,
                                                  unsigned long start,
@@ -25,6 +33,8 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
  static inline int is_uv_system(void)   { return 0; }
  static inline void uv_cpu_init(void)   { }
  static inline void uv_system_init(void)        { }
+static inline void uv_trace(void *fmt, ...)    { }
+static inline void uv_register_nmi_notifier(void) { }
  static inline const struct cpumask *
  uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
                     unsigned long start, unsigned long end, unsigned int cpu)
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h

index 2c32df95bb7834737a2a7e440d29ebbea408fa82..a30836c8ac4da3cc4899164475fa32b9fa5c2839 100644 (file)
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -502,8 +502,8 @@ struct uv_blade_info {
         unsigned short  nr_online_cpus;
         unsigned short  pnode;
         short           memory_nid;
-       spinlock_t      nmi_lock;
-       unsigned long   nmi_count;
+       spinlock_t      nmi_lock;       /* obsolete, see uv_hub_nmi */
+       unsigned long   nmi_count;      /* obsolete, see uv_hub_nmi */
  };
  extern struct uv_blade_info *uv_blade_info;
  extern short *uv_node_to_blade;
@@ -576,6 +576,59 @@ static inline int uv_num_possible_blades(void)
         return uv_possible_blades;
  }
  
+/* Per Hub NMI support */
+extern void uv_nmi_setup(void);
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR            UVH_SCRATCH5
+#define UVH_NMI_MMR_CLEAR      UVH_SCRATCH5_ALIAS
+#define UVH_NMI_MMR_SHIFT      63
+#define        UVH_NMI_MMR_TYPE        "SCRATCH5"
+
+/* Newer SMM NMI handler, not present in all systems */
+#define UVH_NMI_MMRX           UVH_EVENT_OCCURRED0
+#define UVH_NMI_MMRX_CLEAR     UVH_EVENT_OCCURRED0_ALIAS
+#define UVH_NMI_MMRX_SHIFT     (is_uv1_hub() ? \
+                                       UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT :\
+                                       UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT)
+#define        UVH_NMI_MMRX_TYPE       "EXTIO_INT0"
+
+/* Non-zero indicates newer SMM NMI handler present */
+#define UVH_NMI_MMRX_SUPPORTED UVH_EXTIO_INT0_BROADCAST
+
+/* Indicates to BIOS that we want to use the newer SMM NMI handler */
+#define UVH_NMI_MMRX_REQ       UVH_SCRATCH5_ALIAS_2
+#define UVH_NMI_MMRX_REQ_SHIFT 62
+
+struct uv_hub_nmi_s {
+       raw_spinlock_t  nmi_lock;
+       atomic_t        in_nmi;         /* flag this node in UV NMI IRQ */
+       atomic_t        cpu_owner;      /* last locker of this struct */
+       atomic_t        read_mmr_count; /* count of MMR reads */
+       atomic_t        nmi_count;      /* count of true UV NMIs */
+       unsigned long   nmi_value;      /* last value read from NMI MMR */
+};
+
+struct uv_cpu_nmi_s {
+       struct uv_hub_nmi_s     *hub;
+       atomic_t                state;
+       atomic_t                pinging;
+       int                     queries;
+       int                     pings;
+};
+
+DECLARE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi);
+#define uv_cpu_nmi                     (__get_cpu_var(__uv_cpu_nmi))
+#define uv_hub_nmi                     (uv_cpu_nmi.hub)
+#define uv_cpu_nmi_per(cpu)            (per_cpu(__uv_cpu_nmi, cpu))
+#define uv_hub_nmi_per(cpu)            (uv_cpu_nmi_per(cpu).hub)
+
+/* uv_cpu_nmi_states */
+#define        UV_NMI_STATE_OUT                0
+#define        UV_NMI_STATE_IN                 1
+#define        UV_NMI_STATE_DUMP               2
+#define        UV_NMI_STATE_DUMP_DONE          3
+
  /* Update SCIR state */
  static inline void uv_set_scir_bits(unsigned char value)
  {
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h

index bd5f80e58a236b08961d898f37004670a52e9e02..e42249bcf7e1068ffa0d0f58e3450dd55cf2a9f1 100644 (file)
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -460,6 +460,23 @@ union uvh_event_occurred0_u {
  #define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
  
  
+/* ========================================================================= */
+/*                         UVH_EXTIO_INT0_BROADCAST                          */
+/* ========================================================================= */
+#define UVH_EXTIO_INT0_BROADCAST 0x61448UL
+#define UVH_EXTIO_INT0_BROADCAST_32 0x3f0
+
+#define UVH_EXTIO_INT0_BROADCAST_ENABLE_SHFT           0
+#define UVH_EXTIO_INT0_BROADCAST_ENABLE_MASK           0x0000000000000001UL
+
+union uvh_extio_int0_broadcast_u {
+       unsigned long   v;
+       struct uvh_extio_int0_broadcast_s {
+               unsigned long   enable:1;                       /* RW */
+               unsigned long   rsvd_1_63:63;
+       } s;
+};
+
  /* ========================================================================= */
  /*                         UVH_GR0_TLB_INT0_CONFIG                           */
  /* ========================================================================= */
@@ -2605,6 +2622,20 @@ union uvh_scratch5_u {
         } s;
  };
  
+/* ========================================================================= */
+/*                            UVH_SCRATCH5_ALIAS                             */
+/* ========================================================================= */
+#define UVH_SCRATCH5_ALIAS 0x2d0208UL
+#define UVH_SCRATCH5_ALIAS_32 0x780
+
+
+/* ========================================================================= */
+/*                           UVH_SCRATCH5_ALIAS_2                            */
+/* ========================================================================= */
+#define UVH_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UVH_SCRATCH5_ALIAS_2_32 0x788
+
+
  /* ========================================================================= */
  /*                          UVXH_EVENT_OCCURRED2                             */
  /* ========================================================================= */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h

index c15ddaf907107134d6cd2f8d86f554e510a6f848..9c3733c5f8f795c8fa08ef58f5c279e75060cfcd 100644 (file)
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -158,7 +158,7 @@ enum {
         X86_SUBARCH_PC = 0,
         X86_SUBARCH_LGUEST,
         X86_SUBARCH_XEN,
-       X86_SUBARCH_MRST,
+       X86_SUBARCH_INTEL_MID,
         X86_SUBARCH_CE4100,
         X86_NR_SUBARCHS,
  };
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index a5408b965c9d810b19551ee42d0cd0ba16eda385..9b0a34e2cd793dcb6cfe911a99d457b102296966 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,6 +36,8 @@ obj-y                 += tsc.o io_delay.o rtc.o
  obj-y                  += pci-iommu_table.o
  obj-y                  += resource.o
  
+obj-$(CONFIG_PREEMPT)  += preempt.o
+
  obj-y                          += process.o
  obj-y                          += i387.o xsave.o
  obj-y                          += ptrace.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c

index 15e8563e5c244e0712c9696a3367067044a300b3..df94598ad05a845902e9897214cdceacc779a80d 100644 (file)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -402,17 +402,6 @@ void alternatives_enable_smp(void)
  {
         struct smp_alt_module *mod;
  
-#ifdef CONFIG_LOCKDEP
-       /*
-        * Older binutils section handling bug prevented
-        * alternatives-replacement from working reliably.
-        *
-        * If this still occurs then you should see a hang
-        * or crash shortly after this line:
-        */
-       pr_info("lockdep: fixing up alternatives\n");
-#endif
-
         /* Why bother if there are no other CPUs? */
         BUG_ON(num_possible_cpus() == 1);
  
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c

index c9876efecafb53870def4d713292b715d0c9e4ef..af5b08ab3b712836e09d7eb9f6cd79919035d22f 100644 (file)
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -40,7 +40,7 @@
  
  #include <asm/fixmap.h>
  #include <asm/apb_timer.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  #include <asm/time.h>
  
  #define APBT_CLOCKEVENT_RATING         110
@@ -157,13 +157,13 @@ static int __init apbt_clockevent_register(void)
  
         adev->num = smp_processor_id();
         adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
-               mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+               intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
                 APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
                 adev_virt_addr(adev), 0, apbt_freq);
         /* Firmware does EOI handling for us. */
         adev->timer->eoi = NULL;
  
-       if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+       if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
                 global_clock_event = &adev->timer->ced;
                 printk(KERN_DEBUG "%s clockevent registered as global\n",
                        global_clock_event->name);
@@ -253,7 +253,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
  
  static __init int apbt_late_init(void)
  {
-       if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+       if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
                 !apb_timer_block_enabled)
                 return 0;
         /* This notifier should be called after workqueue is ready */
@@ -340,7 +340,7 @@ void __init apbt_time_init(void)
         }
  #ifdef CONFIG_SMP
         /* kernel cmdline disable apb timer, so we will use lapic timers */
-       if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+       if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
                 printk(KERN_INFO "apbt: disabled per cpu timer\n");
                 return;
         }
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c

index a419814cea575f9e2cf183772a9e99c59e097a11..ad0dc0428baf8cc4550f339f78e4552ef40020fb 100644 (file)
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -39,12 +39,6 @@
  #include <asm/x86_init.h>
  #include <asm/nmi.h>
  
-/* BMC sets a bit this MMR non-zero before sending an NMI */
-#define UVH_NMI_MMR                            UVH_SCRATCH5
-#define UVH_NMI_MMR_CLEAR                      (UVH_NMI_MMR + 8)
-#define UV_NMI_PENDING_MASK                    (1UL << 63)
-DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
-
  DEFINE_PER_CPU(int, x2apic_extra_bits);
  
  #define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
@@ -58,7 +52,6 @@ int uv_min_hub_revision_id;
  EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
  unsigned int uv_apicid_hibits;
  EXPORT_SYMBOL_GPL(uv_apicid_hibits);
-static DEFINE_SPINLOCK(uv_nmi_lock);
  
  static struct apic apic_x2apic_uv_x;
  
@@ -847,68 +840,6 @@ void uv_cpu_init(void)
                 set_x2apic_extra_bits(uv_hub_info->pnode);
  }
  
-/*
- * When NMI is received, print a stack trace.
- */
-int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
-{
-       unsigned long real_uv_nmi;
-       int bid;
-
-       /*
-        * Each blade has an MMR that indicates when an NMI has been sent
-        * to cpus on the blade. If an NMI is detected, atomically
-        * clear the MMR and update a per-blade NMI count used to
-        * cause each cpu on the blade to notice a new NMI.
-        */
-       bid = uv_numa_blade_id();
-       real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
-
-       if (unlikely(real_uv_nmi)) {
-               spin_lock(&uv_blade_info[bid].nmi_lock);
-               real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
-               if (real_uv_nmi) {
-                       uv_blade_info[bid].nmi_count++;
-                       uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
-               }
-               spin_unlock(&uv_blade_info[bid].nmi_lock);
-       }
-
-       if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
-               return NMI_DONE;
-
-       __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
-
-       /*
-        * Use a lock so only one cpu prints at a time.
-        * This prevents intermixed output.
-        */
-       spin_lock(&uv_nmi_lock);
-       pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
-       dump_stack();
-       spin_unlock(&uv_nmi_lock);
-
-       return NMI_HANDLED;
-}
-
-void uv_register_nmi_notifier(void)
-{
-       if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
-               printk(KERN_WARNING "UV NMI handler failed to register\n");
-}
-
-void uv_nmi_init(void)
-{
-       unsigned int value;
-
-       /*
-        * Unmask NMI on all cpus
-        */
-       value = apic_read(APIC_LVT1) | APIC_DM_NMI;
-       value &= ~APIC_LVT_MASKED;
-       apic_write(APIC_LVT1, value);
-}
-
  void __init uv_system_init(void)
  {
         union uvh_rh_gam_config_mmr_u  m_n_config;
@@ -1046,6 +977,7 @@ void __init uv_system_init(void)
         map_mmr_high(max_pnode);
         map_mmioh_high(min_pnode, max_pnode);
  
+       uv_nmi_setup();
         uv_cpu_init();
         uv_scir_register_cpu_notifier();
         uv_register_nmi_notifier();
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c

index 28610822fb3cc4be10a04c098ee91b6a8b8ab78f..9f6b9341950f7895b247b1c320cc0f9cd75cda9d 100644 (file)
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,6 @@ void common(void) {
         OFFSET(TI_flags, thread_info, flags);
         OFFSET(TI_status, thread_info, status);
         OFFSET(TI_addr_limit, thread_info, addr_limit);
-       OFFSET(TI_preempt_count, thread_info, preempt_count);
  
         BLANK();
         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 2793d1f095a2d865c23f14449944d322377f3dba..5223fe6dec7bbfe055989935b00ca258a1ba1f81 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1095,6 +1095,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
  
  DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
  
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
  DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
  
  /*
@@ -1169,6 +1172,8 @@ void debug_stack_reset(void)
  
  DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
  EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
  DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
  
  #ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h

index cc16faae0538431073b01dfa0bd5c96ab4383ae1..fd00bb29425d4da50194241c6ae3835775e12e6e 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -163,6 +163,11 @@ struct cpu_hw_events {
         u64                             intel_ctrl_host_mask;
         struct perf_guest_switch_msr    guest_switch_msrs[X86_PMC_IDX_MAX];
  
+       /*
+        * Intel checkpoint mask
+        */
+       u64                             intel_cp_status;
+
         /*
          * manage shared (per-core, per-cpu) registers
          * used on Intel NHM/WSM/SNB
@@ -440,6 +445,7 @@ struct x86_pmu {
         int             lbr_nr;                    /* hardware stack size */
         u64             lbr_sel_mask;              /* LBR_SELECT valid bits */
         const int       *lbr_sel_map;              /* lbr_select mappings */
+       bool            lbr_double_abort;          /* duplicated lbr aborts */
  
         /*
          * Extra registers for events
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

index f31a1655d1ff5bd602239e211b14bdd28d95a79a..0fa4f242f0504ad53297360966e957b84a0edab8 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -190,9 +190,9 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
         EVENT_EXTRA_END
  };
  
-EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads,      mem_ld_nhm,     "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads,      mem_ld_snb,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_snb,     "event=0xcd,umask=0x2");
  
  struct attribute *nhm_events_attrs[] = {
         EVENT_PTR(mem_ld_nhm),
@@ -1184,6 +1184,11 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
         wrmsrl(hwc->config_base, ctrl_val);
  }
  
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+       return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
  static void intel_pmu_disable_event(struct perf_event *event)
  {
         struct hw_perf_event *hwc = &event->hw;
@@ -1197,6 +1202,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
  
         cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
         cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+       cpuc->intel_cp_status &= ~(1ull << hwc->idx);
  
         /*
          * must disable before any actual event
@@ -1271,6 +1277,9 @@ static void intel_pmu_enable_event(struct perf_event *event)
         if (event->attr.exclude_guest)
                 cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
  
+       if (unlikely(event_is_checkpointed(event)))
+               cpuc->intel_cp_status |= (1ull << hwc->idx);
+
         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                 intel_pmu_enable_fixed(hwc);
                 return;
@@ -1289,6 +1298,17 @@ static void intel_pmu_enable_event(struct perf_event *event)
  int intel_pmu_save_and_restart(struct perf_event *event)
  {
         x86_perf_event_update(event);
+       /*
+        * For a checkpointed counter always reset back to 0.  This
+        * avoids a situation where the counter overflows, aborts the
+        * transaction and is then set back to shortly before the
+        * overflow, and overflows and aborts again.
+        */
+       if (unlikely(event_is_checkpointed(event))) {
+               /* No race with NMIs because the counter should not be armed */
+               wrmsrl(event->hw.event_base, 0);
+               local64_set(&event->hw.prev_count, 0);
+       }
         return x86_perf_event_set_period(event);
  }
  
@@ -1372,6 +1392,13 @@ again:
                 x86_pmu.drain_pebs(regs);
         }
  
+       /*
+        * Checkpointed counters can lead to 'spurious' PMIs because the
+        * rollback caused by the PMI will have cleared the overflow status
+        * bit. Therefore always force probe these counters.
+        */
+       status |= cpuc->intel_cp_status;
+
         for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
                 struct perf_event *event = cpuc->events[bit];
  
@@ -1837,6 +1864,20 @@ static int hsw_hw_config(struct perf_event *event)
               event->attr.precise_ip > 0))
                 return -EOPNOTSUPP;
  
+       if (event_is_checkpointed(event)) {
+               /*
+                * Sampling of checkpointed events can cause situations where
+                * the CPU constantly aborts because of a overflow, which is
+                * then checkpointed back and ignored. Forbid checkpointing
+                * for sampling.
+                *
+                * But still allow a long sampling period, so that perf stat
+                * from KVM works.
+                */
+               if (event->attr.sample_period > 0 &&
+                   event->attr.sample_period < 0x7fffffff)
+                       return -EOPNOTSUPP;
+       }
         return 0;
  }
  
@@ -2182,10 +2223,36 @@ static __init void intel_nehalem_quirk(void)
         }
  }
  
-EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
+EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start,       tx_start,       "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit,      tx_commit,      "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort,       tx_abort,       "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity,    tx_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict,    tx_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start,       el_start,       "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit,      el_commit,      "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort,       el_abort,       "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity,    el_capacity,    "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict,    el_conflict,    "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t,       cycles_t,       "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct,      cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
  
  static struct attribute *hsw_events_attrs[] = {
+       EVENT_PTR(tx_start),
+       EVENT_PTR(tx_commit),
+       EVENT_PTR(tx_abort),
+       EVENT_PTR(tx_capacity),
+       EVENT_PTR(tx_conflict),
+       EVENT_PTR(el_start),
+       EVENT_PTR(el_commit),
+       EVENT_PTR(el_abort),
+       EVENT_PTR(el_capacity),
+       EVENT_PTR(el_conflict),
+       EVENT_PTR(cycles_t),
+       EVENT_PTR(cycles_ct),
         EVENT_PTR(mem_ld_hsw),
         EVENT_PTR(mem_st_hsw),
         NULL
@@ -2452,6 +2519,7 @@ __init int intel_pmu_init(void)
                 x86_pmu.hw_config = hsw_hw_config;
                 x86_pmu.get_event_constraints = hsw_get_event_constraints;
                 x86_pmu.cpu_events = hsw_events_attrs;
+               x86_pmu.lbr_double_abort = true;
                 pr_cont("Haswell events, ");
                 break;
  
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

index ab3ba1c1b7dd2c425dd5edf4c2b5091d76355b34..c1760ff3c757e35ae804bf6a9625ae701a358e83 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -12,6 +12,7 @@
  
  #define BTS_BUFFER_SIZE                (PAGE_SIZE << 4)
  #define PEBS_BUFFER_SIZE       PAGE_SIZE
+#define PEBS_FIXUP_SIZE                PAGE_SIZE
  
  /*
   * pebs_record_32 for p4 and core not supported
@@ -182,18 +183,32 @@ struct pebs_record_nhm {
   * Same as pebs_record_nhm, with two additional fields.
   */
  struct pebs_record_hsw {
-       struct pebs_record_nhm nhm;
-       /*
-        * Real IP of the event. In the Intel documentation this
-        * is called eventingrip.
-        */
-       u64 real_ip;
-       /*
-        * TSX tuning information field: abort cycles and abort flags.
-        */
-       u64 tsx_tuning;
+       u64 flags, ip;
+       u64 ax, bx, cx, dx;
+       u64 si, di, bp, sp;
+       u64 r8,  r9,  r10, r11;
+       u64 r12, r13, r14, r15;
+       u64 status, dla, dse, lat;
+       u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+       struct {
+               u32 cycles_last_block     : 32,
+                   hle_abort             : 1,
+                   rtm_abort             : 1,
+                   instruction_abort     : 1,
+                   non_instruction_abort : 1,
+                   retry                 : 1,
+                   data_conflict         : 1,
+                   capacity_writes       : 1,
+                   capacity_reads        : 1;
+       };
+       u64         value;
  };
  
+#define PEBS_HSW_TSX_FLAGS     0xff00000000ULL
+
  void init_debug_store_on_cpu(int cpu)
  {
         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -214,12 +229,14 @@ void fini_debug_store_on_cpu(int cpu)
         wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
  }
  
+static DEFINE_PER_CPU(void *, insn_buffer);
+
  static int alloc_pebs_buffer(int cpu)
  {
         struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
         int node = cpu_to_node(cpu);
         int max, thresh = 1; /* always use a single PEBS record */
-       void *buffer;
+       void *buffer, *ibuffer;
  
         if (!x86_pmu.pebs)
                 return 0;
@@ -228,6 +245,19 @@ static int alloc_pebs_buffer(int cpu)
         if (unlikely(!buffer))
                 return -ENOMEM;
  
+       /*
+        * HSW+ already provides us the eventing ip; no need to allocate this
+        * buffer then.
+        */
+       if (x86_pmu.intel_cap.pebs_format < 2) {
+               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+               if (!ibuffer) {
+                       kfree(buffer);
+                       return -ENOMEM;
+               }
+               per_cpu(insn_buffer, cpu) = ibuffer;
+       }
+
         max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
  
         ds->pebs_buffer_base = (u64)(unsigned long)buffer;
@@ -248,6 +278,9 @@ static void release_pebs_buffer(int cpu)
         if (!ds || !x86_pmu.pebs)
                 return;
  
+       kfree(per_cpu(insn_buffer, cpu));
+       per_cpu(insn_buffer, cpu) = NULL;
+
         kfree((void *)(unsigned long)ds->pebs_buffer_base);
         ds->pebs_buffer_base = 0;
  }
@@ -715,6 +748,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
         unsigned long old_to, to = cpuc->lbr_entries[0].to;
         unsigned long ip = regs->ip;
         int is_64bit = 0;
+       void *kaddr;
  
         /*
          * We don't need to fixup if the PEBS assist is fault like
@@ -738,7 +772,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
          * unsigned math, either ip is before the start (impossible) or
          * the basic block is larger than 1 page (sanity)
          */
-       if ((ip - to) > PAGE_SIZE)
+       if ((ip - to) > PEBS_FIXUP_SIZE)
                 return 0;
  
         /*
@@ -749,29 +783,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
                 return 1;
         }
  
+       if (!kernel_ip(ip)) {
+               int size, bytes;
+               u8 *buf = this_cpu_read(insn_buffer);
+
+               size = ip - to; /* Must fit our buffer, see above */
+               bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+               if (bytes != size)
+                       return 0;
+
+               kaddr = buf;
+       } else {
+               kaddr = (void *)to;
+       }
+
         do {
                 struct insn insn;
-               u8 buf[MAX_INSN_SIZE];
-               void *kaddr;
  
                 old_to = to;
-               if (!kernel_ip(ip)) {
-                       int bytes, size = MAX_INSN_SIZE;
-
-                       bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-                       if (bytes != size)
-                               return 0;
-
-                       kaddr = buf;
-               } else
-                       kaddr = (void *)to;
  
  #ifdef CONFIG_X86_64
                 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
  #endif
                 insn_init(&insn, kaddr, is_64bit);
                 insn_get_length(&insn);
+
                 to += insn.length;
+               kaddr += insn.length;
         } while (to < ip);
  
         if (to == ip) {
@@ -786,16 +824,34 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
         return 0;
  }
  
+static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+{
+       if (pebs->tsx_tuning) {
+               union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+               return tsx.cycles_last_block;
+       }
+       return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+{
+       u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+       /* For RTM XABORTs also log the abort code from AX */
+       if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+               txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+       return txn;
+}
+
  static void __intel_pmu_pebs_event(struct perf_event *event,
                                    struct pt_regs *iregs, void *__pebs)
  {
         /*
-        * We cast to pebs_record_nhm to get the load latency data
-        * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used
+        * We cast to the biggest pebs_record but are careful not to
+        * unconditionally access the 'extra' entries.
          */
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       struct pebs_record_nhm *pebs = __pebs;
-       struct pebs_record_hsw *pebs_hsw = __pebs;
+       struct pebs_record_hsw *pebs = __pebs;
         struct perf_sample_data data;
         struct pt_regs regs;
         u64 sample_type;
@@ -854,7 +910,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
         regs.sp = pebs->sp;
  
         if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-               regs.ip = pebs_hsw->real_ip;
+               regs.ip = pebs->real_ip;
                 regs.flags |= PERF_EFLAGS_EXACT;
         } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
                 regs.flags |= PERF_EFLAGS_EXACT;
@@ -862,9 +918,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
                 regs.flags &= ~PERF_EFLAGS_EXACT;
  
         if ((event->attr.sample_type & PERF_SAMPLE_ADDR) &&
-               x86_pmu.intel_cap.pebs_format >= 1)
+           x86_pmu.intel_cap.pebs_format >= 1)
                 data.addr = pebs->dla;
  
+       if (x86_pmu.intel_cap.pebs_format >= 2) {
+               /* Only set the TSX weight when no memory weight. */
+               if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+                       data.weight = intel_hsw_weight(pebs);
+
+               if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION)
+                       data.txn = intel_hsw_transaction(pebs);
+       }
+
         if (has_branch_stack(event))
                 data.br_stack = &cpuc->lbr_stack;
  
@@ -913,17 +978,34 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
         __intel_pmu_pebs_event(event, iregs, at);
  }
  
-static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,
-                                       void *top)
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct debug_store *ds = cpuc->ds;
         struct perf_event *event = NULL;
+       void *at, *top;
         u64 status = 0;
         int bit;
  
+       if (!x86_pmu.pebs_active)
+               return;
+
+       at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
         ds->pebs_index = ds->pebs_buffer_base;
  
+       if (unlikely(at > top))
+               return;
+
+       /*
+        * Should not happen, we program the threshold at 1 and do not
+        * set a reset value.
+        */
+       WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
+                 "Unexpected number of pebs records %ld\n",
+                 (long)(top - at) / x86_pmu.pebs_record_size);
+
         for (; at < top; at += x86_pmu.pebs_record_size) {
                 struct pebs_record_nhm *p = at;
  
@@ -951,61 +1033,6 @@ static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,
         }
  }
  
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct pebs_record_nhm *at, *top;
-       int n;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
-       ds->pebs_index = ds->pebs_buffer_base;
-
-       n = top - at;
-       if (n <= 0)
-               return;
-
-       /*
-        * Should not happen, we program the threshold at 1 and do not
-        * set a reset value.
-        */
-       WARN_ONCE(n > x86_pmu.max_pebs_events,
-                 "Unexpected number of pebs records %d\n", n);
-
-       return __intel_pmu_drain_pebs_nhm(iregs, at, top);
-}
-
-static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs)
-{
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       struct debug_store *ds = cpuc->ds;
-       struct pebs_record_hsw *at, *top;
-       int n;
-
-       if (!x86_pmu.pebs_active)
-               return;
-
-       at  = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base;
-       top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index;
-
-       n = top - at;
-       if (n <= 0)
-               return;
-       /*
-        * Should not happen, we program the threshold at 1 and do not
-        * set a reset value.
-        */
-       WARN_ONCE(n > x86_pmu.max_pebs_events,
-                 "Unexpected number of pebs records %d\n", n);
-
-       return __intel_pmu_drain_pebs_nhm(iregs, at, top);
-}
-
  /*
   * BTS, PEBS probe and setup
   */
@@ -1040,7 +1067,7 @@ void intel_ds_init(void)
                 case 2:
                         pr_cont("PEBS fmt2%c, ", pebs_type);
                         x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
-                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw;
+                       x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
                         break;
  
                 default:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c

index d5be06a5005e99eb9ac8dbe808f565013bddc929..90ee6c1d0542664dbb9955ff2570e9f338e076e6 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -284,6 +284,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
         int lbr_format = x86_pmu.intel_cap.lbr_format;
         u64 tos = intel_pmu_lbr_tos();
         int i;
+       int out = 0;
  
         for (i = 0; i < x86_pmu.lbr_nr; i++) {
                 unsigned long lbr_idx = (tos - i) & mask;
@@ -306,15 +307,27 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
                 }
                 from = (u64)((((s64)from) << skip) >> skip);
  
-               cpuc->lbr_entries[i].from       = from;
-               cpuc->lbr_entries[i].to         = to;
-               cpuc->lbr_entries[i].mispred    = mis;
-               cpuc->lbr_entries[i].predicted  = pred;
-               cpuc->lbr_entries[i].in_tx      = in_tx;
-               cpuc->lbr_entries[i].abort      = abort;
-               cpuc->lbr_entries[i].reserved   = 0;
+               /*
+                * Some CPUs report duplicated abort records,
+                * with the second entry not having an abort bit set.
+                * Skip them here. This loop runs backwards,
+                * so we need to undo the previous record.
+                * If the abort just happened outside the window
+                * the extra entry cannot be removed.
+                */
+               if (abort && x86_pmu.lbr_double_abort && out > 0)
+                       out--;
+
+               cpuc->lbr_entries[out].from      = from;
+               cpuc->lbr_entries[out].to        = to;
+               cpuc->lbr_entries[out].mispred   = mis;
+               cpuc->lbr_entries[out].predicted = pred;
+               cpuc->lbr_entries[out].in_tx     = in_tx;
+               cpuc->lbr_entries[out].abort     = abort;
+               cpuc->lbr_entries[out].reserved  = 0;
+               out++;
         }
-       cpuc->lbr_stack.nr = i;
+       cpuc->lbr_stack.nr = out;
  }
  
  void intel_pmu_lbr_read(void)
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c

index 88db010845cb26a2594170acc2387cee75a20bbc..384df5105fbc9883626ec5482151babd43ce482a 100644 (file)
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s)
  }
  __setup("nordrand", x86_rdrand_setup);
  
-/* We can't use arch_get_random_long() here since alternatives haven't run */
-static inline int rdrand_long(unsigned long *v)
-{
-       int ok;
-       asm volatile("1: " RDRAND_LONG "\n\t"
-                    "jc 2f\n\t"
-                    "decl %0\n\t"
-                    "jnz 1b\n\t"
-                    "2:"
-                    : "=r" (ok), "=a" (*v)
-                    : "0" (RDRAND_RETRY_LOOPS));
-       return ok;
-}
-
  /*
   * Force a reseed cycle; we are architecturally guaranteed a reseed
   * after no more than 512 128-bit chunks of random data.  This also
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c

index d15f575a861b5ad9f2d8a5043370c92b9b47fd0a..38ca398fb78740fa6e72717300d6752ea2714afe 100644 (file)
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,7 +14,7 @@
  #include <xen/hvc-console.h>
  #include <asm/pci-direct.h>
  #include <asm/fixmap.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  #include <asm/pgtable.h>
  #include <linux/usb/ehci_def.h>
  
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S

index f0dcb0ceb6a2eda24d298b8a05650374e753bcf6..fd1bc1b15e6d8ebfc0f9e98a025e2ed65601c7fa 100644 (file)
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -362,12 +362,9 @@ END(ret_from_exception)
  #ifdef CONFIG_PREEMPT
  ENTRY(resume_kernel)
         DISABLE_INTERRUPTS(CLBR_ANY)
-       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
-       jnz restore_all
  need_resched:
-       movl TI_flags(%ebp), %ecx       # need_resched set ?
-       testb $_TIF_NEED_RESCHED, %cl
-       jz restore_all
+       cmpl $0,PER_CPU_VAR(__preempt_count)
+       jnz restore_all
         testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)    # interrupts off (exception path) ?
         jz restore_all
         call preempt_schedule_irq
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index b077f4cc225a29764e89389e099fbd0ae2006707..603be7c70675fc7146a8dc08117b9d1b682dc77c 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1103,10 +1103,8 @@ retint_signal:
         /* Returning to kernel space. Check if we need preemption */
         /* rcx:  threadinfo. interrupts off. */
  ENTRY(retint_kernel)
-       cmpl $0,TI_preempt_count(%rcx)
+       cmpl $0,PER_CPU_VAR(__preempt_count)
         jnz  retint_restore_args
-       bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
-       jnc  retint_restore_args
         bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
         jnc  retint_restore_args
         call preempt_schedule_irq
@@ -1342,7 +1340,7 @@ bad_gs:
         .previous
  
  /* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(call_softirq)
+ENTRY(do_softirq_own_stack)
         CFI_STARTPROC
         pushq_cfi %rbp
         CFI_REL_OFFSET rbp,0
@@ -1359,7 +1357,7 @@ ENTRY(call_softirq)
         decl PER_CPU_VAR(irq_count)
         ret
         CFI_ENDPROC
-END(call_softirq)
+END(do_softirq_own_stack)
  
  #ifdef CONFIG_XEN
  zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c

index 06f87bece92a72c0b205f8fa0ba02c63a3ffb86a..c61a14a4a3109f92ee63bb4df2ba470540c1d266 100644 (file)
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -35,8 +35,8 @@ asmlinkage void __init i386_start_kernel(void)
  
         /* Call the subarch specific early setup function */
         switch (boot_params.hdr.hardware_subarch) {
-       case X86_SUBARCH_MRST:
-               x86_mrst_early_setup();
+       case X86_SUBARCH_INTEL_MID:
+               x86_intel_mid_early_setup();
                 break;
         case X86_SUBARCH_CE4100:
                 x86_ce4100_early_setup();
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c

index 0fa69127209a4110f14f2e6ff0692a1ae863f1b3..05fd74f537d62122ade73f53dad17c97346c7a80 100644 (file)
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr);
  
  EXPORT_SYMBOL(csum_partial);
  EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(___preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+EXPORT_SYMBOL(___preempt_schedule_context);
+#endif
+#endif
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c

index 9a5c460404dca563a1e5ac32aeeb90ba2cdaafbf..2e977b5d61ddee4e00d599f04236d70e8714fee0 100644 (file)
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -312,8 +312,7 @@ static void init_8259A(int auto_eoi)
          */
         outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
  
-       /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
-          to 0x20-0x27 on i386 */
+       /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
         outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
  
         /* 8259A-1 (the master) has a slave on IR2 */
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c

index 4186755f1d7cf350f82540fdb76f731f58b455c3..d7fcbedc9c43fa9659b7f2f2c687b8402a826dd2 100644 (file)
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
         irqctx->tinfo.task = curctx->tinfo.task;
         irqctx->tinfo.previous_esp = current_stack_pointer;
  
-       /* Copy the preempt_count so that the [soft]irq checks work. */
-       irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
-
         if (unlikely(overflow))
                 call_on_stack(print_stack_overflow, isp);
  
@@ -131,7 +128,6 @@ void irq_ctx_init(int cpu)
                                                THREAD_SIZE_ORDER));
         memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
         irqctx->tinfo.cpu               = cpu;
-       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
         irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
  
         per_cpu(hardirq_ctx, cpu) = irqctx;
@@ -149,35 +145,21 @@ void irq_ctx_init(int cpu)
                cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
  }
  
-asmlinkage void do_softirq(void)
+void do_softirq_own_stack(void)
  {
-       unsigned long flags;
         struct thread_info *curctx;
         union irq_ctx *irqctx;
         u32 *isp;
  
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-
-       if (local_softirq_pending()) {
-               curctx = current_thread_info();
-               irqctx = __this_cpu_read(softirq_ctx);
-               irqctx->tinfo.task = curctx->task;
-               irqctx->tinfo.previous_esp = current_stack_pointer;
-
-               /* build the stack frame on the softirq stack */
-               isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
+       curctx = current_thread_info();
+       irqctx = __this_cpu_read(softirq_ctx);
+       irqctx->tinfo.task = curctx->task;
+       irqctx->tinfo.previous_esp = current_stack_pointer;
  
-               call_on_stack(__do_softirq, isp);
-               /*
-                * Shouldn't happen, we returned above if in_interrupt():
-                */
-               WARN_ON_ONCE(softirq_count());
-       }
+       /* build the stack frame on the softirq stack */
+       isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
  
-       local_irq_restore(flags);
+       call_on_stack(__do_softirq, isp);
  }
  
  bool handle_irq(unsigned irq, struct pt_regs *regs)
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c

index d04d3ecded6299ab2abb7183089908439c2a0e6a..4d1c746892eb9a9aae2e38dedfec88868f347cff 100644 (file)
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -87,24 +87,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
         generic_handle_irq_desc(irq, desc);
         return true;
  }
-
-
-extern void call_softirq(void);
-
-asmlinkage void do_softirq(void)
-{
-       __u32 pending;
-       unsigned long flags;
-
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-       pending = local_softirq_pending();
-       /* Switch to interrupt stack */
-       if (pending) {
-               call_softirq();
-               WARN_ON_ONCE(softirq_count());
-       }
-       local_irq_restore(flags);
-}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c

index 88458faea2f8f0ace5ae6d2b02a451bf43a2d9af..05266b5aae22916e864bb21e1399193ff253374e 100644 (file)
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -46,7 +46,7 @@ static struct class *msr_class;
  static loff_t msr_seek(struct file *file, loff_t offset, int orig)
  {
         loff_t ret;
-       struct inode *inode = file->f_mapping->host;
+       struct inode *inode = file_inode(file);
  
         mutex_lock(&inode->i_mutex);
         switch (orig) {
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S

new file mode 100644 (file)

index 0000000..ca7f0d5
--- /dev/null
+++ b/arch/x86/kernel/preempt.S
@@ -0,0 +1,25 @@
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/asm.h>
+#include <asm/calling.h>
+
+ENTRY(___preempt_schedule)
+       CFI_STARTPROC
+       SAVE_ALL
+       call preempt_schedule
+       RESTORE_ALL
+       ret
+       CFI_ENDPROC
+
+#ifdef CONFIG_CONTEXT_TRACKING
+
+ENTRY(___preempt_schedule_context)
+       CFI_STARTPROC
+       SAVE_ALL
+       call preempt_schedule_context
+       RESTORE_ALL
+       ret
+       CFI_ENDPROC
+
+#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index c83516be1052552adefc6324ee8b0ecd4a1b1ac2..3fb8d95ab8b5ea3635ddb1f0d9f9c12e3a348285 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -391,9 +391,9 @@ static void amd_e400_idle(void)
                  * The switch back from broadcast mode needs to be
                  * called with interrupts disabled.
                  */
-                local_irq_disable();
-                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
-                local_irq_enable();
+               local_irq_disable();
+               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+               local_irq_enable();
         } else
                 default_idle();
  }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c

index 884f98f69354c18393755a23bc89edd74f1051aa..c2ec1aa6d45467bddd5f4c09189e4160b4fb2476 100644 (file)
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -291,6 +291,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
                 set_iopl_mask(next->iopl);
  
+       /*
+        * If it were not for PREEMPT_ACTIVE we could guarantee that the
+        * preempt_count of all tasks was equal here and this would not be
+        * needed.
+        */
+       task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+       this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+
         /*
          * Now maybe handle debug registers and/or IO bitmaps
          */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index bb1dc51bab05649c4eb2e48ddbb23840c4a598a8..45ab4d6fc8a7af0409c56b0f8cd1170124b57858 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         this_cpu_write(old_rsp, next->usersp);
         this_cpu_write(current_task, next_p);
  
+       /*
+        * If it were not for PREEMPT_ACTIVE we could guarantee that the
+        * preempt_count of all tasks was equal here and this would not be
+        * needed.
+        */
+       task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+       this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+
         this_cpu_write(kernel_stack,
                   (unsigned long)task_stack_page(next_p) +
                   THREAD_SIZE - KERNEL_STACK_OFFSET);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c

index 7e920bff99a34b260e1ea338c3fc7dee44ba0e67..6bb3f95037c65e06fabd02307272ff02a087b490 100644 (file)
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -136,194 +136,193 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
   * This is a single dmi_table handling all reboot quirks.
   */
  static struct dmi_system_id __initdata reboot_dmi_table[] = {
-       {       /* Handle problems with rebooting on Dell E520's */
-               .callback = set_bios_reboot,
-               .ident = "Dell E520",
+
+       /* Acer */
+       {       /* Handle reboot issue on Acer Aspire one */
+               .callback = set_kbd_reboot,
+               .ident = "Acer Aspire One A110",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell 1300's */
-               .callback = set_bios_reboot,
-               .ident = "Dell PowerEdge 1300",
+
+       /* Apple */
+       {       /* Handle problems with rebooting on Apple MacBook5 */
+               .callback = set_pci_reboot,
+               .ident = "Apple MacBook5",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell 300's */
-               .callback = set_bios_reboot,
-               .ident = "Dell PowerEdge 300",
+       {       /* Handle problems with rebooting on Apple MacBookPro5 */
+               .callback = set_pci_reboot,
+               .ident = "Apple MacBookPro5",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell Optiplex 745's SFF */
-               .callback = set_bios_reboot,
-               .ident = "Dell OptiPlex 745",
+       {       /* Handle problems with rebooting on Apple Macmini3,1 */
+               .callback = set_pci_reboot,
+               .ident = "Apple Macmini3,1",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell Optiplex 745's DFF */
-               .callback = set_bios_reboot,
-               .ident = "Dell OptiPlex 745",
+       {       /* Handle problems with rebooting on the iMac9,1. */
+               .callback = set_pci_reboot,
+               .ident = "Apple iMac9,1",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
-                       DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+
+       /* ASUS */
+       {       /* Handle problems with rebooting on ASUS P4S800 */
                 .callback = set_bios_reboot,
-               .ident = "Dell OptiPlex 745",
+               .ident = "ASUS P4S800",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
-                       DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
+                       DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+                       DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+
+       /* Dell */
+       {       /* Handle problems with rebooting on Dell DXP061 */
                 .callback = set_bios_reboot,
-               .ident = "Dell OptiPlex 330",
+               .ident = "Dell DXP061",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
-                       DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+       {       /* Handle problems with rebooting on Dell E520's */
                 .callback = set_bios_reboot,
-               .ident = "Dell OptiPlex 360",
+               .ident = "Dell E520",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
-                       DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
-               .callback = set_bios_reboot,
-               .ident = "Dell OptiPlex 760",
+       {       /* Handle problems with rebooting on the Latitude E5420. */
+               .callback = set_pci_reboot,
+               .ident = "Dell Latitude E5420",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
-                       DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell 2400's */
-               .callback = set_bios_reboot,
-               .ident = "Dell PowerEdge 2400",
+       {       /* Handle problems with rebooting on the Latitude E6320. */
+               .callback = set_pci_reboot,
+               .ident = "Dell Latitude E6320",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell T5400's */
-               .callback = set_bios_reboot,
-               .ident = "Dell Precision T5400",
+       {       /* Handle problems with rebooting on the Latitude E6420. */
+               .callback = set_pci_reboot,
+               .ident = "Dell Latitude E6420",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell T7400's */
+       {       /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
                 .callback = set_bios_reboot,
-               .ident = "Dell Precision T7400",
+               .ident = "Dell OptiPlex 330",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
                 },
         },
-       {       /* Handle problems with rebooting on HP laptops */
+       {       /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
                 .callback = set_bios_reboot,
-               .ident = "HP Compaq Laptop",
+               .ident = "Dell OptiPlex 360",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell XPS710 */
+       {       /* Handle problems with rebooting on Dell Optiplex 745's SFF */
                 .callback = set_bios_reboot,
-               .ident = "Dell XPS710",
+               .ident = "Dell OptiPlex 745",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
                 },
         },
-       {       /* Handle problems with rebooting on Dell DXP061 */
+       {       /* Handle problems with rebooting on Dell Optiplex 745's DFF */
                 .callback = set_bios_reboot,
-               .ident = "Dell DXP061",
+               .ident = "Dell OptiPlex 745",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
                 },
         },
-       {       /* Handle problems with rebooting on Sony VGN-Z540N */
+       {       /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
                 .callback = set_bios_reboot,
-               .ident = "Sony VGN-Z540N",
+               .ident = "Dell OptiPlex 745",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
                 },
         },
-       {       /* Handle problems with rebooting on ASUS P4S800 */
+       {       /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
                 .callback = set_bios_reboot,
-               .ident = "ASUS P4S800",
-               .matches = {
-                       DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
-                       DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
-               },
-       },
-
-       {       /* Handle reboot issue on Acer Aspire one */
-               .callback = set_kbd_reboot,
-               .ident = "Acer Aspire One A110",
+               .ident = "Dell OptiPlex 760",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
                 },
         },
-       {       /* Handle problems with rebooting on Apple MacBook5 */
+       {       /* Handle problems with rebooting on the OptiPlex 990. */
                 .callback = set_pci_reboot,
-               .ident = "Apple MacBook5",
+               .ident = "Dell OptiPlex 990",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
                 },
         },
-       {       /* Handle problems with rebooting on Apple MacBookPro5 */
-               .callback = set_pci_reboot,
-               .ident = "Apple MacBookPro5",
+       {       /* Handle problems with rebooting on Dell 300's */
+               .callback = set_bios_reboot,
+               .ident = "Dell PowerEdge 300",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
                 },
         },
-       {       /* Handle problems with rebooting on Apple Macmini3,1 */
-               .callback = set_pci_reboot,
-               .ident = "Apple Macmini3,1",
+       {       /* Handle problems with rebooting on Dell 1300's */
+               .callback = set_bios_reboot,
+               .ident = "Dell PowerEdge 1300",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
                 },
         },
-       {       /* Handle problems with rebooting on the iMac9,1. */
-               .callback = set_pci_reboot,
-               .ident = "Apple iMac9,1",
+       {       /* Handle problems with rebooting on Dell 2400's */
+               .callback = set_bios_reboot,
+               .ident = "Dell PowerEdge 2400",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
                 },
         },
-       {       /* Handle problems with rebooting on the Latitude E6320. */
+       {       /* Handle problems with rebooting on the Dell PowerEdge C6100. */
                 .callback = set_pci_reboot,
-               .ident = "Dell Latitude E6320",
+               .ident = "Dell PowerEdge C6100",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
                 },
         },
         {       /* Handle problems with rebooting on the Latitude E5410. */
@@ -334,54 +333,59 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                         DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),
                 },
         },
-       {       /* Handle problems with rebooting on the Latitude E5420. */
+       {       /* Handle problems with rebooting on the Precision M6600. */
                 .callback = set_pci_reboot,
-               .ident = "Dell Latitude E5420",
+               .ident = "Dell Precision M6600",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
                 },
         },
-       {       /* Handle problems with rebooting on the Latitude E6420. */
-               .callback = set_pci_reboot,
-               .ident = "Dell Latitude E6420",
+       {       /* Handle problems with rebooting on Dell T5400's */
+               .callback = set_bios_reboot,
+               .ident = "Dell Precision T5400",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
                 },
         },
-       {       /* Handle problems with rebooting on the OptiPlex 990. */
-               .callback = set_pci_reboot,
-               .ident = "Dell OptiPlex 990",
+       {       /* Handle problems with rebooting on Dell T7400's */
+               .callback = set_bios_reboot,
+               .ident = "Dell Precision T7400",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
                 },
         },
-       {       /* Handle problems with rebooting on the Precision M6600. */
-               .callback = set_pci_reboot,
-               .ident = "Dell Precision M6600",
+       {       /* Handle problems with rebooting on Dell XPS710 */
+               .callback = set_bios_reboot,
+               .ident = "Dell XPS710",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
                 },
         },
-       {       /* Handle problems with rebooting on the Dell PowerEdge C6100. */
-               .callback = set_pci_reboot,
-               .ident = "Dell PowerEdge C6100",
+
+       /* Hewlett-Packard */
+       {       /* Handle problems with rebooting on HP laptops */
+               .callback = set_bios_reboot,
+               .ident = "HP Compaq Laptop",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
                 },
         },
-       {       /* Some C6100 machines were shipped with vendor being 'Dell'. */
-               .callback = set_pci_reboot,
-               .ident = "Dell PowerEdge C6100",
+
+       /* Sony */
+       {       /* Handle problems with rebooting on Sony VGN-Z540N */
+               .callback = set_bios_reboot,
+               .ident = "Sony VGN-Z540N",
                 .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+                       DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
                 },
         },
+
         { }
  };
  
@@ -535,10 +539,13 @@ static void native_machine_emergency_restart(void)
  
                 case BOOT_CF9_COND:
                         if (port_cf9_safe) {
-                               u8 cf9 = inb(0xcf9) & ~6;
+                               u8 reboot_code = reboot_mode == REBOOT_WARM ?
+                                       0x06 : 0x0E;
+                               u8 cf9 = inb(0xcf9) & ~reboot_code;
                                 outb(cf9|2, 0xcf9); /* Request hard reset */
                                 udelay(50);
-                               outb(cf9|6, 0xcf9); /* Actually do the reset */
+                               /* Actually do the reset */
+                               outb(cf9|reboot_code, 0xcf9);
                                 udelay(50);
                         }
                         reboot_type = BOOT_KBD;
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c

index 0aa29394ed6fe8dadda70aeefcf7bb2d3462e6ec..e35cb18b8a004ce9975dac10f04308181c8472ae 100644 (file)
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -12,7 +12,7 @@
  #include <asm/vsyscall.h>
  #include <asm/x86_init.h>
  #include <asm/time.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  #include <asm/rtc.h>
  
  #ifdef CONFIG_X86_32
@@ -189,7 +189,7 @@ static __init int add_rtc_cmos(void)
                 return 0;
  
         /* Intel MID platforms don't have ioport rtc */
-       if (mrst_identify_cpu())
+       if (intel_mid_identify_cpu())
                 return -ENODEV;
  
         platform_device_register(&rtc_device);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index f0de6294b95573bce74a61ff3b6fed04443f434e..1708862fc40dd0fde25ff5f47057cf67db8d8091 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -823,6 +823,20 @@ static void __init trim_low_memory_range(void)
         memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
  }
         
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+       pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
+                "(relocation range: 0x%lx-0x%lx)\n",
+                (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
+                __START_KERNEL_map, MODULES_VADDR-1);
+
+       return 0;
+}
+
  /*
   * Determine if we were loaded by an EFI loader.  If so, then we have also been
   * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1242,3 +1256,15 @@ void __init i386_reserve_resources(void)
  }
  
  #endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+       .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+       atomic_notifier_chain_register(&panic_notifier_list,
+                                       &kernel_offset_notifier);
+       return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index e73b3f53310c7663b2c37a3cffd57c0cad974a3c..85dc05a3aa02b3251b64ae0406b95716c714cb85 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,11 +73,10 @@
  #include <asm/setup.h>
  #include <asm/uv/uv.h>
  #include <linux/mc146818rtc.h>
-
  #include <asm/smpboot_hooks.h>
  #include <asm/i8259.h>
-
  #include <asm/realmode.h>
+#include <asm/misc.h>
  
  /* State of each CPU */
  DEFINE_PER_CPU(int, cpu_state) = { 0 };
@@ -627,22 +626,46 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
         return (send_status | accept_status);
  }
  
+void smp_announce(void)
+{
+       int num_nodes = num_online_nodes();
+
+       printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
+              num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
+}
+
  /* reduce the number of lines printed when booting a large cpu count system */
  static void announce_cpu(int cpu, int apicid)
  {
         static int current_node = -1;
         int node = early_cpu_to_node(cpu);
-       int max_cpu_present = find_last_bit(cpumask_bits(cpu_present_mask), NR_CPUS);
+       static int width, node_width;
+
+       if (!width)
+               width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
+
+       if (!node_width)
+               node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
+
+       if (cpu == 1)
+               printk(KERN_INFO "x86: Booting SMP configuration:\n");
  
         if (system_state == SYSTEM_BOOTING) {
                 if (node != current_node) {
                         if (current_node > (-1))
-                               pr_cont(" OK\n");
+                               pr_cont("\n");
                         current_node = node;
-                       pr_info("Booting Node %3d, Processors ", node);
+
+                       printk(KERN_INFO ".... node %*s#%d, CPUs:  ",
+                              node_width - num_digits(node), " ", node);
                 }
-               pr_cont(" #%4d%s", cpu, cpu == max_cpu_present ? " OK\n" : "");
-               return;
+
+               /* Add padding for the BSP */
+               if (cpu == 1)
+                       pr_cont("%*s", width + 1, " ");
+
+               pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
+
         } else
                 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
                         node, cpu, apicid);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index 8c8093b146ca7cc565c48cb7ba66ca89deb06f9a..729aa779ff7523fda90b0031e9dab09316a5beae 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -88,7 +88,7 @@ static inline void conditional_sti(struct pt_regs *regs)
  
  static inline void preempt_conditional_sti(struct pt_regs *regs)
  {
-       inc_preempt_count();
+       preempt_count_inc();
         if (regs->flags & X86_EFLAGS_IF)
                 local_irq_enable();
  }
@@ -103,7 +103,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
  {
         if (regs->flags & X86_EFLAGS_IF)
                 local_irq_disable();
-       dec_preempt_count();
+       preempt_count_dec();
  }
  
  static int __kprobes
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 10c4f3006afd3471142ceb8f217fefaba90591d7..da6b35a9826017a04298ecfca3ae7496be4598cb 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -199,6 +199,15 @@ SECTIONS
                 __x86_cpu_dev_end = .;
         }
  
+#ifdef CONFIG_X86_INTEL_MID
+       .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \
+                                                               LOAD_OFFSET) {
+               __x86_intel_mid_dev_start = .;
+               *(.x86_intel_mid_dev.init)
+               __x86_intel_mid_dev_end = .;
+       }
+#endif
+
         /*
          * start address and size of operations which during runtime
          * can be patched with virtualization friendly instructions or
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c

index b014d9414d085611db5d46cc8ee797d39a97ba90..040681928e9d971670eb1bab49b1c86e115dd55d 100644 (file)
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page);
  #ifndef CONFIG_PARAVIRT
  EXPORT_SYMBOL(native_load_gs_index);
  #endif
+
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(___preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+EXPORT_SYMBOL(___preempt_schedule_context);
+#endif
+#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile

index 96b2c6697c9d9b5813bc82f01745ea2838b25aa1..992d63bb154f0e7bc54fe26f549e40d6ae724bd2 100644 (file)
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,7 @@ clean-files := inat-tables.c
  
  obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
  
-lib-y := delay.o
+lib-y := delay.o misc.o
  lib-y += thunk_$(BITS).o
  lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
  lib-y += memcpy_$(BITS).o
diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c

new file mode 100644 (file)

index 0000000..76b373a
--- /dev/null
+++ b/arch/x86/lib/misc.c
@@ -0,0 +1,21 @@
+/*
+ * Count the digits of @val including a possible sign.
+ *
+ * (Typed on and submitted from hpa's mobile phone.)
+ */
+int num_digits(int val)
+{
+       int m = 10;
+       int d = 1;
+
+       if (val < 0) {
+               d++;
+               val = -val;
+       }
+
+       while (val >= m) {
+               m *= 10;
+               d++;
+       }
+       return d;
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

index 04664cdb7fda3df55f3a989d2c4186aaa780dbc8..ce32017c5e38dc1b3ba14f08c6a82a13113d73de 100644 (file)
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -399,8 +399,25 @@ static unsigned long __init init_range_memory_mapping(
         return mapped_ram_size;
  }
  
-/* (PUD_SHIFT-PMD_SHIFT)/2 */
-#define STEP_SIZE_SHIFT 5
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
+       /*
+        * Explain why we shift by 5 and why we don't have to worry about
+        * 'step_size << 5' overflowing:
+        *
+        * initial mapped size is PMD_SIZE (2M).
+        * We can not set step_size to be PUD_SIZE (1G) yet.
+        * In worse case, when we cross the 1G boundary, and
+        * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+        * to map 1G range with PTE. Use 5 as shift for now.
+        *
+        * Don't need to worry about overflow, on 32bit, when step_size
+        * is 0, round_down() returns 0 for start, and that turns it
+        * into 0x100000000ULL.
+        */
+       return step_size << 5;
+}
+
  void __init init_mem_mapping(void)
  {
         unsigned long end, real_end, start, last_start;
@@ -449,7 +466,7 @@ void __init init_mem_mapping(void)
                 min_pfn_mapped = last_start >> PAGE_SHIFT;
                 /* only increase step_size after big range get mapped */
                 if (new_mapped_ram_size > mapped_ram_size)
-                       step_size <<= STEP_SIZE_SHIFT;
+                       step_size = get_new_step_size(step_size);
                 mapped_ram_size += new_mapped_ram_size;
         }
  
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index 4287f1ffba7ef2e42cc68839a38c3ecc0d6b1304..5bdc5430597cfea9056e08edb7902a098f28535c 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -806,6 +806,9 @@ void __init mem_init(void)
         BUILD_BUG_ON(VMALLOC_START                      >= VMALLOC_END);
  #undef high_memory
  #undef __FIXADDR_TOP
+#ifdef CONFIG_RANDOMIZE_BASE
+       BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
+#endif
  
  #ifdef CONFIG_HIGHMEM
         BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile

index ee0af58ca5bd7e5e75ae22885a887ff86381fe46..e063eed0f9128bde4e721264e9bbdc776688e615 100644 (file)
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_X86_VISWS)               += visws.o
  obj-$(CONFIG_X86_NUMAQ)                += numaq_32.o
  obj-$(CONFIG_X86_NUMACHIP)     += numachip.o
  
-obj-$(CONFIG_X86_INTEL_MID)    += mrst.o
+obj-$(CONFIG_X86_INTEL_MID)    += intel_mid_pci.o
  
  obj-y                          += common.o early.o
  obj-y                          += bus_numa.o
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/intel_mid_pci.c

similarity index 96%

rename from arch/x86/pci/mrst.c

rename to arch/x86/pci/intel_mid_pci.c

index 903fded507869b45cc7b304e9b9bb0169db7b00a..51384ca727ad1ac4636fa7ce32efb5730ed7579d 100644 (file)
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -1,5 +1,5 @@
  /*
- * Moorestown PCI support
+ * Intel MID PCI support
   *   Copyright (c) 2008 Intel Corporation
   *     Jesse Barnes <jesse.barnes@intel.com>
   *
@@ -150,12 +150,12 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
          * shim. Therefore, use the header type in shim instead.
          */
         if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
-               return 0;
+               return false;
         if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
                                 || devfn == PCI_DEVFN(0, 0)
                                 || devfn == PCI_DEVFN(3, 0)))
-               return 1;
-       return 0; /* Langwell on others */
+               return true;
+       return false; /* Langwell on others */
  }
  
  static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
@@ -205,7 +205,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
                                where, size, value);
  }
  
-static int mrst_pci_irq_enable(struct pci_dev *dev)
+static int intel_mid_pci_irq_enable(struct pci_dev *dev)
  {
         u8 pin;
         struct io_apic_irq_attr irq_attr;
@@ -225,23 +225,23 @@ static int mrst_pci_irq_enable(struct pci_dev *dev)
         return 0;
  }
  
-struct pci_ops pci_mrst_ops = {
+struct pci_ops intel_mid_pci_ops = {
         .read = pci_read,
         .write = pci_write,
  };
  
  /**
- * pci_mrst_init - installs pci_mrst_ops
+ * intel_mid_pci_init - installs intel_mid_pci_ops
   *
   * Moorestown has an interesting PCI implementation (see above).
   * Called when the early platform detection installs it.
   */
-int __init pci_mrst_init(void)
+int __init intel_mid_pci_init(void)
  {
         pr_info("Intel MID platform detected, using MID PCI ops\n");
         pci_mmcfg_late_init();
-       pcibios_enable_irq = mrst_pci_irq_enable;
-       pci_root_ops = pci_mrst_ops;
+       pcibios_enable_irq = intel_mid_pci_irq_enable;
+       pci_root_ops = intel_mid_pci_ops;
         pci_soc_mode = 1;
         /* Continue with standard init */
         return 1;
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile

index 01e0231a113e9375e0e4f6c44ca17aac83d3eda4..20342d4c82cec9784c3e9149cd984aa635792610 100644 (file)
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -4,7 +4,7 @@ obj-y   += efi/
  obj-y  += geode/
  obj-y  += goldfish/
  obj-y  += iris/
-obj-y  += mrst/
+obj-y  += intel-mid/
  obj-y  += olpc/
  obj-y  += scx200/
  obj-y  += sfi/
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c

index c7e22ab29a5a2eb6ce3dd75f0fd3e0b26be2d61b..92c02344a060f2dacc7997185d6fd6bc04ed225e 100644 (file)
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -60,19 +60,6 @@
  
  static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
  
-struct efi __read_mostly efi = {
-       .mps        = EFI_INVALID_TABLE_ADDR,
-       .acpi       = EFI_INVALID_TABLE_ADDR,
-       .acpi20     = EFI_INVALID_TABLE_ADDR,
-       .smbios     = EFI_INVALID_TABLE_ADDR,
-       .sal_systab = EFI_INVALID_TABLE_ADDR,
-       .boot_info  = EFI_INVALID_TABLE_ADDR,
-       .hcdp       = EFI_INVALID_TABLE_ADDR,
-       .uga        = EFI_INVALID_TABLE_ADDR,
-       .uv_systab  = EFI_INVALID_TABLE_ADDR,
-};
-EXPORT_SYMBOL(efi);
-
  struct efi_memory_map memmap;
  
  static struct efi efi_phys __initdata;
@@ -80,6 +67,13 @@ static efi_system_table_t efi_systab __initdata;
  
  unsigned long x86_efi_facility;
  
+static __initdata efi_config_table_type_t arch_tables[] = {
+#ifdef CONFIG_X86_UV
+       {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab},
+#endif
+       {NULL_GUID, NULL, NULL},
+};
+
  /*
   * Returns 1 if 'facility' is enabled, 0 otherwise.
   */
@@ -399,6 +393,8 @@ int __init efi_memblock_x86_reserve_range(void)
  
         memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
  
+       efi.memmap = &memmap;
+
         return 0;
  }
  
@@ -578,80 +574,6 @@ static int __init efi_systab_init(void *phys)
         return 0;
  }
  
-static int __init efi_config_init(u64 tables, int nr_tables)
-{
-       void *config_tables, *tablep;
-       int i, sz;
-
-       if (efi_enabled(EFI_64BIT))
-               sz = sizeof(efi_config_table_64_t);
-       else
-               sz = sizeof(efi_config_table_32_t);
-
-       /*
-        * Let's see what config tables the firmware passed to us.
-        */
-       config_tables = early_ioremap(tables, nr_tables * sz);
-       if (config_tables == NULL) {
-               pr_err("Could not map Configuration table!\n");
-               return -ENOMEM;
-       }
-
-       tablep = config_tables;
-       pr_info("");
-       for (i = 0; i < efi.systab->nr_tables; i++) {
-               efi_guid_t guid;
-               unsigned long table;
-
-               if (efi_enabled(EFI_64BIT)) {
-                       u64 table64;
-                       guid = ((efi_config_table_64_t *)tablep)->guid;
-                       table64 = ((efi_config_table_64_t *)tablep)->table;
-                       table = table64;
-#ifdef CONFIG_X86_32
-                       if (table64 >> 32) {
-                               pr_cont("\n");
-                               pr_err("Table located above 4GB, disabling EFI.\n");
-                               early_iounmap(config_tables,
-                                             efi.systab->nr_tables * sz);
-                               return -EINVAL;
-                       }
-#endif
-               } else {
-                       guid = ((efi_config_table_32_t *)tablep)->guid;
-                       table = ((efi_config_table_32_t *)tablep)->table;
-               }
-               if (!efi_guidcmp(guid, MPS_TABLE_GUID)) {
-                       efi.mps = table;
-                       pr_cont(" MPS=0x%lx ", table);
-               } else if (!efi_guidcmp(guid, ACPI_20_TABLE_GUID)) {
-                       efi.acpi20 = table;
-                       pr_cont(" ACPI 2.0=0x%lx ", table);
-               } else if (!efi_guidcmp(guid, ACPI_TABLE_GUID)) {
-                       efi.acpi = table;
-                       pr_cont(" ACPI=0x%lx ", table);
-               } else if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) {
-                       efi.smbios = table;
-                       pr_cont(" SMBIOS=0x%lx ", table);
-#ifdef CONFIG_X86_UV
-               } else if (!efi_guidcmp(guid, UV_SYSTEM_TABLE_GUID)) {
-                       efi.uv_systab = table;
-                       pr_cont(" UVsystab=0x%lx ", table);
-#endif
-               } else if (!efi_guidcmp(guid, HCDP_TABLE_GUID)) {
-                       efi.hcdp = table;
-                       pr_cont(" HCDP=0x%lx ", table);
-               } else if (!efi_guidcmp(guid, UGA_IO_PROTOCOL_GUID)) {
-                       efi.uga = table;
-                       pr_cont(" UGA=0x%lx ", table);
-               }
-               tablep += sz;
-       }
-       pr_cont("\n");
-       early_iounmap(config_tables, efi.systab->nr_tables * sz);
-       return 0;
-}
-
  static int __init efi_runtime_init(void)
  {
         efi_runtime_services_t *runtime;
@@ -745,7 +667,7 @@ void __init efi_init(void)
                 efi.systab->hdr.revision >> 16,
                 efi.systab->hdr.revision & 0xffff, vendor);
  
-       if (efi_config_init(efi.systab->tables, efi.systab->nr_tables))
+       if (efi_config_init(arch_tables))
                 return;
  
         set_bit(EFI_CONFIG_TABLES, &x86_efi_facility);
@@ -816,34 +738,6 @@ static void __init runtime_code_page_mkexec(void)
         }
  }
  
-/*
- * We can't ioremap data in EFI boot services RAM, because we've already mapped
- * it as RAM.  So, look it up in the existing EFI memory map instead.  Only
- * callable after efi_enter_virtual_mode and before efi_free_boot_services.
- */
-void __iomem *efi_lookup_mapped_addr(u64 phys_addr)
-{
-       void *p;
-       if (WARN_ON(!memmap.map))
-               return NULL;
-       for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-               efi_memory_desc_t *md = p;
-               u64 size = md->num_pages << EFI_PAGE_SHIFT;
-               u64 end = md->phys_addr + size;
-               if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
-                   md->type != EFI_BOOT_SERVICES_CODE &&
-                   md->type != EFI_BOOT_SERVICES_DATA)
-                       continue;
-               if (!md->virt_addr)
-                       continue;
-               if (phys_addr >= md->phys_addr && phys_addr < end) {
-                       phys_addr += md->virt_addr - md->phys_addr;
-                       return (__force void __iomem *)(unsigned long)phys_addr;
-               }
-       }
-       return NULL;
-}
-
  void efi_memory_uc(u64 addr, unsigned long size)
  {
         unsigned long page_shift = 1UL << EFI_PAGE_SHIFT;
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c

index 90e23e7679a5b9062a34dd266610f1662b426bbb..76b6632d3143352a8161e64721a1a10af884c0bf 100644 (file)
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -98,7 +98,7 @@ static struct platform_device alix_leds_dev = {
         .dev.platform_data = &alix_leds_data,
  };
  
-static struct __initdata platform_device *alix_devs[] = {
+static struct platform_device *alix_devs[] __initdata = {
         &alix_buttons_dev,
         &alix_leds_dev,
  };
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c

index c2e6d53558be4dc9f5412efa8daaf0221138770a..aa733fba2471247cd838c86a2ca39de28d997996 100644 (file)
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -87,7 +87,7 @@ static struct platform_device geos_leds_dev = {
         .dev.platform_data = &geos_leds_data,
  };
  
-static struct __initdata platform_device *geos_devs[] = {
+static struct platform_device *geos_devs[] __initdata = {
         &geos_buttons_dev,
         &geos_leds_dev,
  };
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c

index 646e3b5b4bb6af37766f0fa81625b7d7f876d4d2..927e38c0089fac8947de59eab9a0b029d122b696 100644 (file)
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -78,7 +78,7 @@ static struct platform_device net5501_leds_dev = {
         .dev.platform_data = &net5501_leds_data,
  };
  
-static struct __initdata platform_device *net5501_devs[] = {
+static struct platform_device *net5501_devs[] __initdata = {
         &net5501_buttons_dev,
         &net5501_leds_dev,
  };
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile

new file mode 100644 (file)

index 0000000..01cc29e
--- /dev/null
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o
+obj-$(CONFIG_X86_INTEL_MID) += intel_mid_vrtc.o
+obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o
+# SFI specific code
+ifdef CONFIG_X86_INTEL_MID
+obj-$(CONFIG_SFI) += sfi.o device_libs/
+endif
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile

new file mode 100644 (file)

index 0000000..097e7a7
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -0,0 +1,22 @@
+# IPC Devices
+obj-y += platform_ipc.o
+obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
+obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o
+obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o
+obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o
+obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o
+obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o
+obj-$(subst m,y,$(CONFIG_GPIO_INTEL_PMIC)) += platform_pmic_gpio.o
+obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o
+# I2C Devices
+obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o
+obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o
+obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o
+obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o
+obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
+obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
+obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
+# SPI Devices
+obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o
+# MISC Devices
+obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c

new file mode 100644 (file)

index 0000000..0ae7f2a
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
@@ -0,0 +1,20 @@
+/*
+ * platform_bma023.c: bma023 platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <asm/intel-mid.h>
+
+static const struct devs_id bma023_dev_id __initconst = {
+       .name = "bma023",
+       .type = SFI_DEV_TYPE_I2C,
+       .delay = 1,
+};
+
+sfi_device(bma023_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c

new file mode 100644 (file)

index 0000000..0d942c1
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -0,0 +1,41 @@
+/*
+ * platform_emc1403.c: emc1403 platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <asm/intel-mid.h>
+
+static void __init *emc1403_platform_data(void *info)
+{
+       static short intr2nd_pdata;
+       struct i2c_board_info *i2c_info = info;
+       int intr = get_gpio_by_name("thermal_int");
+       int intr2nd = get_gpio_by_name("thermal_alert");
+
+       if (intr == -1 || intr2nd == -1)
+               return NULL;
+
+       i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+       intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
+
+       return &intr2nd_pdata;
+}
+
+static const struct devs_id emc1403_dev_id __initconst = {
+       .name = "emc1403",
+       .type = SFI_DEV_TYPE_I2C,
+       .delay = 1,
+       .get_platform_data = &emc1403_platform_data,
+};
+
+sfi_device(emc1403_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c

new file mode 100644 (file)

index 0000000..a013a48
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -0,0 +1,83 @@
+/*
+ * platform_gpio_keys.c: gpio_keys platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/input.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_keys.h>
+#include <linux/platform_device.h>
+#include <asm/intel-mid.h>
+
+#define DEVICE_NAME "gpio-keys"
+
+/*
+ * we will search these buttons in SFI GPIO table (by name)
+ * and register them dynamically. Please add all possible
+ * buttons here, we will shrink them if no GPIO found.
+ */
+static struct gpio_keys_button gpio_button[] = {
+       {KEY_POWER,             -1, 1, "power_btn",     EV_KEY, 0, 3000},
+       {KEY_PROG1,             -1, 1, "prog_btn1",     EV_KEY, 0, 20},
+       {KEY_PROG2,             -1, 1, "prog_btn2",     EV_KEY, 0, 20},
+       {SW_LID,                -1, 1, "lid_switch",    EV_SW,  0, 20},
+       {KEY_VOLUMEUP,          -1, 1, "vol_up",        EV_KEY, 0, 20},
+       {KEY_VOLUMEDOWN,        -1, 1, "vol_down",      EV_KEY, 0, 20},
+       {KEY_CAMERA,            -1, 1, "camera_full",   EV_KEY, 0, 20},
+       {KEY_CAMERA_FOCUS,      -1, 1, "camera_half",   EV_KEY, 0, 20},
+       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw1",        EV_SW,  0, 20},
+       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw2",        EV_SW,  0, 20},
+};
+
+static struct gpio_keys_platform_data gpio_keys = {
+       .buttons        = gpio_button,
+       .rep            = 1,
+       .nbuttons       = -1, /* will fill it after search */
+};
+
+static struct platform_device pb_device = {
+       .name           = DEVICE_NAME,
+       .id             = -1,
+       .dev            = {
+               .platform_data  = &gpio_keys,
+       },
+};
+
+/*
+ * Shrink the non-existent buttons, register the gpio button
+ * device if there is some
+ */
+static int __init pb_keys_init(void)
+{
+       struct gpio_keys_button *gb = gpio_button;
+       int i, num, good = 0;
+
+       num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
+       for (i = 0; i < num; i++) {
+               gb[i].gpio = get_gpio_by_name(gb[i].desc);
+               pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
+                                       gb[i].gpio);
+               if (gb[i].gpio == -1)
+                       continue;
+
+               if (i != good)
+                       gb[good] = gb[i];
+               good++;
+       }
+
+       if (good) {
+               gpio_keys.nbuttons = good;
+               return platform_device_register(&pb_device);
+       }
+       return 0;
+}
+late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.c b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c

new file mode 100644 (file)

index 0000000..a84b73d
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.c
@@ -0,0 +1,68 @@
+/*
+ * platform_ipc.c: IPC platform library file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/sfi.h>
+#include <linux/gpio.h>
+#include <asm/intel-mid.h>
+#include "platform_ipc.h"
+
+void __init ipc_device_handler(struct sfi_device_table_entry *pentry,
+                               struct devs_id *dev)
+{
+       struct platform_device *pdev;
+       void *pdata = NULL;
+       static struct resource res __initdata = {
+               .name = "IRQ",
+               .flags = IORESOURCE_IRQ,
+       };
+
+       pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
+               pentry->name, pentry->irq);
+
+       /*
+        * We need to call platform init of IPC devices to fill misc_pdata
+        * structure. It will be used in msic_init for initialization.
+        */
+       if (dev != NULL)
+               pdata = dev->get_platform_data(pentry);
+
+       /*
+        * On Medfield the platform device creation is handled by the MSIC
+        * MFD driver so we don't need to do it here.
+        */
+       if (intel_mid_has_msic())
+               return;
+
+       pdev = platform_device_alloc(pentry->name, 0);
+       if (pdev == NULL) {
+               pr_err("out of memory for SFI platform device '%s'.\n",
+                       pentry->name);
+               return;
+       }
+       res.start = pentry->irq;
+       platform_device_add_resources(pdev, &res, 1);
+
+       pdev->dev.platform_data = pdata;
+       intel_scu_device_register(pdev);
+}
+
+static const struct devs_id pmic_audio_dev_id __initconst = {
+       .name = "pmic_audio",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .device_handler = &ipc_device_handler,
+};
+
+sfi_device(pmic_audio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_ipc.h b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h

new file mode 100644 (file)

index 0000000..8f568dd
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_ipc.h
@@ -0,0 +1,17 @@
+/*
+ * platform_ipc.h: IPC platform library header file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _PLATFORM_IPC_H_
+#define _PLATFORM_IPC_H_
+
+extern void __init ipc_device_handler(struct sfi_device_table_entry *pentry,
+                       struct devs_id *dev) __attribute__((weak));
+#endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c

new file mode 100644 (file)

index 0000000..15278c1
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -0,0 +1,39 @@
+/*
+ * platform_lis331.c:  lis331 platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <asm/intel-mid.h>
+
+static void __init *lis331dl_platform_data(void *info)
+{
+       static short intr2nd_pdata;
+       struct i2c_board_info *i2c_info = info;
+       int intr = get_gpio_by_name("accel_int");
+       int intr2nd = get_gpio_by_name("accel_2");
+
+       if (intr == -1 || intr2nd == -1)
+               return NULL;
+
+       i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+       intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
+
+       return &intr2nd_pdata;
+}
+
+static const struct devs_id lis331dl_dev_id __initconst = {
+       .name = "i2c_accel",
+       .type = SFI_DEV_TYPE_I2C,
+       .get_platform_data = &lis331dl_platform_data,
+};
+
+sfi_device(lis331dl_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c

new file mode 100644 (file)

index 0000000..afd1df9
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c
@@ -0,0 +1,35 @@
+/*
+ * platform_max3111.c: max3111 platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/spi/spi.h>
+#include <asm/intel-mid.h>
+
+static void __init *max3111_platform_data(void *info)
+{
+       struct spi_board_info *spi_info = info;
+       int intr = get_gpio_by_name("max3111_int");
+
+       spi_info->mode = SPI_MODE_0;
+       if (intr == -1)
+               return NULL;
+       spi_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+       return NULL;
+}
+
+static const struct devs_id max3111_dev_id __initconst = {
+       .name = "spi_max3111",
+       .type = SFI_DEV_TYPE_SPI,
+       .get_platform_data = &max3111_platform_data,
+};
+
+sfi_device(max3111_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c

new file mode 100644 (file)

index 0000000..94ade10
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -0,0 +1,79 @@
+/*
+ * platform_max7315.c: max7315 platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/platform_data/pca953x.h>
+#include <asm/intel-mid.h>
+
+#define MAX7315_NUM 2
+
+static void __init *max7315_platform_data(void *info)
+{
+       static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
+       static int nr;
+       struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
+       struct i2c_board_info *i2c_info = info;
+       int gpio_base, intr;
+       char base_pin_name[SFI_NAME_LEN + 1];
+       char intr_pin_name[SFI_NAME_LEN + 1];
+
+       if (nr == MAX7315_NUM) {
+               pr_err("too many max7315s, we only support %d\n",
+                               MAX7315_NUM);
+               return NULL;
+       }
+       /* we have several max7315 on the board, we only need load several
+        * instances of the same pca953x driver to cover them
+        */
+       strcpy(i2c_info->type, "max7315");
+       if (nr++) {
+               sprintf(base_pin_name, "max7315_%d_base", nr);
+               sprintf(intr_pin_name, "max7315_%d_int", nr);
+       } else {
+               strcpy(base_pin_name, "max7315_base");
+               strcpy(intr_pin_name, "max7315_int");
+       }
+
+       gpio_base = get_gpio_by_name(base_pin_name);
+       intr = get_gpio_by_name(intr_pin_name);
+
+       if (gpio_base == -1)
+               return NULL;
+       max7315->gpio_base = gpio_base;
+       if (intr != -1) {
+               i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+               max7315->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
+       } else {
+               i2c_info->irq = -1;
+               max7315->irq_base = -1;
+       }
+       return max7315;
+}
+
+static const struct devs_id max7315_dev_id __initconst = {
+       .name = "i2c_max7315",
+       .type = SFI_DEV_TYPE_I2C,
+       .delay = 1,
+       .get_platform_data = &max7315_platform_data,
+};
+
+static const struct devs_id max7315_2_dev_id __initconst = {
+       .name = "i2c_max7315_2",
+       .type = SFI_DEV_TYPE_I2C,
+       .delay = 1,
+       .get_platform_data = &max7315_platform_data,
+};
+
+sfi_device(max7315_dev_id);
+sfi_device(max7315_2_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c

new file mode 100644 (file)

index 0000000..dd28d63
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -0,0 +1,36 @@
+/*
+ * platform_mpu3050.c: mpu3050 platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <asm/intel-mid.h>
+
+static void *mpu3050_platform_data(void *info)
+{
+       struct i2c_board_info *i2c_info = info;
+       int intr = get_gpio_by_name("mpu3050_int");
+
+       if (intr == -1)
+               return NULL;
+
+       i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+       return NULL;
+}
+
+static const struct devs_id mpu3050_dev_id __initconst = {
+       .name = "mpu3050",
+       .type = SFI_DEV_TYPE_I2C,
+       .delay = 1,
+       .get_platform_data = &mpu3050_platform_data,
+};
+
+sfi_device(mpu3050_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c

new file mode 100644 (file)

index 0000000..9f4a775
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
@@ -0,0 +1,87 @@
+/*
+ * platform_msic.c: MSIC platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/intel-mid.h>
+#include "platform_msic.h"
+
+struct intel_msic_platform_data msic_pdata;
+
+static struct resource msic_resources[] = {
+       {
+               .start  = INTEL_MSIC_IRQ_PHYS_BASE,
+               .end    = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1,
+               .flags  = IORESOURCE_MEM,
+       },
+};
+
+static struct platform_device msic_device = {
+       .name           = "intel_msic",
+       .id             = -1,
+       .dev            = {
+               .platform_data  = &msic_pdata,
+       },
+       .num_resources  = ARRAY_SIZE(msic_resources),
+       .resource       = msic_resources,
+};
+
+static int msic_scu_status_change(struct notifier_block *nb,
+                                 unsigned long code, void *data)
+{
+       if (code == SCU_DOWN) {
+               platform_device_unregister(&msic_device);
+               return 0;
+       }
+
+       return platform_device_register(&msic_device);
+}
+
+static int __init msic_init(void)
+{
+       static struct notifier_block msic_scu_notifier = {
+               .notifier_call  = msic_scu_status_change,
+       };
+
+       /*
+        * We need to be sure that the SCU IPC is ready before MSIC device
+        * can be registered.
+        */
+       if (intel_mid_has_msic())
+               intel_scu_notifier_add(&msic_scu_notifier);
+
+       return 0;
+}
+arch_initcall(msic_init);
+
+/*
+ * msic_generic_platform_data - sets generic platform data for the block
+ * @info: pointer to the SFI device table entry for this block
+ * @block: MSIC block
+ *
+ * Function sets IRQ number from the SFI table entry for given device to
+ * the MSIC platform data.
+ */
+void *msic_generic_platform_data(void *info, enum intel_msic_block block)
+{
+       struct sfi_device_table_entry *entry = info;
+
+       BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST);
+       msic_pdata.irq[block] = entry->irq;
+
+       return NULL;
+}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h

new file mode 100644 (file)

index 0000000..917eb56
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
@@ -0,0 +1,19 @@
+/*
+ * platform_msic.h: MSIC platform data header file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _PLATFORM_MSIC_H_
+#define _PLATFORM_MSIC_H_
+
+extern struct intel_msic_platform_data msic_pdata;
+
+extern void *msic_generic_platform_data(void *info,
+                       enum intel_msic_block block) __attribute__((weak));
+#endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c

new file mode 100644 (file)

index 0000000..2962939
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
@@ -0,0 +1,47 @@
+/*
+ * platform_msic_audio.c: MSIC audio platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+#include "platform_ipc.h"
+
+static void *msic_audio_platform_data(void *info)
+{
+       struct platform_device *pdev;
+
+       pdev = platform_device_register_simple("sst-platform", -1, NULL, 0);
+
+       if (IS_ERR(pdev)) {
+               pr_err("failed to create audio platform device\n");
+               return NULL;
+       }
+
+       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO);
+}
+
+static const struct devs_id msic_audio_dev_id __initconst = {
+       .name = "msic_audio",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .get_platform_data = &msic_audio_platform_data,
+       .device_handler = &ipc_device_handler,
+};
+
+sfi_device(msic_audio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c

new file mode 100644 (file)

index 0000000..f446c33
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
@@ -0,0 +1,37 @@
+/*
+ * platform_msic_battery.c: MSIC battery platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+#include "platform_ipc.h"
+
+static void __init *msic_battery_platform_data(void *info)
+{
+       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY);
+}
+
+static const struct devs_id msic_battery_dev_id __initconst = {
+       .name = "msic_battery",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .get_platform_data = &msic_battery_platform_data,
+       .device_handler = &ipc_device_handler,
+};
+
+sfi_device(msic_battery_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c

new file mode 100644 (file)

index 0000000..2a4f7b1
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
@@ -0,0 +1,48 @@
+/*
+ * platform_msic_gpio.c: MSIC GPIO platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+#include "platform_ipc.h"
+
+static void __init *msic_gpio_platform_data(void *info)
+{
+       static struct intel_msic_gpio_pdata msic_gpio_pdata;
+
+       int gpio = get_gpio_by_name("msic_gpio_base");
+
+       if (gpio < 0)
+               return NULL;
+
+       msic_gpio_pdata.gpio_base = gpio;
+       msic_pdata.gpio = &msic_gpio_pdata;
+
+       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO);
+}
+
+static const struct devs_id msic_gpio_dev_id __initconst = {
+       .name = "msic_gpio",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .get_platform_data = &msic_gpio_platform_data,
+       .device_handler = &ipc_device_handler,
+};
+
+sfi_device(msic_gpio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c

new file mode 100644 (file)

index 0000000..6497111
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
@@ -0,0 +1,49 @@
+/*
+ * platform_msic_ocd.c: MSIC OCD platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+#include "platform_ipc.h"
+
+static void __init *msic_ocd_platform_data(void *info)
+{
+       static struct intel_msic_ocd_pdata msic_ocd_pdata;
+       int gpio;
+
+       gpio = get_gpio_by_name("ocd_gpio");
+
+       if (gpio < 0)
+               return NULL;
+
+       msic_ocd_pdata.gpio = gpio;
+       msic_pdata.ocd = &msic_ocd_pdata;
+
+       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD);
+}
+
+static const struct devs_id msic_ocd_dev_id __initconst = {
+       .name = "msic_ocd",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .get_platform_data = &msic_ocd_platform_data,
+       .device_handler = &ipc_device_handler,
+};
+
+sfi_device(msic_ocd_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c

new file mode 100644 (file)

index 0000000..83a3459
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
@@ -0,0 +1,36 @@
+/*
+ * platform_msic_power_btn.c: MSIC power btn platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/init.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+#include "platform_ipc.h"
+
+static void __init *msic_power_btn_platform_data(void *info)
+{
+       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN);
+}
+
+static const struct devs_id msic_power_btn_dev_id __initconst = {
+       .name = "msic_power_btn",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .get_platform_data = &msic_power_btn_platform_data,
+       .device_handler = &ipc_device_handler,
+};
+
+sfi_device(msic_power_btn_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c

new file mode 100644 (file)

index 0000000..a351878
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
@@ -0,0 +1,37 @@
+/*
+ * platform_msic_thermal.c: msic_thermal platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/input.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+#include "platform_ipc.h"
+
+static void __init *msic_thermal_platform_data(void *info)
+{
+       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL);
+}
+
+static const struct devs_id msic_thermal_dev_id __initconst = {
+       .name = "msic_thermal",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .get_platform_data = &msic_thermal_platform_data,
+       .device_handler = &ipc_device_handler,
+};
+
+sfi_device(msic_thermal_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c

new file mode 100644 (file)

index 0000000..d87182a
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
@@ -0,0 +1,54 @@
+/*
+ * platform_pmic_gpio.c: PMIC GPIO platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/intel_pmic_gpio.h>
+#include <asm/intel-mid.h>
+
+#include "platform_ipc.h"
+
+static void __init *pmic_gpio_platform_data(void *info)
+{
+       static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
+       int gpio_base = get_gpio_by_name("pmic_gpio_base");
+
+       if (gpio_base == -1)
+               gpio_base = 64;
+       pmic_gpio_pdata.gpio_base = gpio_base;
+       pmic_gpio_pdata.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
+       pmic_gpio_pdata.gpiointr = 0xffffeff8;
+
+       return &pmic_gpio_pdata;
+}
+
+static const struct devs_id pmic_gpio_spi_dev_id __initconst = {
+       .name = "pmic_gpio",
+       .type = SFI_DEV_TYPE_SPI,
+       .delay = 1,
+       .get_platform_data = &pmic_gpio_platform_data,
+};
+
+static const struct devs_id pmic_gpio_ipc_dev_id __initconst = {
+       .name = "pmic_gpio",
+       .type = SFI_DEV_TYPE_IPC,
+       .delay = 1,
+       .get_platform_data = &pmic_gpio_platform_data,
+       .device_handler = &ipc_device_handler
+};
+
+sfi_device(pmic_gpio_spi_dev_id);
+sfi_device(pmic_gpio_ipc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c

new file mode 100644 (file)

index 0000000..740fc75
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -0,0 +1,36 @@
+/*
+ * platform_tc35876x.c: tc35876x platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/i2c/tc35876x.h>
+#include <asm/intel-mid.h>
+
+/*tc35876x DSI_LVDS bridge chip and panel platform data*/
+static void *tc35876x_platform_data(void *data)
+{
+       static struct tc35876x_platform_data pdata;
+
+       /* gpio pins set to -1 will not be used by the driver */
+       pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN");
+       pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN");
+       pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3");
+
+       return &pdata;
+}
+
+static const struct devs_id tc35876x_dev_id __initconst = {
+       .name = "i2c_disp_brig",
+       .type = SFI_DEV_TYPE_I2C,
+       .get_platform_data = &tc35876x_platform_data,
+};
+
+sfi_device(tc35876x_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c

new file mode 100644 (file)

index 0000000..22881c9
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -0,0 +1,57 @@
+/*
+ * platform_tca6416.c: tca6416 platform data initilization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/platform_data/pca953x.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <asm/intel-mid.h>
+
+#define TCA6416_NAME   "tca6416"
+#define TCA6416_BASE   "tca6416_base"
+#define TCA6416_INTR   "tca6416_int"
+
+static void *tca6416_platform_data(void *info)
+{
+       static struct pca953x_platform_data tca6416;
+       struct i2c_board_info *i2c_info = info;
+       int gpio_base, intr;
+       char base_pin_name[SFI_NAME_LEN + 1];
+       char intr_pin_name[SFI_NAME_LEN + 1];
+
+       strcpy(i2c_info->type, TCA6416_NAME);
+       strcpy(base_pin_name, TCA6416_BASE);
+       strcpy(intr_pin_name, TCA6416_INTR);
+
+       gpio_base = get_gpio_by_name(base_pin_name);
+       intr = get_gpio_by_name(intr_pin_name);
+
+       if (gpio_base == -1)
+               return NULL;
+       tca6416.gpio_base = gpio_base;
+       if (intr != -1) {
+               i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+               tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
+       } else {
+               i2c_info->irq = -1;
+               tca6416.irq_base = -1;
+       }
+       return &tca6416;
+}
+
+static const struct devs_id tca6416_dev_id __initconst = {
+       .name = "tca6416",
+       .type = SFI_DEV_TYPE_I2C,
+       .delay = 1,
+       .get_platform_data = &tca6416_platform_data,
+};
+
+sfi_device(tca6416_dev_id);
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c

similarity index 97%

rename from arch/x86/platform/mrst/early_printk_mrst.c

rename to arch/x86/platform/intel-mid/early_printk_intel_mid.c

index 028454f0c3a51ac86a7105d24d433e197621a95d..4f702f554f6e011cfcb248af5d1088526a714e6a 100644 (file)
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
@@ -1,5 +1,5 @@
  /*
- * early_printk_mrst.c - early consoles for Intel MID platforms
+ * early_printk_intel_mid.c - early consoles for Intel MID platforms
   *
   * Copyright (c) 2008-2010, Intel Corporation
   *
@@ -27,7 +27,7 @@
  
  #include <asm/fixmap.h>
  #include <asm/pgtable.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  
  #define MRST_SPI_TIMEOUT               0x200000
  #define MRST_REGBASE_SPI0              0xff128000
@@ -152,7 +152,7 @@ void mrst_early_console_init(void)
         spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
         freq = 100000000 / (spi0_cdiv + 1);
  
-       if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+       if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL)
                 mrst_spi_paddr = MRST_REGBASE_SPI1;
  
         pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
@@ -213,13 +213,14 @@ static void early_mrst_spi_putc(char c)
         }
  
         if (!timeout)
-               pr_warning("MRST earlycon: timed out\n");
+               pr_warn("MRST earlycon: timed out\n");
         else
                 max3110_write_data(c);
  }
  
  /* Early SPI only uses polling mode */
-static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+static void early_mrst_spi_write(struct console *con, const char *str,
+                                       unsigned n)
  {
         int i;
  
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c

new file mode 100644 (file)

index 0000000..523a1c8
--- /dev/null
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -0,0 +1,213 @@
+/*
+ * intel-mid.c: Intel MID platform setup code
+ *
+ * (C) Copyright 2008, 2012 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#define pr_fmt(fmt) "intel_mid: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+
+#include <asm/setup.h>
+#include <asm/mpspec_def.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
+#include <asm/io.h>
+#include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/apb_timer.h>
+#include <asm/reboot.h>
+
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_intel_mid_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+enum intel_mid_timer_options intel_mid_timer_options;
+
+enum intel_mid_cpu_type __intel_mid_cpu_chip;
+EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
+
+static void intel_mid_power_off(void)
+{
+}
+
+static void intel_mid_reboot(void)
+{
+       intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
+}
+
+static unsigned long __init intel_mid_calibrate_tsc(void)
+{
+       unsigned long fast_calibrate;
+       u32 lo, hi, ratio, fsb;
+
+       rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+       pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
+       ratio = (hi >> 8) & 0x1f;
+       pr_debug("ratio is %d\n", ratio);
+       if (!ratio) {
+               pr_err("read a zero ratio, should be incorrect!\n");
+               pr_err("force tsc ratio to 16 ...\n");
+               ratio = 16;
+       }
+       rdmsr(MSR_FSB_FREQ, lo, hi);
+       if ((lo & 0x7) == 0x7)
+               fsb = PENWELL_FSB_FREQ_83SKU;
+       else
+               fsb = PENWELL_FSB_FREQ_100SKU;
+       fast_calibrate = ratio * fsb;
+       pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
+       lapic_timer_frequency = fsb * 1000 / HZ;
+       /* mark tsc clocksource as reliable */
+       set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
+
+       if (fast_calibrate)
+               return fast_calibrate;
+
+       return 0;
+}
+
+static void __init intel_mid_time_init(void)
+{
+       sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+       switch (intel_mid_timer_options) {
+       case INTEL_MID_TIMER_APBT_ONLY:
+               break;
+       case INTEL_MID_TIMER_LAPIC_APBT:
+               x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+               x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+               break;
+       default:
+               if (!boot_cpu_has(X86_FEATURE_ARAT))
+                       break;
+               x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+               x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+               return;
+       }
+       /* we need at least one APB timer */
+       pre_init_apic_IRQ0();
+       apbt_time_init();
+}
+
+static void __cpuinit intel_mid_arch_setup(void)
+{
+       if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
+               __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
+       else {
+               pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
+                       boot_cpu_data.x86, boot_cpu_data.x86_model);
+               __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
+       }
+}
+
+/* MID systems don't have i8042 controller */
+static int intel_mid_i8042_detect(void)
+{
+       return 0;
+}
+
+/*
+ * Moorestown does not have external NMI source nor port 0x61 to report
+ * NMI status. The possible NMI sources are from pmu as a result of NMI
+ * watchdog or lock debug. Reading io port 0x61 results in 0xff which
+ * misled NMI handler.
+ */
+static unsigned char intel_mid_get_nmi_reason(void)
+{
+       return 0;
+}
+
+/*
+ * Moorestown specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_intel_mid_early_setup(void)
+{
+       x86_init.resources.probe_roms = x86_init_noop;
+       x86_init.resources.reserve_resources = x86_init_noop;
+
+       x86_init.timers.timer_init = intel_mid_time_init;
+       x86_init.timers.setup_percpu_clockev = x86_init_noop;
+
+       x86_init.irqs.pre_vector_init = x86_init_noop;
+
+       x86_init.oem.arch_setup = intel_mid_arch_setup;
+
+       x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
+
+       x86_platform.calibrate_tsc = intel_mid_calibrate_tsc;
+       x86_platform.i8042_detect = intel_mid_i8042_detect;
+       x86_init.timers.wallclock_init = intel_mid_rtc_init;
+       x86_platform.get_nmi_reason = intel_mid_get_nmi_reason;
+
+       x86_init.pci.init = intel_mid_pci_init;
+       x86_init.pci.fixup_irqs = x86_init_noop;
+
+       legacy_pic = &null_legacy_pic;
+
+       pm_power_off = intel_mid_power_off;
+       machine_ops.emergency_restart  = intel_mid_reboot;
+
+       /* Avoid searching for BIOS MP tables */
+       x86_init.mpparse.find_smp_config = x86_init_noop;
+       x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+       set_bit(MP_BUS_ISA, mp_bus_not_pci);
+}
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_intel_mid_timer(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       if (strcmp("apbt_only", arg) == 0)
+               intel_mid_timer_options = INTEL_MID_TIMER_APBT_ONLY;
+       else if (strcmp("lapic_and_apbt", arg) == 0)
+               intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT;
+       else {
+               pr_warn("X86 INTEL_MID timer option %s not recognised"
+                          " use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n",
+                          arg);
+               return -EINVAL;
+       }
+       return 0;
+}
+__setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer);
+
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c

similarity index 90%

rename from arch/x86/platform/mrst/vrtc.c

rename to arch/x86/platform/intel-mid/intel_mid_vrtc.c

index 5e355b134ba49043b63d9acda6e549fa3d6420e1..4762cff7facd0293c63317144118195b453de2f9 100644 (file)
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -1,5 +1,5 @@
  /*
- * vrtc.c: Driver for virtual RTC device on Intel MID platform
+ * intel_mid_vrtc.c: Driver for virtual RTC device on Intel MID platform
   *
   * (C) Copyright 2009 Intel Corporation
   *
@@ -23,8 +23,8 @@
  #include <linux/sfi.h>
  #include <linux/platform_device.h>
  
-#include <asm/mrst.h>
-#include <asm/mrst-vrtc.h>
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
  #include <asm/time.h>
  #include <asm/fixmap.h>
  
@@ -79,7 +79,7 @@ void vrtc_get_time(struct timespec *now)
         /* vRTC YEAR reg contains the offset to 1972 */
         year += 1972;
  
-       printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
+       pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
                 "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
  
         now->tv_sec = mktime(year, mon, mday, hour, min, sec);
@@ -109,15 +109,14 @@ int vrtc_set_mmss(const struct timespec *now)
                 vrtc_cmos_write(tm.tm_sec, RTC_SECONDS);
                 spin_unlock_irqrestore(&rtc_lock, flags);
         } else {
-               printk(KERN_ERR
-                      "%s: Invalid vRTC value: write of %lx to vRTC failed\n",
+               pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
                         __FUNCTION__, now->tv_sec);
                 retval = -EINVAL;
         }
         return retval;
  }
  
-void __init mrst_rtc_init(void)
+void __init intel_mid_rtc_init(void)
  {
         unsigned long vrtc_paddr;
  
@@ -155,10 +154,10 @@ static struct platform_device vrtc_device = {
  };
  
  /* Register the RTC device if appropriate */
-static int __init mrst_device_create(void)
+static int __init intel_mid_device_create(void)
  {
         /* No Moorestown, no device */
-       if (!mrst_identify_cpu())
+       if (!intel_mid_identify_cpu())
                 return -ENODEV;
         /* No timer, no device */
         if (!sfi_mrtc_num)
@@ -175,4 +174,4 @@ static int __init mrst_device_create(void)
         return platform_device_register(&vrtc_device);
  }
  
-module_init(mrst_device_create);
+module_init(intel_mid_device_create);
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c

new file mode 100644 (file)

index 0000000..c84c1ca
--- /dev/null
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -0,0 +1,488 @@
+/*
+ * intel_mid_sfi.c: Intel MID SFI initialization code
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/intel_pmic_gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+#include <linux/skbuff.h>
+#include <linux/gpio.h>
+#include <linux/gpio_keys.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/mmc/core.h>
+#include <linux/mmc/card.h>
+#include <linux/blkdev.h>
+
+#include <asm/setup.h>
+#include <asm/mpspec_def.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
+#include <asm/io.h>
+#include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/apb_timer.h>
+#include <asm/reboot.h>
+
+#define        SFI_SIG_OEM0    "OEM0"
+#define MAX_IPCDEVS    24
+#define MAX_SCU_SPI    24
+#define MAX_SCU_I2C    24
+
+static struct platform_device *ipc_devs[MAX_IPCDEVS];
+static struct spi_board_info *spi_devs[MAX_SCU_SPI];
+static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
+static struct sfi_gpio_table_entry *gpio_table;
+static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+static int ipc_next_dev;
+static int spi_next_dev;
+static int i2c_next_dev;
+static int i2c_bus[MAX_SCU_I2C];
+static int gpio_num_entry;
+static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
+int sfi_mrtc_num;
+int sfi_mtimer_num;
+
+struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
+EXPORT_SYMBOL_GPL(sfi_mrtc_array);
+
+struct blocking_notifier_head intel_scu_notifier =
+                       BLOCKING_NOTIFIER_INIT(intel_scu_notifier);
+EXPORT_SYMBOL_GPL(intel_scu_notifier);
+
+#define intel_mid_sfi_get_pdata(dev, priv)     \
+       ((dev)->get_platform_data ? (dev)->get_platform_data(priv) : NULL)
+
+/* parse all the mtimer info to a static mtimer array */
+int __init sfi_parse_mtmr(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_timer_table_entry *pentry;
+       struct mpc_intsrc mp_irq;
+       int totallen;
+
+       sb = (struct sfi_table_simple *)table;
+       if (!sfi_mtimer_num) {
+               sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
+                                       struct sfi_timer_table_entry);
+               pentry = (struct sfi_timer_table_entry *) sb->pentry;
+               totallen = sfi_mtimer_num * sizeof(*pentry);
+               memcpy(sfi_mtimer_array, pentry, totallen);
+       }
+
+       pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
+       pentry = sfi_mtimer_array;
+       for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
+               pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
+                       totallen, (u32)pentry->phys_addr,
+                       pentry->freq_hz, pentry->irq);
+                       if (!pentry->irq)
+                               continue;
+                       mp_irq.type = MP_INTSRC;
+                       mp_irq.irqtype = mp_INT;
+/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+                       mp_irq.irqflag = 5;
+                       mp_irq.srcbus = MP_BUS_ISA;
+                       mp_irq.srcbusirq = pentry->irq; /* IRQ */
+                       mp_irq.dstapic = MP_APIC_ALL;
+                       mp_irq.dstirq = pentry->irq;
+                       mp_save_irq(&mp_irq);
+       }
+
+       return 0;
+}
+
+struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
+{
+       int i;
+       if (hint < sfi_mtimer_num) {
+               if (!sfi_mtimer_usage[hint]) {
+                       pr_debug("hint taken for timer %d irq %d\n",
+                               hint, sfi_mtimer_array[hint].irq);
+                       sfi_mtimer_usage[hint] = 1;
+                       return &sfi_mtimer_array[hint];
+               }
+       }
+       /* take the first timer available */
+       for (i = 0; i < sfi_mtimer_num;) {
+               if (!sfi_mtimer_usage[i]) {
+                       sfi_mtimer_usage[i] = 1;
+                       return &sfi_mtimer_array[i];
+               }
+               i++;
+       }
+       return NULL;
+}
+
+void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
+{
+       int i;
+       for (i = 0; i < sfi_mtimer_num;) {
+               if (mtmr->irq == sfi_mtimer_array[i].irq) {
+                       sfi_mtimer_usage[i] = 0;
+                       return;
+               }
+               i++;
+       }
+}
+
+/* parse all the mrtc info to a global mrtc array */
+int __init sfi_parse_mrtc(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_rtc_table_entry *pentry;
+       struct mpc_intsrc mp_irq;
+
+       int totallen;
+
+       sb = (struct sfi_table_simple *)table;
+       if (!sfi_mrtc_num) {
+               sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
+                                               struct sfi_rtc_table_entry);
+               pentry = (struct sfi_rtc_table_entry *)sb->pentry;
+               totallen = sfi_mrtc_num * sizeof(*pentry);
+               memcpy(sfi_mrtc_array, pentry, totallen);
+       }
+
+       pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
+       pentry = sfi_mrtc_array;
+       for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
+               pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
+                       totallen, (u32)pentry->phys_addr, pentry->irq);
+               mp_irq.type = MP_INTSRC;
+               mp_irq.irqtype = mp_INT;
+               mp_irq.irqflag = 0xf;   /* level trigger and active low */
+               mp_irq.srcbus = MP_BUS_ISA;
+               mp_irq.srcbusirq = pentry->irq; /* IRQ */
+               mp_irq.dstapic = MP_APIC_ALL;
+               mp_irq.dstirq = pentry->irq;
+               mp_save_irq(&mp_irq);
+       }
+       return 0;
+}
+
+
+/*
+ * Parsing GPIO table first, since the DEVS table will need this table
+ * to map the pin name to the actual pin.
+ */
+static int __init sfi_parse_gpio(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_gpio_table_entry *pentry;
+       int num, i;
+
+       if (gpio_table)
+               return 0;
+       sb = (struct sfi_table_simple *)table;
+       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
+       pentry = (struct sfi_gpio_table_entry *)sb->pentry;
+
+       gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+       if (!gpio_table)
+               return -1;
+       memcpy(gpio_table, pentry, num * sizeof(*pentry));
+       gpio_num_entry = num;
+
+       pr_debug("GPIO pin info:\n");
+       for (i = 0; i < num; i++, pentry++)
+               pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
+               " pin = %d\n", i,
+                       pentry->controller_name,
+                       pentry->pin_name,
+                       pentry->pin_no);
+       return 0;
+}
+
+int get_gpio_by_name(const char *name)
+{
+       struct sfi_gpio_table_entry *pentry = gpio_table;
+       int i;
+
+       if (!pentry)
+               return -1;
+       for (i = 0; i < gpio_num_entry; i++, pentry++) {
+               if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
+                       return pentry->pin_no;
+       }
+       return -1;
+}
+
+void __init intel_scu_device_register(struct platform_device *pdev)
+{
+       if (ipc_next_dev == MAX_IPCDEVS)
+               pr_err("too many SCU IPC devices");
+       else
+               ipc_devs[ipc_next_dev++] = pdev;
+}
+
+static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
+{
+       struct spi_board_info *new_dev;
+
+       if (spi_next_dev == MAX_SCU_SPI) {
+               pr_err("too many SCU SPI devices");
+               return;
+       }
+
+       new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+       if (!new_dev) {
+               pr_err("failed to alloc mem for delayed spi dev %s\n",
+                       sdev->modalias);
+               return;
+       }
+       memcpy(new_dev, sdev, sizeof(*sdev));
+
+       spi_devs[spi_next_dev++] = new_dev;
+}
+
+static void __init intel_scu_i2c_device_register(int bus,
+                                               struct i2c_board_info *idev)
+{
+       struct i2c_board_info *new_dev;
+
+       if (i2c_next_dev == MAX_SCU_I2C) {
+               pr_err("too many SCU I2C devices");
+               return;
+       }
+
+       new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
+       if (!new_dev) {
+               pr_err("failed to alloc mem for delayed i2c dev %s\n",
+                       idev->type);
+               return;
+       }
+       memcpy(new_dev, idev, sizeof(*idev));
+
+       i2c_bus[i2c_next_dev] = bus;
+       i2c_devs[i2c_next_dev++] = new_dev;
+}
+
+/* Called by IPC driver */
+void intel_scu_devices_create(void)
+{
+       int i;
+
+       for (i = 0; i < ipc_next_dev; i++)
+               platform_device_add(ipc_devs[i]);
+
+       for (i = 0; i < spi_next_dev; i++)
+               spi_register_board_info(spi_devs[i], 1);
+
+       for (i = 0; i < i2c_next_dev; i++) {
+               struct i2c_adapter *adapter;
+               struct i2c_client *client;
+
+               adapter = i2c_get_adapter(i2c_bus[i]);
+               if (adapter) {
+                       client = i2c_new_device(adapter, i2c_devs[i]);
+                       if (!client)
+                               pr_err("can't create i2c device %s\n",
+                                       i2c_devs[i]->type);
+               } else
+                       i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
+       }
+       intel_scu_notifier_post(SCU_AVAILABLE, NULL);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_create);
+
+/* Called by IPC driver */
+void intel_scu_devices_destroy(void)
+{
+       int i;
+
+       intel_scu_notifier_post(SCU_DOWN, NULL);
+
+       for (i = 0; i < ipc_next_dev; i++)
+               platform_device_del(ipc_devs[i]);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
+
+static void __init install_irq_resource(struct platform_device *pdev, int irq)
+{
+       /* Single threaded */
+       static struct resource res __initdata = {
+               .name = "IRQ",
+               .flags = IORESOURCE_IRQ,
+       };
+       res.start = irq;
+       platform_device_add_resources(pdev, &res, 1);
+}
+
+static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
+                                       struct devs_id *dev)
+{
+       struct platform_device *pdev;
+       void *pdata = NULL;
+
+       pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
+               pentry->name, pentry->irq);
+       pdata = intel_mid_sfi_get_pdata(dev, pentry);
+
+       pdev = platform_device_alloc(pentry->name, 0);
+       if (pdev == NULL) {
+               pr_err("out of memory for SFI platform device '%s'.\n",
+                       pentry->name);
+               return;
+       }
+       install_irq_resource(pdev, pentry->irq);
+
+       pdev->dev.platform_data = pdata;
+       platform_device_add(pdev);
+}
+
+static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
+                                       struct devs_id *dev)
+{
+       struct spi_board_info spi_info;
+       void *pdata = NULL;
+
+       memset(&spi_info, 0, sizeof(spi_info));
+       strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
+       spi_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
+       spi_info.bus_num = pentry->host_num;
+       spi_info.chip_select = pentry->addr;
+       spi_info.max_speed_hz = pentry->max_freq;
+       pr_debug("SPI bus=%d, name=%16.16s, irq=0x%2x, max_freq=%d, cs=%d\n",
+               spi_info.bus_num,
+               spi_info.modalias,
+               spi_info.irq,
+               spi_info.max_speed_hz,
+               spi_info.chip_select);
+
+       pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
+
+       spi_info.platform_data = pdata;
+       if (dev->delay)
+               intel_scu_spi_device_register(&spi_info);
+       else
+               spi_register_board_info(&spi_info, 1);
+}
+
+static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
+                                       struct devs_id *dev)
+{
+       struct i2c_board_info i2c_info;
+       void *pdata = NULL;
+
+       memset(&i2c_info, 0, sizeof(i2c_info));
+       strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
+       i2c_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
+       i2c_info.addr = pentry->addr;
+       pr_debug("I2C bus = %d, name = %16.16s, irq = 0x%2x, addr = 0x%x\n",
+               pentry->host_num,
+               i2c_info.type,
+               i2c_info.irq,
+               i2c_info.addr);
+       pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
+       i2c_info.platform_data = pdata;
+
+       if (dev->delay)
+               intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
+       else
+               i2c_register_board_info(pentry->host_num, &i2c_info, 1);
+}
+
+extern struct devs_id *const __x86_intel_mid_dev_start[],
+                     *const __x86_intel_mid_dev_end[];
+
+static struct devs_id __init *get_device_id(u8 type, char *name)
+{
+       struct devs_id *const *dev_table;
+
+       for (dev_table = __x86_intel_mid_dev_start;
+                       dev_table < __x86_intel_mid_dev_end; dev_table++) {
+               struct devs_id *dev = *dev_table;
+               if (dev->type == type &&
+                       !strncmp(dev->name, name, SFI_NAME_LEN)) {
+                       return dev;
+               }
+       }
+
+       return NULL;
+}
+
+static int __init sfi_parse_devs(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_device_table_entry *pentry;
+       struct devs_id *dev = NULL;
+       int num, i;
+       int ioapic;
+       struct io_apic_irq_attr irq_attr;
+
+       sb = (struct sfi_table_simple *)table;
+       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
+       pentry = (struct sfi_device_table_entry *)sb->pentry;
+
+       for (i = 0; i < num; i++, pentry++) {
+               int irq = pentry->irq;
+
+               if (irq != (u8)0xff) { /* native RTE case */
+                       /* these SPI2 devices are not exposed to system as PCI
+                        * devices, but they have separate RTE entry in IOAPIC
+                        * so we have to enable them one by one here
+                        */
+                       ioapic = mp_find_ioapic(irq);
+                       irq_attr.ioapic = ioapic;
+                       irq_attr.ioapic_pin = irq;
+                       irq_attr.trigger = 1;
+                       irq_attr.polarity = 1;
+                       io_apic_set_pci_routing(NULL, irq, &irq_attr);
+               } else
+                       irq = 0; /* No irq */
+
+               dev = get_device_id(pentry->type, pentry->name);
+
+               if (!dev)
+                       continue;
+
+               if (dev->device_handler) {
+                       dev->device_handler(pentry, dev);
+               } else {
+                       switch (pentry->type) {
+                       case SFI_DEV_TYPE_IPC:
+                               sfi_handle_ipc_dev(pentry, dev);
+                               break;
+                       case SFI_DEV_TYPE_SPI:
+                               sfi_handle_spi_dev(pentry, dev);
+                               break;
+                       case SFI_DEV_TYPE_I2C:
+                               sfi_handle_i2c_dev(pentry, dev);
+                               break;
+                       case SFI_DEV_TYPE_UART:
+                       case SFI_DEV_TYPE_HSI:
+                       default:
+                               break;
+                       }
+               }
+       }
+       return 0;
+}
+
+static int __init intel_mid_platform_init(void)
+{
+       sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
+       sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
+       return 0;
+}
+arch_initcall(intel_mid_platform_init);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile

deleted file mode 100644 (file)

index af1da7e..0000000
--- a/arch/x86/platform/mrst/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_X86_INTEL_MID)    += mrst.o
-obj-$(CONFIG_X86_INTEL_MID)    += vrtc.o
-obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)   += early_printk_mrst.o
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c

deleted file mode 100644 (file)

index 3ca5957..0000000
--- a/arch/x86/platform/mrst/mrst.c
+++ /dev/null
@@ -1,1052 +0,0 @@
-/*
- * mrst.c: Intel Moorestown platform specific setup code
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Jacob Pan (jacob.jun.pan@intel.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#define pr_fmt(fmt) "mrst: " fmt
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/sfi.h>
-#include <linux/intel_pmic_gpio.h>
-#include <linux/spi/spi.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/pca953x.h>
-#include <linux/gpio_keys.h>
-#include <linux/input.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <linux/mfd/intel_msic.h>
-#include <linux/gpio.h>
-#include <linux/i2c/tc35876x.h>
-
-#include <asm/setup.h>
-#include <asm/mpspec_def.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-#include <asm/mrst.h>
-#include <asm/mrst-vrtc.h>
-#include <asm/io.h>
-#include <asm/i8259.h>
-#include <asm/intel_scu_ipc.h>
-#include <asm/apb_timer.h>
-#include <asm/reboot.h>
-
-/*
- * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
- * cmdline option x86_mrst_timer can be used to override the configuration
- * to prefer one or the other.
- * at runtime, there are basically three timer configurations:
- * 1. per cpu apbt clock only
- * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
- * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
- *
- * by default (without cmdline option), platform code first detects cpu type
- * to see if we are on lincroft or penwell, then set up both lapic or apbt
- * clocks accordingly.
- * i.e. by default, medfield uses configuration #2, moorestown uses #1.
- * config #3 is supported but not recommended on medfield.
- *
- * rating and feature summary:
- * lapic (with C3STOP) --------- 100
- * apbt (always-on) ------------ 110
- * lapic (always-on,ARAT) ------ 150
- */
-
-enum mrst_timer_options mrst_timer_options;
-
-static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
-static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-enum mrst_cpu_type __mrst_cpu_chip;
-EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
-
-int sfi_mtimer_num;
-
-struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
-EXPORT_SYMBOL_GPL(sfi_mrtc_array);
-int sfi_mrtc_num;
-
-static void mrst_power_off(void)
-{
-}
-
-static void mrst_reboot(void)
-{
-       intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
-}
-
-/* parse all the mtimer info to a static mtimer array */
-static int __init sfi_parse_mtmr(struct sfi_table_header *table)
-{
-       struct sfi_table_simple *sb;
-       struct sfi_timer_table_entry *pentry;
-       struct mpc_intsrc mp_irq;
-       int totallen;
-
-       sb = (struct sfi_table_simple *)table;
-       if (!sfi_mtimer_num) {
-               sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
-                                       struct sfi_timer_table_entry);
-               pentry = (struct sfi_timer_table_entry *) sb->pentry;
-               totallen = sfi_mtimer_num * sizeof(*pentry);
-               memcpy(sfi_mtimer_array, pentry, totallen);
-       }
-
-       pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
-       pentry = sfi_mtimer_array;
-       for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
-               pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
-                       " irq = %d\n", totallen, (u32)pentry->phys_addr,
-                       pentry->freq_hz, pentry->irq);
-                       if (!pentry->irq)
-                               continue;
-                       mp_irq.type = MP_INTSRC;
-                       mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
-                       mp_irq.irqflag = 5;
-                       mp_irq.srcbus = MP_BUS_ISA;
-                       mp_irq.srcbusirq = pentry->irq; /* IRQ */
-                       mp_irq.dstapic = MP_APIC_ALL;
-                       mp_irq.dstirq = pentry->irq;
-                       mp_save_irq(&mp_irq);
-       }
-
-       return 0;
-}
-
-struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
-{
-       int i;
-       if (hint < sfi_mtimer_num) {
-               if (!sfi_mtimer_usage[hint]) {
-                       pr_debug("hint taken for timer %d irq %d\n",\
-                               hint, sfi_mtimer_array[hint].irq);
-                       sfi_mtimer_usage[hint] = 1;
-                       return &sfi_mtimer_array[hint];
-               }
-       }
-       /* take the first timer available */
-       for (i = 0; i < sfi_mtimer_num;) {
-               if (!sfi_mtimer_usage[i]) {
-                       sfi_mtimer_usage[i] = 1;
-                       return &sfi_mtimer_array[i];
-               }
-               i++;
-       }
-       return NULL;
-}
-
-void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
-{
-       int i;
-       for (i = 0; i < sfi_mtimer_num;) {
-               if (mtmr->irq == sfi_mtimer_array[i].irq) {
-                       sfi_mtimer_usage[i] = 0;
-                       return;
-               }
-               i++;
-       }
-}
-
-/* parse all the mrtc info to a global mrtc array */
-int __init sfi_parse_mrtc(struct sfi_table_header *table)
-{
-       struct sfi_table_simple *sb;
-       struct sfi_rtc_table_entry *pentry;
-       struct mpc_intsrc mp_irq;
-
-       int totallen;
-
-       sb = (struct sfi_table_simple *)table;
-       if (!sfi_mrtc_num) {
-               sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
-                                               struct sfi_rtc_table_entry);
-               pentry = (struct sfi_rtc_table_entry *)sb->pentry;
-               totallen = sfi_mrtc_num * sizeof(*pentry);
-               memcpy(sfi_mrtc_array, pentry, totallen);
-       }
-
-       pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
-       pentry = sfi_mrtc_array;
-       for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
-               pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
-                       totallen, (u32)pentry->phys_addr, pentry->irq);
-               mp_irq.type = MP_INTSRC;
-               mp_irq.irqtype = mp_INT;
-               mp_irq.irqflag = 0xf;   /* level trigger and active low */
-               mp_irq.srcbus = MP_BUS_ISA;
-               mp_irq.srcbusirq = pentry->irq; /* IRQ */
-               mp_irq.dstapic = MP_APIC_ALL;
-               mp_irq.dstirq = pentry->irq;
-               mp_save_irq(&mp_irq);
-       }
-       return 0;
-}
-
-static unsigned long __init mrst_calibrate_tsc(void)
-{
-       unsigned long fast_calibrate;
-       u32 lo, hi, ratio, fsb;
-
-       rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-       pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
-       ratio = (hi >> 8) & 0x1f;
-       pr_debug("ratio is %d\n", ratio);
-       if (!ratio) {
-               pr_err("read a zero ratio, should be incorrect!\n");
-               pr_err("force tsc ratio to 16 ...\n");
-               ratio = 16;
-       }
-       rdmsr(MSR_FSB_FREQ, lo, hi);
-       if ((lo & 0x7) == 0x7)
-               fsb = PENWELL_FSB_FREQ_83SKU;
-       else
-               fsb = PENWELL_FSB_FREQ_100SKU;
-       fast_calibrate = ratio * fsb;
-       pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
-       lapic_timer_frequency = fsb * 1000 / HZ;
-       /* mark tsc clocksource as reliable */
-       set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
-       
-       if (fast_calibrate)
-               return fast_calibrate;
-
-       return 0;
-}
-
-static void __init mrst_time_init(void)
-{
-       sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
-       switch (mrst_timer_options) {
-       case MRST_TIMER_APBT_ONLY:
-               break;
-       case MRST_TIMER_LAPIC_APBT:
-               x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
-               x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-               break;
-       default:
-               if (!boot_cpu_has(X86_FEATURE_ARAT))
-                       break;
-               x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
-               x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-               return;
-       }
-       /* we need at least one APB timer */
-       pre_init_apic_IRQ0();
-       apbt_time_init();
-}
-
-static void mrst_arch_setup(void)
-{
-       if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
-               __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
-       else {
-               pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
-                       boot_cpu_data.x86, boot_cpu_data.x86_model);
-               __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
-       }
-}
-
-/* MID systems don't have i8042 controller */
-static int mrst_i8042_detect(void)
-{
-       return 0;
-}
-
-/*
- * Moorestown does not have external NMI source nor port 0x61 to report
- * NMI status. The possible NMI sources are from pmu as a result of NMI
- * watchdog or lock debug. Reading io port 0x61 results in 0xff which
- * misled NMI handler.
- */
-static unsigned char mrst_get_nmi_reason(void)
-{
-       return 0;
-}
-
-/*
- * Moorestown specific x86_init function overrides and early setup
- * calls.
- */
-void __init x86_mrst_early_setup(void)
-{
-       x86_init.resources.probe_roms = x86_init_noop;
-       x86_init.resources.reserve_resources = x86_init_noop;
-
-       x86_init.timers.timer_init = mrst_time_init;
-       x86_init.timers.setup_percpu_clockev = x86_init_noop;
-
-       x86_init.irqs.pre_vector_init = x86_init_noop;
-
-       x86_init.oem.arch_setup = mrst_arch_setup;
-
-       x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
-
-       x86_platform.calibrate_tsc = mrst_calibrate_tsc;
-       x86_platform.i8042_detect = mrst_i8042_detect;
-       x86_init.timers.wallclock_init = mrst_rtc_init;
-       x86_platform.get_nmi_reason = mrst_get_nmi_reason;
-
-       x86_init.pci.init = pci_mrst_init;
-       x86_init.pci.fixup_irqs = x86_init_noop;
-
-       legacy_pic = &null_legacy_pic;
-
-       /* Moorestown specific power_off/restart method */
-       pm_power_off = mrst_power_off;
-       machine_ops.emergency_restart  = mrst_reboot;
-
-       /* Avoid searching for BIOS MP tables */
-       x86_init.mpparse.find_smp_config = x86_init_noop;
-       x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-       set_bit(MP_BUS_ISA, mp_bus_not_pci);
-}
-
-/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       if (strcmp("apbt_only", arg) == 0)
-               mrst_timer_options = MRST_TIMER_APBT_ONLY;
-       else if (strcmp("lapic_and_apbt", arg) == 0)
-               mrst_timer_options = MRST_TIMER_LAPIC_APBT;
-       else {
-               pr_warning("X86 MRST timer option %s not recognised"
-                          " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
-                          arg);
-               return -EINVAL;
-       }
-       return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-
-/*
- * Parsing GPIO table first, since the DEVS table will need this table
- * to map the pin name to the actual pin.
- */
-static struct sfi_gpio_table_entry *gpio_table;
-static int gpio_num_entry;
-
-static int __init sfi_parse_gpio(struct sfi_table_header *table)
-{
-       struct sfi_table_simple *sb;
-       struct sfi_gpio_table_entry *pentry;
-       int num, i;
-
-       if (gpio_table)
-               return 0;
-       sb = (struct sfi_table_simple *)table;
-       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
-       pentry = (struct sfi_gpio_table_entry *)sb->pentry;
-
-       gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL);
-       if (!gpio_table)
-               return -1;
-       memcpy(gpio_table, pentry, num * sizeof(*pentry));
-       gpio_num_entry = num;
-
-       pr_debug("GPIO pin info:\n");
-       for (i = 0; i < num; i++, pentry++)
-               pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
-               " pin = %d\n", i,
-                       pentry->controller_name,
-                       pentry->pin_name,
-                       pentry->pin_no);
-       return 0;
-}
-
-static int get_gpio_by_name(const char *name)
-{
-       struct sfi_gpio_table_entry *pentry = gpio_table;
-       int i;
-
-       if (!pentry)
-               return -1;
-       for (i = 0; i < gpio_num_entry; i++, pentry++) {
-               if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
-                       return pentry->pin_no;
-       }
-       return -1;
-}
-
-/*
- * Here defines the array of devices platform data that IAFW would export
- * through SFI "DEVS" table, we use name and type to match the device and
- * its platform data.
- */
-struct devs_id {
-       char name[SFI_NAME_LEN + 1];
-       u8 type;
-       u8 delay;
-       void *(*get_platform_data)(void *info);
-};
-
-/* the offset for the mapping of global gpio pin to irq */
-#define MRST_IRQ_OFFSET 0x100
-
-static void __init *pmic_gpio_platform_data(void *info)
-{
-       static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
-       int gpio_base = get_gpio_by_name("pmic_gpio_base");
-
-       if (gpio_base == -1)
-               gpio_base = 64;
-       pmic_gpio_pdata.gpio_base = gpio_base;
-       pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
-       pmic_gpio_pdata.gpiointr = 0xffffeff8;
-
-       return &pmic_gpio_pdata;
-}
-
-static void __init *max3111_platform_data(void *info)
-{
-       struct spi_board_info *spi_info = info;
-       int intr = get_gpio_by_name("max3111_int");
-
-       spi_info->mode = SPI_MODE_0;
-       if (intr == -1)
-               return NULL;
-       spi_info->irq = intr + MRST_IRQ_OFFSET;
-       return NULL;
-}
-
-/* we have multiple max7315 on the board ... */
-#define MAX7315_NUM 2
-static void __init *max7315_platform_data(void *info)
-{
-       static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
-       static int nr;
-       struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
-       struct i2c_board_info *i2c_info = info;
-       int gpio_base, intr;
-       char base_pin_name[SFI_NAME_LEN + 1];
-       char intr_pin_name[SFI_NAME_LEN + 1];
-
-       if (nr == MAX7315_NUM) {
-               pr_err("too many max7315s, we only support %d\n",
-                               MAX7315_NUM);
-               return NULL;
-       }
-       /* we have several max7315 on the board, we only need load several
-        * instances of the same pca953x driver to cover them
-        */
-       strcpy(i2c_info->type, "max7315");
-       if (nr++) {
-               sprintf(base_pin_name, "max7315_%d_base", nr);
-               sprintf(intr_pin_name, "max7315_%d_int", nr);
-       } else {
-               strcpy(base_pin_name, "max7315_base");
-               strcpy(intr_pin_name, "max7315_int");
-       }
-
-       gpio_base = get_gpio_by_name(base_pin_name);
-       intr = get_gpio_by_name(intr_pin_name);
-
-       if (gpio_base == -1)
-               return NULL;
-       max7315->gpio_base = gpio_base;
-       if (intr != -1) {
-               i2c_info->irq = intr + MRST_IRQ_OFFSET;
-               max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
-       } else {
-               i2c_info->irq = -1;
-               max7315->irq_base = -1;
-       }
-       return max7315;
-}
-
-static void *tca6416_platform_data(void *info)
-{
-       static struct pca953x_platform_data tca6416;
-       struct i2c_board_info *i2c_info = info;
-       int gpio_base, intr;
-       char base_pin_name[SFI_NAME_LEN + 1];
-       char intr_pin_name[SFI_NAME_LEN + 1];
-
-       strcpy(i2c_info->type, "tca6416");
-       strcpy(base_pin_name, "tca6416_base");
-       strcpy(intr_pin_name, "tca6416_int");
-
-       gpio_base = get_gpio_by_name(base_pin_name);
-       intr = get_gpio_by_name(intr_pin_name);
-
-       if (gpio_base == -1)
-               return NULL;
-       tca6416.gpio_base = gpio_base;
-       if (intr != -1) {
-               i2c_info->irq = intr + MRST_IRQ_OFFSET;
-               tca6416.irq_base = gpio_base + MRST_IRQ_OFFSET;
-       } else {
-               i2c_info->irq = -1;
-               tca6416.irq_base = -1;
-       }
-       return &tca6416;
-}
-
-static void *mpu3050_platform_data(void *info)
-{
-       struct i2c_board_info *i2c_info = info;
-       int intr = get_gpio_by_name("mpu3050_int");
-
-       if (intr == -1)
-               return NULL;
-
-       i2c_info->irq = intr + MRST_IRQ_OFFSET;
-       return NULL;
-}
-
-static void __init *emc1403_platform_data(void *info)
-{
-       static short intr2nd_pdata;
-       struct i2c_board_info *i2c_info = info;
-       int intr = get_gpio_by_name("thermal_int");
-       int intr2nd = get_gpio_by_name("thermal_alert");
-
-       if (intr == -1 || intr2nd == -1)
-               return NULL;
-
-       i2c_info->irq = intr + MRST_IRQ_OFFSET;
-       intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
-
-       return &intr2nd_pdata;
-}
-
-static void __init *lis331dl_platform_data(void *info)
-{
-       static short intr2nd_pdata;
-       struct i2c_board_info *i2c_info = info;
-       int intr = get_gpio_by_name("accel_int");
-       int intr2nd = get_gpio_by_name("accel_2");
-
-       if (intr == -1 || intr2nd == -1)
-               return NULL;
-
-       i2c_info->irq = intr + MRST_IRQ_OFFSET;
-       intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
-
-       return &intr2nd_pdata;
-}
-
-static void __init *no_platform_data(void *info)
-{
-       return NULL;
-}
-
-static struct resource msic_resources[] = {
-       {
-               .start  = INTEL_MSIC_IRQ_PHYS_BASE,
-               .end    = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1,
-               .flags  = IORESOURCE_MEM,
-       },
-};
-
-static struct intel_msic_platform_data msic_pdata;
-
-static struct platform_device msic_device = {
-       .name           = "intel_msic",
-       .id             = -1,
-       .dev            = {
-               .platform_data  = &msic_pdata,
-       },
-       .num_resources  = ARRAY_SIZE(msic_resources),
-       .resource       = msic_resources,
-};
-
-static inline bool mrst_has_msic(void)
-{
-       return mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL;
-}
-
-static int msic_scu_status_change(struct notifier_block *nb,
-                                 unsigned long code, void *data)
-{
-       if (code == SCU_DOWN) {
-               platform_device_unregister(&msic_device);
-               return 0;
-       }
-
-       return platform_device_register(&msic_device);
-}
-
-static int __init msic_init(void)
-{
-       static struct notifier_block msic_scu_notifier = {
-               .notifier_call  = msic_scu_status_change,
-       };
-
-       /*
-        * We need to be sure that the SCU IPC is ready before MSIC device
-        * can be registered.
-        */
-       if (mrst_has_msic())
-               intel_scu_notifier_add(&msic_scu_notifier);
-
-       return 0;
-}
-arch_initcall(msic_init);
-
-/*
- * msic_generic_platform_data - sets generic platform data for the block
- * @info: pointer to the SFI device table entry for this block
- * @block: MSIC block
- *
- * Function sets IRQ number from the SFI table entry for given device to
- * the MSIC platform data.
- */
-static void *msic_generic_platform_data(void *info, enum intel_msic_block block)
-{
-       struct sfi_device_table_entry *entry = info;
-
-       BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST);
-       msic_pdata.irq[block] = entry->irq;
-
-       return no_platform_data(info);
-}
-
-static void *msic_battery_platform_data(void *info)
-{
-       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY);
-}
-
-static void *msic_gpio_platform_data(void *info)
-{
-       static struct intel_msic_gpio_pdata pdata;
-       int gpio = get_gpio_by_name("msic_gpio_base");
-
-       if (gpio < 0)
-               return NULL;
-
-       pdata.gpio_base = gpio;
-       msic_pdata.gpio = &pdata;
-
-       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO);
-}
-
-static void *msic_audio_platform_data(void *info)
-{
-       struct platform_device *pdev;
-
-       pdev = platform_device_register_simple("sst-platform", -1, NULL, 0);
-       if (IS_ERR(pdev)) {
-               pr_err("failed to create audio platform device\n");
-               return NULL;
-       }
-
-       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO);
-}
-
-static void *msic_power_btn_platform_data(void *info)
-{
-       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN);
-}
-
-static void *msic_ocd_platform_data(void *info)
-{
-       static struct intel_msic_ocd_pdata pdata;
-       int gpio = get_gpio_by_name("ocd_gpio");
-
-       if (gpio < 0)
-               return NULL;
-
-       pdata.gpio = gpio;
-       msic_pdata.ocd = &pdata;
-
-       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD);
-}
-
-static void *msic_thermal_platform_data(void *info)
-{
-       return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL);
-}
-
-/* tc35876x DSI-LVDS bridge chip and panel platform data */
-static void *tc35876x_platform_data(void *data)
-{
-       static struct tc35876x_platform_data pdata;
-
-       /* gpio pins set to -1 will not be used by the driver */
-       pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN");
-       pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN");
-       pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3");
-
-       return &pdata;
-}
-
-static const struct devs_id __initconst device_ids[] = {
-       {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data},
-       {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
-       {"pmic_gpio", SFI_DEV_TYPE_IPC, 1, &pmic_gpio_platform_data},
-       {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
-       {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
-       {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
-       {"tca6416", SFI_DEV_TYPE_I2C, 1, &tca6416_platform_data},
-       {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
-       {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
-       {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
-       {"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data},
-       {"i2c_disp_brig", SFI_DEV_TYPE_I2C, 0, &tc35876x_platform_data},
-
-       /* MSIC subdevices */
-       {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data},
-       {"msic_gpio", SFI_DEV_TYPE_IPC, 1, &msic_gpio_platform_data},
-       {"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data},
-       {"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data},
-       {"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data},
-       {"msic_thermal", SFI_DEV_TYPE_IPC, 1, &msic_thermal_platform_data},
-
-       {},
-};
-
-#define MAX_IPCDEVS    24
-static struct platform_device *ipc_devs[MAX_IPCDEVS];
-static int ipc_next_dev;
-
-#define MAX_SCU_SPI    24
-static struct spi_board_info *spi_devs[MAX_SCU_SPI];
-static int spi_next_dev;
-
-#define MAX_SCU_I2C    24
-static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
-static int i2c_bus[MAX_SCU_I2C];
-static int i2c_next_dev;
-
-static void __init intel_scu_device_register(struct platform_device *pdev)
-{
-       if(ipc_next_dev == MAX_IPCDEVS)
-               pr_err("too many SCU IPC devices");
-       else
-               ipc_devs[ipc_next_dev++] = pdev;
-}
-
-static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
-{
-       struct spi_board_info *new_dev;
-
-       if (spi_next_dev == MAX_SCU_SPI) {
-               pr_err("too many SCU SPI devices");
-               return;
-       }
-
-       new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
-       if (!new_dev) {
-               pr_err("failed to alloc mem for delayed spi dev %s\n",
-                       sdev->modalias);
-               return;
-       }
-       memcpy(new_dev, sdev, sizeof(*sdev));
-
-       spi_devs[spi_next_dev++] = new_dev;
-}
-
-static void __init intel_scu_i2c_device_register(int bus,
-                                               struct i2c_board_info *idev)
-{
-       struct i2c_board_info *new_dev;
-
-       if (i2c_next_dev == MAX_SCU_I2C) {
-               pr_err("too many SCU I2C devices");
-               return;
-       }
-
-       new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
-       if (!new_dev) {
-               pr_err("failed to alloc mem for delayed i2c dev %s\n",
-                       idev->type);
-               return;
-       }
-       memcpy(new_dev, idev, sizeof(*idev));
-
-       i2c_bus[i2c_next_dev] = bus;
-       i2c_devs[i2c_next_dev++] = new_dev;
-}
-
-BLOCKING_NOTIFIER_HEAD(intel_scu_notifier);
-EXPORT_SYMBOL_GPL(intel_scu_notifier);
-
-/* Called by IPC driver */
-void intel_scu_devices_create(void)
-{
-       int i;
-
-       for (i = 0; i < ipc_next_dev; i++)
-               platform_device_add(ipc_devs[i]);
-
-       for (i = 0; i < spi_next_dev; i++)
-               spi_register_board_info(spi_devs[i], 1);
-
-       for (i = 0; i < i2c_next_dev; i++) {
-               struct i2c_adapter *adapter;
-               struct i2c_client *client;
-
-               adapter = i2c_get_adapter(i2c_bus[i]);
-               if (adapter) {
-                       client = i2c_new_device(adapter, i2c_devs[i]);
-                       if (!client)
-                               pr_err("can't create i2c device %s\n",
-                                       i2c_devs[i]->type);
-               } else
-                       i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
-       }
-       intel_scu_notifier_post(SCU_AVAILABLE, NULL);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_create);
-
-/* Called by IPC driver */
-void intel_scu_devices_destroy(void)
-{
-       int i;
-
-       intel_scu_notifier_post(SCU_DOWN, NULL);
-
-       for (i = 0; i < ipc_next_dev; i++)
-               platform_device_del(ipc_devs[i]);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
-
-static void __init install_irq_resource(struct platform_device *pdev, int irq)
-{
-       /* Single threaded */
-       static struct resource __initdata res = {
-               .name = "IRQ",
-               .flags = IORESOURCE_IRQ,
-       };
-       res.start = irq;
-       platform_device_add_resources(pdev, &res, 1);
-}
-
-static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry)
-{
-       const struct devs_id *dev = device_ids;
-       struct platform_device *pdev;
-       void *pdata = NULL;
-
-       while (dev->name[0]) {
-               if (dev->type == SFI_DEV_TYPE_IPC &&
-                       !strncmp(dev->name, entry->name, SFI_NAME_LEN)) {
-                       pdata = dev->get_platform_data(entry);
-                       break;
-               }
-               dev++;
-       }
-
-       /*
-        * On Medfield the platform device creation is handled by the MSIC
-        * MFD driver so we don't need to do it here.
-        */
-       if (mrst_has_msic())
-               return;
-
-       pdev = platform_device_alloc(entry->name, 0);
-       if (pdev == NULL) {
-               pr_err("out of memory for SFI platform device '%s'.\n",
-                       entry->name);
-               return;
-       }
-       install_irq_resource(pdev, entry->irq);
-
-       pdev->dev.platform_data = pdata;
-       intel_scu_device_register(pdev);
-}
-
-static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
-{
-       const struct devs_id *dev = device_ids;
-       void *pdata = NULL;
-
-       while (dev->name[0]) {
-               if (dev->type == SFI_DEV_TYPE_SPI &&
-                               !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
-                       pdata = dev->get_platform_data(spi_info);
-                       break;
-               }
-               dev++;
-       }
-       spi_info->platform_data = pdata;
-       if (dev->delay)
-               intel_scu_spi_device_register(spi_info);
-       else
-               spi_register_board_info(spi_info, 1);
-}
-
-static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
-{
-       const struct devs_id *dev = device_ids;
-       void *pdata = NULL;
-
-       while (dev->name[0]) {
-               if (dev->type == SFI_DEV_TYPE_I2C &&
-                       !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
-                       pdata = dev->get_platform_data(i2c_info);
-                       break;
-               }
-               dev++;
-       }
-       i2c_info->platform_data = pdata;
-
-       if (dev->delay)
-               intel_scu_i2c_device_register(bus, i2c_info);
-       else
-               i2c_register_board_info(bus, i2c_info, 1);
- }
-
-
-static int __init sfi_parse_devs(struct sfi_table_header *table)
-{
-       struct sfi_table_simple *sb;
-       struct sfi_device_table_entry *pentry;
-       struct spi_board_info spi_info;
-       struct i2c_board_info i2c_info;
-       int num, i, bus;
-       int ioapic;
-       struct io_apic_irq_attr irq_attr;
-
-       sb = (struct sfi_table_simple *)table;
-       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
-       pentry = (struct sfi_device_table_entry *)sb->pentry;
-
-       for (i = 0; i < num; i++, pentry++) {
-               int irq = pentry->irq;
-
-               if (irq != (u8)0xff) { /* native RTE case */
-                       /* these SPI2 devices are not exposed to system as PCI
-                        * devices, but they have separate RTE entry in IOAPIC
-                        * so we have to enable them one by one here
-                        */
-                       ioapic = mp_find_ioapic(irq);
-                       irq_attr.ioapic = ioapic;
-                       irq_attr.ioapic_pin = irq;
-                       irq_attr.trigger = 1;
-                       irq_attr.polarity = 1;
-                       io_apic_set_pci_routing(NULL, irq, &irq_attr);
-               } else
-                       irq = 0; /* No irq */
-
-               switch (pentry->type) {
-               case SFI_DEV_TYPE_IPC:
-                       pr_debug("info[%2d]: IPC bus, name = %16.16s, "
-                               "irq = 0x%2x\n", i, pentry->name, pentry->irq);
-                       sfi_handle_ipc_dev(pentry);
-                       break;
-               case SFI_DEV_TYPE_SPI:
-                       memset(&spi_info, 0, sizeof(spi_info));
-                       strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
-                       spi_info.irq = irq;
-                       spi_info.bus_num = pentry->host_num;
-                       spi_info.chip_select = pentry->addr;
-                       spi_info.max_speed_hz = pentry->max_freq;
-                       pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
-                               "irq = 0x%2x, max_freq = %d, cs = %d\n", i,
-                               spi_info.bus_num,
-                               spi_info.modalias,
-                               spi_info.irq,
-                               spi_info.max_speed_hz,
-                               spi_info.chip_select);
-                       sfi_handle_spi_dev(&spi_info);
-                       break;
-               case SFI_DEV_TYPE_I2C:
-                       memset(&i2c_info, 0, sizeof(i2c_info));
-                       bus = pentry->host_num;
-                       strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
-                       i2c_info.irq = irq;
-                       i2c_info.addr = pentry->addr;
-                       pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
-                               "irq = 0x%2x, addr = 0x%x\n", i, bus,
-                               i2c_info.type,
-                               i2c_info.irq,
-                               i2c_info.addr);
-                       sfi_handle_i2c_dev(bus, &i2c_info);
-                       break;
-               case SFI_DEV_TYPE_UART:
-               case SFI_DEV_TYPE_HSI:
-               default:
-                       ;
-               }
-       }
-       return 0;
-}
-
-static int __init mrst_platform_init(void)
-{
-       sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
-       sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
-       return 0;
-}
-arch_initcall(mrst_platform_init);
-
-/*
- * we will search these buttons in SFI GPIO table (by name)
- * and register them dynamically. Please add all possible
- * buttons here, we will shrink them if no GPIO found.
- */
-static struct gpio_keys_button gpio_button[] = {
-       {KEY_POWER,             -1, 1, "power_btn",     EV_KEY, 0, 3000},
-       {KEY_PROG1,             -1, 1, "prog_btn1",     EV_KEY, 0, 20},
-       {KEY_PROG2,             -1, 1, "prog_btn2",     EV_KEY, 0, 20},
-       {SW_LID,                -1, 1, "lid_switch",    EV_SW,  0, 20},
-       {KEY_VOLUMEUP,          -1, 1, "vol_up",        EV_KEY, 0, 20},
-       {KEY_VOLUMEDOWN,        -1, 1, "vol_down",      EV_KEY, 0, 20},
-       {KEY_CAMERA,            -1, 1, "camera_full",   EV_KEY, 0, 20},
-       {KEY_CAMERA_FOCUS,      -1, 1, "camera_half",   EV_KEY, 0, 20},
-       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw1",        EV_SW,  0, 20},
-       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw2",        EV_SW,  0, 20},
-};
-
-static struct gpio_keys_platform_data mrst_gpio_keys = {
-       .buttons        = gpio_button,
-       .rep            = 1,
-       .nbuttons       = -1, /* will fill it after search */
-};
-
-static struct platform_device pb_device = {
-       .name           = "gpio-keys",
-       .id             = -1,
-       .dev            = {
-               .platform_data  = &mrst_gpio_keys,
-       },
-};
-
-/*
- * Shrink the non-existent buttons, register the gpio button
- * device if there is some
- */
-static int __init pb_keys_init(void)
-{
-       struct gpio_keys_button *gb = gpio_button;
-       int i, num, good = 0;
-
-       num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
-       for (i = 0; i < num; i++) {
-               gb[i].gpio = get_gpio_by_name(gb[i].desc);
-               pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio);
-               if (gb[i].gpio == -1)
-                       continue;
-
-               if (i != good)
-                       gb[good] = gb[i];
-               good++;
-       }
-
-       if (good) {
-               mrst_gpio_keys.nbuttons = good;
-               return platform_device_register(&pb_device);
-       }
-       return 0;
-}
-late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile

index 6c40995fefb81041586d4cf9d0acc1d4f74e460c..52079bebd0147f247d8de819fc4619c32b45147f 100644 (file)
--- a/arch/x86/platform/uv/Makefile
+++ b/arch/x86/platform/uv/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_X86_UV)           += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
+obj-$(CONFIG_X86_UV)           += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c

new file mode 100644 (file)

index 0000000..9b8ac60
--- /dev/null
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -0,0 +1,711 @@
+/*
+ * SGI NMI/TRACE support routines
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2009-2013 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Mike Travis
+ */
+
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/kdb.h>
+#include <linux/kexec.h>
+#include <linux/kgdb.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/apic.h>
+#include <asm/current.h>
+#include <asm/kdebug.h>
+#include <asm/local64.h>
+#include <asm/nmi.h>
+#include <asm/traps.h>
+#include <asm/uv/uv.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_mmrs.h>
+
+void (*uv_trace_func)(const char *f, const int l, const char *fmt, ...);
+EXPORT_SYMBOL(uv_trace_func);
+
+void (*uv_trace_nmi_func)(unsigned int reason, struct pt_regs *regs);
+EXPORT_SYMBOL(uv_trace_nmi_func);
+
+
+/*
+ * UV handler for NMI
+ *
+ * Handle system-wide NMI events generated by the global 'power nmi' command.
+ *
+ * Basic operation is to field the NMI interrupt on each cpu and wait
+ * until all cpus have arrived into the nmi handler.  If some cpus do not
+ * make it into the handler, try and force them in with the IPI(NMI) signal.
+ *
+ * We also have to lessen UV Hub MMR accesses as much as possible as this
+ * disrupts the UV Hub's primary mission of directing NumaLink traffic and
+ * can cause system problems to occur.
+ *
+ * To do this we register our primary NMI notifier on the NMI_UNKNOWN
+ * chain.  This reduces the number of false NMI calls when the perf
+ * tools are running which generate an enormous number of NMIs per
+ * second (~4M/s for 1024 cpu threads).  Our secondary NMI handler is
+ * very short as it only checks that if it has been "pinged" with the
+ * IPI(NMI) signal as mentioned above, and does not read the UV Hub's MMR.
+ *
+ */
+
+static struct uv_hub_nmi_s **uv_hub_nmi_list;
+
+DEFINE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_nmi);
+
+static unsigned long nmi_mmr;
+static unsigned long nmi_mmr_clear;
+static unsigned long nmi_mmr_pending;
+
+static atomic_t        uv_in_nmi;
+static atomic_t uv_nmi_cpu = ATOMIC_INIT(-1);
+static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1);
+static atomic_t uv_nmi_slave_continue;
+static atomic_t uv_nmi_kexec_failed;
+static cpumask_var_t uv_nmi_cpu_mask;
+
+/* Values for uv_nmi_slave_continue */
+#define SLAVE_CLEAR    0
+#define SLAVE_CONTINUE 1
+#define SLAVE_EXIT     2
+
+/*
+ * Default is all stack dumps go to the console and buffer.
+ * Lower level to send to log buffer only.
+ */
+static int uv_nmi_loglevel = 7;
+module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644);
+
+/*
+ * The following values show statistics on how perf events are affecting
+ * this system.
+ */
+static int param_get_local64(char *buffer, const struct kernel_param *kp)
+{
+       return sprintf(buffer, "%lu\n", local64_read((local64_t *)kp->arg));
+}
+
+static int param_set_local64(const char *val, const struct kernel_param *kp)
+{
+       /* clear on any write */
+       local64_set((local64_t *)kp->arg, 0);
+       return 0;
+}
+
+static struct kernel_param_ops param_ops_local64 = {
+       .get = param_get_local64,
+       .set = param_set_local64,
+};
+#define param_check_local64(name, p) __param_check(name, p, local64_t)
+
+static local64_t uv_nmi_count;
+module_param_named(nmi_count, uv_nmi_count, local64, 0644);
+
+static local64_t uv_nmi_misses;
+module_param_named(nmi_misses, uv_nmi_misses, local64, 0644);
+
+static local64_t uv_nmi_ping_count;
+module_param_named(ping_count, uv_nmi_ping_count, local64, 0644);
+
+static local64_t uv_nmi_ping_misses;
+module_param_named(ping_misses, uv_nmi_ping_misses, local64, 0644);
+
+/*
+ * Following values allow tuning for large systems under heavy loading
+ */
+static int uv_nmi_initial_delay = 100;
+module_param_named(initial_delay, uv_nmi_initial_delay, int, 0644);
+
+static int uv_nmi_slave_delay = 100;
+module_param_named(slave_delay, uv_nmi_slave_delay, int, 0644);
+
+static int uv_nmi_loop_delay = 100;
+module_param_named(loop_delay, uv_nmi_loop_delay, int, 0644);
+
+static int uv_nmi_trigger_delay = 10000;
+module_param_named(trigger_delay, uv_nmi_trigger_delay, int, 0644);
+
+static int uv_nmi_wait_count = 100;
+module_param_named(wait_count, uv_nmi_wait_count, int, 0644);
+
+static int uv_nmi_retry_count = 500;
+module_param_named(retry_count, uv_nmi_retry_count, int, 0644);
+
+/*
+ * Valid NMI Actions:
+ *  "dump"     - dump process stack for each cpu
+ *  "ips"      - dump IP info for each cpu
+ *  "kdump"    - do crash dump
+ *  "kdb"      - enter KDB/KGDB (default)
+ */
+static char uv_nmi_action[8] = "kdb";
+module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644);
+
+static inline bool uv_nmi_action_is(const char *action)
+{
+       return (strncmp(uv_nmi_action, action, strlen(action)) == 0);
+}
+
+/* Setup which NMI support is present in system */
+static void uv_nmi_setup_mmrs(void)
+{
+       if (uv_read_local_mmr(UVH_NMI_MMRX_SUPPORTED)) {
+               uv_write_local_mmr(UVH_NMI_MMRX_REQ,
+                                       1UL << UVH_NMI_MMRX_REQ_SHIFT);
+               nmi_mmr = UVH_NMI_MMRX;
+               nmi_mmr_clear = UVH_NMI_MMRX_CLEAR;
+               nmi_mmr_pending = 1UL << UVH_NMI_MMRX_SHIFT;
+               pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMRX_TYPE);
+       } else {
+               nmi_mmr = UVH_NMI_MMR;
+               nmi_mmr_clear = UVH_NMI_MMR_CLEAR;
+               nmi_mmr_pending = 1UL << UVH_NMI_MMR_SHIFT;
+               pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMR_TYPE);
+       }
+}
+
+/* Read NMI MMR and check if NMI flag was set by BMC. */
+static inline int uv_nmi_test_mmr(struct uv_hub_nmi_s *hub_nmi)
+{
+       hub_nmi->nmi_value = uv_read_local_mmr(nmi_mmr);
+       atomic_inc(&hub_nmi->read_mmr_count);
+       return !!(hub_nmi->nmi_value & nmi_mmr_pending);
+}
+
+static inline void uv_local_mmr_clear_nmi(void)
+{
+       uv_write_local_mmr(nmi_mmr_clear, nmi_mmr_pending);
+}
+
+/*
+ * If first cpu in on this hub, set hub_nmi "in_nmi" and "owner" values and
+ * return true.  If first cpu in on the system, set global "in_nmi" flag.
+ */
+static int uv_set_in_nmi(int cpu, struct uv_hub_nmi_s *hub_nmi)
+{
+       int first = atomic_add_unless(&hub_nmi->in_nmi, 1, 1);
+
+       if (first) {
+               atomic_set(&hub_nmi->cpu_owner, cpu);
+               if (atomic_add_unless(&uv_in_nmi, 1, 1))
+                       atomic_set(&uv_nmi_cpu, cpu);
+
+               atomic_inc(&hub_nmi->nmi_count);
+       }
+       return first;
+}
+
+/* Check if this is a system NMI event */
+static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
+{
+       int cpu = smp_processor_id();
+       int nmi = 0;
+
+       local64_inc(&uv_nmi_count);
+       uv_cpu_nmi.queries++;
+
+       do {
+               nmi = atomic_read(&hub_nmi->in_nmi);
+               if (nmi)
+                       break;
+
+               if (raw_spin_trylock(&hub_nmi->nmi_lock)) {
+
+                       /* check hub MMR NMI flag */
+                       if (uv_nmi_test_mmr(hub_nmi)) {
+                               uv_set_in_nmi(cpu, hub_nmi);
+                               nmi = 1;
+                               break;
+                       }
+
+                       /* MMR NMI flag is clear */
+                       raw_spin_unlock(&hub_nmi->nmi_lock);
+
+               } else {
+                       /* wait a moment for the hub nmi locker to set flag */
+                       cpu_relax();
+                       udelay(uv_nmi_slave_delay);
+
+                       /* re-check hub in_nmi flag */
+                       nmi = atomic_read(&hub_nmi->in_nmi);
+                       if (nmi)
+                               break;
+               }
+
+               /* check if this BMC missed setting the MMR NMI flag */
+               if (!nmi) {
+                       nmi = atomic_read(&uv_in_nmi);
+                       if (nmi)
+                               uv_set_in_nmi(cpu, hub_nmi);
+               }
+
+       } while (0);
+
+       if (!nmi)
+               local64_inc(&uv_nmi_misses);
+
+       return nmi;
+}
+
+/* Need to reset the NMI MMR register, but only once per hub. */
+static inline void uv_clear_nmi(int cpu)
+{
+       struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi;
+
+       if (cpu == atomic_read(&hub_nmi->cpu_owner)) {
+               atomic_set(&hub_nmi->cpu_owner, -1);
+               atomic_set(&hub_nmi->in_nmi, 0);
+               uv_local_mmr_clear_nmi();
+               raw_spin_unlock(&hub_nmi->nmi_lock);
+       }
+}
+
+/* Print non-responding cpus */
+static void uv_nmi_nr_cpus_pr(char *fmt)
+{
+       static char cpu_list[1024];
+       int len = sizeof(cpu_list);
+       int c = cpumask_weight(uv_nmi_cpu_mask);
+       int n = cpulist_scnprintf(cpu_list, len, uv_nmi_cpu_mask);
+
+       if (n >= len-1)
+               strcpy(&cpu_list[len - 6], "...\n");
+
+       printk(fmt, c, cpu_list);
+}
+
+/* Ping non-responding cpus attemping to force them into the NMI handler */
+static void uv_nmi_nr_cpus_ping(void)
+{
+       int cpu;
+
+       for_each_cpu(cpu, uv_nmi_cpu_mask)
+               atomic_set(&uv_cpu_nmi_per(cpu).pinging, 1);
+
+       apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI);
+}
+
+/* Clean up flags for cpus that ignored both NMI and ping */
+static void uv_nmi_cleanup_mask(void)
+{
+       int cpu;
+
+       for_each_cpu(cpu, uv_nmi_cpu_mask) {
+               atomic_set(&uv_cpu_nmi_per(cpu).pinging, 0);
+               atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_OUT);
+               cpumask_clear_cpu(cpu, uv_nmi_cpu_mask);
+       }
+}
+
+/* Loop waiting as cpus enter nmi handler */
+static int uv_nmi_wait_cpus(int first)
+{
+       int i, j, k, n = num_online_cpus();
+       int last_k = 0, waiting = 0;
+
+       if (first) {
+               cpumask_copy(uv_nmi_cpu_mask, cpu_online_mask);
+               k = 0;
+       } else {
+               k = n - cpumask_weight(uv_nmi_cpu_mask);
+       }
+
+       udelay(uv_nmi_initial_delay);
+       for (i = 0; i < uv_nmi_retry_count; i++) {
+               int loop_delay = uv_nmi_loop_delay;
+
+               for_each_cpu(j, uv_nmi_cpu_mask) {
+                       if (atomic_read(&uv_cpu_nmi_per(j).state)) {
+                               cpumask_clear_cpu(j, uv_nmi_cpu_mask);
+                               if (++k >= n)
+                                       break;
+                       }
+               }
+               if (k >= n) {           /* all in? */
+                       k = n;
+                       break;
+               }
+               if (last_k != k) {      /* abort if no new cpus coming in */
+                       last_k = k;
+                       waiting = 0;
+               } else if (++waiting > uv_nmi_wait_count)
+                       break;
+
+               /* extend delay if waiting only for cpu 0 */
+               if (waiting && (n - k) == 1 &&
+                   cpumask_test_cpu(0, uv_nmi_cpu_mask))
+                       loop_delay *= 100;
+
+               udelay(loop_delay);
+       }
+       atomic_set(&uv_nmi_cpus_in_nmi, k);
+       return n - k;
+}
+
+/* Wait until all slave cpus have entered UV NMI handler */
+static void uv_nmi_wait(int master)
+{
+       /* indicate this cpu is in */
+       atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_IN);
+
+       /* if not the first cpu in (the master), then we are a slave cpu */
+       if (!master)
+               return;
+
+       do {
+               /* wait for all other cpus to gather here */
+               if (!uv_nmi_wait_cpus(1))
+                       break;
+
+               /* if not all made it in, send IPI NMI to them */
+               uv_nmi_nr_cpus_pr(KERN_ALERT
+                       "UV: Sending NMI IPI to %d non-responding CPUs: %s\n");
+               uv_nmi_nr_cpus_ping();
+
+               /* if all cpus are in, then done */
+               if (!uv_nmi_wait_cpus(0))
+                       break;
+
+               uv_nmi_nr_cpus_pr(KERN_ALERT
+                       "UV: %d CPUs not in NMI loop: %s\n");
+       } while (0);
+
+       pr_alert("UV: %d of %d CPUs in NMI\n",
+               atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus());
+}
+
+static void uv_nmi_dump_cpu_ip_hdr(void)
+{
+       printk(KERN_DEFAULT
+               "\nUV: %4s %6s %-32s %s   (Note: PID 0 not listed)\n",
+               "CPU", "PID", "COMMAND", "IP");
+}
+
+static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs)
+{
+       printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ",
+               cpu, current->pid, current->comm);
+
+       printk_address(regs->ip, 1);
+}
+
+/* Dump this cpu's state */
+static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
+{
+       const char *dots = " ................................. ";
+
+       if (uv_nmi_action_is("ips")) {
+               if (cpu == 0)
+                       uv_nmi_dump_cpu_ip_hdr();
+
+               if (current->pid != 0)
+                       uv_nmi_dump_cpu_ip(cpu, regs);
+
+       } else if (uv_nmi_action_is("dump")) {
+               printk(KERN_DEFAULT
+                       "UV:%sNMI process trace for CPU %d\n", dots, cpu);
+               show_regs(regs);
+       }
+       atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
+}
+
+/* Trigger a slave cpu to dump it's state */
+static void uv_nmi_trigger_dump(int cpu)
+{
+       int retry = uv_nmi_trigger_delay;
+
+       if (atomic_read(&uv_cpu_nmi_per(cpu).state) != UV_NMI_STATE_IN)
+               return;
+
+       atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP);
+       do {
+               cpu_relax();
+               udelay(10);
+               if (atomic_read(&uv_cpu_nmi_per(cpu).state)
+                               != UV_NMI_STATE_DUMP)
+                       return;
+       } while (--retry > 0);
+
+       pr_crit("UV: CPU %d stuck in process dump function\n", cpu);
+       atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP_DONE);
+}
+
+/* Wait until all cpus ready to exit */
+static void uv_nmi_sync_exit(int master)
+{
+       atomic_dec(&uv_nmi_cpus_in_nmi);
+       if (master) {
+               while (atomic_read(&uv_nmi_cpus_in_nmi) > 0)
+                       cpu_relax();
+               atomic_set(&uv_nmi_slave_continue, SLAVE_CLEAR);
+       } else {
+               while (atomic_read(&uv_nmi_slave_continue))
+                       cpu_relax();
+       }
+}
+
+/* Walk through cpu list and dump state of each */
+static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
+{
+       if (master) {
+               int tcpu;
+               int ignored = 0;
+               int saved_console_loglevel = console_loglevel;
+
+               pr_alert("UV: tracing %s for %d CPUs from CPU %d\n",
+                       uv_nmi_action_is("ips") ? "IPs" : "processes",
+                       atomic_read(&uv_nmi_cpus_in_nmi), cpu);
+
+               console_loglevel = uv_nmi_loglevel;
+               atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
+               for_each_online_cpu(tcpu) {
+                       if (cpumask_test_cpu(tcpu, uv_nmi_cpu_mask))
+                               ignored++;
+                       else if (tcpu == cpu)
+                               uv_nmi_dump_state_cpu(tcpu, regs);
+                       else
+                               uv_nmi_trigger_dump(tcpu);
+               }
+               if (ignored)
+                       printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n",
+                               ignored);
+
+               console_loglevel = saved_console_loglevel;
+               pr_alert("UV: process trace complete\n");
+       } else {
+               while (!atomic_read(&uv_nmi_slave_continue))
+                       cpu_relax();
+               while (atomic_read(&uv_cpu_nmi.state) != UV_NMI_STATE_DUMP)
+                       cpu_relax();
+               uv_nmi_dump_state_cpu(cpu, regs);
+       }
+       uv_nmi_sync_exit(master);
+}
+
+static void uv_nmi_touch_watchdogs(void)
+{
+       touch_softlockup_watchdog_sync();
+       clocksource_touch_watchdog();
+       rcu_cpu_stall_reset();
+       touch_nmi_watchdog();
+}
+
+#if defined(CONFIG_KEXEC)
+static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
+{
+       /* Call crash to dump system state */
+       if (master) {
+               pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu);
+               crash_kexec(regs);
+
+               pr_emerg("UV: crash_kexec unexpectedly returned, ");
+               if (!kexec_crash_image) {
+                       pr_cont("crash kernel not loaded\n");
+                       atomic_set(&uv_nmi_kexec_failed, 1);
+                       uv_nmi_sync_exit(1);
+                       return;
+               }
+               pr_cont("kexec busy, stalling cpus while waiting\n");
+       }
+
+       /* If crash exec fails the slaves should return, otherwise stall */
+       while (atomic_read(&uv_nmi_kexec_failed) == 0)
+               mdelay(10);
+
+       /* Crash kernel most likely not loaded, return in an orderly fashion */
+       uv_nmi_sync_exit(0);
+}
+
+#else /* !CONFIG_KEXEC */
+static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
+{
+       if (master)
+               pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
+}
+#endif /* !CONFIG_KEXEC */
+
+#ifdef CONFIG_KGDB_KDB
+/* Call KDB from NMI handler */
+static void uv_call_kdb(int cpu, struct pt_regs *regs, int master)
+{
+       int ret;
+
+       if (master) {
+               /* call KGDB NMI handler as MASTER */
+               ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs,
+                                       &uv_nmi_slave_continue);
+               if (ret) {
+                       pr_alert("KDB returned error, is kgdboc set?\n");
+                       atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
+               }
+       } else {
+               /* wait for KGDB signal that it's ready for slaves to enter */
+               int sig;
+
+               do {
+                       cpu_relax();
+                       sig = atomic_read(&uv_nmi_slave_continue);
+               } while (!sig);
+
+               /* call KGDB as slave */
+               if (sig == SLAVE_CONTINUE)
+                       kgdb_nmicallback(cpu, regs);
+       }
+       uv_nmi_sync_exit(master);
+}
+
+#else /* !CONFIG_KGDB_KDB */
+static inline void uv_call_kdb(int cpu, struct pt_regs *regs, int master)
+{
+       pr_err("UV: NMI error: KGDB/KDB is not enabled in this kernel\n");
+}
+#endif /* !CONFIG_KGDB_KDB */
+
+/*
+ * UV NMI handler
+ */
+int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
+{
+       struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi;
+       int cpu = smp_processor_id();
+       int master = 0;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       /* If not a UV System NMI, ignore */
+       if (!atomic_read(&uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) {
+               local_irq_restore(flags);
+               return NMI_DONE;
+       }
+
+       /* Call possible NMI trace function */
+       if (unlikely(uv_trace_nmi_func))
+               (uv_trace_nmi_func)(reason, regs);
+
+       /* Indicate we are the first CPU into the NMI handler */
+       master = (atomic_read(&uv_nmi_cpu) == cpu);
+
+       /* If NMI action is "kdump", then attempt to do it */
+       if (uv_nmi_action_is("kdump"))
+               uv_nmi_kdump(cpu, master, regs);
+
+       /* Pause as all cpus enter the NMI handler */
+       uv_nmi_wait(master);
+
+       /* Dump state of each cpu */
+       if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump"))
+               uv_nmi_dump_state(cpu, regs, master);
+
+       /* Call KDB if enabled */
+       else if (uv_nmi_action_is("kdb"))
+               uv_call_kdb(cpu, regs, master);
+
+       /* Clear per_cpu "in nmi" flag */
+       atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT);
+
+       /* Clear MMR NMI flag on each hub */
+       uv_clear_nmi(cpu);
+
+       /* Clear global flags */
+       if (master) {
+               if (cpumask_weight(uv_nmi_cpu_mask))
+                       uv_nmi_cleanup_mask();
+               atomic_set(&uv_nmi_cpus_in_nmi, -1);
+               atomic_set(&uv_nmi_cpu, -1);
+               atomic_set(&uv_in_nmi, 0);
+       }
+
+       uv_nmi_touch_watchdogs();
+       local_irq_restore(flags);
+
+       return NMI_HANDLED;
+}
+
+/*
+ * NMI handler for pulling in CPUs when perf events are grabbing our NMI
+ */
+int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
+{
+       int ret;
+
+       uv_cpu_nmi.queries++;
+       if (!atomic_read(&uv_cpu_nmi.pinging)) {
+               local64_inc(&uv_nmi_ping_misses);
+               return NMI_DONE;
+       }
+
+       uv_cpu_nmi.pings++;
+       local64_inc(&uv_nmi_ping_count);
+       ret = uv_handle_nmi(reason, regs);
+       atomic_set(&uv_cpu_nmi.pinging, 0);
+       return ret;
+}
+
+void uv_register_nmi_notifier(void)
+{
+       if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
+               pr_warn("UV: NMI handler failed to register\n");
+
+       if (register_nmi_handler(NMI_LOCAL, uv_handle_nmi_ping, 0, "uvping"))
+               pr_warn("UV: PING NMI handler failed to register\n");
+}
+
+void uv_nmi_init(void)
+{
+       unsigned int value;
+
+       /*
+        * Unmask NMI on all cpus
+        */
+       value = apic_read(APIC_LVT1) | APIC_DM_NMI;
+       value &= ~APIC_LVT_MASKED;
+       apic_write(APIC_LVT1, value);
+}
+
+void uv_nmi_setup(void)
+{
+       int size = sizeof(void *) * (1 << NODES_SHIFT);
+       int cpu, nid;
+
+       /* Setup hub nmi info */
+       uv_nmi_setup_mmrs();
+       uv_hub_nmi_list = kzalloc(size, GFP_KERNEL);
+       pr_info("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size);
+       BUG_ON(!uv_hub_nmi_list);
+       size = sizeof(struct uv_hub_nmi_s);
+       for_each_present_cpu(cpu) {
+               nid = cpu_to_node(cpu);
+               if (uv_hub_nmi_list[nid] == NULL) {
+                       uv_hub_nmi_list[nid] = kzalloc_node(size,
+                                                           GFP_KERNEL, nid);
+                       BUG_ON(!uv_hub_nmi_list[nid]);
+                       raw_spin_lock_init(&(uv_hub_nmi_list[nid]->nmi_lock));
+                       atomic_set(&uv_hub_nmi_list[nid]->cpu_owner, -1);
+               }
+               uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid];
+       }
+       BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL));
+}
+
+
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c

index f7bab68a4b83e4094db9b70c50292fddd88098c1..11f9285a2ff66726b6c62c7eb608e03f516dcc27 100644 (file)
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -722,15 +722,25 @@ static void percpu_init(void)
  
  /*
   * Check to see if a symbol lies in the .data..percpu section.
- * For some as yet not understood reason the "__init_begin"
- * symbol which immediately preceeds the .data..percpu section
- * also shows up as it it were part of it so we do an explict
- * check for that symbol name and ignore it.
+ *
+ * The linker incorrectly associates some symbols with the
+ * .data..percpu section so we also need to check the symbol
+ * name to make sure that we classify the symbol correctly.
+ *
+ * The GNU linker incorrectly associates:
+ *     __init_begin
+ *     __per_cpu_load
+ *
+ * The "gold" linker incorrectly associates:
+ *     init_per_cpu__irq_stack_union
+ *     init_per_cpu__gdt_page
   */
  static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
  {
         return (sym->st_shndx == per_cpu_shndx) &&
-               strcmp(symname, "__init_begin");
+               strcmp(symname, "__init_begin") &&
+               strcmp(symname, "__per_cpu_load") &&
+               strncmp(symname, "init_per_cpu_", 13);
  }
  
  
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild

index 1b982641ec350e35ad0df918ff512da08393279b..228d6aee3a16528f0e5972371e5a1fd6feac9a79 100644 (file)
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -28,3 +28,4 @@ generic-y += termios.h
  generic-y += topology.h
  generic-y += trace_clock.h
  generic-y += xor.h
+generic-y += preempt.h
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c

index 35c8f2bbcc40b45510ea7d1b67ee294efc33471b..644516d9bde6cf18a824d29c3ae1730de30fd1a6 100644 (file)
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -119,17 +119,10 @@ static struct dmi_system_id processor_power_dmi_table[] = {
   */
  static void acpi_safe_halt(void)
  {
-       current_thread_info()->status &= ~TS_POLLING;
-       /*
-        * TS_POLLING-cleared state must be visible before we
-        * test NEED_RESCHED:
-        */
-       smp_mb();
-       if (!need_resched()) {
+       if (!tif_need_resched()) {
                 safe_halt();
                 local_irq_disable();
         }
-       current_thread_info()->status |= TS_POLLING;
  }
  
  #ifdef ARCH_APICTIMER_STOPS_ON_C3
@@ -734,6 +727,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
         if (unlikely(!pr))
                 return -EINVAL;
  
+       if (cx->entry_method == ACPI_CSTATE_FFH) {
+               if (current_set_polling_and_test())
+                       return -EINVAL;
+       }
+
         lapic_timer_state_broadcast(pr, cx, 1);
         acpi_idle_do_entry(cx);
  
@@ -787,18 +785,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
         if (unlikely(!pr))
                 return -EINVAL;
  
-       if (cx->entry_method != ACPI_CSTATE_FFH) {
-               current_thread_info()->status &= ~TS_POLLING;
-               /*
-                * TS_POLLING-cleared state must be visible before we test
-                * NEED_RESCHED:
-                */
-               smp_mb();
-
-               if (unlikely(need_resched())) {
-                       current_thread_info()->status |= TS_POLLING;
+       if (cx->entry_method == ACPI_CSTATE_FFH) {
+               if (current_set_polling_and_test())
                         return -EINVAL;
-               }
         }
  
         /*
@@ -816,9 +805,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
  
         sched_clock_idle_wakeup_event(0);
  
-       if (cx->entry_method != ACPI_CSTATE_FFH)
-               current_thread_info()->status |= TS_POLLING;
-
         lapic_timer_state_broadcast(pr, cx, 0);
         return index;
  }
@@ -855,18 +841,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
                 }
         }
  
-       if (cx->entry_method != ACPI_CSTATE_FFH) {
-               current_thread_info()->status &= ~TS_POLLING;
-               /*
-                * TS_POLLING-cleared state must be visible before we test
-                * NEED_RESCHED:
-                */
-               smp_mb();
-
-               if (unlikely(need_resched())) {
-                       current_thread_info()->status |= TS_POLLING;
+       if (cx->entry_method == ACPI_CSTATE_FFH) {
+               if (current_set_polling_and_test())
                         return -EINVAL;
-               }
         }
  
         acpi_unlazy_tlb(smp_processor_id());
@@ -912,9 +889,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
  
         sched_clock_idle_wakeup_event(0);
  
-       if (cx->entry_method != ACPI_CSTATE_FFH)
-               current_thread_info()->status |= TS_POLLING;
-
         lapic_timer_state_broadcast(pr, cx, 0);
         return index;
  }
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig

index 971d796e071d889c290029ada3b2dc862f10fcbb..5e940f839a2d5877fdc7db5ef758ef7591f36c62 100644 (file)
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -75,6 +75,21 @@ config ARM_ARCH_TIMER
         bool
         select CLKSRC_OF if OF
  
+config ARM_ARCH_TIMER_EVTSTREAM
+       bool "Support for ARM architected timer event stream generation"
+       default y if ARM_ARCH_TIMER
+       help
+         This option enables support for event stream generation based on
+         the ARM architected timer. It is used for waking up CPUs executing
+         the wfe instruction at a frequency represented as a power-of-2
+         divisor of the clock rate.
+         The main use of the event stream is wfe-based timeouts of userspace
+         locking implementations. It might also be useful for imposing timeout
+         on wfe to safeguard against any programming errors in case an expected
+         event is not generated.
+         This must be disabled for hardware validation purposes to detect any
+         hardware anomalies of missing events.
+
  config ARM_GLOBAL_TIMER
         bool
         select CLKSRC_OF if OF
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c

index fbd9ccd5e114ccdf1eb2db91499315689f6ed407..95fb944e15ee0579a6f437f8440ba93fe8443524 100644 (file)
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -13,12 +13,14 @@
  #include <linux/device.h>
  #include <linux/smp.h>
  #include <linux/cpu.h>
+#include <linux/cpu_pm.h>
  #include <linux/clockchips.h>
  #include <linux/interrupt.h>
  #include <linux/of_irq.h>
  #include <linux/of_address.h>
  #include <linux/io.h>
  #include <linux/slab.h>
+#include <linux/sched_clock.h>
  
  #include <asm/arch_timer.h>
  #include <asm/virt.h>
@@ -294,6 +296,19 @@ static void __arch_timer_setup(unsigned type,
         clockevents_config_and_register(clk, arch_timer_rate, 0xf, 0x7fffffff);
  }
  
+static void arch_timer_configure_evtstream(void)
+{
+       int evt_stream_div, pos;
+
+       /* Find the closest power of two to the divisor */
+       evt_stream_div = arch_timer_rate / ARCH_TIMER_EVT_STREAM_FREQ;
+       pos = fls(evt_stream_div);
+       if (pos > 1 && !(evt_stream_div & (1 << (pos - 2))))
+               pos--;
+       /* enable event stream */
+       arch_timer_evtstrm_enable(min(pos, 15));
+}
+
  static int arch_timer_setup(struct clock_event_device *clk)
  {
         __arch_timer_setup(ARCH_CP15_TIMER, clk);
@@ -307,6 +322,8 @@ static int arch_timer_setup(struct clock_event_device *clk)
         }
  
         arch_counter_set_user_access();
+       if (IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM))
+               arch_timer_configure_evtstream();
  
         return 0;
  }
@@ -389,7 +406,7 @@ static struct clocksource clocksource_counter = {
         .rating = 400,
         .read   = arch_counter_read,
         .mask   = CLOCKSOURCE_MASK(56),
-       .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
+       .flags  = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_SUSPEND_NONSTOP,
  };
  
  static struct cyclecounter cyclecounter = {
@@ -419,6 +436,9 @@ static void __init arch_counter_register(unsigned type)
         cyclecounter.mult = clocksource_counter.mult;
         cyclecounter.shift = clocksource_counter.shift;
         timecounter_init(&timecounter, &cyclecounter, start_count);
+
+       /* 56 bits minimum, so we assume worst case rollover */
+       sched_clock_register(arch_timer_read_counter, 56, arch_timer_rate);
  }
  
  static void arch_timer_stop(struct clock_event_device *clk)
@@ -460,6 +480,33 @@ static struct notifier_block arch_timer_cpu_nb = {
         .notifier_call = arch_timer_cpu_notify,
  };
  
+#ifdef CONFIG_CPU_PM
+static unsigned int saved_cntkctl;
+static int arch_timer_cpu_pm_notify(struct notifier_block *self,
+                                   unsigned long action, void *hcpu)
+{
+       if (action == CPU_PM_ENTER)
+               saved_cntkctl = arch_timer_get_cntkctl();
+       else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT)
+               arch_timer_set_cntkctl(saved_cntkctl);
+       return NOTIFY_OK;
+}
+
+static struct notifier_block arch_timer_cpu_pm_notifier = {
+       .notifier_call = arch_timer_cpu_pm_notify,
+};
+
+static int __init arch_timer_cpu_pm_init(void)
+{
+       return cpu_pm_register_notifier(&arch_timer_cpu_pm_notifier);
+}
+#else
+static int __init arch_timer_cpu_pm_init(void)
+{
+       return 0;
+}
+#endif
+
  static int __init arch_timer_register(void)
  {
         int err;
@@ -499,11 +546,17 @@ static int __init arch_timer_register(void)
         if (err)
                 goto out_free_irq;
  
+       err = arch_timer_cpu_pm_init();
+       if (err)
+               goto out_unreg_notify;
+
         /* Immediately configure the timer on the boot CPU */
         arch_timer_setup(this_cpu_ptr(arch_timer_evt));
  
         return 0;
  
+out_unreg_notify:
+       unregister_cpu_notifier(&arch_timer_cpu_nb);
  out_free_irq:
         if (arch_timer_use_virtual)
                 free_percpu_irq(arch_timer_ppi[VIRT_PPI], arch_timer_evt);
diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c

index b66c1f36066ced2f7bc8ae68677397fddfb3f4f0..c639b1a9e99686fb3333fc825c65d84e3cc52bae 100644 (file)
--- a/drivers/clocksource/arm_global_timer.c
+++ b/drivers/clocksource/arm_global_timer.c
@@ -169,7 +169,8 @@ static int gt_clockevents_init(struct clock_event_device *clk)
         int cpu = smp_processor_id();
  
         clk->name = "arm_global_timer";
-       clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
+       clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
+               CLOCK_EVT_FEAT_PERCPU;
         clk->set_mode = gt_clockevent_set_mode;
         clk->set_next_event = gt_clockevent_set_next_event;
         clk->cpumask = cpumask_of(cpu);
diff --git a/drivers/clocksource/bcm2835_timer.c b/drivers/clocksource/bcm2835_timer.c

index 07ea7ce900dc19a10491732927c162ae8212cda2..26ed331b1aad55be16f931f25f08f8a7e141cd25 100644 (file)
--- a/drivers/clocksource/bcm2835_timer.c
+++ b/drivers/clocksource/bcm2835_timer.c
@@ -49,7 +49,7 @@ struct bcm2835_timer {
  
  static void __iomem *system_clock __read_mostly;
  
-static u32 notrace bcm2835_sched_read(void)
+static u64 notrace bcm2835_sched_read(void)
  {
         return readl_relaxed(system_clock);
  }
@@ -110,7 +110,7 @@ static void __init bcm2835_timer_init(struct device_node *node)
                 panic("Can't read clock-frequency");
  
         system_clock = base + REG_COUNTER_LO;
-       setup_sched_clock(bcm2835_sched_read, 32, freq);
+       sched_clock_register(bcm2835_sched_read, 32, freq);
  
         clocksource_mmio_init(base + REG_COUNTER_LO, node->name,
                 freq, 300, 32, clocksource_mmio_readl_up);
diff --git a/drivers/clocksource/clksrc-dbx500-prcmu.c b/drivers/clocksource/clksrc-dbx500-prcmu.c

index a9fd4ad2567426228c34a5910bf457a6a4bb584a..b375106844d83e6bb2321e8768b02b92e2031efa 100644 (file)
--- a/drivers/clocksource/clksrc-dbx500-prcmu.c
+++ b/drivers/clocksource/clksrc-dbx500-prcmu.c
@@ -53,7 +53,7 @@ static struct clocksource clocksource_dbx500_prcmu = {
  
  #ifdef CONFIG_CLKSRC_DBX500_PRCMU_SCHED_CLOCK
  
-static u32 notrace dbx500_prcmu_sched_clock_read(void)
+static u64 notrace dbx500_prcmu_sched_clock_read(void)
  {
         if (unlikely(!clksrc_dbx500_timer_base))
                 return 0;
@@ -81,8 +81,7 @@ void __init clksrc_dbx500_prcmu_init(void __iomem *base)
                        clksrc_dbx500_timer_base + PRCMU_TIMER_REF);
         }
  #ifdef CONFIG_CLKSRC_DBX500_PRCMU_SCHED_CLOCK
-       setup_sched_clock(dbx500_prcmu_sched_clock_read,
-                        32, RATE_32K);
+       sched_clock_register(dbx500_prcmu_sched_clock_read, 32, RATE_32K);
  #endif
         clocksource_register_hz(&clocksource_dbx500_prcmu, RATE_32K);
  }
diff --git a/drivers/clocksource/clksrc-of.c b/drivers/clocksource/clksrc-of.c

index b9ddd9e3a2f599e2cc7424c1eac18d4280b2f850..35639cf4e5a208af89c9b692eae32227bcefc5d3 100644 (file)
--- a/drivers/clocksource/clksrc-of.c
+++ b/drivers/clocksource/clksrc-of.c
@@ -35,5 +35,6 @@ void __init clocksource_of_init(void)
  
                 init_func = match->data;
                 init_func(np);
+               of_node_put(np);
         }
  }
diff --git a/drivers/clocksource/dw_apb_timer_of.c b/drivers/clocksource/dw_apb_timer_of.c

index 4cbae4f762b199db708a56d4814a8be578d14024..45ba8aecc7298428016dd6d5601482362cce0c4e 100644 (file)
--- a/drivers/clocksource/dw_apb_timer_of.c
+++ b/drivers/clocksource/dw_apb_timer_of.c
@@ -23,7 +23,7 @@
  #include <linux/clk.h>
  #include <linux/sched_clock.h>
  
-static void timer_get_base_and_rate(struct device_node *np,
+static void __init timer_get_base_and_rate(struct device_node *np,
                                     void __iomem **base, u32 *rate)
  {
         struct clk *timer_clk;
@@ -55,11 +55,11 @@ static void timer_get_base_and_rate(struct device_node *np,
  
  try_clock_freq:
         if (of_property_read_u32(np, "clock-freq", rate) &&
-               of_property_read_u32(np, "clock-frequency", rate))
+           of_property_read_u32(np, "clock-frequency", rate))
                 panic("No clock nor clock-frequency property for %s", np->name);
  }
  
-static void add_clockevent(struct device_node *event_timer)
+static void __init add_clockevent(struct device_node *event_timer)
  {
         void __iomem *iobase;
         struct dw_apb_clock_event_device *ced;
@@ -82,7 +82,7 @@ static void add_clockevent(struct device_node *event_timer)
  static void __iomem *sched_io_base;
  static u32 sched_rate;
  
-static void add_clocksource(struct device_node *source_timer)
+static void __init add_clocksource(struct device_node *source_timer)
  {
         void __iomem *iobase;
         struct dw_apb_clocksource *cs;
@@ -106,7 +106,7 @@ static void add_clocksource(struct device_node *source_timer)
         sched_rate = rate;
  }
  
-static u32 read_sched_clock(void)
+static u64 read_sched_clock(void)
  {
         return __raw_readl(sched_io_base);
  }
@@ -117,7 +117,7 @@ static const struct of_device_id sptimer_ids[] __initconst = {
         { /* Sentinel */ },
  };
  
-static void init_sched_clock(void)
+static void __init init_sched_clock(void)
  {
         struct device_node *sched_timer;
  
@@ -128,7 +128,7 @@ static void init_sched_clock(void)
                 of_node_put(sched_timer);
         }
  
-       setup_sched_clock(read_sched_clock, 32, sched_rate);
+       sched_clock_register(read_sched_clock, 32, sched_rate);
  }
  
  static int num_called;
@@ -138,12 +138,10 @@ static void __init dw_apb_timer_init(struct device_node *timer)
         case 0:
                 pr_debug("%s: found clockevent timer\n", __func__);
                 add_clockevent(timer);
-               of_node_put(timer);
                 break;
         case 1:
                 pr_debug("%s: found clocksource timer\n", __func__);
                 add_clocksource(timer);
-               of_node_put(timer);
                 init_sched_clock();
                 break;
         default:
diff --git a/drivers/clocksource/mxs_timer.c b/drivers/clocksource/mxs_timer.c

index 0f5e65f74dc3ee5f9749c6c782628016bb884f97..445b68a01dc5b3d74d7fc5293e16363a8a9adda0 100644 (file)
--- a/drivers/clocksource/mxs_timer.c
+++ b/drivers/clocksource/mxs_timer.c
@@ -222,7 +222,7 @@ static struct clocksource clocksource_mxs = {
         .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
  };
  
-static u32 notrace mxs_read_sched_clock_v2(void)
+static u64 notrace mxs_read_sched_clock_v2(void)
  {
         return ~readl_relaxed(mxs_timrot_base + HW_TIMROT_RUNNING_COUNTn(1));
  }
@@ -236,7 +236,7 @@ static int __init mxs_clocksource_init(struct clk *timer_clk)
         else {
                 clocksource_mmio_init(mxs_timrot_base + HW_TIMROT_RUNNING_COUNTn(1),
                         "mxs_timer", c, 200, 32, clocksource_mmio_readl_down);
-               setup_sched_clock(mxs_read_sched_clock_v2, 32, c);
+               sched_clock_register(mxs_read_sched_clock_v2, 32, c);
         }
  
         return 0;
diff --git a/drivers/clocksource/nomadik-mtu.c b/drivers/clocksource/nomadik-mtu.c

index 1b74bea12385a20acd695f2d422665be1dfd90f3..ed7b73b508e096bb06a3e87a1843aa813f835430 100644 (file)
--- a/drivers/clocksource/nomadik-mtu.c
+++ b/drivers/clocksource/nomadik-mtu.c
@@ -76,7 +76,7 @@ static struct delay_timer mtu_delay_timer;
   * local implementation which uses the clocksource to get some
   * better resolution when scheduling the kernel.
   */
-static u32 notrace nomadik_read_sched_clock(void)
+static u64 notrace nomadik_read_sched_clock(void)
  {
         if (unlikely(!mtu_base))
                 return 0;
@@ -231,7 +231,7 @@ static void __init __nmdk_timer_init(void __iomem *base, int irq,
                        "mtu_0");
  
  #ifdef CONFIG_CLKSRC_NOMADIK_MTU_SCHED_CLOCK
-       setup_sched_clock(nomadik_read_sched_clock, 32, rate);
+       sched_clock_register(nomadik_read_sched_clock, 32, rate);
  #endif
  
         /* Timer 1 is used for events, register irq and clockevents */
diff --git a/drivers/clocksource/samsung_pwm_timer.c b/drivers/clocksource/samsung_pwm_timer.c

index ab29476ee5f9e21dd85de872a45df747dec4a0e4..85082e8d305298ac41b085df4bdb493aca95c682 100644 (file)
--- a/drivers/clocksource/samsung_pwm_timer.c
+++ b/drivers/clocksource/samsung_pwm_timer.c
@@ -331,7 +331,7 @@ static struct clocksource samsung_clocksource = {
   * this wraps around for now, since it is just a relative time
   * stamp. (Inspired by U300 implementation.)
   */
-static u32 notrace samsung_read_sched_clock(void)
+static u64 notrace samsung_read_sched_clock(void)
  {
         return samsung_clocksource_read(NULL);
  }
@@ -357,7 +357,7 @@ static void __init samsung_clocksource_init(void)
         else
                 pwm.source_reg = pwm.base + pwm.source_id * 0x0c + 0x14;
  
-       setup_sched_clock(samsung_read_sched_clock,
+       sched_clock_register(samsung_read_sched_clock,
                                                 pwm.variant.bits, clock_rate);
  
         samsung_clocksource.mask = CLOCKSOURCE_MASK(pwm.variant.bits);
diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c

index 8a6187225dd0e7faa9b25c14147be31f3b7a9539..00fdd11702849e042a2550dec0ac089075ada987 100644 (file)
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -100,7 +100,7 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
                         || tcd->clkevt.mode == CLOCK_EVT_MODE_ONESHOT) {
                 __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
                 __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
-               clk_disable(tcd->clk);
+               clk_disable_unprepare(tcd->clk);
         }
  
         switch (m) {
@@ -109,7 +109,7 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
          * of oneshot, we get lower overhead and improved accuracy.
          */
         case CLOCK_EVT_MODE_PERIODIC:
-               clk_enable(tcd->clk);
+               clk_prepare_enable(tcd->clk);
  
                 /* slow clock, count up to RC, then irq and restart */
                 __raw_writel(timer_clock
@@ -126,7 +126,7 @@ static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
                 break;
  
         case CLOCK_EVT_MODE_ONESHOT:
-               clk_enable(tcd->clk);
+               clk_prepare_enable(tcd->clk);
  
                 /* slow clock, count up to RC, then irq and stop */
                 __raw_writel(timer_clock | ATMEL_TC_CPCSTOP
@@ -180,15 +180,22 @@ static irqreturn_t ch2_irq(int irq, void *handle)
  
  static struct irqaction tc_irqaction = {
         .name           = "tc_clkevt",
-       .flags          = IRQF_TIMER | IRQF_DISABLED,
+       .flags          = IRQF_TIMER,
         .handler        = ch2_irq,
  };
  
-static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
+static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
  {
+       int ret;
         struct clk *t2_clk = tc->clk[2];
         int irq = tc->irq[2];
  
+       /* try to enable t2 clk to avoid future errors in mode change */
+       ret = clk_prepare_enable(t2_clk);
+       if (ret)
+               return ret;
+       clk_disable_unprepare(t2_clk);
+
         clkevt.regs = tc->regs;
         clkevt.clk = t2_clk;
         tc_irqaction.dev_id = &clkevt;
@@ -197,16 +204,21 @@ static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
  
         clkevt.clkevt.cpumask = cpumask_of(0);
  
+       ret = setup_irq(irq, &tc_irqaction);
+       if (ret)
+               return ret;
+
         clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
  
-       setup_irq(irq, &tc_irqaction);
+       return ret;
  }
  
  #else /* !CONFIG_GENERIC_CLOCKEVENTS */
  
-static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
+static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
  {
         /* NOTHING */
+       return 0;
  }
  
  #endif
@@ -265,6 +277,7 @@ static int __init tcb_clksrc_init(void)
         int best_divisor_idx = -1;
         int clk32k_divisor_idx = -1;
         int i;
+       int ret;
  
         tc = atmel_tc_alloc(CONFIG_ATMEL_TCB_CLKSRC_BLOCK, clksrc.name);
         if (!tc) {
@@ -275,7 +288,11 @@ static int __init tcb_clksrc_init(void)
         pdev = tc->pdev;
  
         t0_clk = tc->clk[0];
-       clk_enable(t0_clk);
+       ret = clk_prepare_enable(t0_clk);
+       if (ret) {
+               pr_debug("can't enable T0 clk\n");
+               goto err_free_tc;
+       }
  
         /* How fast will we be counting?  Pick something over 5 MHz.  */
         rate = (u32) clk_get_rate(t0_clk);
@@ -313,17 +330,39 @@ static int __init tcb_clksrc_init(void)
                 /* tclib will give us three clocks no matter what the
                  * underlying platform supports.
                  */
-               clk_enable(tc->clk[1]);
+               ret = clk_prepare_enable(tc->clk[1]);
+               if (ret) {
+                       pr_debug("can't enable T1 clk\n");
+                       goto err_disable_t0;
+               }
                 /* setup both channel 0 & 1 */
                 tcb_setup_dual_chan(tc, best_divisor_idx);
         }
  
         /* and away we go! */
-       clocksource_register_hz(&clksrc, divided_rate);
+       ret = clocksource_register_hz(&clksrc, divided_rate);
+       if (ret)
+               goto err_disable_t1;
  
         /* channel 2:  periodic and oneshot timer support */
-       setup_clkevents(tc, clk32k_divisor_idx);
+       ret = setup_clkevents(tc, clk32k_divisor_idx);
+       if (ret)
+               goto err_unregister_clksrc;
  
         return 0;
+
+err_unregister_clksrc:
+       clocksource_unregister(&clksrc);
+
+err_disable_t1:
+       if (!tc->tcb_config || tc->tcb_config->counter_width != 32)
+               clk_disable_unprepare(tc->clk[1]);
+
+err_disable_t0:
+       clk_disable_unprepare(t0_clk);
+
+err_free_tc:
+       atmel_tc_free(tc);
+       return ret;
  }
  arch_initcall(tcb_clksrc_init);
diff --git a/drivers/clocksource/tegra20_timer.c b/drivers/clocksource/tegra20_timer.c

index 93961703b887e9a82ccde7d42bd981ce5b7a83aa..642849256d82ecabd0d7222d8a960280ce3f3e24 100644 (file)
--- a/drivers/clocksource/tegra20_timer.c
+++ b/drivers/clocksource/tegra20_timer.c
@@ -98,7 +98,7 @@ static struct clock_event_device tegra_clockevent = {
         .set_mode       = tegra_timer_set_mode,
  };
  
-static u32 notrace tegra_read_sched_clock(void)
+static u64 notrace tegra_read_sched_clock(void)
  {
         return timer_readl(TIMERUS_CNTR_1US);
  }
@@ -181,8 +181,6 @@ static void __init tegra20_init_timer(struct device_node *np)
                 rate = clk_get_rate(clk);
         }
  
-       of_node_put(np);
-
         switch (rate) {
         case 12000000:
                 timer_writel(0x000b, TIMERUS_USEC_CFG);
@@ -200,7 +198,7 @@ static void __init tegra20_init_timer(struct device_node *np)
                 WARN(1, "Unknown clock rate");
         }
  
-       setup_sched_clock(tegra_read_sched_clock, 32, 1000000);
+       sched_clock_register(tegra_read_sched_clock, 32, 1000000);
  
         if (clocksource_mmio_init(timer_reg_base + TIMERUS_CNTR_1US,
                 "timer_us", 1000000, 300, 32, clocksource_mmio_readl_up)) {
@@ -241,8 +239,6 @@ static void __init tegra20_init_rtc(struct device_node *np)
         else
                 clk_prepare_enable(clk);
  
-       of_node_put(np);
-
         register_persistent_clock(NULL, tegra_read_persistent_clock);
  }
  CLOCKSOURCE_OF_DECLARE(tegra20_rtc, "nvidia,tegra20-rtc", tegra20_init_rtc);
diff --git a/drivers/clocksource/time-armada-370-xp.c b/drivers/clocksource/time-armada-370-xp.c

index 0198504ef6b02388c8847bc4897eca4e58328479..d8e47e5027858faf9f99c0b3ad99883bd7bc9cc2 100644 (file)
--- a/drivers/clocksource/time-armada-370-xp.c
+++ b/drivers/clocksource/time-armada-370-xp.c
@@ -96,7 +96,7 @@ static void local_timer_ctrl_clrset(u32 clr, u32 set)
                 local_base + TIMER_CTRL_OFF);
  }
  
-static u32 notrace armada_370_xp_read_sched_clock(void)
+static u64 notrace armada_370_xp_read_sched_clock(void)
  {
         return ~readl(timer_base + TIMER0_VAL_OFF);
  }
@@ -258,7 +258,7 @@ static void __init armada_370_xp_timer_common_init(struct device_node *np)
         /*
          * Set scale and timer for sched_clock.
          */
-       setup_sched_clock(armada_370_xp_read_sched_clock, 32, timer_clk);
+       sched_clock_register(armada_370_xp_read_sched_clock, 32, timer_clk);
  
         /*
          * Setup free-running clocksource timer (interrupts
diff --git a/drivers/clocksource/timer-prima2.c b/drivers/clocksource/timer-prima2.c

index ef3cfb269d8bce34091e1fde228da32821d41925..8a492d34ff9f52813655b056bad5c17f9c305220 100644 (file)
--- a/drivers/clocksource/timer-prima2.c
+++ b/drivers/clocksource/timer-prima2.c
@@ -165,9 +165,9 @@ static struct irqaction sirfsoc_timer_irq = {
  };
  
  /* Overwrite weak default sched_clock with more precise one */
-static u32 notrace sirfsoc_read_sched_clock(void)
+static u64 notrace sirfsoc_read_sched_clock(void)
  {
-       return (u32)(sirfsoc_timer_read(NULL) & 0xffffffff);
+       return sirfsoc_timer_read(NULL);
  }
  
  static void __init sirfsoc_clockevent_init(void)
@@ -206,7 +206,7 @@ static void __init sirfsoc_prima2_timer_init(struct device_node *np)
  
         BUG_ON(clocksource_register_hz(&sirfsoc_clocksource, CLOCK_TICK_RATE));
  
-       setup_sched_clock(sirfsoc_read_sched_clock, 32, CLOCK_TICK_RATE);
+       sched_clock_register(sirfsoc_read_sched_clock, 64, CLOCK_TICK_RATE);
  
         BUG_ON(setup_irq(sirfsoc_timer_irq.irq, &sirfsoc_timer_irq));
  
diff --git a/drivers/clocksource/vf_pit_timer.c b/drivers/clocksource/vf_pit_timer.c

index 587e0202a70b263677abe66f2eff513fe491619a..02821b06a39e33be4cb403b202286deb530ff399 100644 (file)
--- a/drivers/clocksource/vf_pit_timer.c
+++ b/drivers/clocksource/vf_pit_timer.c
@@ -52,7 +52,7 @@ static inline void pit_irq_acknowledge(void)
         __raw_writel(PITTFLG_TIF, clkevt_base + PITTFLG);
  }
  
-static unsigned int pit_read_sched_clock(void)
+static u64 pit_read_sched_clock(void)
  {
         return __raw_readl(clksrc_base + PITCVAL);
  }
@@ -64,7 +64,7 @@ static int __init pit_clocksource_init(unsigned long rate)
         __raw_writel(~0UL, clksrc_base + PITLDVAL);
         __raw_writel(PITTCTRL_TEN, clksrc_base + PITTCTRL);
  
-       setup_sched_clock(pit_read_sched_clock, 32, rate);
+       sched_clock_register(pit_read_sched_clock, 32, rate);
         return clocksource_mmio_init(clksrc_base + PITCVAL, "vf-pit", rate,
                         300, 32, clocksource_mmio_readl_down);
  }
diff --git a/drivers/clocksource/vt8500_timer.c b/drivers/clocksource/vt8500_timer.c

index 64f553f04fa4b0d44f8218b2d98301ccb0c08c2b..ad3c0e83a77956431541315108b093f2ca282b52 100644 (file)
--- a/drivers/clocksource/vt8500_timer.c
+++ b/drivers/clocksource/vt8500_timer.c
@@ -137,14 +137,12 @@ static void __init vt8500_timer_init(struct device_node *np)
         if (!regbase) {
                 pr_err("%s: Missing iobase description in Device Tree\n",
                                                                 __func__);
-               of_node_put(np);
                 return;
         }
         timer_irq = irq_of_parse_and_map(np, 0);
         if (!timer_irq) {
                 pr_err("%s: Missing irq description in Device Tree\n",
                                                                 __func__);
-               of_node_put(np);
                 return;
         }
  
diff --git a/drivers/firmware/efi/efi-stub-helper.c b/drivers/firmware/efi/efi-stub-helper.c

new file mode 100644 (file)

index 0000000..b6bffbf
--- /dev/null
+++ b/drivers/firmware/efi/efi-stub-helper.c
@@ -0,0 +1,636 @@
+/*
+ * Helper functions used by the EFI stub on multiple
+ * architectures. This should be #included by the EFI stub
+ * implementation files.
+ *
+ * Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ * This file is part of the Linux kernel, and is made available
+ * under the terms of the GNU General Public License version 2.
+ *
+ */
+#define EFI_READ_CHUNK_SIZE    (1024 * 1024)
+
+struct file_info {
+       efi_file_handle_t *handle;
+       u64 size;
+};
+
+
+
+
+static void efi_char16_printk(efi_system_table_t *sys_table_arg,
+                             efi_char16_t *str)
+{
+       struct efi_simple_text_output_protocol *out;
+
+       out = (struct efi_simple_text_output_protocol *)sys_table_arg->con_out;
+       efi_call_phys2(out->output_string, out, str);
+}
+
+static void efi_printk(efi_system_table_t *sys_table_arg, char *str)
+{
+       char *s8;
+
+       for (s8 = str; *s8; s8++) {
+               efi_char16_t ch[2] = { 0 };
+
+               ch[0] = *s8;
+               if (*s8 == '\n') {
+                       efi_char16_t nl[2] = { '\r', 0 };
+                       efi_char16_printk(sys_table_arg, nl);
+               }
+
+               efi_char16_printk(sys_table_arg, ch);
+       }
+}
+
+
+static efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg,
+                                      efi_memory_desc_t **map,
+                                      unsigned long *map_size,
+                                      unsigned long *desc_size,
+                                      u32 *desc_ver,
+                                      unsigned long *key_ptr)
+{
+       efi_memory_desc_t *m = NULL;
+       efi_status_t status;
+       unsigned long key;
+       u32 desc_version;
+
+       *map_size = sizeof(*m) * 32;
+again:
+       /*
+        * Add an additional efi_memory_desc_t because we're doing an
+        * allocation which may be in a new descriptor region.
+        */
+       *map_size += sizeof(*m);
+       status = efi_call_phys3(sys_table_arg->boottime->allocate_pool,
+                               EFI_LOADER_DATA, *map_size, (void **)&m);
+       if (status != EFI_SUCCESS)
+               goto fail;
+
+       status = efi_call_phys5(sys_table_arg->boottime->get_memory_map,
+                               map_size, m, &key, desc_size, &desc_version);
+       if (status == EFI_BUFFER_TOO_SMALL) {
+               efi_call_phys1(sys_table_arg->boottime->free_pool, m);
+               goto again;
+       }
+
+       if (status != EFI_SUCCESS)
+               efi_call_phys1(sys_table_arg->boottime->free_pool, m);
+       if (key_ptr && status == EFI_SUCCESS)
+               *key_ptr = key;
+       if (desc_ver && status == EFI_SUCCESS)
+               *desc_ver = desc_version;
+
+fail:
+       *map = m;
+       return status;
+}
+
+/*
+ * Allocate at the highest possible address that is not above 'max'.
+ */
+static efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg,
+                              unsigned long size, unsigned long align,
+                              unsigned long *addr, unsigned long max)
+{
+       unsigned long map_size, desc_size;
+       efi_memory_desc_t *map;
+       efi_status_t status;
+       unsigned long nr_pages;
+       u64 max_addr = 0;
+       int i;
+
+       status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size,
+                                   NULL, NULL);
+       if (status != EFI_SUCCESS)
+               goto fail;
+
+       /*
+        * Enforce minimum alignment that EFI requires when requesting
+        * a specific address.  We are doing page-based allocations,
+        * so we must be aligned to a page.
+        */
+       if (align < EFI_PAGE_SIZE)
+               align = EFI_PAGE_SIZE;
+
+       nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+again:
+       for (i = 0; i < map_size / desc_size; i++) {
+               efi_memory_desc_t *desc;
+               unsigned long m = (unsigned long)map;
+               u64 start, end;
+
+               desc = (efi_memory_desc_t *)(m + (i * desc_size));
+               if (desc->type != EFI_CONVENTIONAL_MEMORY)
+                       continue;
+
+               if (desc->num_pages < nr_pages)
+                       continue;
+
+               start = desc->phys_addr;
+               end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+               if ((start + size) > end || (start + size) > max)
+                       continue;
+
+               if (end - size > max)
+                       end = max;
+
+               if (round_down(end - size, align) < start)
+                       continue;
+
+               start = round_down(end - size, align);
+
+               /*
+                * Don't allocate at 0x0. It will confuse code that
+                * checks pointers against NULL.
+                */
+               if (start == 0x0)
+                       continue;
+
+               if (start > max_addr)
+                       max_addr = start;
+       }
+
+       if (!max_addr)
+               status = EFI_NOT_FOUND;
+       else {
+               status = efi_call_phys4(sys_table_arg->boottime->allocate_pages,
+                                       EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+                                       nr_pages, &max_addr);
+               if (status != EFI_SUCCESS) {
+                       max = max_addr;
+                       max_addr = 0;
+                       goto again;
+               }
+
+               *addr = max_addr;
+       }
+
+       efi_call_phys1(sys_table_arg->boottime->free_pool, map);
+
+fail:
+       return status;
+}
+
+/*
+ * Allocate at the lowest possible address.
+ */
+static efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
+                             unsigned long size, unsigned long align,
+                             unsigned long *addr)
+{
+       unsigned long map_size, desc_size;
+       efi_memory_desc_t *map;
+       efi_status_t status;
+       unsigned long nr_pages;
+       int i;
+
+       status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size,
+                                   NULL, NULL);
+       if (status != EFI_SUCCESS)
+               goto fail;
+
+       /*
+        * Enforce minimum alignment that EFI requires when requesting
+        * a specific address.  We are doing page-based allocations,
+        * so we must be aligned to a page.
+        */
+       if (align < EFI_PAGE_SIZE)
+               align = EFI_PAGE_SIZE;
+
+       nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+       for (i = 0; i < map_size / desc_size; i++) {
+               efi_memory_desc_t *desc;
+               unsigned long m = (unsigned long)map;
+               u64 start, end;
+
+               desc = (efi_memory_desc_t *)(m + (i * desc_size));
+
+               if (desc->type != EFI_CONVENTIONAL_MEMORY)
+                       continue;
+
+               if (desc->num_pages < nr_pages)
+                       continue;
+
+               start = desc->phys_addr;
+               end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+               /*
+                * Don't allocate at 0x0. It will confuse code that
+                * checks pointers against NULL. Skip the first 8
+                * bytes so we start at a nice even number.
+                */
+               if (start == 0x0)
+                       start += 8;
+
+               start = round_up(start, align);
+               if ((start + size) > end)
+                       continue;
+
+               status = efi_call_phys4(sys_table_arg->boottime->allocate_pages,
+                                       EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+                                       nr_pages, &start);
+               if (status == EFI_SUCCESS) {
+                       *addr = start;
+                       break;
+               }
+       }
+
+       if (i == map_size / desc_size)
+               status = EFI_NOT_FOUND;
+
+       efi_call_phys1(sys_table_arg->boottime->free_pool, map);
+fail:
+       return status;
+}
+
+static void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
+                    unsigned long addr)
+{
+       unsigned long nr_pages;
+
+       if (!size)
+               return;
+
+       nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+       efi_call_phys2(sys_table_arg->boottime->free_pages, addr, nr_pages);
+}
+
+
+/*
+ * Check the cmdline for a LILO-style file= arguments.
+ *
+ * We only support loading a file from the same filesystem as
+ * the kernel image.
+ */
+static efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
+                                        efi_loaded_image_t *image,
+                                        char *cmd_line, char *option_string,
+                                        unsigned long max_addr,
+                                        unsigned long *load_addr,
+                                        unsigned long *load_size)
+{
+       struct file_info *files;
+       unsigned long file_addr;
+       efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+       u64 file_size_total;
+       efi_file_io_interface_t *io;
+       efi_file_handle_t *fh;
+       efi_status_t status;
+       int nr_files;
+       char *str;
+       int i, j, k;
+
+       file_addr = 0;
+       file_size_total = 0;
+
+       str = cmd_line;
+
+       j = 0;                  /* See close_handles */
+
+       if (!load_addr || !load_size)
+               return EFI_INVALID_PARAMETER;
+
+       *load_addr = 0;
+       *load_size = 0;
+
+       if (!str || !*str)
+               return EFI_SUCCESS;
+
+       for (nr_files = 0; *str; nr_files++) {
+               str = strstr(str, option_string);
+               if (!str)
+                       break;
+
+               str += strlen(option_string);
+
+               /* Skip any leading slashes */
+               while (*str == '/' || *str == '\\')
+                       str++;
+
+               while (*str && *str != ' ' && *str != '\n')
+                       str++;
+       }
+
+       if (!nr_files)
+               return EFI_SUCCESS;
+
+       status = efi_call_phys3(sys_table_arg->boottime->allocate_pool,
+                               EFI_LOADER_DATA,
+                               nr_files * sizeof(*files),
+                               (void **)&files);
+       if (status != EFI_SUCCESS) {
+               efi_printk(sys_table_arg, "Failed to alloc mem for file handle list\n");
+               goto fail;
+       }
+
+       str = cmd_line;
+       for (i = 0; i < nr_files; i++) {
+               struct file_info *file;
+               efi_file_handle_t *h;
+               efi_file_info_t *info;
+               efi_char16_t filename_16[256];
+               unsigned long info_sz;
+               efi_guid_t info_guid = EFI_FILE_INFO_ID;
+               efi_char16_t *p;
+               u64 file_sz;
+
+               str = strstr(str, option_string);
+               if (!str)
+                       break;
+
+               str += strlen(option_string);
+
+               file = &files[i];
+               p = filename_16;
+
+               /* Skip any leading slashes */
+               while (*str == '/' || *str == '\\')
+                       str++;
+
+               while (*str && *str != ' ' && *str != '\n') {
+                       if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16))
+                               break;
+
+                       if (*str == '/') {
+                               *p++ = '\\';
+                               str++;
+                       } else {
+                               *p++ = *str++;
+                       }
+               }
+
+               *p = '\0';
+
+               /* Only open the volume once. */
+               if (!i) {
+                       efi_boot_services_t *boottime;
+
+                       boottime = sys_table_arg->boottime;
+
+                       status = efi_call_phys3(boottime->handle_protocol,
+                                       image->device_handle, &fs_proto,
+                                               (void **)&io);
+                       if (status != EFI_SUCCESS) {
+                               efi_printk(sys_table_arg, "Failed to handle fs_proto\n");
+                               goto free_files;
+                       }
+
+                       status = efi_call_phys2(io->open_volume, io, &fh);
+                       if (status != EFI_SUCCESS) {
+                               efi_printk(sys_table_arg, "Failed to open volume\n");
+                               goto free_files;
+                       }
+               }
+
+               status = efi_call_phys5(fh->open, fh, &h, filename_16,
+                                       EFI_FILE_MODE_READ, (u64)0);
+               if (status != EFI_SUCCESS) {
+                       efi_printk(sys_table_arg, "Failed to open file: ");
+                       efi_char16_printk(sys_table_arg, filename_16);
+                       efi_printk(sys_table_arg, "\n");
+                       goto close_handles;
+               }
+
+               file->handle = h;
+
+               info_sz = 0;
+               status = efi_call_phys4(h->get_info, h, &info_guid,
+                                       &info_sz, NULL);
+               if (status != EFI_BUFFER_TOO_SMALL) {
+                       efi_printk(sys_table_arg, "Failed to get file info size\n");
+                       goto close_handles;
+               }
+
+grow:
+               status = efi_call_phys3(sys_table_arg->boottime->allocate_pool,
+                                       EFI_LOADER_DATA, info_sz,
+                                       (void **)&info);
+               if (status != EFI_SUCCESS) {
+                       efi_printk(sys_table_arg, "Failed to alloc mem for file info\n");
+                       goto close_handles;
+               }
+
+               status = efi_call_phys4(h->get_info, h, &info_guid,
+                                       &info_sz, info);
+               if (status == EFI_BUFFER_TOO_SMALL) {
+                       efi_call_phys1(sys_table_arg->boottime->free_pool,
+                                      info);
+                       goto grow;
+               }
+
+               file_sz = info->file_size;
+               efi_call_phys1(sys_table_arg->boottime->free_pool, info);
+
+               if (status != EFI_SUCCESS) {
+                       efi_printk(sys_table_arg, "Failed to get file info\n");
+                       goto close_handles;
+               }
+
+               file->size = file_sz;
+               file_size_total += file_sz;
+       }
+
+       if (file_size_total) {
+               unsigned long addr;
+
+               /*
+                * Multiple files need to be at consecutive addresses in memory,
+                * so allocate enough memory for all the files.  This is used
+                * for loading multiple files.
+                */
+               status = efi_high_alloc(sys_table_arg, file_size_total, 0x1000,
+                                   &file_addr, max_addr);
+               if (status != EFI_SUCCESS) {
+                       efi_printk(sys_table_arg, "Failed to alloc highmem for files\n");
+                       goto close_handles;
+               }
+
+               /* We've run out of free low memory. */
+               if (file_addr > max_addr) {
+                       efi_printk(sys_table_arg, "We've run out of free low memory\n");
+                       status = EFI_INVALID_PARAMETER;
+                       goto free_file_total;
+               }
+
+               addr = file_addr;
+               for (j = 0; j < nr_files; j++) {
+                       unsigned long size;
+
+                       size = files[j].size;
+                       while (size) {
+                               unsigned long chunksize;
+                               if (size > EFI_READ_CHUNK_SIZE)
+                                       chunksize = EFI_READ_CHUNK_SIZE;
+                               else
+                                       chunksize = size;
+                               status = efi_call_phys3(fh->read,
+                                                       files[j].handle,
+                                                       &chunksize,
+                                                       (void *)addr);
+                               if (status != EFI_SUCCESS) {
+                                       efi_printk(sys_table_arg, "Failed to read file\n");
+                                       goto free_file_total;
+                               }
+                               addr += chunksize;
+                               size -= chunksize;
+                       }
+
+                       efi_call_phys1(fh->close, files[j].handle);
+               }
+
+       }
+
+       efi_call_phys1(sys_table_arg->boottime->free_pool, files);
+
+       *load_addr = file_addr;
+       *load_size = file_size_total;
+
+       return status;
+
+free_file_total:
+       efi_free(sys_table_arg, file_size_total, file_addr);
+
+close_handles:
+       for (k = j; k < i; k++)
+               efi_call_phys1(fh->close, files[k].handle);
+free_files:
+       efi_call_phys1(sys_table_arg->boottime->free_pool, files);
+fail:
+       *load_addr = 0;
+       *load_size = 0;
+
+       return status;
+}
+/*
+ * Relocate a kernel image, either compressed or uncompressed.
+ * In the ARM64 case, all kernel images are currently
+ * uncompressed, and as such when we relocate it we need to
+ * allocate additional space for the BSS segment. Any low
+ * memory that this function should avoid needs to be
+ * unavailable in the EFI memory map, as if the preferred
+ * address is not available the lowest available address will
+ * be used.
+ */
+static efi_status_t efi_relocate_kernel(efi_system_table_t *sys_table_arg,
+                                       unsigned long *image_addr,
+                                       unsigned long image_size,
+                                       unsigned long alloc_size,
+                                       unsigned long preferred_addr,
+                                       unsigned long alignment)
+{
+       unsigned long cur_image_addr;
+       unsigned long new_addr = 0;
+       efi_status_t status;
+       unsigned long nr_pages;
+       efi_physical_addr_t efi_addr = preferred_addr;
+
+       if (!image_addr || !image_size || !alloc_size)
+               return EFI_INVALID_PARAMETER;
+       if (alloc_size < image_size)
+               return EFI_INVALID_PARAMETER;
+
+       cur_image_addr = *image_addr;
+
+       /*
+        * The EFI firmware loader could have placed the kernel image
+        * anywhere in memory, but the kernel has restrictions on the
+        * max physical address it can run at.  Some architectures
+        * also have a prefered address, so first try to relocate
+        * to the preferred address.  If that fails, allocate as low
+        * as possible while respecting the required alignment.
+        */
+       nr_pages = round_up(alloc_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+       status = efi_call_phys4(sys_table_arg->boottime->allocate_pages,
+                               EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+                               nr_pages, &efi_addr);
+       new_addr = efi_addr;
+       /*
+        * If preferred address allocation failed allocate as low as
+        * possible.
+        */
+       if (status != EFI_SUCCESS) {
+               status = efi_low_alloc(sys_table_arg, alloc_size, alignment,
+                                      &new_addr);
+       }
+       if (status != EFI_SUCCESS) {
+               efi_printk(sys_table_arg, "ERROR: Failed to allocate usable memory for kernel.\n");
+               return status;
+       }
+
+       /*
+        * We know source/dest won't overlap since both memory ranges
+        * have been allocated by UEFI, so we can safely use memcpy.
+        */
+       memcpy((void *)new_addr, (void *)cur_image_addr, image_size);
+
+       /* Return the new address of the relocated image. */
+       *image_addr = new_addr;
+
+       return status;
+}
+
+/*
+ * Convert the unicode UEFI command line to ASCII to pass to kernel.
+ * Size of memory allocated return in *cmd_line_len.
+ * Returns NULL on error.
+ */
+static char *efi_convert_cmdline_to_ascii(efi_system_table_t *sys_table_arg,
+                                     efi_loaded_image_t *image,
+                                     int *cmd_line_len)
+{
+       u16 *s2;
+       u8 *s1 = NULL;
+       unsigned long cmdline_addr = 0;
+       int load_options_size = image->load_options_size / 2; /* ASCII */
+       void *options = image->load_options;
+       int options_size = 0;
+       efi_status_t status;
+       int i;
+       u16 zero = 0;
+
+       if (options) {
+               s2 = options;
+               while (*s2 && *s2 != '\n' && options_size < load_options_size) {
+                       s2++;
+                       options_size++;
+               }
+       }
+
+       if (options_size == 0) {
+               /* No command line options, so return empty string*/
+               options_size = 1;
+               options = &zero;
+       }
+
+       options_size++;  /* NUL termination */
+#ifdef CONFIG_ARM
+       /*
+        * For ARM, allocate at a high address to avoid reserved
+        * regions at low addresses that we don't know the specfics of
+        * at the time we are processing the command line.
+        */
+       status = efi_high_alloc(sys_table_arg, options_size, 0,
+                           &cmdline_addr, 0xfffff000);
+#else
+       status = efi_low_alloc(sys_table_arg, options_size, 0,
+                           &cmdline_addr);
+#endif
+       if (status != EFI_SUCCESS)
+               return NULL;
+
+       s1 = (u8 *)cmdline_addr;
+       s2 = (u16 *)options;
+
+       for (i = 0; i < options_size - 1; i++)
+               *s1++ = *s2++;
+
+       *s1 = '\0';
+
+       *cmd_line_len = options_size;
+       return (char *)cmdline_addr;
+}
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c

index 5145fa344ad53a110b5f1da2a4862dcbb004c3e8..2e2fbdec0845414df2f32a8c0f862b4149ce3198 100644 (file)
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -13,11 +13,27 @@
   * This file is released under the GPLv2.
   */
  
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
  #include <linux/kobject.h>
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/device.h>
  #include <linux/efi.h>
+#include <linux/io.h>
+
+struct efi __read_mostly efi = {
+       .mps        = EFI_INVALID_TABLE_ADDR,
+       .acpi       = EFI_INVALID_TABLE_ADDR,
+       .acpi20     = EFI_INVALID_TABLE_ADDR,
+       .smbios     = EFI_INVALID_TABLE_ADDR,
+       .sal_systab = EFI_INVALID_TABLE_ADDR,
+       .boot_info  = EFI_INVALID_TABLE_ADDR,
+       .hcdp       = EFI_INVALID_TABLE_ADDR,
+       .uga        = EFI_INVALID_TABLE_ADDR,
+       .uv_systab  = EFI_INVALID_TABLE_ADDR,
+};
+EXPORT_SYMBOL(efi);
  
  static struct kobject *efi_kobj;
  static struct kobject *efivars_kobj;
@@ -132,3 +148,127 @@ err_put:
  }
  
  subsys_initcall(efisubsys_init);
+
+
+/*
+ * We can't ioremap data in EFI boot services RAM, because we've already mapped
+ * it as RAM.  So, look it up in the existing EFI memory map instead.  Only
+ * callable after efi_enter_virtual_mode and before efi_free_boot_services.
+ */
+void __iomem *efi_lookup_mapped_addr(u64 phys_addr)
+{
+       struct efi_memory_map *map;
+       void *p;
+       map = efi.memmap;
+       if (!map)
+               return NULL;
+       if (WARN_ON(!map->map))
+               return NULL;
+       for (p = map->map; p < map->map_end; p += map->desc_size) {
+               efi_memory_desc_t *md = p;
+               u64 size = md->num_pages << EFI_PAGE_SHIFT;
+               u64 end = md->phys_addr + size;
+               if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+                   md->type != EFI_BOOT_SERVICES_CODE &&
+                   md->type != EFI_BOOT_SERVICES_DATA)
+                       continue;
+               if (!md->virt_addr)
+                       continue;
+               if (phys_addr >= md->phys_addr && phys_addr < end) {
+                       phys_addr += md->virt_addr - md->phys_addr;
+                       return (__force void __iomem *)(unsigned long)phys_addr;
+               }
+       }
+       return NULL;
+}
+
+static __initdata efi_config_table_type_t common_tables[] = {
+       {ACPI_20_TABLE_GUID, "ACPI 2.0", &efi.acpi20},
+       {ACPI_TABLE_GUID, "ACPI", &efi.acpi},
+       {HCDP_TABLE_GUID, "HCDP", &efi.hcdp},
+       {MPS_TABLE_GUID, "MPS", &efi.mps},
+       {SAL_SYSTEM_TABLE_GUID, "SALsystab", &efi.sal_systab},
+       {SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios},
+       {UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
+       {NULL_GUID, NULL, 0},
+};
+
+static __init int match_config_table(efi_guid_t *guid,
+                                    unsigned long table,
+                                    efi_config_table_type_t *table_types)
+{
+       u8 str[EFI_VARIABLE_GUID_LEN + 1];
+       int i;
+
+       if (table_types) {
+               efi_guid_unparse(guid, str);
+
+               for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) {
+                       efi_guid_unparse(&table_types[i].guid, str);
+
+                       if (!efi_guidcmp(*guid, table_types[i].guid)) {
+                               *(table_types[i].ptr) = table;
+                               pr_cont(" %s=0x%lx ",
+                                       table_types[i].name, table);
+                               return 1;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+int __init efi_config_init(efi_config_table_type_t *arch_tables)
+{
+       void *config_tables, *tablep;
+       int i, sz;
+
+       if (efi_enabled(EFI_64BIT))
+               sz = sizeof(efi_config_table_64_t);
+       else
+               sz = sizeof(efi_config_table_32_t);
+
+       /*
+        * Let's see what config tables the firmware passed to us.
+        */
+       config_tables = early_memremap(efi.systab->tables,
+                                      efi.systab->nr_tables * sz);
+       if (config_tables == NULL) {
+               pr_err("Could not map Configuration table!\n");
+               return -ENOMEM;
+       }
+
+       tablep = config_tables;
+       pr_info("");
+       for (i = 0; i < efi.systab->nr_tables; i++) {
+               efi_guid_t guid;
+               unsigned long table;
+
+               if (efi_enabled(EFI_64BIT)) {
+                       u64 table64;
+                       guid = ((efi_config_table_64_t *)tablep)->guid;
+                       table64 = ((efi_config_table_64_t *)tablep)->table;
+                       table = table64;
+#ifndef CONFIG_64BIT
+                       if (table64 >> 32) {
+                               pr_cont("\n");
+                               pr_err("Table located above 4GB, disabling EFI.\n");
+                               early_iounmap(config_tables,
+                                              efi.systab->nr_tables * sz);
+                               return -EINVAL;
+                       }
+#endif
+               } else {
+                       guid = ((efi_config_table_32_t *)tablep)->guid;
+                       table = ((efi_config_table_32_t *)tablep)->table;
+               }
+
+               if (!match_config_table(&guid, table, common_tables))
+                       match_config_table(&guid, table, arch_tables);
+
+               tablep += sz;
+       }
+       pr_cont("\n");
+       early_iounmap(config_tables, efi.systab->nr_tables * sz);
+       return 0;
+}
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c

index 8a7432a4b4136807ad9a00015a00f5a249d6b112..933eb027d527d53aa8fb275b84c5d9b6e2ce3602 100644 (file)
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -564,7 +564,7 @@ static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data)
         return 0;
  }
  
-void efivars_sysfs_exit(void)
+static void efivars_sysfs_exit(void)
  {
         /* Remove all entries and destroy */
         __efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, NULL, NULL);
diff --git a/drivers/gpu/drm/gma500/mdfld_dsi_output.h b/drivers/gpu/drm/gma500/mdfld_dsi_output.h

index 45d5af0546bf374c11ba4bc737c74a5f13cc8e39..5b646c1f0c3eba47558bb52ab5cbf89314c350b2 100644 (file)
--- a/drivers/gpu/drm/gma500/mdfld_dsi_output.h
+++ b/drivers/gpu/drm/gma500/mdfld_dsi_output.h
@@ -39,7 +39,7 @@
  #include "psb_intel_reg.h"
  #include "mdfld_output.h"
  
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  
  #define FLD_MASK(start, end)   (((1 << ((start) - (end) + 1)) - 1) << (end))
  #define FLD_VAL(val, start, end) (((val) << (end)) & FLD_MASK(start, end))
diff --git a/drivers/gpu/drm/gma500/oaktrail_device.c b/drivers/gpu/drm/gma500/oaktrail_device.c

index 08747fd7105cdba2c489709afd097e4ce29f95c4..7a9ce000fd8678d31a7da6f6d275797ae3a36264 100644 (file)
--- a/drivers/gpu/drm/gma500/oaktrail_device.c
+++ b/drivers/gpu/drm/gma500/oaktrail_device.c
@@ -26,7 +26,7 @@
  #include "psb_drv.h"
  #include "psb_reg.h"
  #include "psb_intel_reg.h"
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  #include <asm/intel_scu_ipc.h>
  #include "mid_bios.h"
  #include "intel_bios.h"
diff --git a/drivers/gpu/drm/gma500/oaktrail_lvds.c b/drivers/gpu/drm/gma500/oaktrail_lvds.c

index e77d7214fca4fb0167b2305d25a36043d068d2ff..3ece553311fe9ae030cca381e59c3ea52194fb87 100644 (file)
--- a/drivers/gpu/drm/gma500/oaktrail_lvds.c
+++ b/drivers/gpu/drm/gma500/oaktrail_lvds.c
@@ -22,7 +22,7 @@
  
  #include <linux/i2c.h>
  #include <drm/drmP.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  
  #include "intel_bios.h"
  #include "psb_drv.h"
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c

index 3a449f65eb2d3beb889217893a0e9d677214c1d9..45935d9949d2f840f22a720df675a521aca78f61 100644 (file)
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -389,7 +389,7 @@ static int intel_idle(struct cpuidle_device *dev,
         if (!(lapic_timer_reliable_states & (1 << (cstate))))
                 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
  
-       if (!need_resched()) {
+       if (!current_set_polling_and_test()) {
  
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
                 smp_mb();
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c

index 9215ed72bece39361da1c14e568401d38008108d..d654f831410de9621df488aadbca2d6c6d174721 100644 (file)
--- a/drivers/platform/x86/intel_scu_ipc.c
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -25,7 +25,7 @@
  #include <linux/interrupt.h>
  #include <linux/sfi.h>
  #include <linux/module.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  #include <asm/intel_scu_ipc.h>
  
  /* IPC defines the following message types */
@@ -579,7 +579,7 @@ static struct pci_driver ipc_driver = {
  
  static int __init intel_scu_ipc_init(void)
  {
-       platform = mrst_identify_cpu();
+       platform = intel_mid_identify_cpu();
         if (platform == 0)
                 return -ENODEV;
         return  pci_register_driver(&ipc_driver);
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c

index 72c5cdbe0791afd2fed9a4779e5769c82c6b6ba3..544be722937cb5676c604ee1f586d289df09611c 100644 (file)
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -72,6 +72,7 @@ int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm)
         } else
                 err = -EINVAL;
  
+       pm_stay_awake(rtc->dev.parent);
         mutex_unlock(&rtc->ops_lock);
         /* A timer might have just expired */
         schedule_work(&rtc->irqwork);
@@ -113,6 +114,7 @@ int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
                 err = -EINVAL;
         }
  
+       pm_stay_awake(rtc->dev.parent);
         mutex_unlock(&rtc->ops_lock);
         /* A timer might have just expired */
         schedule_work(&rtc->irqwork);
@@ -771,9 +773,10 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
                 alarm.time = rtc_ktime_to_tm(timer->node.expires);
                 alarm.enabled = 1;
                 err = __rtc_set_alarm(rtc, &alarm);
-               if (err == -ETIME)
+               if (err == -ETIME) {
+                       pm_stay_awake(rtc->dev.parent);
                         schedule_work(&rtc->irqwork);
-               else if (err) {
+               } else if (err) {
                         timerqueue_del(&rtc->timerqueue, &timer->node);
                         timer->enabled = 0;
                         return err;
@@ -818,8 +821,10 @@ static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer)
                 alarm.time = rtc_ktime_to_tm(next->expires);
                 alarm.enabled = 1;
                 err = __rtc_set_alarm(rtc, &alarm);
-               if (err == -ETIME)
+               if (err == -ETIME) {
+                       pm_stay_awake(rtc->dev.parent);
                         schedule_work(&rtc->irqwork);
+               }
         }
  }
  
@@ -845,7 +850,6 @@ void rtc_timer_do_work(struct work_struct *work)
  
         mutex_lock(&rtc->ops_lock);
  again:
-       pm_relax(rtc->dev.parent);
         __rtc_read_time(rtc, &tm);
         now = rtc_tm_to_ktime(tm);
         while ((next = timerqueue_getnext(&rtc->timerqueue))) {
@@ -880,6 +884,7 @@ again:
         } else
                 rtc_alarm_disable(rtc);
  
+       pm_relax(rtc->dev.parent);
         mutex_unlock(&rtc->ops_lock);
  }
  
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c

index 578baf9d9725cf3bbfdb91669af5cda56ff9d7ef..315209d9b40780fe1012faed158da21d0cd58570 100644 (file)
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -38,8 +38,8 @@
  
  #include <asm-generic/rtc.h>
  #include <asm/intel_scu_ipc.h>
-#include <asm/mrst.h>
-#include <asm/mrst-vrtc.h>
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
  
  struct mrst_rtc {
         struct rtc_device       *rtc;
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c

index 0f0609b1aa2ccfead83ec98db660a58fac5736de..e3b25712b6591906aff9633e98d5c82f4feba351 100644 (file)
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -371,6 +371,7 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id)
                 }
         }
  
+       device_init_wakeup(&adev->dev, 1);
         ldata->rtc = rtc_device_register("pl031", &adev->dev, ops,
                                         THIS_MODULE);
         if (IS_ERR(ldata->rtc)) {
@@ -384,8 +385,6 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id)
                 goto out_no_irq;
         }
  
-       device_init_wakeup(&adev->dev, 1);
-
         return 0;
  
  out_no_irq:
diff --git a/drivers/watchdog/intel_scu_watchdog.c b/drivers/watchdog/intel_scu_watchdog.c

index 9dda2d08af91a23c8ce8cb50d6f1e584212ec597..8ced2561395640cdeb16fcc679442011a230c97f 100644 (file)
--- a/drivers/watchdog/intel_scu_watchdog.c
+++ b/drivers/watchdog/intel_scu_watchdog.c
@@ -48,7 +48,7 @@
  #include <linux/atomic.h>
  #include <asm/intel_scu_ipc.h>
  #include <asm/apb_timer.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
  
  #include "intel_scu_watchdog.h"
  
@@ -445,7 +445,7 @@ static int __init intel_scu_watchdog_init(void)
          *
          * If it isn't an intel MID device then it doesn't have this watchdog
          */
-       if (!mrst_identify_cpu())
+       if (!intel_mid_identify_cpu())
                 return -ENODEV;
  
         /* Check boot parameters to verify that their initial values */
diff --git a/fs/exec.c b/fs/exec.c

index 8875dd10ae7ac77444db95e33c9fde83dde67512..2ea437e5acf4d9b0064b48c900d3fa1597573e3b 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1547,6 +1547,7 @@ static int do_execve_common(const char *filename,
         current->fs->in_exec = 0;
         current->in_execve = 0;
         acct_update_integrals(current);
+       task_numa_free(current);
         free_bprm(bprm);
         if (displaced)
                 put_files_struct(displaced);
diff --git a/fs/proc/array.c b/fs/proc/array.c

index cbd0f1b324b972b96f036139fcd96bf53a03fb01..1bd2077187fd000a621e2f10453ce0661e908c02 100644 (file)
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -183,6 +183,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
         seq_printf(m,
                 "State:\t%s\n"
                 "Tgid:\t%d\n"
+               "Ngid:\t%d\n"
                 "Pid:\t%d\n"
                 "PPid:\t%d\n"
                 "TracerPid:\t%d\n"
@@ -190,6 +191,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                 "Gid:\t%d\t%d\t%d\t%d\n",
                 get_task_state(p),
                 task_tgid_nr_ns(p, ns),
+               task_numa_group_id(p),
                 pid_nr_ns(pid, ns),
                 ppid, tpid,
                 from_kuid_munged(user_ns, cred->uid),
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h

new file mode 100644 (file)

index 0000000..ddf2b42
--- /dev/null
+++ b/include/asm-generic/preempt.h
@@ -0,0 +1,105 @@
+#ifndef __ASM_PREEMPT_H
+#define __ASM_PREEMPT_H
+
+#include <linux/thread_info.h>
+
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
+static __always_inline int preempt_count(void)
+{
+       return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline int *preempt_count_ptr(void)
+{
+       return &current_thread_info()->preempt_count;
+}
+
+/*
+ * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
+ * alternative is loosing a reschedule. Better schedule too often -- also this
+ * should be a very rare operation.
+ */
+static __always_inline void preempt_count_set(int pc)
+{
+       *preempt_count_ptr() = pc;
+}
+
+/*
+ * must be macros to avoid header recursion hell
+ */
+#define task_preempt_count(p) \
+       (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED)
+
+#define init_task_preempt_count(p) do { \
+       task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
+} while (0)
+
+#define init_idle_preempt_count(p, cpu) do { \
+       task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+} while (0)
+
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * single instruction.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
+
+static __always_inline void set_preempt_need_resched(void)
+{
+       *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline void clear_preempt_need_resched(void)
+{
+       *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline bool test_preempt_need_resched(void)
+{
+       return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
+}
+
+/*
+ * The various preempt_count add/sub methods
+ */
+
+static __always_inline void __preempt_count_add(int val)
+{
+       *preempt_count_ptr() += val;
+}
+
+static __always_inline void __preempt_count_sub(int val)
+{
+       *preempt_count_ptr() -= val;
+}
+
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+       return !--*preempt_count_ptr();
+}
+
+/*
+ * Returns true when we need to resched and can (barring IRQ state).
+ */
+static __always_inline bool should_resched(void)
+{
+       return unlikely(!*preempt_count_ptr());
+}
+
+#ifdef CONFIG_PREEMPT
+extern asmlinkage void preempt_schedule(void);
+#define __preempt_schedule() preempt_schedule()
+
+#ifdef CONFIG_CONTEXT_TRACKING
+extern asmlinkage void preempt_schedule_context(void);
+#define __preempt_schedule_context() preempt_schedule_context()
+#endif
+#endif /* CONFIG_PREEMPT */
+
+#endif /* __ASM_PREEMPT_H */
diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h

index 93b7f96f9c59f354be61ce529f90f8fda305f376..6d26b40cbf5d29c7d80c83a9fe22b4f9e85a0b70 100644 (file)
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -33,6 +33,16 @@ enum arch_timer_reg {
  #define ARCH_TIMER_MEM_PHYS_ACCESS     2
  #define ARCH_TIMER_MEM_VIRT_ACCESS     3
  
+#define ARCH_TIMER_USR_PCT_ACCESS_EN   (1 << 0) /* physical counter */
+#define ARCH_TIMER_USR_VCT_ACCESS_EN   (1 << 1) /* virtual counter */
+#define ARCH_TIMER_VIRT_EVT_EN         (1 << 2)
+#define ARCH_TIMER_EVT_TRIGGER_SHIFT   (4)
+#define ARCH_TIMER_EVT_TRIGGER_MASK    (0xF << ARCH_TIMER_EVT_TRIGGER_SHIFT)
+#define ARCH_TIMER_USR_VT_ACCESS_EN    (1 << 8) /* virtual timer registers */
+#define ARCH_TIMER_USR_PT_ACCESS_EN    (1 << 9) /* physical timer registers */
+
+#define ARCH_TIMER_EVT_STREAM_FREQ     10000   /* 100us */
+
  #ifdef CONFIG_ARM_ARCH_TIMER
  
  extern u32 arch_timer_get_rate(void);
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h

index 0857922e8ad04fb465bc7ea4edadd968384f6271..493aa021c7a9429a892c8ccb6f19110d1b4c9065 100644 (file)
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -60,6 +60,7 @@ enum clock_event_mode {
   * Core shall set the interrupt affinity dynamically in broadcast mode
   */
  #define CLOCK_EVT_FEAT_DYNIRQ          0x000020
+#define CLOCK_EVT_FEAT_PERCPU          0x000040
  
  /**
   * struct clock_event_device - clock event device descriptor
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h

index dbbf8aa7731bee4f8bb1f134b1e847b6af00b8cb..67301a40571268a9f29f5477cf1b2e29617c2cbf 100644 (file)
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -292,6 +292,8 @@ extern void clocksource_resume(void);
  extern struct clocksource * __init __weak clocksource_default_clock(void);
  extern void clocksource_mark_unstable(struct clocksource *cs);
  
+extern u64
+clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask);
  extern void
  clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
  
diff --git a/include/linux/efi.h b/include/linux/efi.h

index 5f8f176154f7009b74157a009266b74d04fbb571..bc5687d0f3157c9d8d39342d4e69db1734baff16 100644 (file)
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -39,6 +39,8 @@
  typedef unsigned long efi_status_t;
  typedef u8 efi_bool_t;
  typedef u16 efi_char16_t;              /* UNICODE character */
+typedef u64 efi_physical_addr_t;
+typedef void *efi_handle_t;
  
  
  typedef struct {
@@ -96,6 +98,7 @@ typedef       struct {
  #define EFI_MEMORY_DESCRIPTOR_VERSION  1
  
  #define EFI_PAGE_SHIFT         12
+#define EFI_PAGE_SIZE          (1UL << EFI_PAGE_SHIFT)
  
  typedef struct {
         u32 type;
@@ -157,11 +160,13 @@ typedef struct {
         efi_table_hdr_t hdr;
         void *raise_tpl;
         void *restore_tpl;
-       void *allocate_pages;
-       void *free_pages;
-       void *get_memory_map;
-       void *allocate_pool;
-       void *free_pool;
+       efi_status_t (*allocate_pages)(int, int, unsigned long,
+                                      efi_physical_addr_t *);
+       efi_status_t (*free_pages)(efi_physical_addr_t, unsigned long);
+       efi_status_t (*get_memory_map)(unsigned long *, void *, unsigned long *,
+                                      unsigned long *, u32 *);
+       efi_status_t (*allocate_pool)(int, unsigned long, void **);
+       efi_status_t (*free_pool)(void *);
         void *create_event;
         void *set_timer;
         void *wait_for_event;
@@ -171,7 +176,7 @@ typedef struct {
         void *install_protocol_interface;
         void *reinstall_protocol_interface;
         void *uninstall_protocol_interface;
-       void *handle_protocol;
+       efi_status_t (*handle_protocol)(efi_handle_t, efi_guid_t *, void **);
         void *__reserved;
         void *register_protocol_notify;
         void *locate_handle;
@@ -181,7 +186,7 @@ typedef struct {
         void *start_image;
         void *exit;
         void *unload_image;
-       void *exit_boot_services;
+       efi_status_t (*exit_boot_services)(efi_handle_t, unsigned long);
         void *get_next_monotonic_count;
         void *stall;
         void *set_watchdog_timer;
@@ -404,6 +409,12 @@ typedef struct {
         unsigned long table;
  } efi_config_table_t;
  
+typedef struct {
+       efi_guid_t guid;
+       const char *name;
+       unsigned long *ptr;
+} efi_config_table_type_t;
+
  #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL)
  
  #define EFI_2_30_SYSTEM_TABLE_REVISION  ((2 << 16) | (30))
@@ -488,10 +499,6 @@ typedef struct {
         unsigned long unload;
  } efi_loaded_image_t;
  
-typedef struct {
-       u64 revision;
-       void *open_volume;
-} efi_file_io_interface_t;
  
  typedef struct {
         u64 size;
@@ -504,20 +511,30 @@ typedef struct {
         efi_char16_t filename[1];
  } efi_file_info_t;
  
-typedef struct {
+typedef struct _efi_file_handle {
         u64 revision;
-       void *open;
-       void *close;
+       efi_status_t (*open)(struct _efi_file_handle *,
+                            struct _efi_file_handle **,
+                            efi_char16_t *, u64, u64);
+       efi_status_t (*close)(struct _efi_file_handle *);
         void *delete;
-       void *read;
+       efi_status_t (*read)(struct _efi_file_handle *, unsigned long *,
+                            void *);
         void *write;
         void *get_position;
         void *set_position;
-       void *get_info;
+       efi_status_t (*get_info)(struct _efi_file_handle *, efi_guid_t *,
+                       unsigned long *, void *);
         void *set_info;
         void *flush;
  } efi_file_handle_t;
  
+typedef struct _efi_file_io_interface {
+       u64 revision;
+       int (*open_volume)(struct _efi_file_io_interface *,
+                          efi_file_handle_t **);
+} efi_file_io_interface_t;
+
  #define EFI_FILE_MODE_READ     0x0000000000000001
  #define EFI_FILE_MODE_WRITE    0x0000000000000002
  #define EFI_FILE_MODE_CREATE   0x8000000000000000
@@ -552,6 +569,7 @@ extern struct efi {
         efi_get_next_high_mono_count_t *get_next_high_mono_count;
         efi_reset_system_t *reset_system;
         efi_set_virtual_address_map_t *set_virtual_address_map;
+       struct efi_memory_map *memmap;
  } efi;
  
  static inline int
@@ -587,6 +605,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
  }
  #endif
  extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
+extern int efi_config_init(efi_config_table_type_t *arch_tables);
  extern u64 efi_get_iobase (void);
  extern u32 efi_mem_type (unsigned long phys_addr);
  extern u64 efi_mem_attributes (unsigned long phys_addr);
@@ -784,6 +803,13 @@ struct efivar_entry {
         struct kobject kobj;
  };
  
+
+struct efi_simple_text_output_protocol {
+       void *reset;
+       efi_status_t (*output_string)(void *, void *);
+       void *test_string;
+};
+
  extern struct list_head efivar_sysfs_list;
  
  static inline void
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h

index 1e041063b22654aa375630bf36282d10059d7bec..d9cf963ac832475ffeb04c3385d1576b054fbb79 100644 (file)
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void);
  #define __irq_enter()                                  \
         do {                                            \
                 account_irq_enter_time(current);        \
-               add_preempt_count(HARDIRQ_OFFSET);      \
+               preempt_count_add(HARDIRQ_OFFSET);      \
                 trace_hardirq_enter();                  \
         } while (0)
  
@@ -49,7 +49,7 @@ extern void irq_enter(void);
         do {                                            \
                 trace_hardirq_exit();                   \
                 account_irq_exit_time(current);         \
-               sub_preempt_count(HARDIRQ_OFFSET);      \
+               preempt_count_sub(HARDIRQ_OFFSET);      \
         } while (0)
  
  /*
@@ -62,7 +62,7 @@ extern void irq_exit(void);
                 lockdep_off();                                  \
                 ftrace_nmi_enter();                             \
                 BUG_ON(in_nmi());                               \
-               add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \
+               preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
                 rcu_nmi_enter();                                \
                 trace_hardirq_enter();                          \
         } while (0)
@@ -72,7 +72,7 @@ extern void irq_exit(void);
                 trace_hardirq_exit();                           \
                 rcu_nmi_exit();                                 \
                 BUG_ON(!in_nmi());                              \
-               sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \
+               preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
                 ftrace_nmi_exit();                              \
                 lockdep_on();                                   \
         } while (0)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h

index 5e865b55494096898112234b1f2898cfc372d757..c9e831dc80bccb10efaec812787f75b57fbf5b71 100644 (file)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -19,6 +19,7 @@
  
  #include <linux/atomic.h>
  #include <asm/ptrace.h>
+#include <asm/irq.h>
  
  /*
   * These correspond to the IORESOURCE_IRQ_* defines in
@@ -374,6 +375,16 @@ struct softirq_action
  
  asmlinkage void do_softirq(void);
  asmlinkage void __do_softirq(void);
+
+#ifdef __ARCH_HAS_DO_SOFTIRQ
+void do_softirq_own_stack(void);
+#else
+static inline void do_softirq_own_stack(void)
+{
+       __do_softirq();
+}
+#endif
+
  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
  extern void softirq_init(void);
  extern void __raise_softirq_irqoff(unsigned int nr);
diff --git a/include/linux/kdb.h b/include/linux/kdb.h

index 7f6fe6e015bc8ade40f04ee8e3e4ba750ec45202..290db1269c4c7970ab016165c62d93eec53059b4 100644 (file)
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -109,6 +109,7 @@ typedef enum {
         KDB_REASON_RECURSE,     /* Recursive entry to kdb;
                                  * regs probably valid */
         KDB_REASON_SSTEP,       /* Single Step trap. - regs valid */
+       KDB_REASON_SYSTEM_NMI,  /* In NMI due to SYSTEM cmd; regs valid */
  } kdb_reason_t;
  
  extern int kdb_trap_printk;
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h

index c6e091bf39a52e73a0964036ef3e1888e7be61d9..dfb4f2ffdaa2e5eccddb6755f08305ff7a449891 100644 (file)
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -310,6 +310,7 @@ extern int
  kgdb_handle_exception(int ex_vector, int signo, int err_code,
                       struct pt_regs *regs);
  extern int kgdb_nmicallback(int cpu, void *regs);
+extern int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *snd_rdy);
  extern void gdbstub_exit(int status);
  
  extern int                     kgdb_single_step;
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h

index da6716b9e3fea8148f45f57ac556367b547f839f..ea4d2495c6464bb2a5789f2566358461cca31330 100644 (file)
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
  
  struct mempolicy *get_vma_policy(struct task_struct *tsk,
                 struct vm_area_struct *vma, unsigned long addr);
+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma);
  
  extern void numa_default_policy(void);
  extern void numa_policy_init(void);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h

index 8d3c57fdf2213cd21625ea2b5f871462093c0921..f5096b58b20d3ef64618e18ca37e1083b97af252 100644 (file)
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -90,11 +90,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
  #endif /* CONFIG_MIGRATION */
  
  #ifdef CONFIG_NUMA_BALANCING
-extern int migrate_misplaced_page(struct page *page, int node);
-extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page,
+                                 struct vm_area_struct *vma, int node);
  extern bool migrate_ratelimited(int node);
  #else
-static inline int migrate_misplaced_page(struct page *page, int node)
+static inline int migrate_misplaced_page(struct page *page,
+                                        struct vm_area_struct *vma, int node)
  {
         return -EAGAIN; /* can't migrate now */
  }
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 1a0668e5a4eef0377b708b9aded0e11f8da67a80..8aa4006b9636e11dbcb4e32e988f43c30c865396 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -595,11 +595,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
   * sets it, so none of the operations on it need to be atomic.
   */
  
-/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */
+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
  #define SECTIONS_PGOFF         ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
  #define NODES_PGOFF            (SECTIONS_PGOFF - NODES_WIDTH)
  #define ZONES_PGOFF            (NODES_PGOFF - ZONES_WIDTH)
-#define LAST_NID_PGOFF         (ZONES_PGOFF - LAST_NID_WIDTH)
+#define LAST_CPUPID_PGOFF      (ZONES_PGOFF - LAST_CPUPID_WIDTH)
  
  /*
   * Define the bit shifts to access each section.  For non-existent
@@ -609,7 +609,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  #define SECTIONS_PGSHIFT       (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
  #define NODES_PGSHIFT          (NODES_PGOFF * (NODES_WIDTH != 0))
  #define ZONES_PGSHIFT          (ZONES_PGOFF * (ZONES_WIDTH != 0))
-#define LAST_NID_PGSHIFT       (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0))
+#define LAST_CPUPID_PGSHIFT    (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
  
  /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
  #ifdef NODE_NOT_IN_PAGE_FLAGS
@@ -631,7 +631,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
  #define ZONES_MASK             ((1UL << ZONES_WIDTH) - 1)
  #define NODES_MASK             ((1UL << NODES_WIDTH) - 1)
  #define SECTIONS_MASK          ((1UL << SECTIONS_WIDTH) - 1)
-#define LAST_NID_MASK          ((1UL << LAST_NID_WIDTH) - 1)
+#define LAST_CPUPID_MASK       ((1UL << LAST_CPUPID_WIDTH) - 1)
  #define ZONEID_MASK            ((1UL << ZONEID_SHIFT) - 1)
  
  static inline enum zone_type page_zonenum(const struct page *page)
@@ -675,51 +675,117 @@ static inline int page_to_nid(const struct page *page)
  #endif
  
  #ifdef CONFIG_NUMA_BALANCING
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-static inline int page_nid_xchg_last(struct page *page, int nid)
+static inline int cpu_pid_to_cpupid(int cpu, int pid)
  {
-       return xchg(&page->_last_nid, nid);
+       return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
  }
  
-static inline int page_nid_last(struct page *page)
+static inline int cpupid_to_pid(int cpupid)
  {
-       return page->_last_nid;
+       return cpupid & LAST__PID_MASK;
  }
-static inline void page_nid_reset_last(struct page *page)
+
+static inline int cpupid_to_cpu(int cpupid)
  {
-       page->_last_nid = -1;
+       return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
  }
-#else
-static inline int page_nid_last(struct page *page)
+
+static inline int cpupid_to_nid(int cpupid)
  {
-       return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK;
+       return cpu_to_node(cpupid_to_cpu(cpupid));
  }
  
-extern int page_nid_xchg_last(struct page *page, int nid);
+static inline bool cpupid_pid_unset(int cpupid)
+{
+       return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
+}
  
-static inline void page_nid_reset_last(struct page *page)
+static inline bool cpupid_cpu_unset(int cpupid)
  {
-       int nid = (1 << LAST_NID_SHIFT) - 1;
+       return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
+}
  
-       page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
-       page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
+{
+       return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
+}
+
+#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+{
+       return xchg(&page->_last_cpupid, cpupid);
+}
+
+static inline int page_cpupid_last(struct page *page)
+{
+       return page->_last_cpupid;
+}
+static inline void page_cpupid_reset_last(struct page *page)
+{
+       page->_last_cpupid = -1;
  }
-#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
  #else
-static inline int page_nid_xchg_last(struct page *page, int nid)
+static inline int page_cpupid_last(struct page *page)
  {
-       return page_to_nid(page);
+       return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
  }
  
-static inline int page_nid_last(struct page *page)
+extern int page_cpupid_xchg_last(struct page *page, int cpupid);
+
+static inline void page_cpupid_reset_last(struct page *page)
  {
-       return page_to_nid(page);
+       int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
+
+       page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
+       page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+}
+#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
+#else /* !CONFIG_NUMA_BALANCING */
+static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+{
+       return page_to_nid(page); /* XXX */
  }
  
-static inline void page_nid_reset_last(struct page *page)
+static inline int page_cpupid_last(struct page *page)
  {
+       return page_to_nid(page); /* XXX */
  }
-#endif
+
+static inline int cpupid_to_nid(int cpupid)
+{
+       return -1;
+}
+
+static inline int cpupid_to_pid(int cpupid)
+{
+       return -1;
+}
+
+static inline int cpupid_to_cpu(int cpupid)
+{
+       return -1;
+}
+
+static inline int cpu_pid_to_cpupid(int nid, int pid)
+{
+       return -1;
+}
+
+static inline bool cpupid_pid_unset(int cpupid)
+{
+       return 1;
+}
+
+static inline void page_cpupid_reset_last(struct page *page)
+{
+}
+
+static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
+{
+       return false;
+}
+#endif /* CONFIG_NUMA_BALANCING */
  
  static inline struct zone *page_zone(const struct page *page)
  {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index d9851eeb6e1d183ca2c49e3dffcc4e1a1443510e..a3198e5aaf4e4dac55a70caae5669e8ab84906e3 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -174,8 +174,8 @@ struct page {
         void *shadow;
  #endif
  
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-       int _last_nid;
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+       int _last_cpupid;
  #endif
  }
  /*
@@ -420,28 +420,15 @@ struct mm_struct {
          */
         unsigned long numa_next_scan;
  
-       /* numa_next_reset is when the PTE scanner period will be reset */
-       unsigned long numa_next_reset;
-
         /* Restart point for scanning and setting pte_numa */
         unsigned long numa_scan_offset;
  
         /* numa_scan_seq prevents two threads setting pte_numa */
         int numa_scan_seq;
-
-       /*
-        * The first node a task was scheduled on. If a task runs on
-        * a different node than Make PTE Scan Go Now.
-        */
-       int first_nid;
  #endif
         struct uprobes_state uprobes_state;
  };
  
-/* first nid will either be a valid NID or one of these values */
-#define NUMA_PTE_SCAN_INIT     -1
-#define NUMA_PTE_SCAN_ACTIVE   -2
-
  static inline void mm_init_cpumask(struct mm_struct *mm)
  {
  #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h

index 93506a114034ba96394b37529e629a392ae834c9..da523661500ae65eeea227c3eb48fbffe8a2615f 100644 (file)
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -38,10 +38,10 @@
   * The last is when there is insufficient space in page->flags and a separate
   * lookup is necessary.
   *
- * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |          ... | FLAGS |
- *         " plus space for last_nid: |       NODE     | ZONE | LAST_NID ... | FLAGS |
- * classic sparse with space for node:| SECTION | NODE | ZONE |          ... | FLAGS |
- *         " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS |
+ * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |             ... | FLAGS |
+ *      " plus space for last_cpupid: |       NODE     | ZONE | LAST_CPUPID ... | FLAGS |
+ * classic sparse with space for node:| SECTION | NODE | ZONE |             ... | FLAGS |
+ *      " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
   * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
   */
  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -62,15 +62,21 @@
  #endif
  
  #ifdef CONFIG_NUMA_BALANCING
-#define LAST_NID_SHIFT NODES_SHIFT
+#define LAST__PID_SHIFT 8
+#define LAST__PID_MASK  ((1 << LAST__PID_SHIFT)-1)
+
+#define LAST__CPU_SHIFT NR_CPUS_BITS
+#define LAST__CPU_MASK  ((1 << LAST__CPU_SHIFT)-1)
+
+#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT)
  #else
-#define LAST_NID_SHIFT 0
+#define LAST_CPUPID_SHIFT 0
  #endif
  
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
-#define LAST_NID_WIDTH LAST_NID_SHIFT
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
  #else
-#define LAST_NID_WIDTH 0
+#define LAST_CPUPID_WIDTH 0
  #endif
  
  /*
@@ -81,8 +87,8 @@
  #define NODE_NOT_IN_PAGE_FLAGS
  #endif
  
-#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0
-#define LAST_NID_NOT_IN_PAGE_FLAGS
+#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0
+#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
  #endif
  
  #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index c8ba627c1d608733b8480bb929d9e81d97e57fa0..2e069d1288df5f04e6982243d7cac0f7455cd31d 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -584,6 +584,10 @@ struct perf_sample_data {
         struct perf_regs_user           regs_user;
         u64                             stack_user_size;
         u64                             weight;
+       /*
+        * Transaction flags for abort events:
+        */
+       u64                             txn;
  };
  
  static inline void perf_sample_data_init(struct perf_sample_data *data,
@@ -599,6 +603,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
         data->stack_user_size = 0;
         data->weight = 0;
         data->data_src.val = 0;
+       data->txn = 0;
  }
  
  extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/linux/preempt.h b/include/linux/preempt.h

index f5d4723cdb3d3e6ab30d5af02d1eaadbdc7d0ee3..a3d9dc8c2c006a02cd7677a6e4a6ac10dac69910 100644 (file)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -6,106 +6,95 @@
   * preempt_count (used for kernel preemption, interrupt count, etc.)
   */
  
-#include <linux/thread_info.h>
  #include <linux/linkage.h>
  #include <linux/list.h>
  
-#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
-  extern void add_preempt_count(int val);
-  extern void sub_preempt_count(int val);
-#else
-# define add_preempt_count(val)        do { preempt_count() += (val); } while (0)
-# define sub_preempt_count(val)        do { preempt_count() -= (val); } while (0)
-#endif
-
-#define inc_preempt_count() add_preempt_count(1)
-#define dec_preempt_count() sub_preempt_count(1)
-
-#define preempt_count()        (current_thread_info()->preempt_count)
-
-#ifdef CONFIG_PREEMPT
-
-asmlinkage void preempt_schedule(void);
-
-#define preempt_check_resched() \
-do { \
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
-               preempt_schedule(); \
-} while (0)
-
-#ifdef CONFIG_CONTEXT_TRACKING
+/*
+ * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
+ * the other bits -- can't include that header due to inclusion hell.
+ */
+#define PREEMPT_NEED_RESCHED   0x80000000
  
-void preempt_schedule_context(void);
+#include <asm/preempt.h>
  
-#define preempt_check_resched_context() \
-do { \
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
-               preempt_schedule_context(); \
-} while (0)
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
+extern void preempt_count_add(int val);
+extern void preempt_count_sub(int val);
+#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); })
  #else
+#define preempt_count_add(val) __preempt_count_add(val)
+#define preempt_count_sub(val) __preempt_count_sub(val)
+#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
+#endif
  
-#define preempt_check_resched_context() preempt_check_resched()
-
-#endif /* CONFIG_CONTEXT_TRACKING */
-
-#else /* !CONFIG_PREEMPT */
-
-#define preempt_check_resched()                do { } while (0)
-#define preempt_check_resched_context()        do { } while (0)
-
-#endif /* CONFIG_PREEMPT */
+#define __preempt_count_inc() __preempt_count_add(1)
+#define __preempt_count_dec() __preempt_count_sub(1)
  
+#define preempt_count_inc() preempt_count_add(1)
+#define preempt_count_dec() preempt_count_sub(1)
  
  #ifdef CONFIG_PREEMPT_COUNT
  
  #define preempt_disable() \
  do { \
-       inc_preempt_count(); \
+       preempt_count_inc(); \
         barrier(); \
  } while (0)
  
  #define sched_preempt_enable_no_resched() \
  do { \
         barrier(); \
-       dec_preempt_count(); \
+       preempt_count_dec(); \
  } while (0)
  
-#define preempt_enable_no_resched()    sched_preempt_enable_no_resched()
+#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
  
+#ifdef CONFIG_PREEMPT
  #define preempt_enable() \
  do { \
-       preempt_enable_no_resched(); \
         barrier(); \
-       preempt_check_resched(); \
+       if (unlikely(preempt_count_dec_and_test())) \
+               __preempt_schedule(); \
+} while (0)
+
+#define preempt_check_resched() \
+do { \
+       if (should_resched()) \
+               __preempt_schedule(); \
  } while (0)
  
-/* For debugging and tracer internals only! */
-#define add_preempt_count_notrace(val)                 \
-       do { preempt_count() += (val); } while (0)
-#define sub_preempt_count_notrace(val)                 \
-       do { preempt_count() -= (val); } while (0)
-#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
-#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
+#else
+#define preempt_enable() preempt_enable_no_resched()
+#define preempt_check_resched() do { } while (0)
+#endif
  
  #define preempt_disable_notrace() \
  do { \
-       inc_preempt_count_notrace(); \
+       __preempt_count_inc(); \
         barrier(); \
  } while (0)
  
  #define preempt_enable_no_resched_notrace() \
  do { \
         barrier(); \
-       dec_preempt_count_notrace(); \
+       __preempt_count_dec(); \
  } while (0)
  
-/* preempt_check_resched is OK to trace */
+#ifdef CONFIG_PREEMPT
+
+#ifndef CONFIG_CONTEXT_TRACKING
+#define __preempt_schedule_context() __preempt_schedule()
+#endif
+
  #define preempt_enable_notrace() \
  do { \
-       preempt_enable_no_resched_notrace(); \
         barrier(); \
-       preempt_check_resched_context(); \
+       if (unlikely(__preempt_count_dec_and_test())) \
+               __preempt_schedule_context(); \
  } while (0)
+#else
+#define preempt_enable_notrace() preempt_enable_no_resched_notrace()
+#endif
  
  #else /* !CONFIG_PREEMPT_COUNT */
  
@@ -115,10 +104,11 @@ do { \
   * that can cause faults and scheduling migrate into our preempt-protected
   * region.
   */
-#define preempt_disable()              barrier()
+#define preempt_disable()                      barrier()
  #define sched_preempt_enable_no_resched()      barrier()
-#define preempt_enable_no_resched()    barrier()
-#define preempt_enable()               barrier()
+#define preempt_enable_no_resched()            barrier()
+#define preempt_enable()                       barrier()
+#define preempt_check_resched()                        do { } while (0)
  
  #define preempt_disable_notrace()              barrier()
  #define preempt_enable_no_resched_notrace()    barrier()
diff --git a/include/linux/rculist.h b/include/linux/rculist.h

index 4106721c4e5e39d3d0a08d44f5d50909da6c9b86..45a0a9e81478c41187088d6ce3fb9762f070e335 100644 (file)
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -18,6 +18,21 @@
   * be used anywhere you would want to use a list_empty_rcu().
   */
  
+/*
+ * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
+ * @list: list to be initialized
+ *
+ * You should instead use INIT_LIST_HEAD() for normal initialization and
+ * cleanup tasks, when readers have no access to the list being initialized.
+ * However, if the list being initialized is visible to readers, you
+ * need to keep the compiler from being too mischievous.
+ */
+static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
+{
+       ACCESS_ONCE(list->next) = list;
+       ACCESS_ONCE(list->prev) = list;
+}
+
  /*
   * return the ->next pointer of a list_head in an rcu safe
   * way, we must not access it directly
@@ -191,9 +206,13 @@ static inline void list_splice_init_rcu(struct list_head *list,
         if (list_empty(list))
                 return;
  
-       /* "first" and "last" tracking list, so initialize it. */
+       /*
+        * "first" and "last" tracking list, so initialize it.  RCU readers
+        * have access to this list, so we must use INIT_LIST_HEAD_RCU()
+        * instead of INIT_LIST_HEAD().
+        */
  
-       INIT_LIST_HEAD(list);
+       INIT_LIST_HEAD_RCU(list);
  
         /*
          * At this point, the list body still points to the source list.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index f1f1bc39346b85c303c227e3137b78e0e87bdcc8..39cbb889e20da020c8936aae8124601b718e05a5 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -261,6 +261,10 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
                 rcu_irq_exit(); \
         } while (0)
  
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
+extern bool __rcu_is_watching(void);
+#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
+
  /*
   * Infrastructure to implement the synchronize_() primitives in
   * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
@@ -297,10 +301,6 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
  }
  #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
  
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP)
-extern int rcu_is_cpu_idle(void);
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP) */
-
  #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
  bool rcu_lockdep_current_cpu_online(void);
  #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
@@ -351,7 +351,7 @@ static inline int rcu_read_lock_held(void)
  {
         if (!debug_lockdep_rcu_enabled())
                 return 1;
-       if (rcu_is_cpu_idle())
+       if (!rcu_is_watching())
                 return 0;
         if (!rcu_lockdep_current_cpu_online())
                 return 0;
@@ -402,7 +402,7 @@ static inline int rcu_read_lock_sched_held(void)
  
         if (!debug_lockdep_rcu_enabled())
                 return 1;
-       if (rcu_is_cpu_idle())
+       if (!rcu_is_watching())
                 return 0;
         if (!rcu_lockdep_current_cpu_online())
                 return 0;
@@ -771,7 +771,7 @@ static inline void rcu_read_lock(void)
         __rcu_read_lock();
         __acquire(RCU);
         rcu_lock_acquire(&rcu_lock_map);
-       rcu_lockdep_assert(!rcu_is_cpu_idle(),
+       rcu_lockdep_assert(rcu_is_watching(),
                            "rcu_read_lock() used illegally while idle");
  }
  
@@ -792,7 +792,7 @@ static inline void rcu_read_lock(void)
   */
  static inline void rcu_read_unlock(void)
  {
-       rcu_lockdep_assert(!rcu_is_cpu_idle(),
+       rcu_lockdep_assert(rcu_is_watching(),
                            "rcu_read_unlock() used illegally while idle");
         rcu_lock_release(&rcu_lock_map);
         __release(RCU);
@@ -821,7 +821,7 @@ static inline void rcu_read_lock_bh(void)
         local_bh_disable();
         __acquire(RCU_BH);
         rcu_lock_acquire(&rcu_bh_lock_map);
-       rcu_lockdep_assert(!rcu_is_cpu_idle(),
+       rcu_lockdep_assert(rcu_is_watching(),
                            "rcu_read_lock_bh() used illegally while idle");
  }
  
@@ -832,7 +832,7 @@ static inline void rcu_read_lock_bh(void)
   */
  static inline void rcu_read_unlock_bh(void)
  {
-       rcu_lockdep_assert(!rcu_is_cpu_idle(),
+       rcu_lockdep_assert(rcu_is_watching(),
                            "rcu_read_unlock_bh() used illegally while idle");
         rcu_lock_release(&rcu_bh_lock_map);
         __release(RCU_BH);
@@ -857,7 +857,7 @@ static inline void rcu_read_lock_sched(void)
         preempt_disable();
         __acquire(RCU_SCHED);
         rcu_lock_acquire(&rcu_sched_lock_map);
-       rcu_lockdep_assert(!rcu_is_cpu_idle(),
+       rcu_lockdep_assert(rcu_is_watching(),
                            "rcu_read_lock_sched() used illegally while idle");
  }
  
@@ -875,7 +875,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void)
   */
  static inline void rcu_read_unlock_sched(void)
  {
-       rcu_lockdep_assert(!rcu_is_cpu_idle(),
+       rcu_lockdep_assert(rcu_is_watching(),
                            "rcu_read_unlock_sched() used illegally while idle");
         rcu_lock_release(&rcu_sched_lock_map);
         __release(RCU_SCHED);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h

index e31005ee339ebde022bd7dabace245ee79474d9b..09ebcbe9fd780932e2dee223daaa8426395e77d8 100644 (file)
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -132,4 +132,21 @@ static inline void rcu_scheduler_starting(void)
  }
  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
+
+static inline bool rcu_is_watching(void)
+{
+       return __rcu_is_watching();
+}
+
+#else /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
+
+static inline bool rcu_is_watching(void)
+{
+       return true;
+}
+
+
+#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
+
  #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index 226169d1bd2bc07fa465d9c4b4b21cf8131de162..4b9c8154874269d66cc3fa1312bc6e53d2584669 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -90,4 +90,6 @@ extern void exit_rcu(void);
  extern void rcu_scheduler_starting(void);
  extern int rcu_scheduler_active __read_mostly;
  
+extern bool rcu_is_watching(void);
+
  #endif /* __LINUX_RCUTREE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index e27baeeda3f470ed99ae899d8de22da062856bf8..045b0d2278463583e90a57d7e80d4257ecea1b06 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -22,6 +22,7 @@ struct sched_param {
  #include <linux/errno.h>
  #include <linux/nodemask.h>
  #include <linux/mm_types.h>
+#include <linux/preempt.h>
  
  #include <asm/page.h>
  #include <asm/ptrace.h>
@@ -427,6 +428,14 @@ struct task_cputime {
                 .sum_exec_runtime = 0,                          \
         }
  
+#define PREEMPT_ENABLED                (PREEMPT_NEED_RESCHED)
+
+#ifdef CONFIG_PREEMPT_COUNT
+#define PREEMPT_DISABLED       (1 + PREEMPT_ENABLED)
+#else
+#define PREEMPT_DISABLED       PREEMPT_ENABLED
+#endif
+
  /*
   * Disable preemption until the scheduler is running.
   * Reset by start_kernel()->sched_init()->init_idle().
@@ -434,7 +443,7 @@ struct task_cputime {
   * We include PREEMPT_ACTIVE to avoid cond_resched() from working
   * before the scheduler is active -- see should_resched().
   */
-#define INIT_PREEMPT_COUNT     (1 + PREEMPT_ACTIVE)
+#define INIT_PREEMPT_COUNT     (PREEMPT_DISABLED + PREEMPT_ACTIVE)
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
@@ -768,6 +777,7 @@ enum cpu_idle_type {
  #define SD_ASYM_PACKING                0x0800  /* Place busy groups earlier in the domain */
  #define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling domain */
  #define SD_OVERLAP             0x2000  /* sched_domains of this level overlap */
+#define SD_NUMA                        0x4000  /* cross-node balancing */
  
  extern int __weak arch_sd_sibiling_asym_packing(void);
  
@@ -811,6 +821,10 @@ struct sched_domain {
  
         u64 last_update;
  
+       /* idle_balance() stats */
+       u64 max_newidle_lb_cost;
+       unsigned long next_decay_max_lb_cost;
+
  #ifdef CONFIG_SCHEDSTATS
         /* load_balance() stats */
         unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -1029,6 +1043,8 @@ struct task_struct {
         struct task_struct *last_wakee;
         unsigned long wakee_flips;
         unsigned long wakee_flip_decay_ts;
+
+       int wake_cpu;
  #endif
         int on_rq;
  
@@ -1324,10 +1340,41 @@ struct task_struct {
  #endif
  #ifdef CONFIG_NUMA_BALANCING
         int numa_scan_seq;
-       int numa_migrate_seq;
         unsigned int numa_scan_period;
+       unsigned int numa_scan_period_max;
+       int numa_preferred_nid;
+       int numa_migrate_deferred;
+       unsigned long numa_migrate_retry;
         u64 node_stamp;                 /* migration stamp  */
         struct callback_head numa_work;
+
+       struct list_head numa_entry;
+       struct numa_group *numa_group;
+
+       /*
+        * Exponential decaying average of faults on a per-node basis.
+        * Scheduling placement decisions are made based on the these counts.
+        * The values remain static for the duration of a PTE scan
+        */
+       unsigned long *numa_faults;
+       unsigned long total_numa_faults;
+
+       /*
+        * numa_faults_buffer records faults per node during the current
+        * scan window. When the scan completes, the counts in numa_faults
+        * decay and these values are copied.
+        */
+       unsigned long *numa_faults_buffer;
+
+       /*
+        * numa_faults_locality tracks if faults recorded during the last
+        * scan window were remote/local. The task scan period is adapted
+        * based on the locality of the faults with different weights
+        * depending on whether they were shared or private faults
+        */
+       unsigned long numa_faults_locality[2];
+
+       unsigned long numa_pages_migrated;
  #endif /* CONFIG_NUMA_BALANCING */
  
         struct rcu_head rcu;
@@ -1412,16 +1459,33 @@ struct task_struct {
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  
+#define TNF_MIGRATED   0x01
+#define TNF_NO_GROUP   0x02
+#define TNF_SHARED     0x04
+#define TNF_FAULT_LOCAL        0x08
+
  #ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int node, int pages, bool migrated);
+extern void task_numa_fault(int last_node, int node, int pages, int flags);
+extern pid_t task_numa_group_id(struct task_struct *p);
  extern void set_numabalancing_state(bool enabled);
+extern void task_numa_free(struct task_struct *p);
+
+extern unsigned int sysctl_numa_balancing_migrate_deferred;
  #else
-static inline void task_numa_fault(int node, int pages, bool migrated)
+static inline void task_numa_fault(int last_node, int node, int pages,
+                                  int flags)
  {
  }
+static inline pid_t task_numa_group_id(struct task_struct *p)
+{
+       return 0;
+}
  static inline void set_numabalancing_state(bool enabled)
  {
  }
+static inline void task_numa_free(struct task_struct *p)
+{
+}
  #endif
  
  static inline struct pid *task_pid(struct task_struct *task)
@@ -1974,7 +2038,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
  #else
   static inline void kick_process(struct task_struct *tsk) { }
  #endif
-extern void sched_fork(struct task_struct *p);
+extern void sched_fork(unsigned long clone_flags, struct task_struct *p);
  extern void sched_dead(struct task_struct *p);
  
  extern void proc_caches_init(void);
@@ -2401,11 +2465,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
  }
  
-static inline int need_resched(void)
-{
-       return unlikely(test_thread_flag(TIF_NEED_RESCHED));
-}
-
  /*
   * cond_resched() and cond_resched_lock(): latency reduction via
   * explicit rescheduling in places that are safe. The return
@@ -2474,36 +2533,105 @@ static inline int tsk_is_polling(struct task_struct *p)
  {
         return task_thread_info(p)->status & TS_POLLING;
  }
-static inline void current_set_polling(void)
+static inline void __current_set_polling(void)
  {
         current_thread_info()->status |= TS_POLLING;
  }
  
-static inline void current_clr_polling(void)
+static inline bool __must_check current_set_polling_and_test(void)
+{
+       __current_set_polling();
+
+       /*
+        * Polling state must be visible before we test NEED_RESCHED,
+        * paired by resched_task()
+        */
+       smp_mb();
+
+       return unlikely(tif_need_resched());
+}
+
+static inline void __current_clr_polling(void)
  {
         current_thread_info()->status &= ~TS_POLLING;
-       smp_mb__after_clear_bit();
+}
+
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+       __current_clr_polling();
+
+       /*
+        * Polling state must be visible before we test NEED_RESCHED,
+        * paired by resched_task()
+        */
+       smp_mb();
+
+       return unlikely(tif_need_resched());
  }
  #elif defined(TIF_POLLING_NRFLAG)
  static inline int tsk_is_polling(struct task_struct *p)
  {
         return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
  }
-static inline void current_set_polling(void)
+
+static inline void __current_set_polling(void)
  {
         set_thread_flag(TIF_POLLING_NRFLAG);
  }
  
-static inline void current_clr_polling(void)
+static inline bool __must_check current_set_polling_and_test(void)
+{
+       __current_set_polling();
+
+       /*
+        * Polling state must be visible before we test NEED_RESCHED,
+        * paired by resched_task()
+        *
+        * XXX: assumes set/clear bit are identical barrier wise.
+        */
+       smp_mb__after_clear_bit();
+
+       return unlikely(tif_need_resched());
+}
+
+static inline void __current_clr_polling(void)
  {
         clear_thread_flag(TIF_POLLING_NRFLAG);
  }
+
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+       __current_clr_polling();
+
+       /*
+        * Polling state must be visible before we test NEED_RESCHED,
+        * paired by resched_task()
+        */
+       smp_mb__after_clear_bit();
+
+       return unlikely(tif_need_resched());
+}
+
  #else
  static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void current_set_polling(void) { }
-static inline void current_clr_polling(void) { }
+static inline void __current_set_polling(void) { }
+static inline void __current_clr_polling(void) { }
+
+static inline bool __must_check current_set_polling_and_test(void)
+{
+       return unlikely(tif_need_resched());
+}
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+       return unlikely(tif_need_resched());
+}
  #endif
  
+static __always_inline bool need_resched(void)
+{
+       return unlikely(tif_need_resched());
+}
+
  /*
   * Thread group CPU time accounting.
   */
@@ -2545,6 +2673,11 @@ static inline unsigned int task_cpu(const struct task_struct *p)
         return task_thread_info(p)->cpu;
  }
  
+static inline int task_node(const struct task_struct *p)
+{
+       return cpu_to_node(task_cpu(p));
+}
+
  extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
  
  #else
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index bf8086b2506e807d9b3c97679f0917d8d5944a96..41467f8ff8ec8c7c5766021abe00e358f63e93cc 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -2,8 +2,8 @@
  #define _SCHED_SYSCTL_H
  
  #ifdef CONFIG_DETECT_HUNG_TASK
+extern int          sysctl_hung_task_check_count;
  extern unsigned int  sysctl_hung_task_panic;
-extern unsigned long sysctl_hung_task_check_count;
  extern unsigned long sysctl_hung_task_timeout_secs;
  extern unsigned long sysctl_hung_task_warnings;
  extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
@@ -47,7 +47,6 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
  extern unsigned int sysctl_numa_balancing_scan_delay;
  extern unsigned int sysctl_numa_balancing_scan_period_min;
  extern unsigned int sysctl_numa_balancing_scan_period_max;
-extern unsigned int sysctl_numa_balancing_scan_period_reset;
  extern unsigned int sysctl_numa_balancing_scan_size;
  extern unsigned int sysctl_numa_balancing_settle_count;
  
diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h

index fa7922c80a4190afd88476d7af3d688b98762e41..cddf0c2940b6f2c3564f889326a2bbd08144ef1a 100644 (file)
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -15,7 +15,7 @@ static inline void sched_clock_postinit(void) { }
  #endif
  
  extern void setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate);
-
-extern unsigned long long (*sched_clock_func)(void);
+extern void sched_clock_register(u64 (*read)(void), int bits,
+                                unsigned long rate);
  
  #endif
diff --git a/include/linux/sfi.h b/include/linux/sfi.h

index fe817918b30e49ff96648394d99d8fc85eebaac8..d9b436f099258c99b761745fdc35a29910b6dfe5 100644 (file)
--- a/include/linux/sfi.h
+++ b/include/linux/sfi.h
@@ -59,6 +59,9 @@
  #ifndef _LINUX_SFI_H
  #define _LINUX_SFI_H
  
+#include <linux/init.h>
+#include <linux/types.h>
+
  /* Table signatures reserved by the SFI specification */
  #define SFI_SIG_SYST           "SYST"
  #define SFI_SIG_FREQ           "FREQ"
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h

index 3b5e910d14ca47561ca8c2656b1f91cfb696ac45..d2abbdb8c6aaa336fdd5cb4f38fc53b43dc35268 100644 (file)
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -28,6 +28,7 @@ struct cpu_stop_work {
  };
  
  int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
  void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
                          struct cpu_stop_work *work_buf);
  int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h

index e7e04736802f60eca7847b5241b8c217c4b7a732..fddbe2023a5d568717b3b90eabc692bb4612f9bd 100644 (file)
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -104,8 +104,21 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
  #define test_thread_flag(flag) \
         test_ti_thread_flag(current_thread_info(), flag)
  
-#define set_need_resched()     set_thread_flag(TIF_NEED_RESCHED)
-#define clear_need_resched()   clear_thread_flag(TIF_NEED_RESCHED)
+static inline __deprecated void set_need_resched(void)
+{
+       /*
+        * Use of this function in deprecated.
+        *
+        * As of this writing there are only a few users in the DRM tree left
+        * all of which are wrong and can be removed without causing too much
+        * grief.
+        *
+        * The DRM people are aware and are working on removing the last few
+        * instances.
+        */
+}
+
+#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
  
  #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
  /*
diff --git a/include/linux/topology.h b/include/linux/topology.h

index d3cf0d6e7712c115fb529559b59c2ae4f6f8cd9c..12ae6ce997d6be9f658fb2956a957966267028b5 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -106,6 +106,8 @@ int arch_update_cpu_topology(void);
         .last_balance           = jiffies,                              \
         .balance_interval       = 1,                                    \
         .smt_gain               = 1178, /* 15% */                       \
+       .max_newidle_lb_cost    = 0,                                    \
+       .next_decay_max_lb_cost = jiffies,                              \
  }
  #endif
  #endif /* CONFIG_SCHED_SMT */
@@ -135,6 +137,8 @@ int arch_update_cpu_topology(void);
                                 ,                                       \
         .last_balance           = jiffies,                              \
         .balance_interval       = 1,                                    \
+       .max_newidle_lb_cost    = 0,                                    \
+       .next_decay_max_lb_cost = jiffies,                              \
  }
  #endif
  #endif /* CONFIG_SCHED_MC */
@@ -166,6 +170,8 @@ int arch_update_cpu_topology(void);
                                 ,                                       \
         .last_balance           = jiffies,                              \
         .balance_interval       = 1,                                    \
+       .max_newidle_lb_cost    = 0,                                    \
+       .next_decay_max_lb_cost = jiffies,                              \
  }
  #endif
  
diff --git a/include/linux/tty.h b/include/linux/tty.h

index 64f864651d8696aa5cf8a93b931c78c06a23bfdc..633cac77f9f97b56155bf5a005aa2899591425ac 100644 (file)
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -672,31 +672,17 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty,
  #define wait_event_interruptible_tty(tty, wq, condition)               \
  ({                                                                     \
         int __ret = 0;                                                  \
-       if (!(condition)) {                                             \
-               __wait_event_interruptible_tty(tty, wq, condition, __ret);      \
-       }                                                               \
+       if (!(condition))                                               \
+               __ret = __wait_event_interruptible_tty(tty, wq,         \
+                                                      condition);      \
         __ret;                                                          \
  })
  
-#define __wait_event_interruptible_tty(tty, wq, condition, ret)                \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \
-               if (condition)                                          \
-                       break;                                          \
-               if (!signal_pending(current)) {                         \
-                       tty_unlock(tty);                                        \
+#define __wait_event_interruptible_tty(tty, wq, condition)             \
+       ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,          \
+                       tty_unlock(tty);                                \
                         schedule();                                     \
-                       tty_lock(tty);                                  \
-                       continue;                                       \
-               }                                                       \
-               ret = -ERESTARTSYS;                                     \
-               break;                                                  \
-       }                                                               \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+                       tty_lock(tty))
  
  #ifdef CONFIG_PROC_FS
  extern void proc_tty_register_driver(struct tty_driver *);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h

index 5ca0951e1855b103ff65337cef6f94449f34d6c5..9d8cf056e661fab2c7389a78f0c5ac1248bcfe62 100644 (file)
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -15,7 +15,7 @@
   */
  static inline void pagefault_disable(void)
  {
-       inc_preempt_count();
+       preempt_count_inc();
         /*
          * make sure to have issued the store before a pagefault
          * can hit.
@@ -30,11 +30,7 @@ static inline void pagefault_enable(void)
          * the pagefault handler again.
          */
         barrier();
-       dec_preempt_count();
-       /*
-        * make sure we do..
-        */
-       barrier();
+       preempt_count_dec();
         preempt_check_resched();
  }
  
diff --git a/include/linux/wait.h b/include/linux/wait.h

index d9eff54c69cfc1ba5696e205e8122e072019eb1c..7f8caa519128d6bc2432161f05c53eebaa548065 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -1,7 +1,8 @@
  #ifndef _LINUX_WAIT_H
  #define _LINUX_WAIT_H
-
-
+/*
+ * Linux wait queue related types and methods
+ */
  #include <linux/list.h>
  #include <linux/stddef.h>
  #include <linux/spinlock.h>
@@ -13,27 +14,27 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
  int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
  
  struct __wait_queue {
-       unsigned int flags;
+       unsigned int            flags;
  #define WQ_FLAG_EXCLUSIVE      0x01
-       void *private;
-       wait_queue_func_t func;
-       struct list_head task_list;
+       void                    *private;
+       wait_queue_func_t       func;
+       struct list_head        task_list;
  };
  
  struct wait_bit_key {
-       void *flags;
-       int bit_nr;
-#define WAIT_ATOMIC_T_BIT_NR -1
+       void                    *flags;
+       int                     bit_nr;
+#define WAIT_ATOMIC_T_BIT_NR   -1
  };
  
  struct wait_bit_queue {
-       struct wait_bit_key key;
-       wait_queue_t wait;
+       struct wait_bit_key     key;
+       wait_queue_t            wait;
  };
  
  struct __wait_queue_head {
-       spinlock_t lock;
-       struct list_head task_list;
+       spinlock_t              lock;
+       struct list_head        task_list;
  };
  typedef struct __wait_queue_head wait_queue_head_t;
  
@@ -84,17 +85,17 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct
  
  static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
  {
-       q->flags = 0;
-       q->private = p;
-       q->func = default_wake_function;
+       q->flags        = 0;
+       q->private      = p;
+       q->func         = default_wake_function;
  }
  
-static inline void init_waitqueue_func_entry(wait_queue_t *q,
-                                       wait_queue_func_t func)
+static inline void
+init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
  {
-       q->flags = 0;
-       q->private = NULL;
-       q->func = func;
+       q->flags        = 0;
+       q->private      = NULL;
+       q->func         = func;
  }
  
  static inline int waitqueue_active(wait_queue_head_t *q)
@@ -114,8 +115,8 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
  /*
   * Used for wake-one threads:
   */
-static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
-                                             wait_queue_t *wait)
+static inline void
+__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
  {
         wait->flags |= WQ_FLAG_EXCLUSIVE;
         __add_wait_queue(q, wait);
@@ -127,23 +128,22 @@ static inline void __add_wait_queue_tail(wait_queue_head_t *head,
         list_add_tail(&new->task_list, &head->task_list);
  }
  
-static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
-                                             wait_queue_t *wait)
+static inline void
+__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
  {
         wait->flags |= WQ_FLAG_EXCLUSIVE;
         __add_wait_queue_tail(q, wait);
  }
  
-static inline void __remove_wait_queue(wait_queue_head_t *head,
-                                                       wait_queue_t *old)
+static inline void
+__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
  {
         list_del(&old->task_list);
  }
  
  void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
  void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
-                       void *key);
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
  void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
  void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
  void __wake_up_bit(wait_queue_head_t *, void *, int);
@@ -170,27 +170,64 @@ wait_queue_head_t *bit_waitqueue(void *, int);
  /*
   * Wakeup macros to be used to report events to the targets.
   */
-#define wake_up_poll(x, m)                             \
+#define wake_up_poll(x, m)                                             \
         __wake_up(x, TASK_NORMAL, 1, (void *) (m))
-#define wake_up_locked_poll(x, m)                              \
+#define wake_up_locked_poll(x, m)                                      \
         __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
-#define wake_up_interruptible_poll(x, m)                       \
+#define wake_up_interruptible_poll(x, m)                               \
         __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
  #define wake_up_interruptible_sync_poll(x, m)                          \
         __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
  
-#define __wait_event(wq, condition)                                    \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
+#define ___wait_cond_timeout(condition)                                        \
+({                                                                     \
+       bool __cond = (condition);                                      \
+       if (__cond && !__ret)                                           \
+               __ret = 1;                                              \
+       __cond || !__ret;                                               \
+})
+
+#define ___wait_is_interruptible(state)                                        \
+       (!__builtin_constant_p(state) ||                                \
+               state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)  \
+
+#define ___wait_event(wq, condition, state, exclusive, ret, cmd)       \
+({                                                                     \
+       __label__ __out;                                                \
+       wait_queue_t __wait;                                            \
+       long __ret = ret;                                               \
+                                                                       \
+       INIT_LIST_HEAD(&__wait.task_list);                              \
+       if (exclusive)                                                  \
+               __wait.flags = WQ_FLAG_EXCLUSIVE;                       \
+       else                                                            \
+               __wait.flags = 0;                                       \
                                                                         \
         for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);    \
+               long __int = prepare_to_wait_event(&wq, &__wait, state);\
+                                                                       \
                 if (condition)                                          \
                         break;                                          \
-               schedule();                                             \
+                                                                       \
+               if (___wait_is_interruptible(state) && __int) {         \
+                       __ret = __int;                                  \
+                       if (exclusive) {                                \
+                               abort_exclusive_wait(&wq, &__wait,      \
+                                                    state, NULL);      \
+                               goto __out;                             \
+                       }                                               \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               cmd;                                                    \
         }                                                               \
         finish_wait(&wq, &__wait);                                      \
-} while (0)
+__out: __ret;                                                          \
+})
+
+#define __wait_event(wq, condition)                                    \
+       (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
+                           schedule())
  
  /**
   * wait_event - sleep until a condition gets true
@@ -204,29 +241,17 @@ do {                                                                      \
   * wake_up() has to be called after changing any variable that could
   * change the result of the wait condition.
   */
-#define wait_event(wq, condition)                                      \
+#define wait_event(wq, condition)                                      \
  do {                                                                   \
-       if (condition)                                                  \
+       if (condition)                                                  \
                 break;                                                  \
         __wait_event(wq, condition);                                    \
  } while (0)
  
-#define __wait_event_timeout(wq, condition, ret)                       \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);    \
-               if (condition)                                          \
-                       break;                                          \
-               ret = schedule_timeout(ret);                            \
-               if (!ret)                                               \
-                       break;                                          \
-       }                                                               \
-       if (!ret && (condition))                                        \
-               ret = 1;                                                \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+#define __wait_event_timeout(wq, condition, timeout)                   \
+       ___wait_event(wq, ___wait_cond_timeout(condition),              \
+                     TASK_UNINTERRUPTIBLE, 0, timeout,                 \
+                     __ret = schedule_timeout(__ret))
  
  /**
   * wait_event_timeout - sleep until a condition gets true or a timeout elapses
@@ -248,8 +273,8 @@ do {                                                                        \
  #define wait_event_timeout(wq, condition, timeout)                     \
  ({                                                                     \
         long __ret = timeout;                                           \
-       if (!(condition))                                               \
-               __wait_event_timeout(wq, condition, __ret);             \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __wait_event_timeout(wq, condition, timeout);   \
         __ret;                                                          \
  })
  
@@ -289,23 +314,9 @@ do {                                                                       \
         __wait_event_cmd(wq, condition, cmd1, cmd2);                    \
  } while (0)
  
-#define __wait_event_interruptible(wq, condition, ret)                 \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \
-               if (condition)                                          \
-                       break;                                          \
-               if (!signal_pending(current)) {                         \
-                       schedule();                                     \
-                       continue;                                       \
-               }                                                       \
-               ret = -ERESTARTSYS;                                     \
-               break;                                                  \
-       }                                                               \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+#define __wait_event_interruptible(wq, condition)                      \
+       ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,          \
+                     schedule())
  
  /**
   * wait_event_interruptible - sleep until a condition gets true
@@ -326,31 +337,14 @@ do {                                                                      \
  ({                                                                     \
         int __ret = 0;                                                  \
         if (!(condition))                                               \
-               __wait_event_interruptible(wq, condition, __ret);       \
+               __ret = __wait_event_interruptible(wq, condition);      \
         __ret;                                                          \
  })
  
-#define __wait_event_interruptible_timeout(wq, condition, ret)         \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \
-               if (condition)                                          \
-                       break;                                          \
-               if (!signal_pending(current)) {                         \
-                       ret = schedule_timeout(ret);                    \
-                       if (!ret)                                       \
-                               break;                                  \
-                       continue;                                       \
-               }                                                       \
-               ret = -ERESTARTSYS;                                     \
-               break;                                                  \
-       }                                                               \
-       if (!ret && (condition))                                        \
-               ret = 1;                                                \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+#define __wait_event_interruptible_timeout(wq, condition, timeout)     \
+       ___wait_event(wq, ___wait_cond_timeout(condition),              \
+                     TASK_INTERRUPTIBLE, 0, timeout,                   \
+                     __ret = schedule_timeout(__ret))
  
  /**
   * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
@@ -373,15 +367,15 @@ do {                                                                      \
  #define wait_event_interruptible_timeout(wq, condition, timeout)       \
  ({                                                                     \
         long __ret = timeout;                                           \
-       if (!(condition))                                               \
-               __wait_event_interruptible_timeout(wq, condition, __ret); \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __wait_event_interruptible_timeout(wq,          \
+                                               condition, timeout);    \
         __ret;                                                          \
  })
  
  #define __wait_event_hrtimeout(wq, condition, timeout, state)          \
  ({                                                                     \
         int __ret = 0;                                                  \
-       DEFINE_WAIT(__wait);                                            \
         struct hrtimer_sleeper __t;                                     \
                                                                         \
         hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC,              \
@@ -392,25 +386,15 @@ do {                                                                      \
                                        current->timer_slack_ns,         \
                                        HRTIMER_MODE_REL);               \
                                                                         \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, state);                   \
-               if (condition)                                          \
-                       break;                                          \
-               if (state == TASK_INTERRUPTIBLE &&                      \
-                   signal_pending(current)) {                          \
-                       __ret = -ERESTARTSYS;                           \
-                       break;                                          \
-               }                                                       \
+       __ret = ___wait_event(wq, condition, state, 0, 0,               \
                 if (!__t.task) {                                        \
                         __ret = -ETIME;                                 \
                         break;                                          \
                 }                                                       \
-               schedule();                                             \
-       }                                                               \
+               schedule());                                            \
                                                                         \
         hrtimer_cancel(&__t.timer);                                     \
         destroy_hrtimer_on_stack(&__t.timer);                           \
-       finish_wait(&wq, &__wait);                                      \
         __ret;                                                          \
  })
  
@@ -464,33 +448,15 @@ do {                                                                      \
         __ret;                                                          \
  })
  
-#define __wait_event_interruptible_exclusive(wq, condition, ret)       \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait_exclusive(&wq, &__wait,                 \
-                                       TASK_INTERRUPTIBLE);            \
-               if (condition) {                                        \
-                       finish_wait(&wq, &__wait);                      \
-                       break;                                          \
-               }                                                       \
-               if (!signal_pending(current)) {                         \
-                       schedule();                                     \
-                       continue;                                       \
-               }                                                       \
-               ret = -ERESTARTSYS;                                     \
-               abort_exclusive_wait(&wq, &__wait,                      \
-                               TASK_INTERRUPTIBLE, NULL);              \
-               break;                                                  \
-       }                                                               \
-} while (0)
+#define __wait_event_interruptible_exclusive(wq, condition)            \
+       ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,          \
+                     schedule())
  
  #define wait_event_interruptible_exclusive(wq, condition)              \
  ({                                                                     \
         int __ret = 0;                                                  \
         if (!(condition))                                               \
-               __wait_event_interruptible_exclusive(wq, condition, __ret);\
+               __ret = __wait_event_interruptible_exclusive(wq, condition);\
         __ret;                                                          \
  })
  
@@ -642,24 +608,8 @@ do {                                                                       \
          ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1))
  
  
-
-#define __wait_event_killable(wq, condition, ret)                      \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_KILLABLE);           \
-               if (condition)                                          \
-                       break;                                          \
-               if (!fatal_signal_pending(current)) {                   \
-                       schedule();                                     \
-                       continue;                                       \
-               }                                                       \
-               ret = -ERESTARTSYS;                                     \
-               break;                                                  \
-       }                                                               \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+#define __wait_event_killable(wq, condition)                           \
+       ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())
  
  /**
   * wait_event_killable - sleep until a condition gets true
@@ -680,26 +630,17 @@ do {                                                                      \
  ({                                                                     \
         int __ret = 0;                                                  \
         if (!(condition))                                               \
-               __wait_event_killable(wq, condition, __ret);            \
+               __ret = __wait_event_killable(wq, condition);           \
         __ret;                                                          \
  })
  
  
  #define __wait_event_lock_irq(wq, condition, lock, cmd)                        \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);    \
-               if (condition)                                          \
-                       break;                                          \
-               spin_unlock_irq(&lock);                                 \
-               cmd;                                                    \
-               schedule();                                             \
-               spin_lock_irq(&lock);                                   \
-       }                                                               \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+       (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
+                           spin_unlock_irq(&lock);                     \
+                           cmd;                                        \
+                           schedule();                                 \
+                           spin_lock_irq(&lock))
  
  /**
   * wait_event_lock_irq_cmd - sleep until a condition gets true. The
@@ -759,26 +700,12 @@ do {                                                                      \
  } while (0)
  
  
-#define __wait_event_interruptible_lock_irq(wq, condition,             \
-                                           lock, ret, cmd)             \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \
-               if (condition)                                          \
-                       break;                                          \
-               if (signal_pending(current)) {                          \
-                       ret = -ERESTARTSYS;                             \
-                       break;                                          \
-               }                                                       \
-               spin_unlock_irq(&lock);                                 \
-               cmd;                                                    \
-               schedule();                                             \
-               spin_lock_irq(&lock);                                   \
-       }                                                               \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd)  \
+       ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,          \
+                     spin_unlock_irq(&lock);                           \
+                     cmd;                                              \
+                     schedule();                                       \
+                     spin_lock_irq(&lock))
  
  /**
   * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
@@ -808,10 +735,9 @@ do {                                                                       \
  #define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd)        \
  ({                                                                     \
         int __ret = 0;                                                  \
-                                                                       \
         if (!(condition))                                               \
-               __wait_event_interruptible_lock_irq(wq, condition,      \
-                                                   lock, __ret, cmd);  \
+               __ret = __wait_event_interruptible_lock_irq(wq,         \
+                                               condition, lock, cmd);  \
         __ret;                                                          \
  })
  
@@ -840,39 +766,24 @@ do {                                                                      \
  #define wait_event_interruptible_lock_irq(wq, condition, lock)         \
  ({                                                                     \
         int __ret = 0;                                                  \
-                                                                       \
         if (!(condition))                                               \
-               __wait_event_interruptible_lock_irq(wq, condition,      \
-                                                   lock, __ret, );     \
+               __ret = __wait_event_interruptible_lock_irq(wq,         \
+                                               condition, lock,)       \
         __ret;                                                          \
  })
  
  #define __wait_event_interruptible_lock_irq_timeout(wq, condition,     \
-                                                   lock, ret)          \
-do {                                                                   \
-       DEFINE_WAIT(__wait);                                            \
-                                                                       \
-       for (;;) {                                                      \
-               prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \
-               if (condition)                                          \
-                       break;                                          \
-               if (signal_pending(current)) {                          \
-                       ret = -ERESTARTSYS;                             \
-                       break;                                          \
-               }                                                       \
-               spin_unlock_irq(&lock);                                 \
-               ret = schedule_timeout(ret);                            \
-               spin_lock_irq(&lock);                                   \
-               if (!ret)                                               \
-                       break;                                          \
-       }                                                               \
-       finish_wait(&wq, &__wait);                                      \
-} while (0)
+                                                   lock, timeout)      \
+       ___wait_event(wq, ___wait_cond_timeout(condition),              \
+                     TASK_INTERRUPTIBLE, 0, ret,                       \
+                     spin_unlock_irq(&lock);                           \
+                     __ret = schedule_timeout(__ret);                  \
+                     spin_lock_irq(&lock));
  
  /**
- * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses.
- *             The condition is checked under the lock. This is expected
- *             to be called with the lock taken.
+ * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
+ *             true or a timeout elapses. The condition is checked under
+ *             the lock. This is expected to be called with the lock taken.
   * @wq: the waitqueue to wait on
   * @condition: a C expression for the event to wait for
   * @lock: a locked spinlock_t, which will be released before schedule()
@@ -896,11 +807,10 @@ do {                                                                      \
  #define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \
                                                   timeout)              \
  ({                                                                     \
-       int __ret = timeout;                                            \
-                                                                       \
-       if (!(condition))                                               \
-               __wait_event_interruptible_lock_irq_timeout(            \
-                                       wq, condition, lock, __ret);    \
+       long __ret = timeout;                                           \
+       if (!___wait_cond_timeout(condition))                           \
+               __ret = __wait_event_interruptible_lock_irq_timeout(    \
+                                       wq, condition, lock, timeout);  \
         __ret;                                                          \
  })
  
@@ -911,20 +821,18 @@ do {                                                                      \
   * We plan to remove these interfaces.
   */
  extern void sleep_on(wait_queue_head_t *q);
-extern long sleep_on_timeout(wait_queue_head_t *q,
-                                     signed long timeout);
+extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
  extern void interruptible_sleep_on(wait_queue_head_t *q);
-extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
-                                          signed long timeout);
+extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
  
  /*
   * Waitqueues which are removed from the waitqueue_head at wakeup time
   */
  void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
  void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
  void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
-                       unsigned int mode, void *key);
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
  int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
  int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
  
@@ -970,8 +878,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
   * One uses wait_on_bit() where one is waiting for the bit to clear,
   * but has no intention of setting it.
   */
-static inline int wait_on_bit(void *word, int bit,
-                               int (*action)(void *), unsigned mode)
+static inline int
+wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode)
  {
         if (!test_bit(bit, word))
                 return 0;
@@ -994,8 +902,8 @@ static inline int wait_on_bit(void *word, int bit,
   * One uses wait_on_bit_lock() where one is waiting for the bit to
   * clear with the intention of setting it, and when done, clearing it.
   */
-static inline int wait_on_bit_lock(void *word, int bit,
-                               int (*action)(void *), unsigned mode)
+static inline int
+wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode)
  {
         if (!test_and_set_bit(bit, word))
                 return 0;
@@ -1019,5 +927,5 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
                 return 0;
         return out_of_line_wait_on_atomic_t(val, action, mode);
  }
-       
-#endif
+
+#endif /* _LINUX_WAIT_H */
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h

index ee2376cfaab3a668d01374844c6757d07e7e1be8..aca382266411620f6bd7b052311cee6d6b04e8e7 100644 (file)
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -39,15 +39,26 @@ TRACE_EVENT(rcu_utilization,
  #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
  
  /*
- * Tracepoint for grace-period events: starting and ending a grace
- * period ("start" and "end", respectively), a CPU noting the start
- * of a new grace period or the end of an old grace period ("cpustart"
- * and "cpuend", respectively), a CPU passing through a quiescent
- * state ("cpuqs"), a CPU coming online or going offline ("cpuonl"
- * and "cpuofl", respectively), a CPU being kicked for being too
- * long in dyntick-idle mode ("kick"), a CPU accelerating its new
- * callbacks to RCU_NEXT_READY_TAIL ("AccReadyCB"), and a CPU
- * accelerating its new callbacks to RCU_WAIT_TAIL ("AccWaitCB").
+ * Tracepoint for grace-period events.  Takes a string identifying the
+ * RCU flavor, the grace-period number, and a string identifying the
+ * grace-period-related event as follows:
+ *
+ *     "AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL.
+ *     "AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL.
+ *     "newreq": Request a new grace period.
+ *     "start": Start a grace period.
+ *     "cpustart": CPU first notices a grace-period start.
+ *     "cpuqs": CPU passes through a quiescent state.
+ *     "cpuonl": CPU comes online.
+ *     "cpuofl": CPU goes offline.
+ *     "reqwait": GP kthread sleeps waiting for grace-period request.
+ *     "reqwaitsig": GP kthread awakened by signal from reqwait state.
+ *     "fqswait": GP kthread waiting until time to force quiescent states.
+ *     "fqsstart": GP kthread starts forcing quiescent states.
+ *     "fqsend": GP kthread done forcing quiescent states.
+ *     "fqswaitsig": GP kthread awakened by signal from fqswait state.
+ *     "end": End a grace period.
+ *     "cpuend": CPU first notices a grace-period end.
   */
  TRACE_EVENT(rcu_grace_period,
  
@@ -160,6 +171,46 @@ TRACE_EVENT(rcu_grace_period_init,
                   __entry->grplo, __entry->grphi, __entry->qsmask)
  );
  
+/*
+ * Tracepoint for RCU no-CBs CPU callback handoffs.  This event is intended
+ * to assist debugging of these handoffs.
+ *
+ * The first argument is the name of the RCU flavor, and the second is
+ * the number of the offloaded CPU are extracted.  The third and final
+ * argument is a string as follows:
+ *
+ *     "WakeEmpty": Wake rcuo kthread, first CB to empty list.
+ *     "WakeOvf": Wake rcuo kthread, CB list is huge.
+ *     "WakeNot": Don't wake rcuo kthread.
+ *     "WakeNotPoll": Don't wake rcuo kthread because it is polling.
+ *     "Poll": Start of new polling cycle for rcu_nocb_poll.
+ *     "Sleep": Sleep waiting for CBs for !rcu_nocb_poll.
+ *     "WokeEmpty": rcuo kthread woke to find empty list.
+ *     "WokeNonEmpty": rcuo kthread woke to find non-empty list.
+ *     "WaitQueue": Enqueue partially done, timed wait for it to complete.
+ *     "WokeQueue": Partial enqueue now complete.
+ */
+TRACE_EVENT(rcu_nocb_wake,
+
+       TP_PROTO(const char *rcuname, int cpu, const char *reason),
+
+       TP_ARGS(rcuname, cpu, reason),
+
+       TP_STRUCT__entry(
+               __field(const char *, rcuname)
+               __field(int, cpu)
+               __field(const char *, reason)
+       ),
+
+       TP_fast_assign(
+               __entry->rcuname = rcuname;
+               __entry->cpu = cpu;
+               __entry->reason = reason;
+       ),
+
+       TP_printk("%s %d %s", __entry->rcuname, __entry->cpu, __entry->reason)
+);
+
  /*
   * Tracepoint for tasks blocking within preemptible-RCU read-side
   * critical sections.  Track the type of RCU (which one day might
@@ -540,17 +591,17 @@ TRACE_EVENT(rcu_invoke_kfree_callback,
  TRACE_EVENT(rcu_batch_end,
  
         TP_PROTO(const char *rcuname, int callbacks_invoked,
-                bool cb, bool nr, bool iit, bool risk),
+                char cb, char nr, char iit, char risk),
  
         TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk),
  
         TP_STRUCT__entry(
                 __field(const char *, rcuname)
                 __field(int, callbacks_invoked)
-               __field(bool, cb)
-               __field(bool, nr)
-               __field(bool, iit)
-               __field(bool, risk)
+               __field(char, cb)
+               __field(char, nr)
+               __field(char, iit)
+               __field(char, risk)
         ),
  
         TP_fast_assign(
@@ -656,6 +707,7 @@ TRACE_EVENT(rcu_barrier,
  #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
                                       level, grplo, grphi, event) \
                                       do { } while (0)
+#define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0)
  #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
  #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
  #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index 2e7d9947a10d3f62bfe12e450375625efae913df..613381bcde40a8c68a0917cd19d36d6a603c9e2a 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
         /*
          * For all intents and purposes a preempted task is a running task.
          */
-       if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
+       if (task_preempt_count(p) & PREEMPT_ACTIVE)
                 state = TASK_RUNNING | TASK_STATE_MAX;
  #endif
  
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h

index 009a655a5d354c51e20fb34cb12ca653e94f9b71..da48837d617d0301609b9eca0c08687c1b27b1e1 100644 (file)
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -136,8 +136,9 @@ enum perf_event_sample_format {
         PERF_SAMPLE_WEIGHT                      = 1U << 14,
         PERF_SAMPLE_DATA_SRC                    = 1U << 15,
         PERF_SAMPLE_IDENTIFIER                  = 1U << 16,
+       PERF_SAMPLE_TRANSACTION                 = 1U << 17,
  
-       PERF_SAMPLE_MAX = 1U << 17,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 18,             /* non-ABI */
  };
  
  /*
@@ -180,6 +181,28 @@ enum perf_sample_regs_abi {
         PERF_SAMPLE_REGS_ABI_64         = 2,
  };
  
+/*
+ * Values for the memory transaction event qualifier, mostly for
+ * abort events. Multiple bits can be set.
+ */
+enum {
+       PERF_TXN_ELISION        = (1 << 0), /* From elision */
+       PERF_TXN_TRANSACTION    = (1 << 1), /* From transaction */
+       PERF_TXN_SYNC           = (1 << 2), /* Instruction is related */
+       PERF_TXN_ASYNC          = (1 << 3), /* Instruction not related */
+       PERF_TXN_RETRY          = (1 << 4), /* Retry possible */
+       PERF_TXN_CONFLICT       = (1 << 5), /* Conflict abort */
+       PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */
+       PERF_TXN_CAPACITY_READ  = (1 << 7), /* Capacity read abort */
+
+       PERF_TXN_MAX            = (1 << 8), /* non-ABI */
+
+       /* bits 32..63 are reserved for the abort code */
+
+       PERF_TXN_ABORT_MASK  = (0xffffffffULL << 32),
+       PERF_TXN_ABORT_SHIFT = 32,
+};
+
  /*
   * The format of the data returned by read() on a perf event fd,
   * as specified by attr.read_format:
diff --git a/init/Kconfig b/init/Kconfig

index 4d09f69eb76648a5f2495623acad64747ae9c023..6ba4329410a32bbafd54167a6965cdf19e7cba01 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -354,7 +354,8 @@ config VIRT_CPU_ACCOUNTING_NATIVE
  
  config VIRT_CPU_ACCOUNTING_GEN
         bool "Full dynticks CPU time accounting"
-       depends on HAVE_CONTEXT_TRACKING && 64BIT
+       depends on HAVE_CONTEXT_TRACKING
+       depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
         select VIRT_CPU_ACCOUNTING
         select CONTEXT_TRACKING
         help
diff --git a/init/main.c b/init/main.c

index edee99f735746f4bb3b6a8fb17794919469f4888..dad19cf42dbaa0f088516355e3a359d99821d249 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -700,7 +700,7 @@ int __init_or_module do_one_initcall(initcall_t fn)
  
         if (preempt_count() != count) {
                 sprintf(msgbuf, "preemption imbalance ");
-               preempt_count() = count;
+               preempt_count_set(count);
         }
         if (irqs_disabled()) {
                 strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf));
diff --git a/kernel/Makefile b/kernel/Makefile

index 63136989c132e6095320547098e157878fd1942a..9a52eb5bf68980b8860014655de49018f9d00c49 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,9 +6,9 @@ obj-y     = fork.o exec_domain.o panic.o \
             cpu.o exit.o itimer.o time.o softirq.o resource.o \
             sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
             signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
-           rcupdate.o extable.o params.o posix-timers.o \
+           extable.o params.o posix-timers.o \
             kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
-           hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+           hrtimer.o rwsem.o nsproxy.o semaphore.o \
             notifier.o ksysfs.o cred.o reboot.o \
             async.o range.o groups.o lglock.o smpboot.o
  
@@ -27,6 +27,7 @@ obj-y += power/
  obj-y += printk/
  obj-y += cpu/
  obj-y += irq/
+obj-y += rcu/
  
  obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
  obj-$(CONFIG_FREEZER) += freezer.o
@@ -82,12 +83,6 @@ obj-$(CONFIG_KGDB) += debug/
  obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
  obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
  obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_TINY_RCU) += rcutiny.o
-obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
  obj-$(CONFIG_RELAY) += relay.o
  obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/bounds.c b/kernel/bounds.c

index 0c9b862292b26dc0494f771f96532fd8995c2b3b..e8ca97b5c386dc8a7e5374ab0aae48091fc30b6d 100644 (file)
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
  #include <linux/mmzone.h>
  #include <linux/kbuild.h>
  #include <linux/page_cgroup.h>
+#include <linux/log2.h>
  
  void foo(void)
  {
@@ -17,5 +18,8 @@ void foo(void)
         DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
         DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
         DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+       DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
         /* End of constants */
  }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c

index 859c8dfd78a1a5b296dd5c6c23d741e7a4c1e8cd..e5f3917aa05b724f3367e4fad474534eb34138c5 100644 (file)
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
   * instead of preempt_schedule() to exit user context if needed before
   * calling the scheduler.
   */
-void __sched notrace preempt_schedule_context(void)
+asmlinkage void __sched notrace preempt_schedule_context(void)
  {
         enum ctx_state prev_ctx;
  
diff --git a/kernel/cpu.c b/kernel/cpu.c

index d7f07a2da5a6b6bcc682d39918b3bd97471d9543..63aa50d7ce1efa16f9c29faa97f734f8c5db5d34 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         }
         smpboot_park_threads(cpu);
  
+       /*
+        * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+        * and RCU users of this state to go away such that all new such users
+        * will observe it.
+        *
+        * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+        * not imply sync_sched(), so explicitly call both.
+        */
+#ifdef CONFIG_PREEMPT
+       synchronize_sched();
+#endif
+       synchronize_rcu();
+
+       /*
+        * So now all preempt/rcu users must observe !cpu_active().
+        */
+
         err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
         if (err) {
                 /* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c

index e695c0a0bcb5c84e79ee93e80de57e4fbab69b33..988573a9a387665dddc402af2439996aa02fd1c0 100644 (file)
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
         rcu_idle_enter();
         trace_cpu_idle_rcuidle(0, smp_processor_id());
         local_irq_enable();
-       while (!need_resched())
+       while (!tif_need_resched())
                 cpu_relax();
         trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
         rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
                         if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
                                 cpu_idle_poll();
                         } else {
-                               current_clr_polling();
-                               if (!need_resched()) {
+                               if (!current_clr_polling_and_test()) {
                                         stop_critical_timings();
                                         rcu_idle_enter();
                                         arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
                                 } else {
                                         local_irq_enable();
                                 }
-                               current_set_polling();
+                               __current_set_polling();
                         }
                         arch_cpu_idle_exit();
+                       /*
+                        * We need to test and propagate the TIF_NEED_RESCHED
+                        * bit here because we might not have send the
+                        * reschedule IPI to idle tasks.
+                        */
+                       if (tif_need_resched())
+                               set_preempt_need_resched();
                 }
                 tick_nohz_idle_exit();
                 schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
          */
         boot_init_stack_canary();
  #endif
-       current_set_polling();
+       __current_set_polling();
         arch_cpu_idle_prepare();
         cpu_idle_loop();
  }
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c

index 0506d447aed270867214c9da188b4ac8a22ebecf..7d2f35e5df2f91c1ad0ec7a37741582929c22fec 100644 (file)
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,8 +575,12 @@ return_normal:
                 raw_spin_lock(&dbg_slave_lock);
  
  #ifdef CONFIG_SMP
+       /* If send_ready set, slaves are already waiting */
+       if (ks->send_ready)
+               atomic_set(ks->send_ready, 1);
+
         /* Signal the other CPUs to enter kgdb_wait() */
-       if ((!kgdb_single_step) && kgdb_do_roundup)
+       else if ((!kgdb_single_step) && kgdb_do_roundup)
                 kgdb_roundup_cpus(flags);
  #endif
  
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
         if (arch_kgdb_ops.enable_nmi)
                 arch_kgdb_ops.enable_nmi(0);
  
+       memset(ks, 0, sizeof(struct kgdb_state));
         ks->cpu                 = raw_smp_processor_id();
         ks->ex_vector           = evector;
         ks->signo               = signo;
         ks->err_code            = ecode;
-       ks->kgdb_usethreadid    = 0;
         ks->linux_regs          = regs;
  
         if (kgdb_reenter_check(ks))
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)
         return 1;
  }
  
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+       if (!kgdb_io_ready(0) || !send_ready)
+               return 1;
+
+       if (kgdb_info[cpu].enter_kgdb == 0) {
+               struct kgdb_state kgdb_var;
+               struct kgdb_state *ks = &kgdb_var;
+
+               memset(ks, 0, sizeof(struct kgdb_state));
+               ks->cpu                 = cpu;
+               ks->ex_vector           = trapnr;
+               ks->signo               = SIGTRAP;
+               ks->err_code            = KGDB_KDB_REASON_SYSTEM_NMI;
+               ks->linux_regs          = regs;
+               ks->send_ready          = send_ready;
+               kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+               return 0;
+       }
+#endif
+       return 1;
+}
+
  static void kgdb_console_write(struct console *co, const char *s,
     unsigned count)
  {
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h

index 2235967e78b0c8c6c181295269b2d7cc61e40232..572aa4f5677cb7c6034a72e040a6c0b9bd4fcb34 100644 (file)
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
         unsigned long           threadid;
         long                    kgdb_usethreadid;
         struct pt_regs          *linux_regs;
+       atomic_t                *send_ready;
  };
  
  /* Exception state values */
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);
  extern int kdb_parse(const char *cmdstr);
  extern int kdb_common_init_state(struct kgdb_state *ks);
  extern int kdb_common_deinit_state(void);
+#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
  #else /* ! CONFIG_KGDB_KDB */
  static inline int kdb_stub(struct kgdb_state *ks)
  {
         return DBG_PASS_EVENT;
  }
+#define KGDB_KDB_REASON_SYSTEM_NMI 0
  #endif /* CONFIG_KGDB_KDB */
  
  #endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c

index 328d18ef31e4b4f02e7cde7aff71504ad8a0fb5e..8859ca34dcfe0a58dbd530b8668d8d88760a3eb4 100644 (file)
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)
         if (atomic_read(&kgdb_setting_breakpoint))
                 reason = KDB_REASON_KEYBOARD;
  
-       if (in_nmi())
+       if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+               reason = KDB_REASON_SYSTEM_NMI;
+
+       else if (in_nmi())
                 reason = KDB_REASON_NMI;
  
         for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c

index 00eb8f7fbf41c49e1deb4b09e6858858064ae9e9..0b097c8a1e50e0b0a0a299f6c1df240f15468b3f 100644 (file)
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                            instruction_pointer(regs));
                 kdb_dumpregs(regs);
                 break;
+       case KDB_REASON_SYSTEM_NMI:
+               kdb_printf("due to System NonMaskable Interrupt\n");
+               break;
         case KDB_REASON_NMI:
                 kdb_printf("due to NonMaskable Interrupt @ "
                            kdb_machreg_fmt "\n",
diff --git a/kernel/events/core.c b/kernel/events/core.c

index d49a9d29334cc4d67c24bad9814221a0371a6350..5bd7fe43a7a22197379991a589fc4185c50a2e0c 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos)
  {
-       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  
         if (ret || !write)
                 return ret;
@@ -1201,6 +1201,9 @@ static void perf_event__header_size(struct perf_event *event)
         if (sample_type & PERF_SAMPLE_DATA_SRC)
                 size += sizeof(data->data_src.val);
  
+       if (sample_type & PERF_SAMPLE_TRANSACTION)
+               size += sizeof(data->txn);
+
         event->header_size = size;
  }
  
@@ -4572,6 +4575,9 @@ void perf_output_sample(struct perf_output_handle *handle,
         if (sample_type & PERF_SAMPLE_DATA_SRC)
                 perf_output_put(handle, data->data_src.val);
  
+       if (sample_type & PERF_SAMPLE_TRANSACTION)
+               perf_output_put(handle, data->txn);
+
         if (!event->attr.watermark) {
                 int wakeup_events = event->attr.wakeup_events;
  
@@ -6767,6 +6773,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
         if (ret)
                 return -EFAULT;
  
+       /* disabled for now */
+       if (attr->mmap2)
+               return -EINVAL;
+
         if (attr->__reserved_1)
                 return -EINVAL;
  
diff --git a/kernel/fork.c b/kernel/fork.c

index 086fe73ad6bde5b1ac1703f29f3d9f154878968b..c93be06dee874fae8366888840346ae58c012416 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -816,9 +816,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         mm->pmd_huge_pte = NULL;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-       mm->first_nid = NUMA_PTE_SCAN_INIT;
  #endif
         if (!mm_init(mm, tsk))
                 goto fail_nomem;
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  #endif
  
         /* Perform scheduler related setup. Assign this task to a CPU. */
-       sched_fork(p);
+       sched_fork(clone_flags, p);
  
         retval = perf_event_init_task(p);
         if (retval)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c

index 3e97fb126e6b9255a852887923f39230cd52c513..042252383fd2bbff549aa2380cec3a9662adaa68 100644 (file)
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -20,7 +20,7 @@
  /*
   * The number of tasks checked:
   */
-unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  
  /*
   * Limit number of tasks checked in a batch.
diff --git a/kernel/lockdep.c b/kernel/lockdep.c

index e16c45b9ee77054f80becd37a51da00776f796c1..4e8e14c34e428d6a75580ec862c4975b0425dfc0 100644 (file)
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
         printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
                !rcu_lockdep_current_cpu_online()
                         ? "RCU used illegally from offline CPU!\n"
-                       : rcu_is_cpu_idle()
+                       : !rcu_is_watching()
                                 ? "RCU used illegally from idle CPU!\n"
                                 : "",
                rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
          * So complain bitterly if someone does call rcu_read_lock(),
          * rcu_read_lock_bh() and so on from extended quiescent states.
          */
-       if (rcu_is_cpu_idle())
+       if (!rcu_is_watching())
                 printk("RCU used illegally from extended quiescent state!\n");
  
         lockdep_print_held_locks(curr);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c

index b2c71c5873e441ae1d9a89d8caa7517eb5bcf976..09220656d888b64f9415368218a3323147c4967f 100644 (file)
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
         seq_time(m, lt->min);
         seq_time(m, lt->max);
         seq_time(m, lt->total);
+       seq_time(m, lt->nr ? do_div(lt->total, lt->nr) : 0);
  }
  
  static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
@@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
         }
         if (i) {
                 seq_puts(m, "\n");
-               seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
+               seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
                 seq_puts(m, "\n");
         }
  }
  
  static void seq_header(struct seq_file *m)
  {
-       seq_printf(m, "lock_stat version 0.3\n");
+       seq_puts(m, "lock_stat version 0.4\n");
  
         if (unlikely(!debug_locks))
                 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
  
-       seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
-       seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
+       seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
+       seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
                         "%14s %14s\n",
                         "class name",
                         "con-bounces",
@@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m)
                         "waittime-min",
                         "waittime-max",
                         "waittime-total",
+                       "waittime-avg",
                         "acq-bounces",
                         "acquisitions",
                         "holdtime-min",
                         "holdtime-max",
-                       "holdtime-total");
-       seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
+                       "holdtime-total",
+                       "holdtime-avg");
+       seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
         seq_printf(m, "\n");
  }
  
diff --git a/kernel/mutex.c b/kernel/mutex.c

index 6d647aedffea494dec6b11d20fa222a05fd49f7f..d24105b1b794e635e93509c04e0ac6edb85cc7f5 100644 (file)
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
  static __always_inline int __sched
  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                     struct lockdep_map *nest_lock, unsigned long ip,
-                   struct ww_acquire_ctx *ww_ctx)
+                   struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
  {
         struct task_struct *task = current;
         struct mutex_waiter waiter;
@@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 struct task_struct *owner;
                 struct mspin_node  node;
  
-               if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
+               if (use_ww_ctx && ww_ctx->acquired > 0) {
                         struct ww_mutex *ww;
  
                         ww = container_of(lock, struct ww_mutex, base);
@@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 if ((atomic_read(&lock->count) == 1) &&
                     (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
                         lock_acquired(&lock->dep_map, ip);
-                       if (!__builtin_constant_p(ww_ctx == NULL)) {
+                       if (use_ww_ctx) {
                                 struct ww_mutex *ww;
                                 ww = container_of(lock, struct ww_mutex, base);
  
@@ -551,7 +551,7 @@ slowpath:
                         goto err;
                 }
  
-               if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
+               if (use_ww_ctx && ww_ctx->acquired > 0) {
                         ret = __mutex_lock_check_stamp(lock, ww_ctx);
                         if (ret)
                                 goto err;
@@ -575,7 +575,7 @@ skip_wait:
         lock_acquired(&lock->dep_map, ip);
         mutex_set_owner(lock);
  
-       if (!__builtin_constant_p(ww_ctx == NULL)) {
+       if (use_ww_ctx) {
                 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
                 struct mutex_waiter *cur;
  
@@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
  {
         might_sleep();
         __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-                           subclass, NULL, _RET_IP_, NULL);
+                           subclass, NULL, _RET_IP_, NULL, 0);
  }
  
  EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
  {
         might_sleep();
         __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-                           0, nest, _RET_IP_, NULL);
+                           0, nest, _RET_IP_, NULL, 0);
  }
  
  EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
  {
         might_sleep();
         return __mutex_lock_common(lock, TASK_KILLABLE,
-                                  subclass, NULL, _RET_IP_, NULL);
+                                  subclass, NULL, _RET_IP_, NULL, 0);
  }
  EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
  
@@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
  {
         might_sleep();
         return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-                                  subclass, NULL, _RET_IP_, NULL);
+                                  subclass, NULL, _RET_IP_, NULL, 0);
  }
  
  EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
  
         might_sleep();
         ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
-                                  0, &ctx->dep_map, _RET_IP_, ctx);
+                                  0, &ctx->dep_map, _RET_IP_, ctx, 1);
         if (!ret && ctx->acquired > 1)
                 return ww_mutex_deadlock_injection(lock, ctx);
  
@@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
  
         might_sleep();
         ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-                                 0, &ctx->dep_map, _RET_IP_, ctx);
+                                 0, &ctx->dep_map, _RET_IP_, ctx, 1);
  
         if (!ret && ctx->acquired > 1)
                 return ww_mutex_deadlock_injection(lock, ctx);
@@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)
         struct mutex *lock = container_of(lock_count, struct mutex, count);
  
         __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
-                           NULL, _RET_IP_, NULL);
+                           NULL, _RET_IP_, NULL, 0);
  }
  
  static noinline int __sched
  __mutex_lock_killable_slowpath(struct mutex *lock)
  {
         return __mutex_lock_common(lock, TASK_KILLABLE, 0,
-                                  NULL, _RET_IP_, NULL);
+                                  NULL, _RET_IP_, NULL, 0);
  }
  
  static noinline int __sched
  __mutex_lock_interruptible_slowpath(struct mutex *lock)
  {
         return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
-                                  NULL, _RET_IP_, NULL);
+                                  NULL, _RET_IP_, NULL, 0);
  }
  
  static noinline int __sched
  __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
  {
         return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
-                                  NULL, _RET_IP_, ctx);
+                                  NULL, _RET_IP_, ctx, 1);
  }
  
  static noinline int __sched
@@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
                                             struct ww_acquire_ctx *ctx)
  {
         return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
-                                  NULL, _RET_IP_, ctx);
+                                  NULL, _RET_IP_, ctx, 1);
  }
  
  #endif
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile

new file mode 100644 (file)

index 0000000..01e9ec3
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
+obj-y += update.o srcu.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
+obj-$(CONFIG_TREE_RCU) += tree.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
+obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h

similarity index 94%

rename from kernel/rcu.h

rename to kernel/rcu/rcu.h

index 77131966c4adc5f1fe41424a75710d42a1cd9850..7859a0a3951eae09e50a11df267213af0f6bace4 100644 (file)
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);
  
  #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
  
+/*
+ * Strings used in tracepoints need to be exported via the
+ * tracing system such that tools like perf and trace-cmd can
+ * translate the string address pointers to actual text.
+ */
+#define TPS(x)  tracepoint_string(x)
+
  #endif /* __LINUX_RCU_H */
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c

similarity index 100%

rename from kernel/srcu.c

rename to kernel/rcu/srcu.c
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c

similarity index 90%

rename from kernel/rcutiny.c

rename to kernel/rcu/tiny.c

index 9ed6075dc562443b775cdec4efb56ff58b51bcef..0c9a934cfec19d0859b57c2ca9eca86830401ab2 100644 (file)
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,6 +35,7 @@
  #include <linux/time.h>
  #include <linux/cpu.h>
  #include <linux/prefetch.h>
+#include <linux/ftrace_event.h>
  
  #ifdef CONFIG_RCU_TRACE
  #include <trace/events/rcu.h>
@@ -42,7 +43,7 @@
  
  #include "rcu.h"
  
-/* Forward declarations for rcutiny_plugin.h. */
+/* Forward declarations for tiny_plugin.h. */
  struct rcu_ctrlblk;
  static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
  static void rcu_process_callbacks(struct softirq_action *unused);
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,
  
  static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
  
-#include "rcutiny_plugin.h"
+#include "tiny_plugin.h"
  
  /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
  static void rcu_idle_enter_common(long long newval)
  {
         if (newval) {
-               RCU_TRACE(trace_rcu_dyntick("--=",
+               RCU_TRACE(trace_rcu_dyntick(TPS("--="),
                                             rcu_dynticks_nesting, newval));
                 rcu_dynticks_nesting = newval;
                 return;
         }
-       RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
+       RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
+                                   rcu_dynticks_nesting, newval));
         if (!is_idle_task(current)) {
-               struct task_struct *idle = idle_task(smp_processor_id());
+               struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
  
-               RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
+               RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
                                             rcu_dynticks_nesting, newval));
                 ftrace_dump(DUMP_ALL);
                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
  static void rcu_idle_exit_common(long long oldval)
  {
         if (oldval) {
-               RCU_TRACE(trace_rcu_dyntick("++=",
+               RCU_TRACE(trace_rcu_dyntick(TPS("++="),
                                             oldval, rcu_dynticks_nesting));
                 return;
         }
-       RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+       RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
         if (!is_idle_task(current)) {
-               struct task_struct *idle = idle_task(smp_processor_id());
+               struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
  
-               RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+               RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
                           oldval, rcu_dynticks_nesting));
                 ftrace_dump(DUMP_ALL);
                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -174,18 +176,18 @@ void rcu_irq_enter(void)
  }
  EXPORT_SYMBOL_GPL(rcu_irq_enter);
  
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
  
  /*
   * Test whether RCU thinks that the current CPU is idle.
   */
-int rcu_is_cpu_idle(void)
+bool __rcu_is_watching(void)
  {
-       return !rcu_dynticks_nesting;
+       return rcu_dynticks_nesting;
  }
-EXPORT_SYMBOL(rcu_is_cpu_idle);
+EXPORT_SYMBOL(__rcu_is_watching);
  
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
  
  /*
   * Test whether the current CPU was interrupted from idle.  Nested
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
         if (&rcp->rcucblist == rcp->donetail) {
                 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
                 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
-                                             ACCESS_ONCE(rcp->rcucblist),
+                                             !!ACCESS_ONCE(rcp->rcucblist),
                                               need_resched(),
                                               is_idle_task(current),
                                               false));
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                 RCU_TRACE(cb_count++);
         }
         RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
-       RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+       RCU_TRACE(trace_rcu_batch_end(rcp->name,
+                                     cb_count, 0, need_resched(),
                                       is_idle_task(current),
                                       false));
  }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h

similarity index 100%

rename from kernel/rcutiny_plugin.h

rename to kernel/rcu/tiny_plugin.h
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c

similarity index 99%

rename from kernel/rcutorture.c

rename to kernel/rcu/torture.c

index be63101c6175e5f10a121942a438694537094bd0..3929cd451511ef18b03c33d33a34d9b263151153 100644 (file)
--- a/kernel/rcutorture.c
+++ b/kernel/rcu/torture.c
@@ -52,6 +52,12 @@
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
  
+MODULE_ALIAS("rcutorture");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcutorture."
+
  static int fqs_duration;
  module_param(fqs_duration, int, 0444);
  MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c

similarity index 94%

rename from kernel/rcutree.c

rename to kernel/rcu/tree.c

index 32618b3fe4e6aa375b3cd3bde1acd12bdd4237b3..4c06ddfea7cd2beffbf428a952e1efada1d1fa57 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcu/tree.c
@@ -41,6 +41,7 @@
  #include <linux/export.h>
  #include <linux/completion.h>
  #include <linux/moduleparam.h>
+#include <linux/module.h>
  #include <linux/percpu.h>
  #include <linux/notifier.h>
  #include <linux/cpu.h>
@@ -56,17 +57,16 @@
  #include <linux/ftrace_event.h>
  #include <linux/suspend.h>
  
-#include "rcutree.h"
+#include "tree.h"
  #include <trace/events/rcu.h>
  
  #include "rcu.h"
  
-/*
- * Strings used in tracepoints need to be exported via the
- * tracing system such that tools like perf and trace-cmd can
- * translate the string address pointers to actual text.
- */
-#define TPS(x) tracepoint_string(x)
+MODULE_ALIAS("rcutree");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcutree."
  
  /* Data structures. */
  
@@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu)
  }
  EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  
-DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
         .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
         .dynticks = ATOMIC_INIT(1),
  #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
@@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
  {
         trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
         if (!user && !is_idle_task(current)) {
-               struct task_struct *idle = idle_task(smp_processor_id());
+               struct task_struct *idle __maybe_unused =
+                       idle_task(smp_processor_id());
  
                 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
                 ftrace_dump(DUMP_ORIG);
@@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user)
         long long oldval;
         struct rcu_dynticks *rdtp;
  
-       rdtp = &__get_cpu_var(rcu_dynticks);
+       rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
         WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
         if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -435,7 +436,7 @@ void rcu_idle_enter(void)
  
         local_irq_save(flags);
         rcu_eqs_enter(false);
-       rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0);
+       rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
         local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -478,7 +479,7 @@ void rcu_irq_exit(void)
         struct rcu_dynticks *rdtp;
  
         local_irq_save(flags);
-       rdtp = &__get_cpu_var(rcu_dynticks);
+       rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
         rdtp->dynticks_nesting--;
         WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
@@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
         rcu_cleanup_after_idle(smp_processor_id());
         trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
         if (!user && !is_idle_task(current)) {
-               struct task_struct *idle = idle_task(smp_processor_id());
+               struct task_struct *idle __maybe_unused =
+                       idle_task(smp_processor_id());
  
                 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
                                   oldval, rdtp->dynticks_nesting);
@@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user)
         struct rcu_dynticks *rdtp;
         long long oldval;
  
-       rdtp = &__get_cpu_var(rcu_dynticks);
+       rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
         WARN_ON_ONCE(oldval < 0);
         if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -555,7 +557,7 @@ void rcu_idle_exit(void)
  
         local_irq_save(flags);
         rcu_eqs_exit(false);
-       rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0);
+       rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
         local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -599,7 +601,7 @@ void rcu_irq_enter(void)
         long long oldval;
  
         local_irq_save(flags);
-       rdtp = &__get_cpu_var(rcu_dynticks);
+       rdtp = this_cpu_ptr(&rcu_dynticks);
         oldval = rdtp->dynticks_nesting;
         rdtp->dynticks_nesting++;
         WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
@@ -620,7 +622,7 @@ void rcu_irq_enter(void)
   */
  void rcu_nmi_enter(void)
  {
-       struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+       struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
  
         if (rdtp->dynticks_nmi_nesting == 0 &&
             (atomic_read(&rdtp->dynticks) & 0x1))
@@ -642,7 +644,7 @@ void rcu_nmi_enter(void)
   */
  void rcu_nmi_exit(void)
  {
-       struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+       struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
  
         if (rdtp->dynticks_nmi_nesting == 0 ||
             --rdtp->dynticks_nmi_nesting != 0)
@@ -655,21 +657,34 @@ void rcu_nmi_exit(void)
  }
  
  /**
- * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
+ * __rcu_is_watching - are RCU read-side critical sections safe?
+ *
+ * Return true if RCU is watching the running CPU, which means that
+ * this CPU can safely enter RCU read-side critical sections.  Unlike
+ * rcu_is_watching(), the caller of __rcu_is_watching() must have at
+ * least disabled preemption.
+ */
+bool __rcu_is_watching(void)
+{
+       return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+}
+
+/**
+ * rcu_is_watching - see if RCU thinks that the current CPU is idle
   *
   * If the current CPU is in its idle loop and is neither in an interrupt
   * or NMI handler, return true.
   */
-int rcu_is_cpu_idle(void)
+bool rcu_is_watching(void)
  {
         int ret;
  
         preempt_disable();
-       ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+       ret = __rcu_is_watching();
         preempt_enable();
         return ret;
  }
-EXPORT_SYMBOL(rcu_is_cpu_idle);
+EXPORT_SYMBOL_GPL(rcu_is_watching);
  
  #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
  
@@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)
         if (in_nmi())
                 return 1;
         preempt_disable();
-       rdp = &__get_cpu_var(rcu_sched_data);
+       rdp = this_cpu_ptr(&rcu_sched_data);
         rnp = rdp->mynode;
         ret = (rdp->grpmask & rnp->qsmaskinit) ||
               !rcu_scheduler_fully_active;
@@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
   */
  static int rcu_is_cpu_rrupt_from_idle(void)
  {
-       return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
+       return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
  }
  
  /*
@@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
  
  static void record_gp_stall_check_time(struct rcu_state *rsp)
  {
-       rsp->gp_start = jiffies;
-       rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+       unsigned long j = ACCESS_ONCE(jiffies);
+
+       rsp->gp_start = j;
+       smp_wmb(); /* Record start time before stall time. */
+       rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
  }
  
  /*
@@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         force_quiescent_state(rsp);  /* Kick them all. */
  }
  
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
  static void print_cpu_stall(struct rcu_state *rsp)
  {
         int cpu;
@@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)
                                      3 * rcu_jiffies_till_stall_check() + 3;
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
-       set_need_resched();  /* kick ourselves to get things going. */
+       /*
+        * Attempt to revive the RCU machinery by forcing a context switch.
+        *
+        * A context switch would normally allow the RCU state machine to make
+        * progress and it could be we're stuck in kernel space without context
+        * switches for an entirely unreasonable amount of time.
+        */
+       resched_cpu(smp_processor_id());
  }
  
  static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
  {
+       unsigned long completed;
+       unsigned long gpnum;
+       unsigned long gps;
         unsigned long j;
         unsigned long js;
         struct rcu_node *rnp;
  
-       if (rcu_cpu_stall_suppress)
+       if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
                 return;
         j = ACCESS_ONCE(jiffies);
+
+       /*
+        * Lots of memory barriers to reject false positives.
+        *
+        * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
+        * then rsp->gp_start, and finally rsp->completed.  These values
+        * are updated in the opposite order with memory barriers (or
+        * equivalent) during grace-period initialization and cleanup.
+        * Now, a false positive can occur if we get an new value of
+        * rsp->gp_start and a old value of rsp->jiffies_stall.  But given
+        * the memory barriers, the only way that this can happen is if one
+        * grace period ends and another starts between these two fetches.
+        * Detect this by comparing rsp->completed with the previous fetch
+        * from rsp->gpnum.
+        *
+        * Given this check, comparisons of jiffies, rsp->jiffies_stall,
+        * and rsp->gp_start suffice to forestall false positives.
+        */
+       gpnum = ACCESS_ONCE(rsp->gpnum);
+       smp_rmb(); /* Pick up ->gpnum first... */
         js = ACCESS_ONCE(rsp->jiffies_stall);
+       smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+       gps = ACCESS_ONCE(rsp->gp_start);
+       smp_rmb(); /* ...and finally ->gp_start before ->completed. */
+       completed = ACCESS_ONCE(rsp->completed);
+       if (ULONG_CMP_GE(completed, gpnum) ||
+           ULONG_CMP_LT(j, js) ||
+           ULONG_CMP_GE(gps, js))
+               return; /* No stall or GP completed since entering function. */
         rnp = rdp->mynode;
         if (rcu_gp_in_progress(rsp) &&
-           (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
+           (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
  
                 /* We haven't checked in, so go dump stack. */
                 print_cpu_stall(rsp);
@@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
  }
  
  /*
- * Initialize a new grace period.
+ * Initialize a new grace period.  Return 0 if no grace period required.
   */
  static int rcu_gp_init(struct rcu_state *rsp)
  {
@@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp)
  
         rcu_bind_gp_kthread();
         raw_spin_lock_irq(&rnp->lock);
+       if (rsp->gp_flags == 0) {
+               /* Spurious wakeup, tell caller to go back to sleep.  */
+               raw_spin_unlock_irq(&rnp->lock);
+               return 0;
+       }
         rsp->gp_flags = 0; /* Clear all flags: New grace period. */
  
-       if (rcu_gp_in_progress(rsp)) {
-               /* Grace period already in progress, don't start another.  */
+       if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
+               /*
+                * Grace period already in progress, don't start another.
+                * Not supposed to be able to happen.
+                */
                 raw_spin_unlock_irq(&rnp->lock);
                 return 0;
         }
  
         /* Advance to a new grace period and initialize state. */
+       record_gp_stall_check_time(rsp);
+       smp_wmb(); /* Record GP times before starting GP. */
         rsp->gpnum++;
         trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
-       record_gp_stall_check_time(rsp);
         raw_spin_unlock_irq(&rnp->lock);
  
         /* Exclude any concurrent CPU-hotplug operations. */
@@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
  /*
   * Do one round of quiescent-state forcing.
   */
-int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
  {
         int fqs_state = fqs_state_in;
         bool isidle = false;
@@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         rsp->fqs_state = RCU_GP_IDLE;
         rdp = this_cpu_ptr(rsp->rda);
         rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
-       if (cpu_needs_another_gp(rsp, rdp))
-               rsp->gp_flags = 1;
+       if (cpu_needs_another_gp(rsp, rdp)) {
+               rsp->gp_flags = RCU_GP_FLAG_INIT;
+               trace_rcu_grace_period(rsp->name,
+                                      ACCESS_ONCE(rsp->gpnum),
+                                      TPS("newreq"));
+       }
         raw_spin_unlock_irq(&rnp->lock);
  }
  
@@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
  static int __noreturn rcu_gp_kthread(void *arg)
  {
         int fqs_state;
+       int gf;
         unsigned long j;
         int ret;
         struct rcu_state *rsp = arg;
@@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
  
                 /* Handle grace-period start. */
                 for (;;) {
+                       trace_rcu_grace_period(rsp->name,
+                                              ACCESS_ONCE(rsp->gpnum),
+                                              TPS("reqwait"));
                         wait_event_interruptible(rsp->gp_wq,
-                                                rsp->gp_flags &
+                                                ACCESS_ONCE(rsp->gp_flags) &
                                                  RCU_GP_FLAG_INIT);
-                       if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
-                           rcu_gp_init(rsp))
+                       if (rcu_gp_init(rsp))
                                 break;
                         cond_resched();
                         flush_signals(current);
+                       trace_rcu_grace_period(rsp->name,
+                                              ACCESS_ONCE(rsp->gpnum),
+                                              TPS("reqwaitsig"));
                 }
  
                 /* Handle quiescent-state forcing. */
@@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)
                         j = HZ;
                         jiffies_till_first_fqs = HZ;
                 }
+               ret = 0;
                 for (;;) {
-                       rsp->jiffies_force_qs = jiffies + j;
+                       if (!ret)
+                               rsp->jiffies_force_qs = jiffies + j;
+                       trace_rcu_grace_period(rsp->name,
+                                              ACCESS_ONCE(rsp->gpnum),
+                                              TPS("fqswait"));
                         ret = wait_event_interruptible_timeout(rsp->gp_wq,
-                                       (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
+                                       ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+                                        RCU_GP_FLAG_FQS) ||
                                         (!ACCESS_ONCE(rnp->qsmask) &&
                                          !rcu_preempt_blocked_readers_cgp(rnp)),
                                         j);
@@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)
                             !rcu_preempt_blocked_readers_cgp(rnp))
                                 break;
                         /* If time for quiescent-state forcing, do it. */
-                       if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
+                       if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
+                           (gf & RCU_GP_FLAG_FQS)) {
+                               trace_rcu_grace_period(rsp->name,
+                                                      ACCESS_ONCE(rsp->gpnum),
+                                                      TPS("fqsstart"));
                                 fqs_state = rcu_gp_fqs(rsp, fqs_state);
+                               trace_rcu_grace_period(rsp->name,
+                                                      ACCESS_ONCE(rsp->gpnum),
+                                                      TPS("fqsend"));
                                 cond_resched();
                         } else {
                                 /* Deal with stray signal. */
                                 cond_resched();
                                 flush_signals(current);
+                               trace_rcu_grace_period(rsp->name,
+                                                      ACCESS_ONCE(rsp->gpnum),
+                                                      TPS("fqswaitsig"));
                         }
                         j = jiffies_till_next_fqs;
                         if (j > HZ) {
@@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                 return;
         }
         rsp->gp_flags = RCU_GP_FLAG_INIT;
+       trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+                              TPS("newreq"));
  
         /*
          * We can't do wakeups while holding the rnp->lock, as that
@@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
          * If called from an extended quiescent state, invoke the RCU
          * core in order to force a re-evaluation of RCU's idleness.
          */
-       if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+       if (!rcu_is_watching() && cpu_online(smp_processor_id()))
                 invoke_rcu_core();
  
         /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
  
         for_each_rcu_flavor(rsp) {
                 rdp = per_cpu_ptr(rsp->rda, cpu);
-               if (rdp->qlen != rdp->qlen_lazy)
+               if (!rdp->nxtlist)
+                       continue;
+               hc = true;
+               if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
                         al = false;
-               if (rdp->nxtlist)
-                       hc = true;
+                       break;
+               }
         }
         if (all_lazy)
                 *all_lazy = al;
@@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
  
  /*
   * Compute the rcu_node tree geometry from kernel parameters.  This cannot
- * replace the definitions in rcutree.h because those are needed to size
+ * replace the definitions in tree.h because those are needed to size
   * the ->node array in the rcu_state structure.
   */
  static void __init rcu_init_geometry(void)
@@ -3295,8 +3397,8 @@ void __init rcu_init(void)
  
         rcu_bootup_announce();
         rcu_init_geometry();
-       rcu_init_one(&rcu_sched_state, &rcu_sched_data);
         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
+       rcu_init_one(&rcu_sched_state, &rcu_sched_data);
         __rcu_init_preempt();
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  
@@ -3311,4 +3413,4 @@ void __init rcu_init(void)
                 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
  }
  
-#include "rcutree_plugin.h"
+#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h

similarity index 99%

rename from kernel/rcutree.h

rename to kernel/rcu/tree.h

index 5f97eab602cd831dec54baaf7bafd970920d0c60..52be957c9fe22360a7a307842795c09353b42789 100644 (file)
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -104,6 +104,8 @@ struct rcu_dynticks {
                                     /* idle-period nonlazy_posted snapshot. */
         unsigned long last_accelerate;
                                     /* Last jiffy CBs were accelerated. */
+       unsigned long last_advance_all;
+                                   /* Last jiffy CBs were all advanced. */
         int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
  #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
  };
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h

similarity index 97%

rename from kernel/rcutree_plugin.h

rename to kernel/rcu/tree_plugin.h

index 130c97b027f2e1f244cf2c23edfca913238e6dfa..3822ac0c4b2732dfd39982b74138a1960fa47a21 100644 (file)
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,7 +28,7 @@
  #include <linux/gfp.h>
  #include <linux/oom.h>
  #include <linux/smpboot.h>
-#include "time/tick-internal.h"
+#include "../time/tick-internal.h"
  
  #define RCU_KTHREAD_PRIO 1
  
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)
  #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
  #ifdef CONFIG_RCU_NOCB_CPU_ALL
         pr_info("\tOffload RCU callbacks from all CPUs\n");
-       cpumask_setall(rcu_nocb_mask);
+       cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
  #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
         if (have_rcu_nocb_mask) {
+               if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+                       pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+                       cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+                                   rcu_nocb_mask);
+               }
                 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
                 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
                 if (rcu_nocb_poll)
@@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)
  
  static void rcu_preempt_do_callbacks(void)
  {
-       rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+       rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
  }
  
  #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1128,7 +1133,7 @@ void exit_rcu(void)
  
  #ifdef CONFIG_RCU_BOOST
  
-#include "rtmutex_common.h"
+#include "../rtmutex_common.h"
  
  #ifdef CONFIG_RCU_TRACE
  
@@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)
   */
  static bool rcu_is_callbacks_kthread(void)
  {
-       return __get_cpu_var(rcu_cpu_kthread_task) == current;
+       return __this_cpu_read(rcu_cpu_kthread_task) == current;
  }
  
  #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
  
  static void rcu_kthread_do_work(void)
  {
-       rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
-       rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
+       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
         rcu_preempt_do_callbacks();
  }
  
@@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
  
  static int rcu_cpu_kthread_should_run(unsigned int cpu)
  {
-       return __get_cpu_var(rcu_cpu_has_work);
+       return __this_cpu_read(rcu_cpu_has_work);
  }
  
  /*
@@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
   */
  static void rcu_cpu_kthread(unsigned int cpu)
  {
-       unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
-       char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
+       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
         int spincnt;
  
         for (spincnt = 0; spincnt < 10; spincnt++) {
@@ -1630,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644);
  extern int tick_nohz_enabled;
  
  /*
- * Try to advance callbacks for all flavors of RCU on the current CPU.
- * Afterwards, if there are any callbacks ready for immediate invocation,
- * return true.
+ * Try to advance callbacks for all flavors of RCU on the current CPU, but
+ * only if it has been awhile since the last time we did so.  Afterwards,
+ * if there are any callbacks ready for immediate invocation, return true.
   */
  static bool rcu_try_advance_all_cbs(void)
  {
         bool cbs_ready = false;
         struct rcu_data *rdp;
+       struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
         struct rcu_node *rnp;
         struct rcu_state *rsp;
  
+       /* Exit early if we advanced recently. */
+       if (jiffies == rdtp->last_advance_all)
+               return 0;
+       rdtp->last_advance_all = jiffies;
+
         for_each_rcu_flavor(rsp) {
                 rdp = this_cpu_ptr(rsp->rda);
                 rnp = rdp->mynode;
@@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)
          */
         if (rdtp->all_lazy &&
             rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+               rdtp->all_lazy = false;
+               rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
                 invoke_rcu_core();
                 return;
         }
@@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)
   */
  static void rcu_cleanup_after_idle(int cpu)
  {
-       struct rcu_data *rdp;
-       struct rcu_state *rsp;
  
         if (rcu_is_nocb_cpu(cpu))
                 return;
-       rcu_try_advance_all_cbs();
-       for_each_rcu_flavor(rsp) {
-               rdp = per_cpu_ptr(rsp->rda, cpu);
-               if (cpu_has_callbacks_ready_to_invoke(rdp))
-                       invoke_rcu_core();
-       }
+       if (rcu_try_advance_all_cbs())
+               invoke_rcu_core();
  }
  
  /*
@@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
  
         /* If we are not being polled and there is a kthread, awaken it ... */
         t = ACCESS_ONCE(rdp->nocb_kthread);
-       if (rcu_nocb_poll | !t)
+       if (rcu_nocb_poll || !t) {
+               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                   TPS("WakeNotPoll"));
                 return;
+       }
         len = atomic_long_read(&rdp->nocb_q_count);
         if (old_rhpp == &rdp->nocb_head) {
                 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
                 rdp->qlen_last_fqs_check = 0;
+               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
         } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                 wake_up_process(t); /* ... or if many callbacks queued. */
                 rdp->qlen_last_fqs_check = LONG_MAX / 2;
+               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
+       } else {
+               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
         }
         return;
  }
@@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
         if (__is_kfree_rcu_offset((unsigned long)rhp->func))
                 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
                                          (unsigned long)rhp->func,
-                                        rdp->qlen_lazy, rdp->qlen);
+                                        -atomic_long_read(&rdp->nocb_q_count_lazy),
+                                        -atomic_long_read(&rdp->nocb_q_count));
         else
                 trace_rcu_callback(rdp->rsp->name, rhp,
-                                  rdp->qlen_lazy, rdp->qlen);
+                                  -atomic_long_read(&rdp->nocb_q_count_lazy),
+                                  -atomic_long_read(&rdp->nocb_q_count));
         return 1;
  }
  
@@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
  static int rcu_nocb_kthread(void *arg)
  {
         int c, cl;
+       bool firsttime = 1;
         struct rcu_head *list;
         struct rcu_head *next;
         struct rcu_head **tail;
@@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)
         /* Each pass through this loop invokes one batch of callbacks */
         for (;;) {
                 /* If not polling, wait for next batch of callbacks. */
-               if (!rcu_nocb_poll)
+               if (!rcu_nocb_poll) {
+                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                           TPS("Sleep"));
                         wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+               } else if (firsttime) {
+                       firsttime = 0;
+                       trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                           TPS("Poll"));
+               }
                 list = ACCESS_ONCE(rdp->nocb_head);
                 if (!list) {
+                       if (!rcu_nocb_poll)
+                               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                   TPS("WokeEmpty"));
                         schedule_timeout_interruptible(1);
                         flush_signals(current);
                         continue;
                 }
+               firsttime = 1;
+               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                   TPS("WokeNonEmpty"));
  
                 /*
                  * Extract queued callbacks, update counts, and wait
@@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)
                         next = list->next;
                         /* Wait for enqueuing to complete, if needed. */
                         while (next == NULL && &list->next != tail) {
+                               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                   TPS("WaitQueue"));
                                 schedule_timeout_interruptible(1);
+                               trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                   TPS("WokeQueue"));
                                 next = list->next;
                         }
                         debug_rcu_head_unqueue(list);
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c

similarity index 99%

rename from kernel/rcutree_trace.c

rename to kernel/rcu/tree_trace.c

index cf6c174129321b71b5e9cffb60bf7c6c608f4c3d..3596797b7e462b2069f8ef21f1b669d777667689 100644 (file)
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -44,7 +44,7 @@
  #include <linux/seq_file.h>
  
  #define RCU_TREE_NONCORE
-#include "rcutree.h"
+#include "tree.h"
  
  static int r_open(struct inode *inode, struct file *file,
                                         const struct seq_operations *op)
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c

similarity index 97%

rename from kernel/rcupdate.c

rename to kernel/rcu/update.c

index b02a339836b43d59dbd273efdb77d423b84ea90b..6cb3dff89e2b73d214c9fd67d70a2abd43b64f17 100644 (file)
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -53,6 +53,12 @@
  
  #include "rcu.h"
  
+MODULE_ALIAS("rcupdate");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcupdate."
+
  module_param(rcu_expedited, int, 0);
  
  #ifdef CONFIG_PREEMPT_RCU
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)
  {
         if (!debug_lockdep_rcu_enabled())
                 return 1;
-       if (rcu_is_cpu_idle())
+       if (!rcu_is_watching())
                 return 0;
         if (!rcu_lockdep_current_cpu_online())
                 return 0;
@@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
  #endif
  
  int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
  
  module_param(rcu_cpu_stall_suppress, int, 0644);
  module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 5ac63c9a995a3570e0ad73a20b28c23cb972a963..c06b8d345faef1bd3b796e309190250d75820fdb 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
   * might also involve a cross-CPU call to trigger the scheduler on
   * the target CPU.
   */
-#ifdef CONFIG_SMP
  void resched_task(struct task_struct *p)
  {
         int cpu;
  
-       assert_raw_spin_locked(&task_rq(p)->lock);
+       lockdep_assert_held(&task_rq(p)->lock);
  
         if (test_tsk_need_resched(p))
                 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
         set_tsk_need_resched(p);
  
         cpu = task_cpu(p);
-       if (cpu == smp_processor_id())
+       if (cpu == smp_processor_id()) {
+               set_preempt_need_resched();
                 return;
+       }
  
         /* NEED_RESCHED must be visible before we test polling */
         smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
+#ifdef CONFIG_SMP
  #ifdef CONFIG_NO_HZ_COMMON
  /*
   * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
         }
  }
  
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-       assert_raw_spin_locked(&task_rq(p)->lock);
-       set_tsk_need_resched(p);
-}
  #endif /* CONFIG_SMP */
  
  #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
  static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_queued(p);
+       sched_info_queued(rq, p);
         p->sched_class->enqueue_task(rq, p, flags);
  }
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_dequeued(p);
+       sched_info_dequeued(rq, p);
         p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
          * ttwu() will sort out the placement.
          */
         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                       !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+                       !(task_preempt_count(p) & PREEMPT_ACTIVE));
  
  #ifdef CONFIG_LOCKDEP
         /*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         __set_task_cpu(p, new_cpu);
  }
  
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+       if (p->on_rq) {
+               struct rq *src_rq, *dst_rq;
+
+               src_rq = task_rq(p);
+               dst_rq = cpu_rq(cpu);
+
+               deactivate_task(src_rq, p, 0);
+               set_task_cpu(p, cpu);
+               activate_task(dst_rq, p, 0);
+               check_preempt_curr(dst_rq, p, 0);
+       } else {
+               /*
+                * Task isn't running anymore; make it appear like we migrated
+                * it before it went to sleep. This means on wakeup we make the
+                * previous cpu our targer instead of where it really is.
+                */
+               p->wake_cpu = cpu;
+       }
+}
+
+struct migration_swap_arg {
+       struct task_struct *src_task, *dst_task;
+       int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+       struct migration_swap_arg *arg = data;
+       struct rq *src_rq, *dst_rq;
+       int ret = -EAGAIN;
+
+       src_rq = cpu_rq(arg->src_cpu);
+       dst_rq = cpu_rq(arg->dst_cpu);
+
+       double_raw_lock(&arg->src_task->pi_lock,
+                       &arg->dst_task->pi_lock);
+       double_rq_lock(src_rq, dst_rq);
+       if (task_cpu(arg->dst_task) != arg->dst_cpu)
+               goto unlock;
+
+       if (task_cpu(arg->src_task) != arg->src_cpu)
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+               goto unlock;
+
+       __migrate_swap_task(arg->src_task, arg->dst_cpu);
+       __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+       ret = 0;
+
+unlock:
+       double_rq_unlock(src_rq, dst_rq);
+       raw_spin_unlock(&arg->dst_task->pi_lock);
+       raw_spin_unlock(&arg->src_task->pi_lock);
+
+       return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+       struct migration_swap_arg arg;
+       int ret = -EINVAL;
+
+       arg = (struct migration_swap_arg){
+               .src_task = cur,
+               .src_cpu = task_cpu(cur),
+               .dst_task = p,
+               .dst_cpu = task_cpu(p),
+       };
+
+       if (arg.src_cpu == arg.dst_cpu)
+               goto out;
+
+       /*
+        * These three tests are all lockless; this is OK since all of them
+        * will be re-checked with proper locks held further down the line.
+        */
+       if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+               goto out;
+
+       ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+       return ret;
+}
+
  struct migration_arg {
         struct task_struct *task;
         int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
   */
  static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
  {
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+       cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
  
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  
         if (rq->idle_stamp) {
                 u64 delta = rq_clock(rq) - rq->idle_stamp;
-               u64 max = 2*sysctl_sched_migration_cost;
+               u64 max = 2*rq->max_idle_balance_cost;
+
+               update_avg(&rq->avg_idle, delta);
  
-               if (delta > max)
+               if (rq->avg_idle > max)
                         rq->avg_idle = max;
-               else
-                       update_avg(&rq->avg_idle, delta);
+
                 rq->idle_stamp = 0;
         }
  #endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
  
  void scheduler_ipi(void)
  {
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       if (tif_need_resched())
+               set_preempt_need_resched();
+
         if (llist_empty(&this_rq()->wake_list)
                         && !tick_nohz_full_cpu(smp_processor_id())
                         && !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         if (p->sched_class->task_waking)
                 p->sched_class->task_waking(p);
  
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
   *
   * __sched_fork() is basic setup used by init_idle() too:
   */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
         p->on_rq                        = 0;
  
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
  
  #ifdef CONFIG_NUMA_BALANCING
         if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-               p->mm->numa_next_scan = jiffies;
-               p->mm->numa_next_reset = jiffies;
+               p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
                 p->mm->numa_scan_seq = 0;
         }
  
+       if (clone_flags & CLONE_VM)
+               p->numa_preferred_nid = current->numa_preferred_nid;
+       else
+               p->numa_preferred_nid = -1;
+
         p->node_stamp = 0ULL;
         p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
         p->numa_scan_period = sysctl_numa_balancing_scan_delay;
         p->numa_work.next = &p->numa_work;
+       p->numa_faults = NULL;
+       p->numa_faults_buffer = NULL;
+
+       INIT_LIST_HEAD(&p->numa_entry);
+       p->numa_group = NULL;
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
  /*
   * fork()/clone()-time setup:
   */
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
         unsigned long flags;
         int cpu = get_cpu();
  
-       __sched_fork(p);
+       __sched_fork(clone_flags, p);
         /*
          * We mark the process as running here. This guarantees that
          * nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
  #if defined(CONFIG_SMP)
         p->on_cpu = 0;
  #endif
-#ifdef CONFIG_PREEMPT_COUNT
-       /* Want to start with kernel preemption disabled. */
-       task_thread_info(p)->preempt_count = 1;
-#endif
+       init_task_preempt_count(p);
  #ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
  #endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
          */
-       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
         /* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
  {
         trace_sched_switch(prev, next);
-       sched_info_switch(prev, next);
+       sched_info_switch(rq, prev, next);
         perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
         prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
+               task_numa_free(prev);
+
                 /*
                  * Remove function-return probe instances associated with this
                  * task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
         int dest_cpu;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
  
@@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_PREEMPT_TRACER))
  
-void __kprobes add_preempt_count(int val)
+void __kprobes preempt_count_add(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
         /*
@@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val)
         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                 return;
  #endif
-       preempt_count() += val;
+       __preempt_count_add(val);
  #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Spinlock count overflowing soon?
@@ -2235,9 +2348,9 @@ void __kprobes add_preempt_count(int val)
         if (preempt_count() == val)
                 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
  }
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
  
-void __kprobes sub_preempt_count(int val)
+void __kprobes preempt_count_sub(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
         /*
@@ -2255,9 +2368,9 @@ void __kprobes sub_preempt_count(int val)
  
         if (preempt_count() == val)
                 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-       preempt_count() -= val;
+       __preempt_count_sub(val);
  }
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
  
  #endif
  
@@ -2430,6 +2543,7 @@ need_resched:
         put_prev_task(rq, prev);
         next = pick_next_task(rq);
         clear_tsk_need_resched(prev);
+       clear_preempt_need_resched();
         rq->skip_clock_update = 0;
  
         if (likely(prev != next)) {
@@ -2520,9 +2634,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
                 return;
  
         do {
-               add_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_add(PREEMPT_ACTIVE);
                 __schedule();
-               sub_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_sub(PREEMPT_ACTIVE);
  
                 /*
                  * Check again in case we missed a preemption opportunity
@@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule);
   */
  asmlinkage void __sched preempt_schedule_irq(void)
  {
-       struct thread_info *ti = current_thread_info();
         enum ctx_state prev_state;
  
         /* Catch callers which need to be fixed */
-       BUG_ON(ti->preempt_count || !irqs_disabled());
+       BUG_ON(preempt_count() || !irqs_disabled());
  
         prev_state = exception_enter();
  
         do {
-               add_preempt_count(PREEMPT_ACTIVE);
+               __preempt_count_add(PREEMPT_ACTIVE);
                 local_irq_enable();
                 __schedule();
                 local_irq_disable();
-               sub_preempt_count(PREEMPT_ACTIVE);
+               __preempt_count_sub(PREEMPT_ACTIVE);
  
                 /*
                  * Check again in case we missed a preemption opportunity
@@ -3598,7 +3711,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         struct task_struct *p;
         int retval;
  
-       get_online_cpus();
         rcu_read_lock();
  
         p = find_process_by_pid(pid);
@@ -3661,7 +3773,6 @@ out_free_cpus_allowed:
         free_cpumask_var(cpus_allowed);
  out_put_task:
         put_task_struct(p);
-       put_online_cpus();
         return retval;
  }
  
@@ -3706,7 +3817,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
         unsigned long flags;
         int retval;
  
-       get_online_cpus();
         rcu_read_lock();
  
         retval = -ESRCH;
@@ -3719,12 +3829,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                 goto out_unlock;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+       cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
  out_unlock:
         rcu_read_unlock();
-       put_online_cpus();
  
         return retval;
  }
@@ -3794,16 +3903,11 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
-static inline int should_resched(void)
-{
-       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
  static void __cond_resched(void)
  {
-       add_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_add(PREEMPT_ACTIVE);
         __schedule();
-       sub_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_sub(PREEMPT_ACTIVE);
  }
  
  int __sched _cond_resched(void)
@@ -4186,7 +4290,7 @@ void init_idle(struct task_struct *idle, int cpu)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
  
-       __sched_fork(idle);
+       __sched_fork(0, idle);
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
  
@@ -4212,7 +4316,7 @@ void init_idle(struct task_struct *idle, int cpu)
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  
         /* Set the preempt count _outside_ the spinlocks! */
-       task_thread_info(idle)->preempt_count = 0;
+       init_idle_preempt_count(idle, cpu);
  
         /*
          * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4450,53 @@ fail:
         return ret;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+       struct migration_arg arg = { p, target_cpu };
+       int curr_cpu = task_cpu(p);
+
+       if (curr_cpu == target_cpu)
+               return 0;
+
+       if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+               return -EINVAL;
+
+       /* TODO: This is not properly updating schedstats */
+
+       return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+       struct rq *rq;
+       unsigned long flags;
+       bool on_rq, running;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->numa_preferred_nid = nid;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+#endif
+
  /*
   * migration_cpu_stop - this will be executed by a highprio stopper thread
   * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +5270,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_size);
  DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
  
  static void update_top_cache_domain(int cpu)
  {
@@ -5135,6 +5287,9 @@ static void update_top_cache_domain(int cpu)
         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
         per_cpu(sd_llc_size, cpu) = size;
         per_cpu(sd_llc_id, cpu) = id;
+
+       sd = lowest_flag_domain(cpu, SD_NUMA);
+       rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
  }
  
  /*
@@ -5654,6 +5809,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                         | 0*SD_SHARE_PKG_RESOURCES
                                         | 1*SD_SERIALIZE
                                         | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                         | sd_local_flags(level)
                                         ,
                 .last_balance           = jiffies,
@@ -6335,14 +6491,17 @@ void __init sched_init_smp(void)
  
         sched_init_numa();
  
-       get_online_cpus();
+       /*
+        * There's no userspace yet to cause hotplug operations; hence all the
+        * cpu masks are stable and all blatant races in the below code cannot
+        * happen.
+        */
         mutex_lock(&sched_domains_mutex);
         init_sched_domains(cpu_active_mask);
         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
         if (cpumask_empty(non_isolated_cpus))
                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
-       put_online_cpus();
  
         hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
         hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6664,7 @@ void __init sched_init(void)
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
+               rq->max_idle_balance_cost = sysctl_sched_migration_cost;
  
                 INIT_LIST_HEAD(&rq->cfs_tasks);
  
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 196559994f7ce33e210f3a290043ed5862950ec6..e6ba5e31c7ca24e67bb3f56fbe8f422371c8fb74 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
  #include <linux/seq_file.h>
  #include <linux/kallsyms.h>
  #include <linux/utsname.h>
+#include <linux/mempolicy.h>
  
  #include "sched.h"
  
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
         SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
  #endif
+#ifdef CONFIG_NUMA_BALANCING
+       SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
+#endif
  #ifdef CONFIG_CGROUP_SCHED
         SEQ_printf(m, " %s", task_group_path(task_group(p)));
  #endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
         read_lock_irqsave(&tasklist_lock, flags);
  
         do_each_thread(g, p) {
-               if (!p->on_rq || task_cpu(p) != rq_cpu)
+               if (task_cpu(p) != rq_cpu)
                         continue;
  
                 print_task(m, rq, p);
@@ -345,7 +349,7 @@ static void sched_debug_header(struct seq_file *m)
         cpu_clk = local_clock();
         local_irq_restore(flags);
  
-       SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+       SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
                 init_utsname()->release,
                 (int)strcspn(init_utsname()->version, " "),
                 init_utsname()->version);
@@ -488,6 +492,56 @@ static int __init init_sched_debug_procfs(void)
  
  __initcall(init_sched_debug_procfs);
  
+#define __P(F) \
+       SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+       SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+       SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+       SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+       struct mempolicy *pol;
+       int node, i;
+
+       if (p->mm)
+               P(mm->numa_scan_seq);
+
+       task_lock(p);
+       pol = p->mempolicy;
+       if (pol && !(pol->flags & MPOL_F_MORON))
+               pol = NULL;
+       mpol_get(pol);
+       task_unlock(p);
+
+       SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+
+       for_each_online_node(node) {
+               for (i = 0; i < 2; i++) {
+                       unsigned long nr_faults = -1;
+                       int cpu_current, home_node;
+
+                       if (p->numa_faults)
+                               nr_faults = p->numa_faults[2*node + i];
+
+                       cpu_current = !i ? (task_node(p) == node) :
+                               (pol && node_isset(node, pol->v.nodes));
+
+                       home_node = (p->numa_preferred_nid == node);
+
+                       SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                               i, node, cpu_current, home_node, nr_faults);
+               }
+       }
+
+       mpol_put(pol);
+#endif
+}
+
  void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
  {
         unsigned long nr_switches;
@@ -591,6 +645,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                 SEQ_printf(m, "%-45s:%21Ld\n",
                            "clock-delta", (long long)(t1-t0));
         }
+
+       sched_show_numa(p, m);
  }
  
  void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 7c70201fbc61aef012d5b2536600d898602bab0a..813dd61a9b4336f4c4d48e171db54ed6e185ce43 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
+
  static inline void __update_task_entity_contrib(struct sched_entity *se);
  
  /* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  
  #ifdef CONFIG_NUMA_BALANCING
  /*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
   */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
  
  /* Portion of address space to scan in MB */
  unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
-static void task_numa_placement(struct task_struct *p)
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+       unsigned long rss = 0;
+       unsigned long nr_scan_pages;
+
+       /*
+        * Calculations based on RSS as non-present and empty pages are skipped
+        * by the PTE scanner and NUMA hinting faults should be trapped based
+        * on resident pages
+        */
+       nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+       rss = get_mm_rss(p->mm);
+       if (!rss)
+               rss = nr_scan_pages;
+
+       rss = round_up(rss, nr_scan_pages);
+       return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
  {
-       int seq;
+       unsigned int scan, floor;
+       unsigned int windows = 1;
+
+       if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+               windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+       floor = 1000 / windows;
+
+       scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+       return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+       unsigned int smin = task_scan_min(p);
+       unsigned int smax;
+
+       /* Watch for min being lower than max due to floor calculations */
+       smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+       return max(smin, smax);
+}
+
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+       atomic_t refcount;
+
+       spinlock_t lock; /* nr_tasks, tasks */
+       int nr_tasks;
+       pid_t gid;
+       struct list_head task_list;
+
+       struct rcu_head rcu;
+       unsigned long total_faults;
+       unsigned long faults[0];
+};
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+       return p->numa_group ? p->numa_group->gid : 0;
+}
+
+static inline int task_faults_idx(int nid, int priv)
+{
+       return 2 * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+       if (!p->numa_faults)
+               return 0;
+
+       return p->numa_faults[task_faults_idx(nid, 0)] +
+               p->numa_faults[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+       if (!p->numa_group)
+               return 0;
+
+       return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node.  The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+       unsigned long total_faults;
+
+       if (!p->numa_faults)
+               return 0;
+
+       total_faults = p->total_numa_faults;
+
+       if (!total_faults)
+               return 0;
+
+       return 1000 * task_faults(p, nid) / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+       if (!p->numa_group || !p->numa_group->total_faults)
+               return 0;
+
+       return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+       unsigned long nr_running;
+       unsigned long load;
+
+       /* Total compute capacity of CPUs on a node */
+       unsigned long power;
+
+       /* Approximate capacity in terms of runnable tasks on a node */
+       unsigned long capacity;
+       int has_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+       int cpu;
+
+       memset(ns, 0, sizeof(*ns));
+       for_each_cpu(cpu, cpumask_of_node(nid)) {
+               struct rq *rq = cpu_rq(cpu);
+
+               ns->nr_running += rq->nr_running;
+               ns->load += weighted_cpuload(cpu);
+               ns->power += power_of(cpu);
+       }
+
+       ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+       ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+       ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+
+struct task_numa_env {
+       struct task_struct *p;
+
+       int src_cpu, src_nid;
+       int dst_cpu, dst_nid;
+
+       struct numa_stats src_stats, dst_stats;
+
+       int imbalance_pct, idx;
+
+       struct task_struct *best_task;
+       long best_imp;
+       int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+                            struct task_struct *p, long imp)
+{
+       if (env->best_task)
+               put_task_struct(env->best_task);
+       if (p)
+               get_task_struct(p);
+
+       env->best_task = p;
+       env->best_imp = imp;
+       env->best_cpu = env->dst_cpu;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+                             long taskimp, long groupimp)
+{
+       struct rq *src_rq = cpu_rq(env->src_cpu);
+       struct rq *dst_rq = cpu_rq(env->dst_cpu);
+       struct task_struct *cur;
+       long dst_load, src_load;
+       long load;
+       long imp = (groupimp > 0) ? groupimp : taskimp;
+
+       rcu_read_lock();
+       cur = ACCESS_ONCE(dst_rq->curr);
+       if (cur->pid == 0) /* idle */
+               cur = NULL;
+
+       /*
+        * "imp" is the fault differential for the source task between the
+        * source and destination node. Calculate the total differential for
+        * the source task and potential destination task. The more negative
+        * the value is, the more rmeote accesses that would be expected to
+        * be incurred if the tasks were swapped.
+        */
+       if (cur) {
+               /* Skip this swap candidate if cannot move to the source cpu */
+               if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                       goto unlock;
+
+               /*
+                * If dst and source tasks are in the same NUMA group, or not
+                * in any group then look only at task weights.
+                */
+               if (cur->numa_group == env->p->numa_group) {
+                       imp = taskimp + task_weight(cur, env->src_nid) -
+                             task_weight(cur, env->dst_nid);
+                       /*
+                        * Add some hysteresis to prevent swapping the
+                        * tasks within a group over tiny differences.
+                        */
+                       if (cur->numa_group)
+                               imp -= imp/16;
+               } else {
+                       /*
+                        * Compare the group weights. If a task is all by
+                        * itself (not part of a group), use the task weight
+                        * instead.
+                        */
+                       if (env->p->numa_group)
+                               imp = groupimp;
+                       else
+                               imp = taskimp;
+
+                       if (cur->numa_group)
+                               imp += group_weight(cur, env->src_nid) -
+                                      group_weight(cur, env->dst_nid);
+                       else
+                               imp += task_weight(cur, env->src_nid) -
+                                      task_weight(cur, env->dst_nid);
+               }
+       }
+
+       if (imp < env->best_imp)
+               goto unlock;
+
+       if (!cur) {
+               /* Is there capacity at our destination? */
+               if (env->src_stats.has_capacity &&
+                   !env->dst_stats.has_capacity)
+                       goto unlock;
+
+               goto balance;
+       }
+
+       /* Balance doesn't matter much if we're running a task per cpu */
+       if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+               goto assign;
+
+       /*
+        * In the overloaded case, try and keep the load balanced.
+        */
+balance:
+       dst_load = env->dst_stats.load;
+       src_load = env->src_stats.load;
+
+       /* XXX missing power terms */
+       load = task_h_load(env->p);
+       dst_load += load;
+       src_load -= load;
+
+       if (cur) {
+               load = task_h_load(cur);
+               dst_load -= load;
+               src_load += load;
+       }
+
+       /* make src_load the smaller */
+       if (dst_load < src_load)
+               swap(dst_load, src_load);
+
+       if (src_load * env->imbalance_pct < dst_load * 100)
+               goto unlock;
+
+assign:
+       task_numa_assign(env, cur, imp);
+unlock:
+       rcu_read_unlock();
+}
+
+static void task_numa_find_cpu(struct task_numa_env *env,
+                               long taskimp, long groupimp)
+{
+       int cpu;
+
+       for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+               /* Skip this CPU if the source task cannot migrate */
+               if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+                       continue;
+
+               env->dst_cpu = cpu;
+               task_numa_compare(env, taskimp, groupimp);
+       }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+       struct task_numa_env env = {
+               .p = p,
+
+               .src_cpu = task_cpu(p),
+               .src_nid = task_node(p),
+
+               .imbalance_pct = 112,
+
+               .best_task = NULL,
+               .best_imp = 0,
+               .best_cpu = -1
+       };
+       struct sched_domain *sd;
+       unsigned long taskweight, groupweight;
+       int nid, ret;
+       long taskimp, groupimp;
+
+       /*
+        * Pick the lowest SD_NUMA domain, as that would have the smallest
+        * imbalance and would be the first to start moving tasks about.
+        *
+        * And we want to avoid any moving of tasks about, as that would create
+        * random movement of tasks -- counter the numa conditions we're trying
+        * to satisfy here.
+        */
+       rcu_read_lock();
+       sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+       env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+       rcu_read_unlock();
+
+       taskweight = task_weight(p, env.src_nid);
+       groupweight = group_weight(p, env.src_nid);
+       update_numa_stats(&env.src_stats, env.src_nid);
+       env.dst_nid = p->numa_preferred_nid;
+       taskimp = task_weight(p, env.dst_nid) - taskweight;
+       groupimp = group_weight(p, env.dst_nid) - groupweight;
+       update_numa_stats(&env.dst_stats, env.dst_nid);
+
+       /* If the preferred nid has capacity, try to use it. */
+       if (env.dst_stats.has_capacity)
+               task_numa_find_cpu(&env, taskimp, groupimp);
+
+       /* No space available on the preferred nid. Look elsewhere. */
+       if (env.best_cpu == -1) {
+               for_each_online_node(nid) {
+                       if (nid == env.src_nid || nid == p->numa_preferred_nid)
+                               continue;
+
+                       /* Only consider nodes where both task and groups benefit */
+                       taskimp = task_weight(p, nid) - taskweight;
+                       groupimp = group_weight(p, nid) - groupweight;
+                       if (taskimp < 0 && groupimp < 0)
+                               continue;
+
+                       env.dst_nid = nid;
+                       update_numa_stats(&env.dst_stats, env.dst_nid);
+                       task_numa_find_cpu(&env, taskimp, groupimp);
+               }
+       }
+
+       /* No better CPU than the current one was found. */
+       if (env.best_cpu == -1)
+               return -EAGAIN;
+
+       sched_setnuma(p, env.dst_nid);
+
+       /*
+        * Reset the scan period if the task is being rescheduled on an
+        * alternative node to recheck if the tasks is now properly placed.
+        */
+       p->numa_scan_period = task_scan_min(p);
+
+       if (env.best_task == NULL) {
+               int ret = migrate_task_to(p, env.best_cpu);
+               return ret;
+       }
+
+       ret = migrate_swap(p, env.best_task);
+       put_task_struct(env.best_task);
+       return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+       /* This task has no NUMA fault statistics yet */
+       if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+               return;
+
+       /* Periodically retry migrating the task to the preferred node */
+       p->numa_migrate_retry = jiffies + HZ;
+
+       /* Success if task is already running on preferred CPU */
+       if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+               return;
+
+       /* Otherwise, try migrate to a CPU on the preferred node */
+       task_numa_migrate(p);
+}
+
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+                       unsigned long shared, unsigned long private)
+{
+       unsigned int period_slot;
+       int ratio;
+       int diff;
+
+       unsigned long remote = p->numa_faults_locality[0];
+       unsigned long local = p->numa_faults_locality[1];
+
+       /*
+        * If there were no record hinting faults then either the task is
+        * completely idle or all activity is areas that are not of interest
+        * to automatic numa balancing. Scan slower
+        */
+       if (local + shared == 0) {
+               p->numa_scan_period = min(p->numa_scan_period_max,
+                       p->numa_scan_period << 1);
+
+               p->mm->numa_next_scan = jiffies +
+                       msecs_to_jiffies(p->numa_scan_period);
  
-       if (!p->mm)     /* for example, ksmd faulting in a user's mm */
                 return;
+       }
+
+       /*
+        * Prepare to scale scan period relative to the current period.
+        *       == NUMA_PERIOD_THRESHOLD scan period stays the same
+        *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+        *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+        */
+       period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+       ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+       if (ratio >= NUMA_PERIOD_THRESHOLD) {
+               int slot = ratio - NUMA_PERIOD_THRESHOLD;
+               if (!slot)
+                       slot = 1;
+               diff = slot * period_slot;
+       } else {
+               diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+               /*
+                * Scale scan rate increases based on sharing. There is an
+                * inverse relationship between the degree of sharing and
+                * the adjustment made to the scanning period. Broadly
+                * speaking the intent is that there is little point
+                * scanning faster if shared accesses dominate as it may
+                * simply bounce migrations uselessly
+                */
+               period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+               ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+               diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+       }
+
+       p->numa_scan_period = clamp(p->numa_scan_period + diff,
+                       task_scan_min(p), task_scan_max(p));
+       memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq, nid, max_nid = -1, max_group_nid = -1;
+       unsigned long max_faults = 0, max_group_faults = 0;
+       unsigned long fault_types[2] = { 0, 0 };
+       spinlock_t *group_lock = NULL;
+
         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
         if (p->numa_scan_seq == seq)
                 return;
         p->numa_scan_seq = seq;
+       p->numa_scan_period_max = task_scan_max(p);
+
+       /* If the task is part of a group prevent parallel updates to group stats */
+       if (p->numa_group) {
+               group_lock = &p->numa_group->lock;
+               spin_lock(group_lock);
+       }
+
+       /* Find the node with the highest number of faults */
+       for_each_online_node(nid) {
+               unsigned long faults = 0, group_faults = 0;
+               int priv, i;
+
+               for (priv = 0; priv < 2; priv++) {
+                       long diff;
+
+                       i = task_faults_idx(nid, priv);
+                       diff = -p->numa_faults[i];
+
+                       /* Decay existing window, copy faults since last scan */
+                       p->numa_faults[i] >>= 1;
+                       p->numa_faults[i] += p->numa_faults_buffer[i];
+                       fault_types[priv] += p->numa_faults_buffer[i];
+                       p->numa_faults_buffer[i] = 0;
+
+                       faults += p->numa_faults[i];
+                       diff += p->numa_faults[i];
+                       p->total_numa_faults += diff;
+                       if (p->numa_group) {
+                               /* safe because we can only change our own group */
+                               p->numa_group->faults[i] += diff;
+                               p->numa_group->total_faults += diff;
+                               group_faults += p->numa_group->faults[i];
+                       }
+               }
+
+               if (faults > max_faults) {
+                       max_faults = faults;
+                       max_nid = nid;
+               }
+
+               if (group_faults > max_group_faults) {
+                       max_group_faults = group_faults;
+                       max_group_nid = nid;
+               }
+       }
+
+       update_task_scan_period(p, fault_types[0], fault_types[1]);
+
+       if (p->numa_group) {
+               /*
+                * If the preferred task and group nids are different,
+                * iterate over the nodes again to find the best place.
+                */
+               if (max_nid != max_group_nid) {
+                       unsigned long weight, max_weight = 0;
+
+                       for_each_online_node(nid) {
+                               weight = task_weight(p, nid) + group_weight(p, nid);
+                               if (weight > max_weight) {
+                                       max_weight = weight;
+                                       max_nid = nid;
+                               }
+                       }
+               }
+
+               spin_unlock(group_lock);
+       }
+
+       /* Preferred node as the node with the most faults */
+       if (max_faults && max_nid != p->numa_preferred_nid) {
+               /* Update the preferred nid and migrate task if possible */
+               sched_setnuma(p, max_nid);
+               numa_migrate_preferred(p);
+       }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+       return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+       if (atomic_dec_and_test(&grp->refcount))
+               kfree_rcu(grp, rcu);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                       int *priv)
+{
+       struct numa_group *grp, *my_grp;
+       struct task_struct *tsk;
+       bool join = false;
+       int cpu = cpupid_to_cpu(cpupid);
+       int i;
+
+       if (unlikely(!p->numa_group)) {
+               unsigned int size = sizeof(struct numa_group) +
+                                   2*nr_node_ids*sizeof(unsigned long);
+
+               grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+               if (!grp)
+                       return;
+
+               atomic_set(&grp->refcount, 1);
+               spin_lock_init(&grp->lock);
+               INIT_LIST_HEAD(&grp->task_list);
+               grp->gid = p->pid;
+
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       grp->faults[i] = p->numa_faults[i];
+
+               grp->total_faults = p->total_numa_faults;
+
+               list_add(&p->numa_entry, &grp->task_list);
+               grp->nr_tasks++;
+               rcu_assign_pointer(p->numa_group, grp);
+       }
+
+       rcu_read_lock();
+       tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+       if (!cpupid_match_pid(tsk, cpupid))
+               goto no_join;
  
-       /* FIXME: Scheduling placement policy hints go here */
+       grp = rcu_dereference(tsk->numa_group);
+       if (!grp)
+               goto no_join;
+
+       my_grp = p->numa_group;
+       if (grp == my_grp)
+               goto no_join;
+
+       /*
+        * Only join the other group if its bigger; if we're the bigger group,
+        * the other task will join us.
+        */
+       if (my_grp->nr_tasks > grp->nr_tasks)
+               goto no_join;
+
+       /*
+        * Tie-break on the grp address.
+        */
+       if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+               goto no_join;
+
+       /* Always join threads in the same process. */
+       if (tsk->mm == current->mm)
+               join = true;
+
+       /* Simple filter to avoid false positives due to PID collisions */
+       if (flags & TNF_SHARED)
+               join = true;
+
+       /* Update priv based on whether false sharing was detected */
+       *priv = !join;
+
+       if (join && !get_numa_group(grp))
+               goto no_join;
+
+       rcu_read_unlock();
+
+       if (!join)
+               return;
+
+       double_lock(&my_grp->lock, &grp->lock);
+
+       for (i = 0; i < 2*nr_node_ids; i++) {
+               my_grp->faults[i] -= p->numa_faults[i];
+               grp->faults[i] += p->numa_faults[i];
+       }
+       my_grp->total_faults -= p->total_numa_faults;
+       grp->total_faults += p->total_numa_faults;
+
+       list_move(&p->numa_entry, &grp->task_list);
+       my_grp->nr_tasks--;
+       grp->nr_tasks++;
+
+       spin_unlock(&my_grp->lock);
+       spin_unlock(&grp->lock);
+
+       rcu_assign_pointer(p->numa_group, grp);
+
+       put_numa_group(my_grp);
+       return;
+
+no_join:
+       rcu_read_unlock();
+       return;
+}
+
+void task_numa_free(struct task_struct *p)
+{
+       struct numa_group *grp = p->numa_group;
+       int i;
+       void *numa_faults = p->numa_faults;
+
+       if (grp) {
+               spin_lock(&grp->lock);
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       grp->faults[i] -= p->numa_faults[i];
+               grp->total_faults -= p->total_numa_faults;
+
+               list_del(&p->numa_entry);
+               grp->nr_tasks--;
+               spin_unlock(&grp->lock);
+               rcu_assign_pointer(p->numa_group, NULL);
+               put_numa_group(grp);
+       }
+
+       p->numa_faults = NULL;
+       p->numa_faults_buffer = NULL;
+       kfree(numa_faults);
  }
  
  /*
   * Got a PROT_NONE fault for a page on @node.
   */
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
  {
         struct task_struct *p = current;
+       bool migrated = flags & TNF_MIGRATED;
+       int priv;
  
         if (!numabalancing_enabled)
                 return;
  
-       /* FIXME: Allocate task-specific structure for placement policy here */
+       /* for example, ksmd faulting in a user's mm */
+       if (!p->mm)
+               return;
+
+       /* Do not worry about placement if exiting */
+       if (p->state == TASK_DEAD)
+               return;
+
+       /* Allocate buffer to track faults on a per-node basis */
+       if (unlikely(!p->numa_faults)) {
+               int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+
+               /* numa_faults and numa_faults_buffer share the allocation */
+               p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+               if (!p->numa_faults)
+                       return;
+
+               BUG_ON(p->numa_faults_buffer);
+               p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+               p->total_numa_faults = 0;
+               memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+       }
  
         /*
-        * If pages are properly placed (did not migrate) then scan slower.
-        * This is reset periodically in case of phase changes
+        * First accesses are treated as private, otherwise consider accesses
+        * to be private if the accessing pid has not changed
          */
-        if (!migrated)
-               p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
-                       p->numa_scan_period + jiffies_to_msecs(10));
+       if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+               priv = 1;
+       } else {
+               priv = cpupid_match_pid(p, last_cpupid);
+               if (!priv && !(flags & TNF_NO_GROUP))
+                       task_numa_group(p, last_cpupid, flags, &priv);
+       }
  
         task_numa_placement(p);
+
+       /*
+        * Retry task to preferred node migration periodically, in case it
+        * case it previously failed, or the scheduler moved us.
+        */
+       if (time_after(jiffies, p->numa_migrate_retry))
+               numa_migrate_preferred(p);
+
+       if (migrated)
+               p->numa_pages_migrated += pages;
+
+       p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+       p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
  }
  
  static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
         struct mm_struct *mm = p->mm;
         struct vm_area_struct *vma;
         unsigned long start, end;
+       unsigned long nr_pte_updates = 0;
         long pages;
  
         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
         if (p->flags & PF_EXITING)
                 return;
  
-       /*
-        * We do not care about task placement until a task runs on a node
-        * other than the first one used by the address space. This is
-        * largely because migrations are driven by what CPU the task
-        * is running on. If it's never scheduled on another node, it'll
-        * not migrate so why bother trapping the fault.
-        */
-       if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-               mm->first_nid = numa_node_id();
-       if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-               /* Are we running on a new node yet? */
-               if (numa_node_id() == mm->first_nid &&
-                   !sched_feat_numa(NUMA_FORCE))
-                       return;
-
-               mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
-       }
-
-       /*
-        * Reset the scan period if enough time has gone by. Objective is that
-        * scanning will be reduced if pages are properly placed. As tasks
-        * can enter different phases this needs to be re-examined. Lacking
-        * proper tracking of reference behaviour, this blunt hammer is used.
-        */
-       migrate = mm->numa_next_reset;
-       if (time_after(now, migrate)) {
-               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-               next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
-               xchg(&mm->numa_next_reset, next_scan);
+       if (!mm->numa_next_scan) {
+               mm->numa_next_scan = now +
+                       msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
         }
  
         /*
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
         if (time_before(now, migrate))
                 return;
  
-       if (p->numa_scan_period == 0)
-               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+       if (p->numa_scan_period == 0) {
+               p->numa_scan_period_max = task_scan_max(p);
+               p->numa_scan_period = task_scan_min(p);
+       }
  
         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
                 return;
  
         /*
-        * Do not set pte_numa if the current running node is rate-limited.
-        * This loses statistics on the fault but if we are unwilling to
-        * migrate to this node, it is less likely we can do useful work
+        * Delay this task enough that another task of this mm will likely win
+        * the next time around.
          */
-       if (migrate_ratelimited(numa_node_id()))
-               return;
+       p->node_stamp += 2 * TICK_NSEC;
  
         start = mm->numa_scan_offset;
         pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
                 vma = mm->mmap;
         }
         for (; vma; vma = vma->vm_next) {
-               if (!vma_migratable(vma))
+               if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
                         continue;
  
-               /* Skip small VMAs. They are not likely to be of relevance */
-               if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+               /*
+                * Shared library pages mapped by multiple processes are not
+                * migrated as it is expected they are cache replicated. Avoid
+                * hinting faults in read-only file-backed mappings or the vdso
+                * as migrating the pages will be of marginal benefit.
+                */
+               if (!vma->vm_mm ||
+                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                         continue;
  
                 do {
                         start = max(start, vma->vm_start);
                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                         end = min(end, vma->vm_end);
-                       pages -= change_prot_numa(vma, start, end);
+                       nr_pte_updates += change_prot_numa(vma, start, end);
+
+                       /*
+                        * Scan sysctl_numa_balancing_scan_size but ensure that
+                        * at least one PTE is updated so that unused virtual
+                        * address space is quickly skipped.
+                        */
+                       if (nr_pte_updates)
+                               pages -= (end - start) >> PAGE_SHIFT;
  
                         start = end;
                         if (pages <= 0)
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
  
  out:
         /*
-        * It is possible to reach the end of the VMA list but the last few VMAs are
-        * not guaranteed to the vma_migratable. If they are not, we would find the
-        * !migratable VMA on the next scan but not reset the scanner to the start
-        * so check it now.
+        * It is possible to reach the end of the VMA list but the last few
+        * VMAs are not guaranteed to the vma_migratable. If they are not, we
+        * would find the !migratable VMA on the next scan but not reset the
+        * scanner to the start so check it now.
          */
         if (vma)
                 mm->numa_scan_offset = start;
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
  
         if (now - curr->node_stamp > period) {
                 if (!curr->node_stamp)
-                       curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-               curr->node_stamp = now;
+                       curr->numa_scan_period = task_scan_min(curr);
+               curr->node_stamp += period;
  
                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
  static void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
  }
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
  #endif /* CONFIG_NUMA_BALANCING */
  
  static void
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         if (!parent_entity(se))
                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
  #ifdef CONFIG_SMP
-       if (entity_is_task(se))
-               list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+       if (entity_is_task(se)) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               account_numa_enqueue(rq, task_of(se));
+               list_add(&se->group_node, &rq->cfs_tasks);
+       }
  #endif
         cfs_rq->nr_running++;
  }
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         update_load_sub(&cfs_rq->load, se->load.weight);
         if (!parent_entity(se))
                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-       if (entity_is_task(se))
+       if (entity_is_task(se)) {
+               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                 list_del_init(&se->group_node);
+       }
         cfs_rq->nr_running--;
  }
  
@@ -3113,7 +3888,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
  
-       if (!tg->parent)        /* the trivial, non-cgroup case */
+       if (!tg->parent || !wl) /* the trivial, non-cgroup case */
                 return wl;
  
         for_each_sched_entity(se) {
@@ -3166,8 +3941,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  }
  #else
  
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
-               unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         return wl;
  }
@@ -3420,11 +4194,10 @@ done:
   * preempt must be disabled.
   */
  static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
  {
         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
         int cpu = smp_processor_id();
-       int prev_cpu = task_cpu(p);
         int new_cpu = cpu;
         int want_affine = 0;
         int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4677,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  
  static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
+enum fbq_type { regular, remote, all };
+
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED  0x04
+#define LBF_SOME_PINNED        0x08
  
  struct lb_env {
         struct sched_domain     *sd;
@@ -3929,6 +4705,8 @@ struct lb_env {
         unsigned int            loop;
         unsigned int            loop_break;
         unsigned int            loop_max;
+
+       enum fbq_type           fbq_type;
  };
  
  /*
@@ -3975,6 +4753,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         return delta < (s64)sysctl_sched_migration_cost;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+           !(env->sd->flags & SD_NUMA)) {
+               return false;
+       }
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid)
+               return false;
+
+       /* Always encourage migration to the preferred node. */
+       if (dst_nid == p->numa_preferred_nid)
+               return true;
+
+       /* If both task and group weight improve, this move is a winner. */
+       if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+           group_weight(p, dst_nid) > group_weight(p, src_nid))
+               return true;
+
+       return false;
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+               return false;
+
+       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+               return false;
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid)
+               return false;
+
+       /* Migrating away from the preferred node is always bad. */
+       if (src_nid == p->numa_preferred_nid)
+               return true;
+
+       /* If either task or group weight get worse, don't do it. */
+       if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+           group_weight(p, dst_nid) < group_weight(p, src_nid))
+               return true;
+
+       return false;
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+#endif
+
  /*
   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
   */
@@ -3997,6 +4847,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  
                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
  
+               env->flags |= LBF_SOME_PINNED;
+
                 /*
                  * Remember if this task can be migrated to any other cpu in
                  * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4857,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                  * Also avoid computing new_dst_cpu if we have already computed
                  * one in current iteration.
                  */
-               if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+               if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
                         return 0;
  
                 /* Prevent to re-select dst_cpu via env's cpus */
                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                               env->flags |= LBF_SOME_PINNED;
+                               env->flags |= LBF_DST_PINNED;
                                 env->new_dst_cpu = cpu;
                                 break;
                         }
@@ -4030,11 +4882,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  
         /*
          * Aggressive migration if:
-        * 1) task is cache cold, or
-        * 2) too many balance attempts have failed.
+        * 1) destination numa is preferred
+        * 2) task is cache cold, or
+        * 3) too many balance attempts have failed.
          */
-
         tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+       if (!tsk_cache_hot)
+               tsk_cache_hot = migrate_degrades_locality(p, env);
+
+       if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+               if (tsk_cache_hot) {
+                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
         if (!tsk_cache_hot ||
                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
  
@@ -4077,8 +4942,6 @@ static int move_one_task(struct lb_env *env)
         return 0;
  }
  
-static unsigned long task_h_load(struct task_struct *p);
-
  static const unsigned int sched_nr_migrate_break = 32;
  
  /*
@@ -4291,6 +5154,10 @@ struct sg_lb_stats {
         unsigned int group_weight;
         int group_imb; /* Is there an imbalance in the group ? */
         int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
  };
  
  /*
@@ -4330,7 +5197,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
  /**
   * get_sd_load_idx - Obtain the load index for a given sched domain.
   * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
   *
   * Return: The load index.
   */
@@ -4447,7 +5314,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
  {
         struct sched_domain *child = sd->child;
         struct sched_group *group, *sdg = sd->groups;
-       unsigned long power;
+       unsigned long power, power_orig;
         unsigned long interval;
  
         interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5326,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 return;
         }
  
-       power = 0;
+       power_orig = power = 0;
  
         if (child->flags & SD_OVERLAP) {
                 /*
@@ -4467,8 +5334,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
                  * span the current group.
                  */
  
-               for_each_cpu(cpu, sched_group_cpus(sdg))
-                       power += power_of(cpu);
+               for_each_cpu(cpu, sched_group_cpus(sdg)) {
+                       struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+
+                       power_orig += sg->sgp->power_orig;
+                       power += sg->sgp->power;
+               }
         } else  {
                 /*
                  * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5348,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
  
                 group = child->groups;
                 do {
+                       power_orig += group->sgp->power_orig;
                         power += group->sgp->power;
                         group = group->next;
                 } while (group != child->groups);
         }
  
-       sdg->sgp->power_orig = sdg->sgp->power = power;
+       sdg->sgp->power_orig = power_orig;
+       sdg->sgp->power = power;
  }
  
  /*
@@ -4526,13 +5399,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * cpu 3 and leave one of the cpus in the second group unused.
   *
   * The current solution to this issue is detecting the skew in the first group
- * by noticing it has a cpu that is overloaded while the remaining cpus are
- * idle -- or rather, there's a distinct imbalance in the cpus; see
- * sg_imbalanced().
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
   *
   * When this is so detected; this group becomes a candidate for busiest; see
- * update_sd_pick_busiest(). And calculcate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
   * to create an effective group imbalance.
   *
   * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5412,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * subtle and fragile situation.
   */
  
-struct sg_imb_stats {
-       unsigned long max_nr_running, min_nr_running;
-       unsigned long max_cpu_load, min_cpu_load;
-};
-
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
+static inline int sg_imbalanced(struct sched_group *group)
  {
-       sgi->max_cpu_load = sgi->max_nr_running = 0UL;
-       sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
+       return group->sgp->imbalance;
  }
  
-static inline void
-update_sg_imb_stats(struct sg_imb_stats *sgi,
-                   unsigned long load, unsigned long nr_running)
+/*
+ * Compute the group capacity.
+ *
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit power unit capacity with that.
+ */
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
  {
-       if (load > sgi->max_cpu_load)
-               sgi->max_cpu_load = load;
-       if (sgi->min_cpu_load > load)
-               sgi->min_cpu_load = load;
+       unsigned int capacity, smt, cpus;
+       unsigned int power, power_orig;
  
-       if (nr_running > sgi->max_nr_running)
-               sgi->max_nr_running = nr_running;
-       if (sgi->min_nr_running > nr_running)
-               sgi->min_nr_running = nr_running;
-}
+       power = group->sgp->power;
+       power_orig = group->sgp->power_orig;
+       cpus = group->group_weight;
  
-static inline int
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
-{
-       /*
-        * Consider the group unbalanced when the imbalance is larger
-        * than the average weight of a task.
-        *
-        * APZ: with cgroup the avg task weight can vary wildly and
-        *      might not be a suitable number - should we keep a
-        *      normalized nr_running number somewhere that negates
-        *      the hierarchy?
-        */
-       if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
-           (sgi->max_nr_running - sgi->min_nr_running) > 1)
-               return 1;
+       /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
+       smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
+       capacity = cpus / smt; /* cores */
  
-       return 0;
+       capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+       if (!capacity)
+               capacity = fix_small_capacity(env->sd, group);
+
+       return capacity;
  }
  
  /**
@@ -4597,12 +5456,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
                         int local_group, struct sg_lb_stats *sgs)
  {
-       struct sg_imb_stats sgi;
         unsigned long nr_running;
         unsigned long load;
         int i;
  
-       init_sg_imb_stats(&sgi);
+       memset(sgs, 0, sizeof(*sgs));
  
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                 struct rq *rq = cpu_rq(i);
@@ -4610,24 +5468,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 nr_running = rq->nr_running;
  
                 /* Bias balancing toward cpus of our domain */
-               if (local_group) {
+               if (local_group)
                         load = target_load(i, load_idx);
-               } else {
+               else
                         load = source_load(i, load_idx);
-                       update_sg_imb_stats(&sgi, load, nr_running);
-               }
  
                 sgs->group_load += load;
                 sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+               sgs->nr_numa_running += rq->nr_numa_running;
+               sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                 sgs->sum_weighted_load += weighted_cpuload(i);
                 if (idle_cpu(i))
                         sgs->idle_cpus++;
         }
  
-       if (local_group && (env->idle != CPU_NEWLY_IDLE ||
-                       time_after_eq(jiffies, group->sgp->next_update)))
-               update_group_power(env->sd, env->dst_cpu);
-
         /* Adjust by relative CPU power of the group */
         sgs->group_power = group->sgp->power;
         sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5491,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
         if (sgs->sum_nr_running)
                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  
-       sgs->group_imb = sg_imbalanced(sgs, &sgi);
-
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
-
-       if (!sgs->group_capacity)
-               sgs->group_capacity = fix_small_capacity(env->sd, group);
-
         sgs->group_weight = group->group_weight;
  
+       sgs->group_imb = sg_imbalanced(group);
+       sgs->group_capacity = sg_capacity(env, group);
+
         if (sgs->group_capacity > sgs->sum_nr_running)
                 sgs->group_has_capacity = 1;
  }
@@ -4693,14 +5544,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         return false;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       if (sgs->sum_nr_running > sgs->nr_numa_running)
+               return regular;
+       if (sgs->sum_nr_running > sgs->nr_preferred_running)
+               return remote;
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       if (rq->nr_running > rq->nr_numa_running)
+               return regular;
+       if (rq->nr_running > rq->nr_preferred_running)
+               return remote;
+       return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
- * @balance: Should we balance.
   * @sds: variable to hold the statistics for this sched_domain.
   */
-static inline void update_sd_lb_stats(struct lb_env *env,
-                                       struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5599,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                 if (local_group) {
                         sds->local = sg;
                         sgs = &sds->local_stat;
+
+                       if (env->idle != CPU_NEWLY_IDLE ||
+                           time_after_eq(jiffies, sg->sgp->next_update))
+                               update_group_power(env->sd, env->dst_cpu);
                 }
  
-               memset(sgs, 0, sizeof(*sgs));
                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
  
+               if (local_group)
+                       goto next_group;
+
                 /*
                  * In case the child domain prefers tasks go to siblings
                  * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5620,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                  * heaviest group when it is already under-utilized (possible
                  * with a large weight task outweighs the tasks on the system).
                  */
-               if (prefer_sibling && !local_group &&
-                               sds->local && sds->local_stat.group_has_capacity)
+               if (prefer_sibling && sds->local &&
+                   sds->local_stat.group_has_capacity)
                         sgs->group_capacity = min(sgs->group_capacity, 1U);
  
-               /* Now, start updating sd_lb_stats */
-               sds->total_load += sgs->group_load;
-               sds->total_pwr += sgs->group_power;
-
-               if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
+               if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                         sds->busiest = sg;
                         sds->busiest_stat = *sgs;
                 }
  
+next_group:
+               /* Now, start updating sd_lb_stats */
+               sds->total_load += sgs->group_load;
+               sds->total_pwr += sgs->group_power;
+
                 sg = sg->next;
         } while (sg != env->sd->groups);
+
+       if (env->sd->flags & SD_NUMA)
+               env->fbq_type = fbq_classify_group(&sds->busiest_stat);
  }
  
  /**
@@ -5053,15 +5942,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
         int i;
  
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-               unsigned long power = power_of(i);
-               unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                          SCHED_POWER_SCALE);
-               unsigned long wl;
+               unsigned long power, capacity, wl;
+               enum fbq_type rt;
+
+               rq = cpu_rq(i);
+               rt = fbq_classify_rq(rq);
  
+               /*
+                * We classify groups/runqueues into three groups:
+                *  - regular: there are !numa tasks
+                *  - remote:  there are numa tasks that run on the 'wrong' node
+                *  - all:     there is no distinction
+                *
+                * In order to avoid migrating ideally placed numa tasks,
+                * ignore those when there's better options.
+                *
+                * If we ignore the actual busiest queue to migrate another
+                * task, the next balance pass can still reduce the busiest
+                * queue by moving tasks around inside the node.
+                *
+                * If we cannot move enough load due to this classification
+                * the next pass will adjust the group classification and
+                * allow migration of more tasks.
+                *
+                * Both cases only affect the total convergence complexity.
+                */
+               if (rt > env->fbq_type)
+                       continue;
+
+               power = power_of(i);
+               capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                 if (!capacity)
                         capacity = fix_small_capacity(env->sd, group);
  
-               rq = cpu_rq(i);
                 wl = weighted_cpuload(i);
  
                 /*
@@ -5164,6 +6077,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         int *continue_balancing)
  {
         int ld_moved, cur_ld_moved, active_balance = 0;
+       struct sched_domain *sd_parent = sd->parent;
         struct sched_group *group;
         struct rq *busiest;
         unsigned long flags;
@@ -5177,6 +6091,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                 .idle           = idle,
                 .loop_break     = sched_nr_migrate_break,
                 .cpus           = cpus,
+               .fbq_type       = all,
         };
  
         /*
@@ -5268,17 +6183,17 @@ more_balance:
                  * moreover subsequent load balance cycles should correct the
                  * excess load moved.
                  */
-               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+               if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+                       /* Prevent to re-select dst_cpu via env's cpus */
+                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
  
                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
                         env.dst_cpu      = env.new_dst_cpu;
-                       env.flags       &= ~LBF_SOME_PINNED;
+                       env.flags       &= ~LBF_DST_PINNED;
                         env.loop         = 0;
                         env.loop_break   = sched_nr_migrate_break;
  
-                       /* Prevent to re-select dst_cpu via env's cpus */
-                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
-
                         /*
                          * Go back to "more_balance" rather than "redo" since we
                          * need to continue with same src_cpu.
@@ -5286,6 +6201,18 @@ more_balance:
                         goto more_balance;
                 }
  
+               /*
+                * We failed to reach balance because of affinity.
+                */
+               if (sd_parent) {
+                       int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+
+                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                               *group_imbalance = 1;
+                       } else if (*group_imbalance)
+                               *group_imbalance = 0;
+               }
+
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
                         cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6320,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
+       u64 curr_cost = 0;
  
         this_rq->idle_stamp = rq_clock(this_rq);
  
@@ -5409,15 +6337,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
                 int continue_balancing = 1;
+               u64 t0, domain_cost;
  
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
  
+               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                       break;
+
                 if (sd->flags & SD_BALANCE_NEWIDLE) {
+                       t0 = sched_clock_cpu(this_cpu);
+
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance(this_cpu, this_rq,
                                                    sd, CPU_NEWLY_IDLE,
                                                    &continue_balancing);
+
+                       domain_cost = sched_clock_cpu(this_cpu) - t0;
+                       if (domain_cost > sd->max_newidle_lb_cost)
+                               sd->max_newidle_lb_cost = domain_cost;
+
+                       curr_cost += domain_cost;
                 }
  
                 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6379,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                  */
                 this_rq->next_balance = next_balance;
         }
+
+       if (curr_cost > this_rq->max_idle_balance_cost)
+               this_rq->max_idle_balance_cost = curr_cost;
  }
  
  /*
@@ -5662,15 +6605,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
         /* Earliest time when we have to do rebalance again */
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
-       int need_serialize;
+       int need_serialize, need_decay = 0;
+       u64 max_cost = 0;
  
         update_blocked_averages(cpu);
  
         rcu_read_lock();
         for_each_domain(cpu, sd) {
+               /*
+                * Decay the newidle max times here because this is a regular
+                * visit to all the domains. Decay ~1% per second.
+                */
+               if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                       sd->max_newidle_lb_cost =
+                               (sd->max_newidle_lb_cost * 253) / 256;
+                       sd->next_decay_max_lb_cost = jiffies + HZ;
+                       need_decay = 1;
+               }
+               max_cost += sd->max_newidle_lb_cost;
+
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
  
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!continue_balancing) {
+                       if (need_decay)
+                               continue;
+                       break;
+               }
+
                 interval = sd->balance_interval;
                 if (idle != CPU_IDLE)
                         interval *= sd->busy_factor;
@@ -5689,7 +6656,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                 /*
-                                * The LBF_SOME_PINNED logic could have changed
+                                * The LBF_DST_PINNED logic could have changed
                                  * env->dst_cpu, so we can't know our idle
                                  * state even if we migrated tasks. Update it.
                                  */
@@ -5704,14 +6671,14 @@ out:
                         next_balance = sd->last_balance + interval;
                         update_next_balance = 1;
                 }
-
+       }
+       if (need_decay) {
                 /*
-                * Stop the load balance at this level. There is another
-                * CPU in our sched group which is doing load balancing more
-                * actively.
+                * Ensure the rq-wide value also decays but keep it at a
+                * reasonable floor to avoid funnies with rq->avg_idle.
                  */
-               if (!continue_balancing)
-                       break;
+               rq->max_idle_balance_cost =
+                       max((u64)sysctl_sched_migration_cost, max_cost);
         }
         rcu_read_unlock();
  
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index 99399f8e4799b118e81486f9bc9b672947fd1ab7..5716929a2e3a4ad2d492ad2de0c3431291ea81a1 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
  /*
   * Apply the automatic NUMA scheduling policy. Enabled automatically
   * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
- * for debugging the core machinery.
+ * numa_balancing=
   */
  #ifdef CONFIG_NUMA_BALANCING
  SCHED_FEAT(NUMA,       false)
-SCHED_FEAT(NUMA_FORCE, false)
+
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
  #endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c

index d8da01008d390a770acb43c22bfb1de38e389a76..516c3d9ceea1455cee6c5a7b96db7f92a141f9af 100644 (file)
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
  
  #ifdef CONFIG_SMP
  static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
  {
         return task_cpu(p); /* IDLE tasks as never migrated */
  }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 01970c8e64df64def4585bf3bd517c3bdb8a9354..a848f526b941a1cbda606bd954b927d7e7a8eb76 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
          * if we should look at the mask. It would be a shame
          * if we looked at the mask, but the mask was not
          * updated yet.
+        *
+        * Matched by the barrier in pull_rt_task().
          */
-       wmb();
+       smp_wmb();
         atomic_inc(&rq->rd->rto_count);
  }
  
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
  static int find_lowest_rq(struct task_struct *task);
  
  static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
  {
         struct task_struct *curr;
         struct rq *rq;
-       int cpu;
-
-       cpu = task_cpu(p);
  
         if (p->nr_cpus_allowed == 1)
                 goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
          */
         if (curr && unlikely(rt_task(curr)) &&
             (curr->nr_cpus_allowed < 2 ||
-            curr->prio <= p->prio) &&
-           (p->nr_cpus_allowed > 1)) {
+            curr->prio <= p->prio)) {
                 int target = find_lowest_rq(p);
  
                 if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
         if (likely(!rt_overloaded(this_rq)))
                 return 0;
  
+       /*
+        * Match the barrier from rt_set_overloaded; this guarantees that if we
+        * see overloaded we must also see the rto_mask bit.
+        */
+       smp_rmb();
+
         for_each_cpu(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index b3c5653e1dca6ee3a965d83022e8e9b1905465b3..ffc708717b70d059daf1ad3d6f54a307b0e16eb9 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
  #include <linux/spinlock.h>
  #include <linux/stop_machine.h>
  #include <linux/tick.h>
+#include <linux/slab.h>
  
  #include "cpupri.h"
  #include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
          * remote CPUs use both these fields when doing load calculation.
          */
         unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
         unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
         u64 age_stamp;
         u64 idle_stamp;
         u64 avg_idle;
+
+       /* This is used to determine avg_idle's max value */
+       u64 max_idle_balance_cost;
  #endif
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
         return rq->clock_task;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
+
  #ifdef CONFIG_SMP
  
  #define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
         return hsd;
  }
  
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd) {
+               if (sd->flags & flag)
+                       break;
+       }
+
+       return sd;
+}
+
  DECLARE_PER_CPU(struct sched_domain *, sd_llc);
  DECLARE_PER_CPU(int, sd_llc_size);
  DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
  
  struct sched_group_power {
         atomic_t ref;
@@ -605,6 +632,7 @@ struct sched_group_power {
          */
         unsigned int power, power_orig;
         unsigned long next_update;
+       int imbalance; /* XXX unrelated to power but shared group state */
         /*
          * Number of busy cpus in this group.
          */
@@ -719,6 +747,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
          */
         smp_wmb();
         task_thread_info(p)->cpu = cpu;
+       p->wake_cpu = cpu;
  #endif
  }
  
@@ -974,7 +1003,7 @@ struct sched_class {
         void (*put_prev_task) (struct rq *rq, struct task_struct *p);
  
  #ifdef CONFIG_SMP
-       int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+       int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
         void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
  
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1249,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
  }
  
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+       if (l1 > l2)
+               swap(l1, l2);
+
+       spin_lock(l1);
+       spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+       if (l1 > l2)
+               swap(l1, l2);
+
+       raw_spin_lock(l1);
+       raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
  /*
   * double_rq_lock - safely lock two runqueues
   *
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index c7edee71bce8915f67a3b4427b7f305696834afb..4ab7043396569f201cd6dde0a997733a71a4bb7a 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
   * from dequeue_task() to account for possible rq->clock skew across cpus. The
   * delta taken on each cpu would annul the skew.
   */
-static inline void sched_info_dequeued(struct task_struct *t)
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
  {
-       unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+       unsigned long long now = rq_clock(rq), delta = 0;
  
         if (unlikely(sched_info_on()))
                 if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
         sched_info_reset_dequeued(t);
         t->sched_info.run_delay += delta;
  
-       rq_sched_info_dequeued(task_rq(t), delta);
+       rq_sched_info_dequeued(rq, delta);
  }
  
  /*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
   * long it was waiting to run.  We also note when it began so that we
   * can keep stats on how long its timeslice is.
   */
-static void sched_info_arrive(struct task_struct *t)
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
  {
-       unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+       unsigned long long now = rq_clock(rq), delta = 0;
  
         if (t->sched_info.last_queued)
                 delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
         t->sched_info.last_arrival = now;
         t->sched_info.pcount++;
  
-       rq_sched_info_arrive(task_rq(t), delta);
+       rq_sched_info_arrive(rq, delta);
  }
  
  /*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
   * the timestamp if it is already not set.  It's assumed that
   * sched_info_dequeued() will clear that stamp when appropriate.
   */
-static inline void sched_info_queued(struct task_struct *t)
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
  {
         if (unlikely(sched_info_on()))
                 if (!t->sched_info.last_queued)
-                       t->sched_info.last_queued = rq_clock(task_rq(t));
+                       t->sched_info.last_queued = rq_clock(rq);
  }
  
  /*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
   * sched_info_queued() to mark that it has now again started waiting on
   * the runqueue.
   */
-static inline void sched_info_depart(struct task_struct *t)
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
  {
-       unsigned long long delta = rq_clock(task_rq(t)) -
+       unsigned long long delta = rq_clock(rq) -
                                         t->sched_info.last_arrival;
  
-       rq_sched_info_depart(task_rq(t), delta);
+       rq_sched_info_depart(rq, delta);
  
         if (t->state == TASK_RUNNING)
-               sched_info_queued(t);
+               sched_info_queued(rq, t);
  }
  
  /*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
   * the idle task.)  We are only called when prev != next.
   */
  static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq,
+                   struct task_struct *prev, struct task_struct *next)
  {
-       struct rq *rq = task_rq(prev);
-
         /*
          * prev now departs the cpu.  It's not interesting to record
          * stats about how efficient we were at scheduling the idle
          * process, however.
          */
         if (prev != rq->idle)
-               sched_info_depart(prev);
+               sched_info_depart(rq, prev);
  
         if (next != rq->idle)
-               sched_info_arrive(next);
+               sched_info_arrive(rq, next);
  }
  static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq,
+                 struct task_struct *prev, struct task_struct *next)
  {
         if (unlikely(sched_info_on()))
-               __sched_info_switch(prev, next);
+               __sched_info_switch(rq, prev, next);
  }
  #else
-#define sched_info_queued(t)                   do { } while (0)
+#define sched_info_queued(rq, t)               do { } while (0)
  #define sched_info_reset_dequeued(t)   do { } while (0)
-#define sched_info_dequeued(t)                 do { } while (0)
-#define sched_info_switch(t, next)             do { } while (0)
+#define sched_info_dequeued(rq, t)             do { } while (0)
+#define sched_info_depart(rq, t)               do { } while (0)
+#define sched_info_arrive(rq, next)            do { } while (0)
+#define sched_info_switch(rq, t, next)         do { } while (0)
  #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
  
  /*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c

index e08fbeeb54b9ffecf77bc8cdfc5b5ad7b63455f7..47197de8abd99af5dd7d734166721a5e2deb2ae8 100644 (file)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
  
  #ifdef CONFIG_SMP
  static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
  {
         return task_cpu(p); /* stop tasks as never migrate */
  }
diff --git a/kernel/smp.c b/kernel/smp.c

index 0564571dcdf726fab6832bf91417dfe2eff372fb..f5768b0c816a6baf02148c535b33e40b373f72fb 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -524,6 +524,11 @@ void __init setup_nr_cpu_ids(void)
         nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
  }
  
+void __weak smp_announce(void)
+{
+       printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
+}
+
  /* Called by boot processor to activate the rest. */
  void __init smp_init(void)
  {
@@ -540,7 +545,7 @@ void __init smp_init(void)
         }
  
         /* Any cleanup work */
-       printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+       smp_announce();
         smp_cpus_done(setup_max_cpus);
  }
  
diff --git a/kernel/softirq.c b/kernel/softirq.c

index d7d498d8cc4f8185954a220c66071cbc5761b917..b24988353458c8cad2fff6aed7853f8625f1cf36 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -29,7 +29,6 @@
  #define CREATE_TRACE_POINTS
  #include <trace/events/irq.h>
  
-#include <asm/irq.h>
  /*
     - No shared variables, all the data are CPU local.
     - If a softirq needs serialization, let it serialize itself
@@ -100,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
  
         raw_local_irq_save(flags);
         /*
-        * The preempt tracer hooks into add_preempt_count and will break
+        * The preempt tracer hooks into preempt_count_add and will break
          * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
          * is set and before current->softirq_enabled is cleared.
          * We must manually increment preempt_count here and manually
          * call the trace_preempt_off later.
          */
-       preempt_count() += cnt;
+       __preempt_count_add(cnt);
         /*
          * Were softirqs turned off above:
          */
@@ -120,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
  #else /* !CONFIG_TRACE_IRQFLAGS */
  static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
  {
-       add_preempt_count(cnt);
+       preempt_count_add(cnt);
         barrier();
  }
  #endif /* CONFIG_TRACE_IRQFLAGS */
@@ -134,12 +133,11 @@ EXPORT_SYMBOL(local_bh_disable);
  
  static void __local_bh_enable(unsigned int cnt)
  {
-       WARN_ON_ONCE(in_irq());
         WARN_ON_ONCE(!irqs_disabled());
  
         if (softirq_count() == cnt)
                 trace_softirqs_on(_RET_IP_);
-       sub_preempt_count(cnt);
+       preempt_count_sub(cnt);
  }
  
  /*
@@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt)
   */
  void _local_bh_enable(void)
  {
+       WARN_ON_ONCE(in_irq());
         __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
  }
  
@@ -169,12 +168,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)
          * Keep preemption disabled until we are done with
          * softirq processing:
          */
-       sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
+       preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
  
-       if (unlikely(!in_interrupt() && local_softirq_pending()))
+       if (unlikely(!in_interrupt() && local_softirq_pending())) {
+               /*
+                * Run softirq if any pending. And do it in its own stack
+                * as we may be calling this deep in a task call stack already.
+                */
                 do_softirq();
+       }
  
-       dec_preempt_count();
+       preempt_count_dec();
  #ifdef CONFIG_TRACE_IRQFLAGS
         local_irq_enable();
  #endif
@@ -256,7 +260,7 @@ restart:
                                        " exited with %08x?\n", vec_nr,
                                        softirq_to_name[vec_nr], h->action,
                                        prev_count, preempt_count());
-                               preempt_count() = prev_count;
+                               preempt_count_set(prev_count);
                         }
  
                         rcu_bh_qs(cpu);
@@ -280,10 +284,11 @@ restart:
  
         account_irq_exit_time(current);
         __local_bh_enable(SOFTIRQ_OFFSET);
+       WARN_ON_ONCE(in_interrupt());
         tsk_restore_flags(current, old_flags, PF_MEMALLOC);
  }
  
-#ifndef __ARCH_HAS_DO_SOFTIRQ
+
  
  asmlinkage void do_softirq(void)
  {
@@ -298,13 +303,11 @@ asmlinkage void do_softirq(void)
         pending = local_softirq_pending();
  
         if (pending)
-               __do_softirq();
+               do_softirq_own_stack();
  
         local_irq_restore(flags);
  }
  
-#endif
-
  /*
   * Enter an interrupt context.
   */
@@ -329,15 +332,21 @@ void irq_enter(void)
  static inline void invoke_softirq(void)
  {
         if (!force_irqthreads) {
+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
                 /*
                  * We can safely execute softirq on the current stack if
                  * it is the irq stack, because it should be near empty
-                * at this stage. But we have no way to know if the arch
-                * calls irq_exit() on the irq stack. So call softirq
-                * in its own stack to prevent from any overrun on top
-                * of a potentially deep task stack.
+                * at this stage.
                  */
-               do_softirq();
+               __do_softirq();
+#else
+               /*
+                * Otherwise, irq_exit() is called on the task stack that can
+                * be potentially deep already. So call softirq in its own stack
+                * to prevent from any overrun.
+                */
+               do_softirq_own_stack();
+#endif
         } else {
                 wakeup_softirqd();
         }
@@ -369,7 +378,7 @@ void irq_exit(void)
  
         account_irq_exit_time(current);
         trace_hardirq_exit();
-       sub_preempt_count(HARDIRQ_OFFSET);
+       preempt_count_sub(HARDIRQ_OFFSET);
         if (!in_interrupt() && local_softirq_pending())
                 invoke_softirq();
  
@@ -771,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu)
  {
         local_irq_disable();
         if (local_softirq_pending()) {
+               /*
+                * We can safely run softirq on inline stack, as we are not deep
+                * in the task stack here.
+                */
                 __do_softirq();
                 rcu_note_context_switch(cpu);
                 local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c

index c09f2955ae3055b42f1edde601ee1eb431bfc18a..c530bc5be7cfa9e6be364f83848369d204d6d9f8 100644 (file)
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -115,6 +115,182 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
         return done.executed ? done.ret : -ENOENT;
  }
  
+/* This controls the threads on each CPU. */
+enum multi_stop_state {
+       /* Dummy starting state for thread. */
+       MULTI_STOP_NONE,
+       /* Awaiting everyone to be scheduled. */
+       MULTI_STOP_PREPARE,
+       /* Disable interrupts. */
+       MULTI_STOP_DISABLE_IRQ,
+       /* Run the function */
+       MULTI_STOP_RUN,
+       /* Exit */
+       MULTI_STOP_EXIT,
+};
+
+struct multi_stop_data {
+       int                     (*fn)(void *);
+       void                    *data;
+       /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+       unsigned int            num_threads;
+       const struct cpumask    *active_cpus;
+
+       enum multi_stop_state   state;
+       atomic_t                thread_ack;
+};
+
+static void set_state(struct multi_stop_data *msdata,
+                     enum multi_stop_state newstate)
+{
+       /* Reset ack counter. */
+       atomic_set(&msdata->thread_ack, msdata->num_threads);
+       smp_wmb();
+       msdata->state = newstate;
+}
+
+/* Last one to ack a state moves to the next state. */
+static void ack_state(struct multi_stop_data *msdata)
+{
+       if (atomic_dec_and_test(&msdata->thread_ack))
+               set_state(msdata, msdata->state + 1);
+}
+
+/* This is the cpu_stop function which stops the CPU. */
+static int multi_cpu_stop(void *data)
+{
+       struct multi_stop_data *msdata = data;
+       enum multi_stop_state curstate = MULTI_STOP_NONE;
+       int cpu = smp_processor_id(), err = 0;
+       unsigned long flags;
+       bool is_active;
+
+       /*
+        * When called from stop_machine_from_inactive_cpu(), irq might
+        * already be disabled.  Save the state and restore it on exit.
+        */
+       local_save_flags(flags);
+
+       if (!msdata->active_cpus)
+               is_active = cpu == cpumask_first(cpu_online_mask);
+       else
+               is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+
+       /* Simple state machine */
+       do {
+               /* Chill out and ensure we re-read multi_stop_state. */
+               cpu_relax();
+               if (msdata->state != curstate) {
+                       curstate = msdata->state;
+                       switch (curstate) {
+                       case MULTI_STOP_DISABLE_IRQ:
+                               local_irq_disable();
+                               hard_irq_disable();
+                               break;
+                       case MULTI_STOP_RUN:
+                               if (is_active)
+                                       err = msdata->fn(msdata->data);
+                               break;
+                       default:
+                               break;
+                       }
+                       ack_state(msdata);
+               }
+       } while (curstate != MULTI_STOP_EXIT);
+
+       local_irq_restore(flags);
+       return err;
+}
+
+struct irq_cpu_stop_queue_work_info {
+       int cpu1;
+       int cpu2;
+       struct cpu_stop_work *work1;
+       struct cpu_stop_work *work2;
+};
+
+/*
+ * This function is always run with irqs and preemption disabled.
+ * This guarantees that both work1 and work2 get queued, before
+ * our local migrate thread gets the chance to preempt us.
+ */
+static void irq_cpu_stop_queue_work(void *arg)
+{
+       struct irq_cpu_stop_queue_work_info *info = arg;
+       cpu_stop_queue_work(info->cpu1, info->work1);
+       cpu_stop_queue_work(info->cpu2, info->work2);
+}
+
+/**
+ * stop_two_cpus - stops two cpus
+ * @cpu1: the cpu to stop
+ * @cpu2: the other cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Stops both the current and specified CPU and runs @fn on one of them.
+ *
+ * returns when both are completed.
+ */
+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
+{
+       struct cpu_stop_done done;
+       struct cpu_stop_work work1, work2;
+       struct irq_cpu_stop_queue_work_info call_args;
+       struct multi_stop_data msdata;
+
+       preempt_disable();
+       msdata = (struct multi_stop_data){
+               .fn = fn,
+               .data = arg,
+               .num_threads = 2,
+               .active_cpus = cpumask_of(cpu1),
+       };
+
+       work1 = work2 = (struct cpu_stop_work){
+               .fn = multi_cpu_stop,
+               .arg = &msdata,
+               .done = &done
+       };
+
+       call_args = (struct irq_cpu_stop_queue_work_info){
+               .cpu1 = cpu1,
+               .cpu2 = cpu2,
+               .work1 = &work1,
+               .work2 = &work2,
+       };
+
+       cpu_stop_init_done(&done, 2);
+       set_state(&msdata, MULTI_STOP_PREPARE);
+
+       /*
+        * If we observe both CPUs active we know _cpu_down() cannot yet have
+        * queued its stop_machine works and therefore ours will get executed
+        * first. Or its not either one of our CPUs that's getting unplugged,
+        * in which case we don't care.
+        *
+        * This relies on the stopper workqueues to be FIFO.
+        */
+       if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+               preempt_enable();
+               return -ENOENT;
+       }
+
+       /*
+        * Queuing needs to be done by the lowest numbered CPU, to ensure
+        * that works are always queued in the same order on every CPU.
+        * This prevents deadlocks.
+        */
+       smp_call_function_single(min(cpu1, cpu2),
+                                &irq_cpu_stop_queue_work,
+                                &call_args, 0);
+       preempt_enable();
+
+       wait_for_completion(&done.completion);
+
+       return done.executed ? done.ret : -ENOENT;
+}
+
  /**
   * stop_one_cpu_nowait - stop a cpu but don't wait for completion
   * @cpu: cpu to stop
@@ -359,98 +535,14 @@ early_initcall(cpu_stop_init);
  
  #ifdef CONFIG_STOP_MACHINE
  
-/* This controls the threads on each CPU. */
-enum stopmachine_state {
-       /* Dummy starting state for thread. */
-       STOPMACHINE_NONE,
-       /* Awaiting everyone to be scheduled. */
-       STOPMACHINE_PREPARE,
-       /* Disable interrupts. */
-       STOPMACHINE_DISABLE_IRQ,
-       /* Run the function */
-       STOPMACHINE_RUN,
-       /* Exit */
-       STOPMACHINE_EXIT,
-};
-
-struct stop_machine_data {
-       int                     (*fn)(void *);
-       void                    *data;
-       /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-       unsigned int            num_threads;
-       const struct cpumask    *active_cpus;
-
-       enum stopmachine_state  state;
-       atomic_t                thread_ack;
-};
-
-static void set_state(struct stop_machine_data *smdata,
-                     enum stopmachine_state newstate)
-{
-       /* Reset ack counter. */
-       atomic_set(&smdata->thread_ack, smdata->num_threads);
-       smp_wmb();
-       smdata->state = newstate;
-}
-
-/* Last one to ack a state moves to the next state. */
-static void ack_state(struct stop_machine_data *smdata)
-{
-       if (atomic_dec_and_test(&smdata->thread_ack))
-               set_state(smdata, smdata->state + 1);
-}
-
-/* This is the cpu_stop function which stops the CPU. */
-static int stop_machine_cpu_stop(void *data)
-{
-       struct stop_machine_data *smdata = data;
-       enum stopmachine_state curstate = STOPMACHINE_NONE;
-       int cpu = smp_processor_id(), err = 0;
-       unsigned long flags;
-       bool is_active;
-
-       /*
-        * When called from stop_machine_from_inactive_cpu(), irq might
-        * already be disabled.  Save the state and restore it on exit.
-        */
-       local_save_flags(flags);
-
-       if (!smdata->active_cpus)
-               is_active = cpu == cpumask_first(cpu_online_mask);
-       else
-               is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
-
-       /* Simple state machine */
-       do {
-               /* Chill out and ensure we re-read stopmachine_state. */
-               cpu_relax();
-               if (smdata->state != curstate) {
-                       curstate = smdata->state;
-                       switch (curstate) {
-                       case STOPMACHINE_DISABLE_IRQ:
-                               local_irq_disable();
-                               hard_irq_disable();
-                               break;
-                       case STOPMACHINE_RUN:
-                               if (is_active)
-                                       err = smdata->fn(smdata->data);
-                               break;
-                       default:
-                               break;
-                       }
-                       ack_state(smdata);
-               }
-       } while (curstate != STOPMACHINE_EXIT);
-
-       local_irq_restore(flags);
-       return err;
-}
-
  int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
  {
-       struct stop_machine_data smdata = { .fn = fn, .data = data,
-                                           .num_threads = num_online_cpus(),
-                                           .active_cpus = cpus };
+       struct multi_stop_data msdata = {
+               .fn = fn,
+               .data = data,
+               .num_threads = num_online_cpus(),
+               .active_cpus = cpus,
+       };
  
         if (!stop_machine_initialized) {
                 /*
@@ -461,7 +553,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
                 unsigned long flags;
                 int ret;
  
-               WARN_ON_ONCE(smdata.num_threads != 1);
+               WARN_ON_ONCE(msdata.num_threads != 1);
  
                 local_irq_save(flags);
                 hard_irq_disable();
@@ -472,8 +564,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
         }
  
         /* Set the initial state and stop all online cpus. */
-       set_state(&smdata, STOPMACHINE_PREPARE);
-       return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
+       set_state(&msdata, MULTI_STOP_PREPARE);
+       return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
  }
  
  int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +605,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
  int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
                                   const struct cpumask *cpus)
  {
-       struct stop_machine_data smdata = { .fn = fn, .data = data,
+       struct multi_stop_data msdata = { .fn = fn, .data = data,
                                             .active_cpus = cpus };
         struct cpu_stop_done done;
         int ret;
  
         /* Local CPU must be inactive and CPU hotplug in progress. */
         BUG_ON(cpu_active(raw_smp_processor_id()));
-       smdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
+       msdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
  
         /* No proper task established and can't sleep - busy wait for lock. */
         while (!mutex_trylock(&stop_cpus_mutex))
                 cpu_relax();
  
         /* Schedule work on other CPUs and execute directly for local CPU */
-       set_state(&smdata, STOPMACHINE_PREPARE);
+       set_state(&msdata, MULTI_STOP_PREPARE);
         cpu_stop_init_done(&done, num_active_cpus());
-       queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+       queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
                              &done);
-       ret = stop_machine_cpu_stop(&smdata);
+       ret = multi_cpu_stop(&msdata);
  
         /* Busy wait for completion. */
         while (!completion_done(&done.completion))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index b2f06f3c6a3ff32ec9f8f2b92bea0c4766ac6f7d..5c704dbf6d45c2196a460d640a508a643cb7f03a 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -370,13 +370,6 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
-       {
-               .procname       = "numa_balancing_scan_period_reset",
-               .data           = &sysctl_numa_balancing_scan_period_reset,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
         {
                 .procname       = "numa_balancing_scan_period_max_ms",
                 .data           = &sysctl_numa_balancing_scan_period_max,
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+       {
+               .procname       = "numa_balancing_settle_count",
+               .data           = &sysctl_numa_balancing_settle_count,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "numa_balancing_migrate_deferred",
+               .data           = &sysctl_numa_balancing_migrate_deferred,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
  #endif /* CONFIG_NUMA_BALANCING */
  #endif /* CONFIG_SCHED_DEBUG */
         {
@@ -962,9 +969,10 @@ static struct ctl_table kern_table[] = {
         {
                 .procname       = "hung_task_check_count",
                 .data           = &sysctl_hung_task_check_count,
-               .maxlen         = sizeof(unsigned long),
+               .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
         },
         {
                 .procname       = "hung_task_timeout_secs",
@@ -1049,6 +1057,7 @@ static struct ctl_table kern_table[] = {
                 .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                 .mode           = 0644,
                 .proc_handler   = perf_proc_update_handler,
+               .extra1         = &one,
         },
         {
                 .procname       = "perf_cpu_time_max_percent",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig

index 2b62fe86f9eccf828846511dbfca8f20a4109321..3ce6e8c5f3fca86b436b288194a661cfe7b80829 100644 (file)
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,7 +100,7 @@ config NO_HZ_FULL
         # RCU_USER_QS dependency
         depends on HAVE_CONTEXT_TRACKING
         # VIRT_CPU_ACCOUNTING_GEN dependency
-       depends on 64BIT
+       depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
         select NO_HZ_COMMON
         select RCU_USER_QS
         select RCU_NOCB_CPU
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c

index 50a8736757f3b470c5d0d393be03cfbf05f5df03..c9317e14aae6cfab03d1c9566a9de3b5cc78a0fa 100644 (file)
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
  static inline void clocksource_resume_watchdog(void) { }
  static inline int __clocksource_watchdog_kthread(void) { return 0; }
  static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
+void clocksource_mark_unstable(struct clocksource *cs) { }
  
  #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
  
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
  }
  
  /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs:         Pointer to clocksource
- *
+ * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
+ * @mult:      cycle to nanosecond multiplier
+ * @shift:     cycle to nanosecond divisor (power of two)
+ * @maxadj:    maximum adjustment value to mult (~11%)
+ * @mask:      bitmask for two's complement subtraction of non 64 bit counters
   */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
  {
         u64 max_nsecs, max_cycles;
  
         /*
          * Calculate the maximum number of cycles that we can pass to the
          * cyc2ns function without overflowing a 64-bit signed result. The
-        * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
+        * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
          * which is equivalent to the below.
-        * max_cycles < (2^63)/(cs->mult + cs->maxadj)
-        * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
-        * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
-        * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
-        * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
+        * max_cycles < (2^63)/(mult + maxadj)
+        * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
+        * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
+        * max_cycles < 2^(63 - log2(mult + maxadj))
+        * max_cycles < 1 << (63 - log2(mult + maxadj))
          * Please note that we add 1 to the result of the log2 to account for
          * any rounding errors, ensure the above inequality is satisfied and
          * no overflow will occur.
          */
-       max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
+       max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
  
         /*
          * The actual maximum number of cycles we can defer the clocksource is
-        * determined by the minimum of max_cycles and cs->mask.
+        * determined by the minimum of max_cycles and mask.
          * Note: Here we subtract the maxadj to make sure we don't sleep for
          * too long if there's a large negative adjustment.
          */
-       max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
-       max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
-                                       cs->shift);
+       max_cycles = min(max_cycles, mask);
+       max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+
+       return max_nsecs;
+}
+
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+       u64 max_nsecs;
  
+       max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
+                                         cs->mask);
         /*
          * To ensure that the clocksource does not wrap whilst we are idle,
          * limit the time the clocksource can be deferred by 12.5%. Please
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
                                           struct device_attribute *attr,
                                           const char *buf, size_t count)
  {
-       size_t ret;
+       ssize_t ret;
  
         mutex_lock(&clocksource_mutex);
  
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
  {
         struct clocksource *cs;
         char name[CS_NAME_LEN];
-       size_t ret;
+       ssize_t ret;
  
         ret = sysfs_get_uname(buf, name, count);
         if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c

index bb2215174f0577332cc8924eacb440703d1414d0..af8d1d4f3d55156eaae7936b1a34d8cd7141248f 100644 (file)
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
          * called as close as possible to 500 ms before the new second starts.
          * This code is run on a timer.  If the clock is set, that timer
          * may not expire at the correct time.  Thus, we adjust...
+        * We want the clock to be within a couple of ticks from the target.
          */
         if (!ntp_synced()) {
                 /*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
         }
  
         getnstimeofday(&now);
-       if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
+       if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
                 struct timespec adjust = now;
  
                 fail = -ENODEV;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c

index 0b479a6a22bb8fe30e2b9d6c76c2ddb1d5646ae1..68b79937598106918748832a396848f7fd22f870 100644 (file)
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
  #include <linux/clocksource.h>
  #include <linux/init.h>
  #include <linux/jiffies.h>
+#include <linux/ktime.h>
  #include <linux/kernel.h>
  #include <linux/moduleparam.h>
  #include <linux/sched.h>
  #include <linux/syscore_ops.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
  #include <linux/sched_clock.h>
+#include <linux/seqlock.h>
+#include <linux/bitops.h>
  
  struct clock_data {
+       ktime_t wrap_kt;
         u64 epoch_ns;
-       u32 epoch_cyc;
-       u32 epoch_cyc_copy;
+       u64 epoch_cyc;
+       seqcount_t seq;
         unsigned long rate;
         u32 mult;
         u32 shift;
         bool suspended;
  };
  
-static void sched_clock_poll(unsigned long wrap_ticks);
-static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
+static struct hrtimer sched_clock_timer;
  static int irqtime = -1;
  
  core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
         .mult   = NSEC_PER_SEC / HZ,
  };
  
-static u32 __read_mostly sched_clock_mask = 0xffffffff;
+static u64 __read_mostly sched_clock_mask;
  
-static u32 notrace jiffy_sched_clock_read(void)
+static u64 notrace jiffy_sched_clock_read(void)
  {
-       return (u32)(jiffies - INITIAL_JIFFIES);
+       /*
+        * We don't need to use get_jiffies_64 on 32-bit arches here
+        * because we register with BITS_PER_LONG
+        */
+       return (u64)(jiffies - INITIAL_JIFFIES);
  }
  
-static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static u32 __read_mostly (*read_sched_clock_32)(void);
+
+static u64 notrace read_sched_clock_32_wrapper(void)
+{
+       return read_sched_clock_32();
+}
+
+static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
  
  static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
  {
         return (cyc * mult) >> shift;
  }
  
-static unsigned long long notrace sched_clock_32(void)
+unsigned long long notrace sched_clock(void)
  {
         u64 epoch_ns;
-       u32 epoch_cyc;
-       u32 cyc;
+       u64 epoch_cyc;
+       u64 cyc;
+       unsigned long seq;
  
         if (cd.suspended)
                 return cd.epoch_ns;
  
-       /*
-        * Load the epoch_cyc and epoch_ns atomically.  We do this by
-        * ensuring that we always write epoch_cyc, epoch_ns and
-        * epoch_cyc_copy in strict order, and read them in strict order.
-        * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
-        * the middle of an update, and we should repeat the load.
-        */
         do {
+               seq = read_seqcount_begin(&cd.seq);
                 epoch_cyc = cd.epoch_cyc;
-               smp_rmb();
                 epoch_ns = cd.epoch_ns;
-               smp_rmb();
-       } while (epoch_cyc != cd.epoch_cyc_copy);
+       } while (read_seqcount_retry(&cd.seq, seq));
  
         cyc = read_sched_clock();
         cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
  static void notrace update_sched_clock(void)
  {
         unsigned long flags;
-       u32 cyc;
+       u64 cyc;
         u64 ns;
  
         cyc = read_sched_clock();
         ns = cd.epoch_ns +
                 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
                           cd.mult, cd.shift);
-       /*
-        * Write epoch_cyc and epoch_ns in a way that the update is
-        * detectable in cyc_to_fixed_sched_clock().
-        */
+
         raw_local_irq_save(flags);
-       cd.epoch_cyc_copy = cyc;
-       smp_wmb();
+       write_seqcount_begin(&cd.seq);
         cd.epoch_ns = ns;
-       smp_wmb();
         cd.epoch_cyc = cyc;
+       write_seqcount_end(&cd.seq);
         raw_local_irq_restore(flags);
  }
  
-static void sched_clock_poll(unsigned long wrap_ticks)
+static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
  {
-       mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
         update_sched_clock();
+       hrtimer_forward_now(hrt, cd.wrap_kt);
+       return HRTIMER_RESTART;
  }
  
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
+void __init sched_clock_register(u64 (*read)(void), int bits,
+                                unsigned long rate)
  {
-       unsigned long r, w;
+       unsigned long r;
         u64 res, wrap;
         char r_unit;
  
         if (cd.rate > rate)
                 return;
  
-       BUG_ON(bits > 32);
         WARN_ON(!irqs_disabled());
         read_sched_clock = read;
-       sched_clock_mask = (1ULL << bits) - 1;
+       sched_clock_mask = CLOCKSOURCE_MASK(bits);
         cd.rate = rate;
  
         /* calculate the mult/shift to convert counter ticks to ns. */
-       clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
+       clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
  
         r = rate;
         if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
                 r_unit = ' ';
  
         /* calculate how many ns until we wrap */
-       wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
-       do_div(wrap, NSEC_PER_MSEC);
-       w = wrap;
+       wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
+       cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
  
         /* calculate the ns resolution of this counter */
         res = cyc_to_ns(1ULL, cd.mult, cd.shift);
-       pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
-               bits, r, r_unit, res, w);
+       pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
+               bits, r, r_unit, res, wrap);
  
-       /*
-        * Start the timer to keep sched_clock() properly updated and
-        * sets the initial epoch.
-        */
-       sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
         update_sched_clock();
  
         /*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
         pr_debug("Registered %pF as sched_clock source\n", read);
  }
  
-unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
-
-unsigned long long notrace sched_clock(void)
+void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
  {
-       return sched_clock_func();
+       read_sched_clock_32 = read;
+       sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
  }
  
  void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
          * make it the final one one.
          */
         if (read_sched_clock == jiffy_sched_clock_read)
-               setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
+               sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
  
-       sched_clock_poll(sched_clock_timer.data);
+       update_sched_clock();
+
+       /*
+        * Start the timer to keep sched_clock() properly updated and
+        * sets the initial epoch.
+        */
+       hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       sched_clock_timer.function = sched_clock_poll;
+       hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
  }
  
  static int sched_clock_suspend(void)
  {
-       sched_clock_poll(sched_clock_timer.data);
+       sched_clock_poll(&sched_clock_timer);
         cd.suspended = true;
         return 0;
  }
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
  static void sched_clock_resume(void)
  {
         cd.epoch_cyc = read_sched_clock();
-       cd.epoch_cyc_copy = cd.epoch_cyc;
         cd.suspended = false;
  }
  
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c

index 218bcb565fed0f6e8f867b9e3e2264812d9797bf..9532690daaa9edff1fedd01066215ee43884a8a4 100644 (file)
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
                                         struct clock_event_device *newdev)
  {
         if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+           (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
             (newdev->features & CLOCK_EVT_FEAT_C3STOP))
                 return false;
  
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c

index 0b537f27b5591d5300427fffa4948619c0d0f4d3..1fb08f21302ece707ac7ea3b5210863cae8b19c2 100644 (file)
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
         period = ktime_to_timespec(time);
         ms = period.tv_nsec / 1000000;
  
-       seq_puts(m, "Timer Stats Version: v0.2\n");
+       seq_puts(m, "Timer Stats Version: v0.3\n");
         seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
         if (atomic_read(&overflow_count))
-               seq_printf(m, "Overflow: %d entries\n",
-                       atomic_read(&overflow_count));
+               seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
+       seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
  
         for (i = 0; i < nr_entries; i++) {
                 entry = entries + i;
-               if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+               if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
                         seq_printf(m, "%4luD, %5d %-16s ",
                                 entry->count, entry->pid, entry->comm);
                 } else {
diff --git a/kernel/timer.c b/kernel/timer.c

index 4296d13db3d15876f320070caee26c9a6ff06e6d..6582b82fa966d6ba503a7361f8f89f699f4de04d 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
  static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
                           unsigned long data)
  {
-       int preempt_count = preempt_count();
+       int count = preempt_count();
  
  #ifdef CONFIG_LOCKDEP
         /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
  
         lock_map_release(&lockdep_map);
  
-       if (preempt_count != preempt_count()) {
+       if (count != preempt_count()) {
                 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
-                         fn, preempt_count, preempt_count());
+                         fn, count, preempt_count());
                 /*
                  * Restore the preempt count. That gives us a decent
                  * chance to survive and extract information. If the
                  * callback kept a lock held, bad luck, but not worse
                  * than the BUG() we had.
                  */
-               preempt_count() = preempt_count;
+               preempt_count_set(count);
         }
  }
  
diff --git a/kernel/wait.c b/kernel/wait.c

index d550920e040c4515c7c865bb6ba2ab884f7ef7bd..de21c6305a44fe913f84bcbf8115d3ae4f1b2335 100644 (file)
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,6 +92,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
  }
  EXPORT_SYMBOL(prepare_to_wait_exclusive);
  
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+       unsigned long flags;
+
+       if (signal_pending_state(state, current))
+               return -ERESTARTSYS;
+
+       wait->private = current;
+       wait->func = autoremove_wake_function;
+
+       spin_lock_irqsave(&q->lock, flags);
+       if (list_empty(&wait->task_list)) {
+               if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                       __add_wait_queue_tail(q, wait);
+               else
+                       __add_wait_queue(q, wait);
+       }
+       set_current_state(state);
+       spin_unlock_irqrestore(&q->lock, flags);
+
+       return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
  /**
   * finish_wait - clean up after waiting in a queue
   * @q: waitqueue waited on
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c

index 6dc09d8f4c248d3765775c7ebf66869c34b4cff4..872a15a2a6374017af2bc0f8a446f86736f9b45e 100644 (file)
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -1002,7 +1002,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
          * Some tests (e.g. double-unlock) might corrupt the preemption
          * count, so restore it:
          */
-       preempt_count() = saved_preempt_count;
+       preempt_count_set(saved_preempt_count);
  #ifdef CONFIG_TRACE_IRQFLAGS
         if (softirq_count())
                 current->softirqs_enabled = 0;
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c

index 4c0d0e51d49e69e58673f013c83826545246daa0..04abe53f12a18bd639c598eac2045fa866482774 100644 (file)
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -9,10 +9,9 @@
  
  notrace unsigned int debug_smp_processor_id(void)
  {
-       unsigned long preempt_count = preempt_count();
         int this_cpu = raw_smp_processor_id();
  
-       if (likely(preempt_count))
+       if (likely(preempt_count()))
                 goto out;
  
         if (irqs_disabled())
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 610e3df2768a6a5b2ec1e293da4c96dafbbe2d30..2612f60f53ee5a4a22dabc4c9fb132f7bbc537f3 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1278,64 +1278,105 @@ out:
  int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
  {
+       struct anon_vma *anon_vma = NULL;
         struct page *page;
         unsigned long haddr = addr & HPAGE_PMD_MASK;
-       int target_nid;
-       int current_nid = -1;
-       bool migrated;
+       int page_nid = -1, this_nid = numa_node_id();
+       int target_nid, last_cpupid = -1;
+       bool page_locked;
+       bool migrated = false;
+       int flags = 0;
  
         spin_lock(&mm->page_table_lock);
         if (unlikely(!pmd_same(pmd, *pmdp)))
                 goto out_unlock;
  
         page = pmd_page(pmd);
-       get_page(page);
-       current_nid = page_to_nid(page);
+       BUG_ON(is_huge_zero_page(page));
+       page_nid = page_to_nid(page);
+       last_cpupid = page_cpupid_last(page);
         count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (current_nid == numa_node_id())
+       if (page_nid == this_nid) {
                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+               flags |= TNF_FAULT_LOCAL;
+       }
+
+       /*
+        * Avoid grouping on DSO/COW pages in specific and RO pages
+        * in general, RO pages shouldn't hurt as much anyway since
+        * they can be in shared cache state.
+        */
+       if (!pmd_write(pmd))
+               flags |= TNF_NO_GROUP;
  
+       /*
+        * Acquire the page lock to serialise THP migrations but avoid dropping
+        * page_table_lock if at all possible
+        */
+       page_locked = trylock_page(page);
         target_nid = mpol_misplaced(page, vma, haddr);
         if (target_nid == -1) {
-               put_page(page);
-               goto clear_pmdnuma;
+               /* If the page was locked, there are no parallel migrations */
+               if (page_locked)
+                       goto clear_pmdnuma;
+
+               /*
+                * Otherwise wait for potential migrations and retry. We do
+                * relock and check_same as the page may no longer be mapped.
+                * As the fault is being retried, do not account for it.
+                */
+               spin_unlock(&mm->page_table_lock);
+               wait_on_page_locked(page);
+               page_nid = -1;
+               goto out;
         }
  
-       /* Acquire the page lock to serialise THP migrations */
+       /* Page is misplaced, serialise migrations and parallel THP splits */
+       get_page(page);
         spin_unlock(&mm->page_table_lock);
-       lock_page(page);
+       if (!page_locked)
+               lock_page(page);
+       anon_vma = page_lock_anon_vma_read(page);
  
-       /* Confirm the PTE did not while locked */
+       /* Confirm the PMD did not change while page_table_lock was released */
         spin_lock(&mm->page_table_lock);
         if (unlikely(!pmd_same(pmd, *pmdp))) {
                 unlock_page(page);
                 put_page(page);
+               page_nid = -1;
                 goto out_unlock;
         }
-       spin_unlock(&mm->page_table_lock);
  
-       /* Migrate the THP to the requested node */
+       /*
+        * Migrate the THP to the requested node, returns with page unlocked
+        * and pmd_numa cleared.
+        */
+       spin_unlock(&mm->page_table_lock);
         migrated = migrate_misplaced_transhuge_page(mm, vma,
                                 pmdp, pmd, addr, page, target_nid);
-       if (!migrated)
-               goto check_same;
-
-       task_numa_fault(target_nid, HPAGE_PMD_NR, true);
-       return 0;
+       if (migrated) {
+               flags |= TNF_MIGRATED;
+               page_nid = target_nid;
+       }
  
-check_same:
-       spin_lock(&mm->page_table_lock);
-       if (unlikely(!pmd_same(pmd, *pmdp)))
-               goto out_unlock;
+       goto out;
  clear_pmdnuma:
+       BUG_ON(!PageLocked(page));
         pmd = pmd_mknonnuma(pmd);
         set_pmd_at(mm, haddr, pmdp, pmd);
         VM_BUG_ON(pmd_numa(*pmdp));
         update_mmu_cache_pmd(vma, addr, pmdp);
+       unlock_page(page);
  out_unlock:
         spin_unlock(&mm->page_table_lock);
-       if (current_nid != -1)
-               task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+
+out:
+       if (anon_vma)
+               page_unlock_anon_vma_read(anon_vma);
+
+       if (page_nid != -1)
+               task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+
         return 0;
  }
  
@@ -1432,6 +1473,12 @@ out:
         return ret;
  }
  
+/*
+ * Returns
+ *  - 0 if PMD could not be locked
+ *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr, pgprot_t newprot, int prot_numa)
  {
@@ -1440,22 +1487,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  
         if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                 pmd_t entry;
-               entry = pmdp_get_and_clear(mm, addr, pmd);
+               ret = 1;
                 if (!prot_numa) {
+                       entry = pmdp_get_and_clear(mm, addr, pmd);
                         entry = pmd_modify(entry, newprot);
+                       ret = HPAGE_PMD_NR;
                         BUG_ON(pmd_write(entry));
                 } else {
                         struct page *page = pmd_page(*pmd);
  
-                       /* only check non-shared pages */
-                       if (page_mapcount(page) == 1 &&
+                       /*
+                        * Do not trap faults against the zero page. The
+                        * read-only data is likely to be read-cached on the
+                        * local CPU cache and it is less useful to know about
+                        * local vs remote hits on the zero page.
+                        */
+                       if (!is_huge_zero_page(page) &&
                             !pmd_numa(*pmd)) {
+                               entry = pmdp_get_and_clear(mm, addr, pmd);
                                 entry = pmd_mknuma(entry);
+                               ret = HPAGE_PMD_NR;
                         }
                 }
-               set_pmd_at(mm, addr, pmd, entry);
+
+               /* Set PMD if cleared earlier */
+               if (ret == HPAGE_PMD_NR)
+                       set_pmd_at(mm, addr, pmd, entry);
+
                 spin_unlock(&vma->vm_mm->page_table_lock);
-               ret = 1;
         }
  
         return ret;
@@ -1636,7 +1695,7 @@ static void __split_huge_page_refcount(struct page *page,
                 page_tail->mapping = page->mapping;
  
                 page_tail->index = page->index + i;
-               page_nid_xchg_last(page_tail, page_nid_last(page));
+               page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
  
                 BUG_ON(!PageAnon(page_tail));
                 BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c

index 1311f26497e6a0f776682ed8a7e23b8620a5b9ad..1f2287eaa88e94da2062f8aeef04e46a737f70eb 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
  
  #include "internal.h"
  
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
  #endif
  
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -2721,6 +2721,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 get_page(dirty_page);
  
  reuse:
+               /*
+                * Clear the pages cpupid information as the existing
+                * information potentially belongs to a now completely
+                * unrelated process.
+                */
+               if (old_page)
+                       page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
+
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
                 entry = pte_mkyoung(orig_pte);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3521,13 +3529,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-                               unsigned long addr, int current_nid)
+                               unsigned long addr, int page_nid,
+                               int *flags)
  {
         get_page(page);
  
         count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (current_nid == numa_node_id())
+       if (page_nid == numa_node_id()) {
                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+               *flags |= TNF_FAULT_LOCAL;
+       }
  
         return mpol_misplaced(page, vma, addr);
  }
@@ -3537,9 +3548,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
  {
         struct page *page = NULL;
         spinlock_t *ptl;
-       int current_nid = -1;
+       int page_nid = -1;
+       int last_cpupid;
         int target_nid;
         bool migrated = false;
+       int flags = 0;
  
         /*
         * The "pte" at this point cannot be used safely without
@@ -3566,123 +3579,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 pte_unmap_unlock(ptep, ptl);
                 return 0;
         }
+       BUG_ON(is_zero_pfn(page_to_pfn(page)));
+
+       /*
+        * Avoid grouping on DSO/COW pages in specific and RO pages
+        * in general, RO pages shouldn't hurt as much anyway since
+        * they can be in shared cache state.
+        */
+       if (!pte_write(pte))
+               flags |= TNF_NO_GROUP;
+
+       /*
+        * Flag if the page is shared between multiple address spaces. This
+        * is later used when determining whether to group tasks together
+        */
+       if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+               flags |= TNF_SHARED;
  
-       current_nid = page_to_nid(page);
-       target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+       last_cpupid = page_cpupid_last(page);
+       page_nid = page_to_nid(page);
+       target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
         pte_unmap_unlock(ptep, ptl);
         if (target_nid == -1) {
-               /*
-                * Account for the fault against the current node if it not
-                * being replaced regardless of where the page is located.
-                */
-               current_nid = numa_node_id();
                 put_page(page);
                 goto out;
         }
  
         /* Migrate to the requested node */
-       migrated = migrate_misplaced_page(page, target_nid);
-       if (migrated)
-               current_nid = target_nid;
-
-out:
-       if (current_nid != -1)
-               task_numa_fault(current_nid, 1, migrated);
-       return 0;
-}
-
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                    unsigned long addr, pmd_t *pmdp)
-{
-       pmd_t pmd;
-       pte_t *pte, *orig_pte;
-       unsigned long _addr = addr & PMD_MASK;
-       unsigned long offset;
-       spinlock_t *ptl;
-       bool numa = false;
-       int local_nid = numa_node_id();
-
-       spin_lock(&mm->page_table_lock);
-       pmd = *pmdp;
-       if (pmd_numa(pmd)) {
-               set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
-               numa = true;
-       }
-       spin_unlock(&mm->page_table_lock);
-
-       if (!numa)
-               return 0;
-
-       /* we're in a page fault so some vma must be in the range */
-       BUG_ON(!vma);
-       BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
-       offset = max(_addr, vma->vm_start) & ~PMD_MASK;
-       VM_BUG_ON(offset >= PMD_SIZE);
-       orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
-       pte += offset >> PAGE_SHIFT;
-       for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
-               pte_t pteval = *pte;
-               struct page *page;
-               int curr_nid = local_nid;
-               int target_nid;
-               bool migrated;
-               if (!pte_present(pteval))
-                       continue;
-               if (!pte_numa(pteval))
-                       continue;
-               if (addr >= vma->vm_end) {
-                       vma = find_vma(mm, addr);
-                       /* there's a pte present so there must be a vma */
-                       BUG_ON(!vma);
-                       BUG_ON(addr < vma->vm_start);
-               }
-               if (pte_numa(pteval)) {
-                       pteval = pte_mknonnuma(pteval);
-                       set_pte_at(mm, addr, pte, pteval);
-               }
-               page = vm_normal_page(vma, addr, pteval);
-               if (unlikely(!page))
-                       continue;
-               /* only check non-shared pages */
-               if (unlikely(page_mapcount(page) != 1))
-                       continue;
-
-               /*
-                * Note that the NUMA fault is later accounted to either
-                * the node that is currently running or where the page is
-                * migrated to.
-                */
-               curr_nid = local_nid;
-               target_nid = numa_migrate_prep(page, vma, addr,
-                                              page_to_nid(page));
-               if (target_nid == -1) {
-                       put_page(page);
-                       continue;
-               }
-
-               /* Migrate to the requested node */
-               pte_unmap_unlock(pte, ptl);
-               migrated = migrate_misplaced_page(page, target_nid);
-               if (migrated)
-                       curr_nid = target_nid;
-               task_numa_fault(curr_nid, 1, migrated);
-
-               pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+       migrated = migrate_misplaced_page(page, vma, target_nid);
+       if (migrated) {
+               page_nid = target_nid;
+               flags |= TNF_MIGRATED;
         }
-       pte_unmap_unlock(orig_pte, ptl);
  
+out:
+       if (page_nid != -1)
+               task_numa_fault(last_cpupid, page_nid, 1, flags);
         return 0;
  }
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                    unsigned long addr, pmd_t *pmdp)
-{
-       BUG();
-       return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
  
  /*
   * These routines also need to handle stuff like marking pages dirty
@@ -3822,8 +3756,8 @@ retry:
                 }
         }
  
-       if (pmd_numa(*pmd))
-               return do_pmd_numa_page(mm, vma, address, pmd);
+       /* THP should already have been handled */
+       BUG_ON(pmd_numa(*pmd));
  
         /*
          * Use __pte_alloc instead of pte_alloc_map, because we can't
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 04729647f359c7c1fa3a91058cc1044c0db2df8d..71cb253368cb72b1bf99d390ca9360ccc277c29b 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
         return pol;
  }
  
+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+{
+       struct mempolicy *pol = get_task_policy(task);
+       if (vma) {
+               if (vma->vm_ops && vma->vm_ops->get_policy) {
+                       bool ret = false;
+
+                       pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+                       if (pol && (pol->flags & MPOL_F_MOF))
+                               ret = true;
+                       mpol_cond_put(pol);
+
+                       return ret;
+               } else if (vma->vm_policy) {
+                       pol = vma->vm_policy;
+               }
+       }
+
+       if (!pol)
+               return default_policy.flags & MPOL_F_MOF;
+
+       return pol->flags & MPOL_F_MOF;
+}
+
  static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
  {
         enum zone_type dynamic_policy_zone = policy_zone;
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n)
         kmem_cache_free(sn_cache, n);
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+       /* Never defer a private fault */
+       if (cpupid_match_pid(p, last_cpupid))
+               return false;
+
+       if (p->numa_migrate_deferred) {
+               p->numa_migrate_deferred--;
+               return true;
+       }
+       return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+       p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+       return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
  /**
   * mpol_misplaced - check whether current page node is valid in policy
   *
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
         struct zone *zone;
         int curnid = page_to_nid(page);
         unsigned long pgoff;
+       int thiscpu = raw_smp_processor_id();
+       int thisnid = cpu_to_node(thiscpu);
         int polnid = -1;
         int ret = -1;
  
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
  
         /* Migrate the page towards the node whose CPU is referencing it */
         if (pol->flags & MPOL_F_MORON) {
-               int last_nid;
+               int last_cpupid;
+               int this_cpupid;
  
-               polnid = numa_node_id();
+               polnid = thisnid;
+               this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
  
                 /*
                  * Multi-stage node selection is used in conjunction
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                  * it less likely we act on an unlikely task<->page
                  * relation.
                  */
-               last_nid = page_nid_xchg_last(page, polnid);
-               if (last_nid != polnid)
+               last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+               if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+
+                       /* See sysctl_numa_balancing_migrate_deferred comment */
+                       if (!cpupid_match_pid(current, last_cpupid))
+                               defer_numa_migrate(current);
+
+                       goto out;
+               }
+
+               /*
+                * The quadratic filter above reduces extraneous migration
+                * of shared pages somewhat. This code reduces it even more,
+                * reducing the overhead of page migrations of shared pages.
+                * This makes workloads with shared pages rely more on
+                * "move task near its memory", and less on "move memory
+                * towards its task", which is exactly what we want.
+                */
+               if (numa_migrate_deferred(current, last_cpupid))
                         goto out;
         }
  
diff --git a/mm/migrate.c b/mm/migrate.c

index 7a7325ee1d089696a8073a84d2f748f326124805..dfc8300ecbb273fe3d7f5ef37e21e20a63b58c84 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -445,6 +445,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
   */
  void migrate_page_copy(struct page *newpage, struct page *page)
  {
+       int cpupid;
+
         if (PageHuge(page) || PageTransHuge(page))
                 copy_huge_page(newpage, page);
         else
@@ -481,6 +483,13 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                         __set_page_dirty_nobuffers(newpage);
         }
  
+       /*
+        * Copy NUMA information to the new page, to prevent over-eager
+        * future migrations of this same page.
+        */
+       cpupid = page_cpupid_xchg_last(page, -1);
+       page_cpupid_xchg_last(newpage, cpupid);
+
         mlock_migrate_page(newpage, page);
         ksm_migrate_page(newpage, page);
         /*
@@ -1500,7 +1509,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                                           __GFP_NOWARN) &
                                          ~GFP_IOFS, 0);
         if (newpage)
-               page_nid_xchg_last(newpage, page_nid_last(page));
+               page_cpupid_xchg_last(newpage, page_cpupid_last(page));
  
         return newpage;
  }
@@ -1601,7 +1610,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
   * node. Caller is expected to have an elevated reference count on
   * the page that will be dropped by this function before returning.
   */
-int migrate_misplaced_page(struct page *page, int node)
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+                          int node)
  {
         pg_data_t *pgdat = NODE_DATA(node);
         int isolated;
@@ -1609,10 +1619,11 @@ int migrate_misplaced_page(struct page *page, int node)
         LIST_HEAD(migratepages);
  
         /*
-        * Don't migrate pages that are mapped in multiple processes.
-        * TODO: Handle false sharing detection instead of this hammer
+        * Don't migrate file pages that are mapped in multiple processes
+        * with execute permissions as they are probably shared libraries.
          */
-       if (page_mapcount(page) != 1)
+       if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+           (vma->vm_flags & VM_EXEC))
                 goto out;
  
         /*
@@ -1662,13 +1673,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
         struct mem_cgroup *memcg = NULL;
         int page_lru = page_is_file_cache(page);
  
-       /*
-        * Don't migrate pages that are mapped in multiple processes.
-        * TODO: Handle false sharing detection instead of this hammer
-        */
-       if (page_mapcount(page) != 1)
-               goto out_dropref;
-
         /*
          * Rate-limit the amount of data that is being migrated to a node.
          * Optimal placement is no good if the memory bus is saturated and
@@ -1682,7 +1686,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
         if (!new_page)
                 goto out_fail;
  
-       page_nid_xchg_last(new_page, page_nid_last(page));
+       page_cpupid_xchg_last(new_page, page_cpupid_last(page));
  
         isolated = numamigrate_isolate_page(pgdat, page);
         if (!isolated) {
@@ -1715,12 +1719,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                 unlock_page(new_page);
                 put_page(new_page);             /* Free it */
  
-               unlock_page(page);
+               /* Retake the callers reference and putback on LRU */
+               get_page(page);
                 putback_lru_page(page);
-
-               count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-               isolated = 0;
-               goto out;
+               mod_zone_page_state(page_zone(page),
+                        NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
+               goto out_fail;
         }
  
         /*
@@ -1737,9 +1741,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
         entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
         entry = pmd_mkhuge(entry);
  
-       page_add_new_anon_rmap(new_page, vma, haddr);
-
+       pmdp_clear_flush(vma, haddr, pmd);
         set_pmd_at(mm, haddr, pmd, entry);
+       page_add_new_anon_rmap(new_page, vma, haddr);
         update_mmu_cache_pmd(vma, address, &entry);
         page_remove_rmap(page);
         /*
@@ -1758,7 +1762,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
         count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
         count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
  
-out:
         mod_zone_page_state(page_zone(page),
                         NR_ISOLATED_ANON + page_lru,
                         -HPAGE_PMD_NR);
@@ -1767,6 +1770,10 @@ out:
  out_fail:
         count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
  out_dropref:
+       entry = pmd_mknonnuma(entry);
+       set_pmd_at(mm, haddr, pmd, entry);
+       update_mmu_cache_pmd(vma, address, &entry);
+
         unlock_page(page);
         put_page(page);
         return 0;
diff --git a/mm/mm_init.c b/mm/mm_init.c

index 633c08863fd8923ca393f6126bc61f24f38fa427..68562e92d50cba17ba52440f2ae809824a88e783 100644 (file)
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
         unsigned long or_mask, add_mask;
  
         shift = 8 * sizeof(unsigned long);
-       width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
+       width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
         mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
-               "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
+               "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
                 SECTIONS_WIDTH,
                 NODES_WIDTH,
                 ZONES_WIDTH,
-               LAST_NID_WIDTH,
+               LAST_CPUPID_WIDTH,
                 NR_PAGEFLAGS);
         mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
-               "Section %d Node %d Zone %d Lastnid %d\n",
+               "Section %d Node %d Zone %d Lastcpupid %d\n",
                 SECTIONS_SHIFT,
                 NODES_SHIFT,
                 ZONES_SHIFT,
-               LAST_NID_SHIFT);
+               LAST_CPUPID_SHIFT);
         mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
-               "Section %lu Node %lu Zone %lu Lastnid %lu\n",
+               "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
                 (unsigned long)SECTIONS_PGSHIFT,
                 (unsigned long)NODES_PGSHIFT,
                 (unsigned long)ZONES_PGSHIFT,
-               (unsigned long)LAST_NID_PGSHIFT);
+               (unsigned long)LAST_CPUPID_PGSHIFT);
         mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
                 "Node/Zone ID: %lu -> %lu\n",
                 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
         mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
                 "Node not in page flags");
  #endif
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
         mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
-               "Last nid not in page flags");
+               "Last cpupid not in page flags");
  #endif
  
         if (SECTIONS_WIDTH) {
diff --git a/mm/mmzone.c b/mm/mmzone.c

index 2ac0afbd68f356cfaaa3bf47b33703df6b475785..bf34fb8556db0a5b3805ec97137ae9ee1a3045f9 100644 (file)
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
                 INIT_LIST_HEAD(&lruvec->lists[lru]);
  }
  
-#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
-int page_nid_xchg_last(struct page *page, int nid)
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
+int page_cpupid_xchg_last(struct page *page, int cpupid)
  {
         unsigned long old_flags, flags;
-       int last_nid;
+       int last_cpupid;
  
         do {
                 old_flags = flags = page->flags;
-               last_nid = page_nid_last(page);
+               last_cpupid = page_cpupid_last(page);
  
-               flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
-               flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+               flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
+               flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
         } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
  
-       return last_nid;
+       return last_cpupid;
  }
  #endif
diff --git a/mm/mprotect.c b/mm/mprotect.c

index a3af058f68e4d9f434337d0dcd6127d5d8c6d039..a597f2ffcd6fc873a5f5b9779e4e79ac022bb2fe 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
  
  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr, unsigned long end, pgprot_t newprot,
-               int dirty_accountable, int prot_numa, bool *ret_all_same_node)
+               int dirty_accountable, int prot_numa)
  {
         struct mm_struct *mm = vma->vm_mm;
         pte_t *pte, oldpte;
         spinlock_t *ptl;
         unsigned long pages = 0;
-       bool all_same_node = true;
-       int last_nid = -1;
  
         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         arch_enter_lazy_mmu_mode();
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  
                                 page = vm_normal_page(vma, addr, oldpte);
                                 if (page) {
-                                       int this_nid = page_to_nid(page);
-                                       if (last_nid == -1)
-                                               last_nid = this_nid;
-                                       if (last_nid != this_nid)
-                                               all_same_node = false;
-
-                                       /* only check non-shared pages */
-                                       if (!pte_numa(oldpte) &&
-                                           page_mapcount(page) == 1) {
+                                       if (!pte_numa(oldpte)) {
                                                 ptent = pte_mknuma(ptent);
                                                 updated = true;
                                         }
@@ -104,33 +94,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 if (pte_swp_soft_dirty(oldpte))
                                         newpte = pte_swp_mksoft_dirty(newpte);
                                 set_pte_at(mm, addr, pte, newpte);
+
+                               pages++;
                         }
-                       pages++;
                 }
         } while (pte++, addr += PAGE_SIZE, addr != end);
         arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(pte - 1, ptl);
  
-       *ret_all_same_node = all_same_node;
         return pages;
  }
  
-#ifdef CONFIG_NUMA_BALANCING
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-                                      pmd_t *pmd)
-{
-       spin_lock(&mm->page_table_lock);
-       set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
-       spin_unlock(&mm->page_table_lock);
-}
-#else
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-                                      pmd_t *pmd)
-{
-       BUG();
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
  static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                 pud_t *pud, unsigned long addr, unsigned long end,
                 pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -138,34 +112,33 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
         pmd_t *pmd;
         unsigned long next;
         unsigned long pages = 0;
-       bool all_same_node;
  
         pmd = pmd_offset(pud, addr);
         do {
+               unsigned long this_pages;
+
                 next = pmd_addr_end(addr, end);
                 if (pmd_trans_huge(*pmd)) {
                         if (next - addr != HPAGE_PMD_SIZE)
                                 split_huge_page_pmd(vma, addr, pmd);
-                       else if (change_huge_pmd(vma, pmd, addr, newprot,
-                                                prot_numa)) {
-                               pages += HPAGE_PMD_NR;
-                               continue;
+                       else {
+                               int nr_ptes = change_huge_pmd(vma, pmd, addr,
+                                               newprot, prot_numa);
+
+                               if (nr_ptes) {
+                                       if (nr_ptes == HPAGE_PMD_NR)
+                                               pages++;
+
+                                       continue;
+                               }
                         }
                         /* fall through */
                 }
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               pages += change_pte_range(vma, pmd, addr, next, newprot,
-                                dirty_accountable, prot_numa, &all_same_node);
-
-               /*
-                * If we are changing protections for NUMA hinting faults then
-                * set pmd_numa if the examined pages were all on the same
-                * node. This allows a regular PMD to be handled as one fault
-                * and effectively batches the taking of the PTL
-                */
-               if (prot_numa && all_same_node)
-                       change_pmd_protnuma(vma->vm_mm, addr, pmd);
+               this_pages = change_pte_range(vma, pmd, addr, next, newprot,
+                                dirty_accountable, prot_numa);
+               pages += this_pages;
         } while (pmd++, addr = next, addr != end);
  
         return pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index dd886fac451ab6ab7d6a3132fd2587c10538f300..73d812f16dde6acbad004c8dddb22c140443fcc4 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
                 bad_page(page);
                 return 1;
         }
-       page_nid_reset_last(page);
+       page_cpupid_reset_last(page);
         if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
         return 0;
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 mminit_verify_page_links(page, zone, nid, pfn);
                 init_page_count(page);
                 page_mapcount_reset(page);
-               page_nid_reset_last(page);
+               page_cpupid_reset_last(page);
                 SetPageReserved(page);
                 /*
                  * Mark the block movable so that blocks are reserved for
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c

index 0578d4fa00a9cf1c274b2adb42d8277801dbdeb3..0f676908d15b6108225817108d1cdddf0e31dc16 100644 (file)
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -2563,9 +2563,8 @@ bed:
                                   jiffies + msecs_to_jiffies(val));
  
                         /* Wait for IR-LMP to call us back */
-                       __wait_event_interruptible(self->query_wait,
-                             (self->cachedaddr != 0 || self->errno == -ETIME),
-                                                  err);
+                       err = __wait_event_interruptible(self->query_wait,
+                             (self->cachedaddr != 0 || self->errno == -ETIME));
  
                         /* If watchdog is still activated, kill it! */
                         del_timer(&(self->watchdog));
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c

index f4484719f3e638cc2c6cf10e4d0fa5535990ad6a..f63c2388f38d80f470d2ef0e5b38a7a20741696f 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1637,12 +1637,9 @@ static int sync_thread_master(void *data)
                         continue;
                 }
                 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
-                       int ret = 0;
-
-                       __wait_event_interruptible(*sk_sleep(sk),
+                       int ret = __wait_event_interruptible(*sk_sleep(sk),
                                                    sock_writeable(sk) ||
-                                                  kthread_should_stop(),
-                                                  ret);
+                                                  kthread_should_stop());
                         if (unlikely(kthread_should_stop()))
                                 goto done;
                 }
diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile

index ca6cb779876a02de1bdd89f5ab0686e887bd2da0..fc1502098595b8b960f262c4f789c15c247d0d61 100644 (file)
--- a/tools/lib/traceevent/Makefile
+++ b/tools/lib/traceevent/Makefile
@@ -134,14 +134,14 @@ ifeq ($(VERBOSE),1)
    print_install =
  else
    Q = @
-  print_compile =              echo '  CC                 '$(OBJ);
-  print_app_build =            echo '  BUILD              '$(OBJ);
-  print_fpic_compile =         echo '  CC FPIC            '$(OBJ);
-  print_shared_lib_compile =   echo '  BUILD SHARED LIB   '$(OBJ);
-  print_plugin_obj_compile =   echo '  CC PLUGIN OBJ      '$(OBJ);
-  print_plugin_build =         echo '  CC PLUGI           '$(OBJ);
-  print_static_lib_build =     echo '  BUILD STATIC LIB   '$(OBJ);
-  print_install =              echo '  INSTALL     '$1'        to      $(DESTDIR_SQ)$2';
+  print_compile =              echo '  CC       '$(OBJ);
+  print_app_build =            echo '  BUILD    '$(OBJ);
+  print_fpic_compile =         echo '  CC FPIC  '$(OBJ);
+  print_shared_lib_compile =   echo '  BUILD    SHARED LIB '$(OBJ);
+  print_plugin_obj_compile =   echo '  BUILD    PLUGIN OBJ '$(OBJ);
+  print_plugin_build =         echo '  BUILD    PLUGIN     '$(OBJ);
+  print_static_lib_build =     echo '  BUILD    STATIC LIB '$(OBJ);
+  print_install =              echo '  INSTALL  '$1'   to      $(DESTDIR_SQ)$2';
  endif
  
  do_fpic_compile =                                      \
@@ -268,7 +268,7 @@ TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE)
  TRACEEVENT-CFLAGS: force
         @FLAGS='$(TRACK_CFLAGS)'; \
             if test x"$$FLAGS" != x"`cat TRACEEVENT-CFLAGS 2>/dev/null`" ; then \
-               echo 1>&2 "    * new build flags or cross compiler"; \
+               echo 1>&2 "  FLAGS:   * new build flags or cross compiler"; \
                 echo "$$FLAGS" >TRACEEVENT-CFLAGS; \
              fi
  
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore

index 8f8fbc227a468477147629548ea983ee238e6ea6..782d86e961b9fb79195407e7634bceae9d55f279 100644 (file)
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -13,6 +13,7 @@ perf*.html
  common-cmds.h
  perf.data
  perf.data.old
+output.svg
  perf-archive
  tags
  TAGS
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile

index 5a37a7c84e69891fc753f205c50b81d22238499f..3ba1c0b09908b0e66a8a87f5307d7e9066a003ce 100644 (file)
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -145,16 +145,17 @@ endif
  
  ifneq ($(findstring $(MAKEFLAGS),s),s)
  ifneq ($(V),1)
-       QUIET_ASCIIDOC  = @echo '   ' ASCIIDOC $@;
-       QUIET_XMLTO     = @echo '   ' XMLTO $@;
-       QUIET_DB2TEXI   = @echo '   ' DB2TEXI $@;
-       QUIET_MAKEINFO  = @echo '   ' MAKEINFO $@;
-       QUIET_DBLATEX   = @echo '   ' DBLATEX $@;
-       QUIET_XSLTPROC  = @echo '   ' XSLTPROC $@;
-       QUIET_GEN       = @echo '   ' GEN $@;
+       QUIET_ASCIIDOC  = @echo '  ASCIIDOC '$@;
+       QUIET_XMLTO     = @echo '  XMLTO    '$@;
+       QUIET_DB2TEXI   = @echo '  DB2TEXI  '$@;
+       QUIET_MAKEINFO  = @echo '  MAKEINFO '$@;
+       QUIET_DBLATEX   = @echo '  DBLATEX  '$@;
+       QUIET_XSLTPROC  = @echo '  XSLTPROC '$@;
+       QUIET_GEN       = @echo '  GEN      '$@;
         QUIET_STDERR    = 2> /dev/null
         QUIET_SUBDIR0   = +@subdir=
-       QUIET_SUBDIR1   = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+       QUIET_SUBDIR1   = ;$(NO_SUBDIR) \
+                          echo '  SUBDIR   ' $$subdir; \
                           $(MAKE) $(PRINT_DIR) -C $$subdir
         export V
  endif
@@ -183,47 +184,43 @@ ifdef missing_tools
  endif
  
  do-install-man: man
-       $(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
-#      $(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
-#      $(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
-       $(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
-#      $(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
-#      $(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+       $(call QUIET_INSTALL, Documentation-man) \
+               $(INSTALL) -d -m 755 $(DESTDIR)$(man1dir); \
+#              $(INSTALL) -d -m 755 $(DESTDIR)$(man5dir); \
+#              $(INSTALL) -d -m 755 $(DESTDIR)$(man7dir); \
+               $(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir); \
+#              $(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir); \
+#              $(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
  
  install-man: check-man-tools man
  
-try-install-man:
  ifdef missing_tools
-       $(warning Please install $(missing_tools) to have the man pages installed)
+  DO_INSTALL_MAN = $(warning Please install $(missing_tools) to have the man pages installed)
  else
-       $(MAKE) do-install-man
+  DO_INSTALL_MAN = do-install-man
  endif
  
+try-install-man: $(DO_INSTALL_MAN)
+
  install-info: info
-       $(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
-       $(INSTALL) -m 644 $(OUTPUT)perf.info $(OUTPUT)perfman.info $(DESTDIR)$(infodir)
+       $(call QUIET_INSTALL, Documentation-info) \
+               $(INSTALL) -d -m 755 $(DESTDIR)$(infodir); \
+               $(INSTALL) -m 644 $(OUTPUT)perf.info $(OUTPUT)perfman.info $(DESTDIR)$(infodir); \
         if test -r $(DESTDIR)$(infodir)/dir; then \
-         $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
-         $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
+               $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
+               $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
         else \
           echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
         fi
  
  install-pdf: pdf
-       $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
-       $(INSTALL) -m 644 $(OUTPUT)user-manual.pdf $(DESTDIR)$(pdfdir)
+       $(call QUIET_INSTALL, Documentation-pdf) \
+               $(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir); \
+               $(INSTALL) -m 644 $(OUTPUT)user-manual.pdf $(DESTDIR)$(pdfdir)
  
  #install-html: html
  #      '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
  
-ifneq ($(MAKECMDGOALS),clean)
-ifneq ($(MAKECMDGOALS),tags)
-$(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-       $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) $(OUTPUT)PERF-VERSION-FILE
-
--include $(OUTPUT)PERF-VERSION-FILE
-endif
-endif
  
  #
  # Determine "include::" file references in asciidoc files.
@@ -253,15 +250,17 @@ $(OUTPUT)cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
         $(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
         date >$@
  
+CLEAN_FILES =                                                                  \
+       $(MAN_XML) $(addsuffix +,$(MAN_XML))                                    \
+       $(MAN_HTML) $(addsuffix +,$(MAN_HTML))                                  \
+       $(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7)                         \
+       $(OUTPUT)*.texi $(OUTPUT)*.texi+ $(OUTPUT)*.texi++                      \
+       $(OUTPUT)perf.info $(OUTPUT)perfman.info                                \
+       $(OUTPUT)howto-index.txt $(OUTPUT)howto/*.html $(OUTPUT)doc.dep         \
+       $(OUTPUT)technical/api-*.html $(OUTPUT)technical/api-index.txt          \
+       $(cmds_txt) $(OUTPUT)*.made
  clean:
-       $(RM) $(MAN_XML) $(addsuffix +,$(MAN_XML))
-       $(RM) $(MAN_HTML) $(addsuffix +,$(MAN_HTML))
-       $(RM) $(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7)
-       $(RM) $(OUTPUT)*.texi $(OUTPUT)*.texi+ $(OUTPUT)*.texi++
-       $(RM) $(OUTPUT)perf.info $(OUTPUT)perfman.info
-       $(RM) $(OUTPUT)howto-index.txt $(OUTPUT)howto/*.html $(OUTPUT)doc.dep
-       $(RM) $(OUTPUT)technical/api-*.html $(OUTPUT)technical/api-index.txt
-       $(RM) $(cmds_txt) $(OUTPUT)*.made
+       $(call QUIET_CLEAN, Documentation) $(RM) $(CLEAN_FILES)
  
  $(MAN_HTML): $(OUTPUT)%.html : %.txt
         $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
@@ -342,5 +341,3 @@ $(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
  
  #quick-install-html:
  #      '$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
-
-.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt

index e9a8349a7172f7533fe970062c5936b4fd7df6fd..fd77d81ea748663284bc55db4db8fa5c20fc2a28 100644 (file)
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -21,6 +21,19 @@ OPTIONS
  -a::
  --add=::
          Add specified file to the cache.
+-k::
+--kcore::
+        Add specified kcore file to the cache. For the current host that is
+        /proc/kcore which requires root permissions to read. Be aware that
+        running 'perf buildid-cache' as root may update root's build-id cache
+        not the user's. Use the -v option to see where the file is created.
+        Note that the copied file contains only code sections not the whole core
+        image. Note also that files "kallsyms" and "modules" must also be in the
+        same directory and are also copied.  All 3 files are created with read
+        permissions for root only. kcore will not be added if there is already a
+        kcore in the cache (with the same build-id) that has the same modules at
+        the same addresses. Use the -v option to see if a copy of kcore is
+        actually made.
  -r::
  --remove=::
          Remove specified file from the cache.
diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt

index ac84db2d23342aaa269255e34c9ab3f5d70a3e4f..6a06cefe96427453ad3b21e1caf64889f5f9ea55 100644 (file)
--- a/tools/perf/Documentation/perf-kvm.txt
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -109,7 +109,9 @@ STAT LIVE OPTIONS
  
  -m::
  --mmap-pages=::
-    Number of mmap data pages. Must be a power of two.
+    Number of mmap data pages (must be a power of two) or size
+    specification with appended unit character - B/K/M/G. The
+    size is rounded up to have nearest pages power of two value.
  
  -a::
  --all-cpus::
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt

index c7f5f55634ac3fab974216404fdcb6348aec94d0..ab25be28c9dcaab31adb1822195afab571308910 100644 (file)
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -48,7 +48,7 @@ REPORT OPTIONS
  -k::
  --key=<value>::
          Sorting key. Possible values: acquired (default), contended,
-        wait_total, wait_max, wait_min.
+       avg_wait, wait_total, wait_max, wait_min.
  
  INFO OPTIONS
  ------------
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt

index e297b74471b8a32e76912342f50f77534360ec87..f10ab63576d7550dbffade473f45332be969657c 100644 (file)
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -87,7 +87,9 @@ OPTIONS
  
  -m::
  --mmap-pages=::
-       Number of mmap data pages. Must be a power of two.
+       Number of mmap data pages (must be a power of two) or size
+       specification with appended unit character - B/K/M/G. The
+       size is rounded up to have nearest pages power of two value.
  
  -g::
  --call-graph::
@@ -166,6 +168,9 @@ following filters are defined:
          - u:  only when the branch target is at the user level
          - k: only when the branch target is in the kernel
          - hv: only when the target is at the hypervisor level
+       - in_tx: only when the target is in a hardware transaction
+       - no_tx: only when the target is not in a hardware transaction
+       - abort_tx: only when the target is a hardware transaction abort
  
  +
  The option requires at least one branch type among any, any_call, any_ret, ind_call.
@@ -176,12 +181,14 @@ is enabled for all the sampling events. The sampled branch type is the same for
  The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
  Note that this feature may not be available on all processors.
  
--W::
  --weight::
  Enable weightened sampling. An additional weight is recorded per sample and can be
  displayed with the weight and local_weight sort keys.  This currently works for TSX
  abort events and some memory events in precise mode on modern Intel CPUs.
  
+--transaction::
+Record transaction flags for transaction related events.
+
  SEE ALSO
  --------
  linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index 2b8097ee39d83c194fd8393a0a1a3e2b67e40b65..10a2798712511a26616ea618a91130a6ed467345 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -71,7 +71,11 @@ OPTIONS
         entries are displayed as "[other]".
         - cpu: cpu number the task ran at the time of sample
         - srcline: filename and line number executed at the time of sample.  The
-       DWARF debuggin info must be provided.
+       DWARF debugging info must be provided.
+       - weight: Event specific weight, e.g. memory latency or transaction
+       abort cost. This is the global weight.
+       - local_weight: Local weight version of the weight above.
+       - transaction: Transaction abort flags.
  
         By default, comm, dso and symbol keys are used.
         (i.e. --sort comm,dso,symbol)
@@ -85,6 +89,8 @@ OPTIONS
         - symbol_from: name of function branched from
         - symbol_to: name of function branched to
         - mispredict: "N" for predicted branch, "Y" for mispredicted branch
+       - in_tx: branch in TSX transaction
+       - abort: TSX transaction abort.
  
         And default sort keys are changed to comm, dso_from, symbol_from, dso_to
         and symbol_to, see '--branch-stack'.
@@ -135,6 +141,14 @@ OPTIONS
  
         Default: fractal,0.5,callee,function.
  
+--max-stack::
+       Set the stack depth limit when parsing the callchain, anything
+       beyond the specified depth will be ignored. This is a trade-off
+       between information loss and faster processing especially for
+       workloads that can have a very long callchain stack.
+
+       Default: 127
+
  -G::
  --inverted::
          alias for inverted caller based call graph.
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt

index 73c9759005a354eb8e052e7425f981bd16fb8c2a..80c7da6732f294782d86c51df8e8ba9225bf1d04 100644 (file)
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -137,6 +137,11 @@ core number and the number of online logical processors on that physical process
  After starting the program, wait msecs before measuring. This is useful to
  filter out the startup phase of the program, which is often very different.
  
+-T::
+--transaction::
+
+Print statistics of transactional execution if supported.
+
  EXAMPLES
  --------
  
diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt

index 1632b0efc7577a730388154164be888c619ad934..3ff8bd4f0b4d94163d77231881b462e50bbd4e17 100644 (file)
--- a/tools/perf/Documentation/perf-timechart.txt
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -8,7 +8,8 @@ perf-timechart - Tool to visualize total system behavior during a workload
  SYNOPSIS
  --------
  [verse]
-'perf timechart' {record}
+'perf timechart' record <command>
+'perf timechart' [<options>]
  
  DESCRIPTION
  -----------
@@ -41,6 +42,18 @@ OPTIONS
  --symfs=<directory>::
          Look for files with symbols relative to this directory.
  
+EXAMPLES
+--------
+
+$ perf timechart record git pull
+
+  [ perf record: Woken up 13 times to write data ]
+  [ perf record: Captured and wrote 4.253 MB perf.data (~185801 samples) ]
+
+$ perf timechart
+
+  Written 10.2 seconds of trace to output.svg.
+
  SEE ALSO
  --------
  linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt

index 58d6598a968679fb4c020f9d932443a57f9feee8..c16a09e2f182333cfa710d7f11df76240ad4634f 100644 (file)
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -68,7 +68,9 @@ Default is to monitor all CPUS.
  
  -m <pages>::
  --mmap-pages=<pages>::
-       Number of mmapped data pages.
+       Number of mmap data pages (must be a power of two) or size
+       specification with appended unit character - B/K/M/G. The
+       size is rounded up to have nearest pages power of two value.
  
  -p <pid>::
  --pid=<pid>::
@@ -112,7 +114,8 @@ Default is to monitor all CPUS.
  
  -s::
  --sort::
-       Sort by key(s): pid, comm, dso, symbol, parent, srcline, weight, local_weight.
+       Sort by key(s): pid, comm, dso, symbol, parent, srcline, weight,
+       local_weight, abort, in_tx, transaction
  
  -n::
  --show-nr-samples::
@@ -155,6 +158,14 @@ Default is to monitor all CPUS.
  
         Default: fractal,0.5,callee.
  
+--max-stack::
+       Set the stack depth limit when parsing the callchain, anything
+       beyond the specified depth will be ignored. This is a trade-off
+       between information loss and faster processing especially for
+       workloads that can have a very long callchain stack.
+
+       Default: 127
+
  --ignore-callees=<regex>::
          Ignore callees of the function(s) matching the given regex.
          This has the effect of collecting the callers of each such
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt

index daccd2c0a48fe9c933cf50c870f68aa9bfe253a6..7b0497f95a75f0dafdf079742cec136dcff3ffdc 100644 (file)
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -9,6 +9,7 @@ SYNOPSIS
  --------
  [verse]
  'perf trace'
+'perf trace record'
  
  DESCRIPTION
  -----------
@@ -16,9 +17,14 @@ This command will show the events associated with the target, initially
  syscalls, but other system events like pagefaults, task lifetime events,
  scheduling events, etc.
  
-Initially this is a live mode only tool, but eventually will work with
-perf.data files like the other tools, allowing a detached 'record' from
-analysis phases.
+This is a live mode tool in addition to working with perf.data files like
+the other perf tools. Files can be generated using the 'perf record' command
+but the session needs to include the raw_syscalls events (-e 'raw_syscalls:*').
+Alernatively, the 'perf trace record' can be used as a shortcut to
+automatically include the raw_syscalls events when writing events to a file.
+
+The following options apply to perf trace; options to perf trace record are
+found in the perf record man page.
  
  OPTIONS
  -------
@@ -59,7 +65,9 @@ OPTIONS
  
  -m::
  --mmap-pages=::
-       Number of mmap data pages. Must be a power of two.
+       Number of mmap data pages (must be a power of two) or size
+       specification with appended unit character - B/K/M/G. The
+       size is rounded up to have nearest pages power of two value.
  
  -C::
  --cpu::
@@ -78,6 +86,21 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
  --input
         Process events from a given perf data file.
  
+-T
+--time
+       Print full timestamp rather time relative to first sample.
+
+--comm::
+        Show process COMM right beside its ID, on by default, disable with --no-comm.
+
+--summary::
+       Show a summary of syscalls by thread with min, max, and average times (in
+    msec) and relative stddev.
+
+--tool_stats::
+       Show tool stats such as number of times fd->pathname was discovered thru
+       hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc.
+
  SEE ALSO
  --------
  linkperf:perf-record[1], linkperf:perf-script[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index 64c043b7a43848f17a023dcf9479dbd77f96d7be..4835618a5608892054de87d497e45e644fdeb0da 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -1,819 +1,79 @@
-include ../scripts/Makefile.include
-
-# The default target of this Makefile is...
-all:
-
-include config/utilities.mak
-
-# Define V to have a more verbose compile.
-#
-# Define O to save output files in a separate directory.
-#
-# Define ARCH as name of target architecture if you want cross-builds.
-#
-# Define CROSS_COMPILE as prefix name of compiler if you want cross-builds.
-#
-# Define NO_LIBPERL to disable perl script extension.
-#
-# Define NO_LIBPYTHON to disable python script extension.
-#
-# Define PYTHON to point to the python binary if the default
-# `python' is not correct; for example: PYTHON=python2
-#
-# Define PYTHON_CONFIG to point to the python-config binary if
-# the default `$(PYTHON)-config' is not correct.
  #
-# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
+# This is a simple wrapper Makefile that calls the main Makefile.perf
+# with a -j option to do parallel builds
  #
-# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
+# If you want to invoke the perf build in some non-standard way then
+# you can use the 'make -f Makefile.perf' method to invoke it.
  #
-# Define LDFLAGS=-static to build a static binary.
-#
-# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
-#
-# Define NO_DWARF if you do not want debug-info analysis feature at all.
-#
-# Define WERROR=0 to disable treating any warnings as errors.
-#
-# Define NO_NEWT if you do not want TUI support. (deprecated)
-#
-# Define NO_SLANG if you do not want TUI support.
-#
-# Define NO_GTK2 if you do not want GTK+ GUI support.
+
  #
-# Define NO_DEMANGLE if you do not want C++ symbol demangling.
+# Clear out the built-in rules GNU make defines by default (such as .o targets),
+# so that we pass through all targets to Makefile.perf:
  #
-# Define NO_LIBELF if you do not want libelf dependency (e.g. cross-builds)
+.SUFFIXES:
+
  #
-# Define NO_LIBUNWIND if you do not want libunwind dependency for dwarf
-# backtrace post unwind.
+# We don't want to pass along options like -j:
  #
-# Define NO_BACKTRACE if you do not want stack backtrace debug feature
+unexport MAKEFLAGS
+
  #
-# Define NO_LIBNUMA if you do not want numa perf benchmark
+# Do a parallel build with multiple jobs, based on the number of CPUs online
+# in this system: 'make -j8' on a 8-CPU system, etc.
  #
-# Define NO_LIBAUDIT if you do not want libaudit support
+# (To override it, run 'make JOBS=1' and similar.)
  #
-# Define NO_LIBBIONIC if you do not want bionic support
-
-ifeq ($(srctree),)
-srctree := $(patsubst %/,%,$(dir $(shell pwd)))
-srctree := $(patsubst %/,%,$(dir $(srctree)))
-#$(info Determined 'srctree' to be $(srctree))
-endif
-
-ifneq ($(objtree),)
-#$(info Determined 'objtree' to be $(objtree))
-endif
-
-ifneq ($(OUTPUT),)
-#$(info Determined 'OUTPUT' to be $(OUTPUT))
-endif
-
-$(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-       @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
-
-CC = $(CROSS_COMPILE)gcc
-AR = $(CROSS_COMPILE)ar
-
-RM      = rm -f
-MKDIR   = mkdir
-FIND    = find
-INSTALL = install
-FLEX    = flex
-BISON   = bison
-STRIP   = strip
-
-LK_DIR          = $(srctree)/tools/lib/lk/
-TRACE_EVENT_DIR = $(srctree)/tools/lib/traceevent/
-
-# include config/Makefile by default and rule out
-# non-config cases
-config := 1
-
-NON_CONFIG_TARGETS := clean TAGS tags cscope help
-
-ifdef MAKECMDGOALS
-ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
-  config := 0
-endif
+ifeq ($(JOBS),)
+  JOBS := $(shell grep -c ^processor /proc/cpuinfo 2>/dev/null)
+  ifeq ($(JOBS),)
+    JOBS := 1
+  endif
  endif
  
-ifeq ($(config),1)
-include config/Makefile
+#
+# Only pass canonical directory names as the output directory:
+#
+ifneq ($(O),)
+  FULL_O := $(shell readlink -f $(O) || echo $(O))
  endif
  
-export prefix bindir sharedir sysconfdir
-
-# sparse is architecture-neutral, which means that we need to tell it
-# explicitly what architecture to check for. Fix this up for yours..
-SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
-
-# Guard against environment variables
-BUILTIN_OBJS =
-LIB_H =
-LIB_OBJS =
-PYRF_OBJS =
-SCRIPT_SH =
-
-SCRIPT_SH += perf-archive.sh
-
-grep-libs = $(filter -l%,$(1))
-strip-libs = $(filter-out -l%,$(1))
-
-ifneq ($(OUTPUT),)
-  TE_PATH=$(OUTPUT)
-ifneq ($(subdir),)
-  LK_PATH=$(OUTPUT)/../lib/lk/
-else
-  LK_PATH=$(OUTPUT)
-endif
+#
+# Only accept the 'DEBUG' variable from the command line:
+#
+ifeq ("$(origin DEBUG)", "command line")
+  ifeq ($(DEBUG),)
+    override DEBUG = 0
+  else
+    SET_DEBUG = "DEBUG=$(DEBUG)"
+  endif
  else
-  TE_PATH=$(TRACE_EVENT_DIR)
-  LK_PATH=$(LK_DIR)
+  override DEBUG = 0
  endif
  
-LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
-export LIBTRACEEVENT
-
-LIBLK = $(LK_PATH)liblk.a
-export LIBLK
-
-# python extension build directories
-PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
-PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
-PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/
-export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
+define print_msg
+  @printf '  BUILD:   Doing '\''make \033[33m-j'$(JOBS)'\033[m'\'' parallel build\n'
+endef
  
-python-clean := rm -rf $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
+define make
+  @$(MAKE) -f Makefile.perf --no-print-directory -j$(JOBS) O=$(FULL_O) $(SET_DEBUG) $@
+endef
  
-PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
-PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBLK)
-
-$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
-       $(QUIET_GEN)CFLAGS='$(CFLAGS)' $(PYTHON_WORD) util/setup.py \
-         --quiet build_ext; \
-       mkdir -p $(OUTPUT)python && \
-       cp $(PYTHON_EXTBUILD_LIB)perf.so $(OUTPUT)python/
  #
-# No Perl scripts right now:
+# Needed if no target specified:
  #
-
-SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
+all:
+       $(print_msg)
+       $(make)
  
  #
-# Single 'perf' binary right now:
+# The clean target is not really parallel, don't print the jobs info:
  #
-PROGRAMS += $(OUTPUT)perf
-
-# what 'all' will build and 'install' will install, in perfexecdir
-ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
-
-# what 'all' will build but not install in perfexecdir
-OTHER_PROGRAMS = $(OUTPUT)perf
-
-# Set paths to tools early so that they can be used for version tests.
-ifndef SHELL_PATH
-  SHELL_PATH = /bin/sh
-endif
-ifndef PERL_PATH
-  PERL_PATH = /usr/bin/perl
-endif
-
-export PERL_PATH
-
-$(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
-       $(QUIET_FLEX)$(FLEX) --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) -t util/parse-events.l > $(OUTPUT)util/parse-events-flex.c
-
-$(OUTPUT)util/parse-events-bison.c: util/parse-events.y
-       $(QUIET_BISON)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $(OUTPUT)util/parse-events-bison.c -p parse_events_
-
-$(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
-       $(QUIET_FLEX)$(FLEX) --header-file=$(OUTPUT)util/pmu-flex.h -t util/pmu.l > $(OUTPUT)util/pmu-flex.c
-
-$(OUTPUT)util/pmu-bison.c: util/pmu.y
-       $(QUIET_BISON)$(BISON) -v util/pmu.y -d -o $(OUTPUT)util/pmu-bison.c -p perf_pmu_
-
-$(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
-$(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
-
-LIB_FILE=$(OUTPUT)libperf.a
-
-LIB_H += ../../include/uapi/linux/perf_event.h
-LIB_H += ../../include/linux/rbtree.h
-LIB_H += ../../include/linux/list.h
-LIB_H += ../../include/uapi/linux/const.h
-LIB_H += ../../include/linux/hash.h
-LIB_H += ../../include/linux/stringify.h
-LIB_H += util/include/linux/bitmap.h
-LIB_H += util/include/linux/bitops.h
-LIB_H += util/include/linux/compiler.h
-LIB_H += util/include/linux/const.h
-LIB_H += util/include/linux/ctype.h
-LIB_H += util/include/linux/kernel.h
-LIB_H += util/include/linux/list.h
-LIB_H += util/include/linux/export.h
-LIB_H += util/include/linux/magic.h
-LIB_H += util/include/linux/poison.h
-LIB_H += util/include/linux/prefetch.h
-LIB_H += util/include/linux/rbtree.h
-LIB_H += util/include/linux/rbtree_augmented.h
-LIB_H += util/include/linux/string.h
-LIB_H += util/include/linux/types.h
-LIB_H += util/include/linux/linkage.h
-LIB_H += util/include/asm/asm-offsets.h
-LIB_H += util/include/asm/bug.h
-LIB_H += util/include/asm/byteorder.h
-LIB_H += util/include/asm/hweight.h
-LIB_H += util/include/asm/swab.h
-LIB_H += util/include/asm/system.h
-LIB_H += util/include/asm/uaccess.h
-LIB_H += util/include/dwarf-regs.h
-LIB_H += util/include/asm/dwarf2.h
-LIB_H += util/include/asm/cpufeature.h
-LIB_H += util/include/asm/unistd_32.h
-LIB_H += util/include/asm/unistd_64.h
-LIB_H += perf.h
-LIB_H += util/annotate.h
-LIB_H += util/cache.h
-LIB_H += util/callchain.h
-LIB_H += util/build-id.h
-LIB_H += util/debug.h
-LIB_H += util/sysfs.h
-LIB_H += util/pmu.h
-LIB_H += util/event.h
-LIB_H += util/evsel.h
-LIB_H += util/evlist.h
-LIB_H += util/exec_cmd.h
-LIB_H += util/types.h
-LIB_H += util/levenshtein.h
-LIB_H += util/machine.h
-LIB_H += util/map.h
-LIB_H += util/parse-options.h
-LIB_H += util/parse-events.h
-LIB_H += util/quote.h
-LIB_H += util/util.h
-LIB_H += util/xyarray.h
-LIB_H += util/header.h
-LIB_H += util/help.h
-LIB_H += util/session.h
-LIB_H += util/strbuf.h
-LIB_H += util/strlist.h
-LIB_H += util/strfilter.h
-LIB_H += util/svghelper.h
-LIB_H += util/tool.h
-LIB_H += util/run-command.h
-LIB_H += util/sigchain.h
-LIB_H += util/dso.h
-LIB_H += util/symbol.h
-LIB_H += util/color.h
-LIB_H += util/values.h
-LIB_H += util/sort.h
-LIB_H += util/hist.h
-LIB_H += util/thread.h
-LIB_H += util/thread_map.h
-LIB_H += util/trace-event.h
-LIB_H += util/probe-finder.h
-LIB_H += util/dwarf-aux.h
-LIB_H += util/probe-event.h
-LIB_H += util/pstack.h
-LIB_H += util/cpumap.h
-LIB_H += util/top.h
-LIB_H += $(ARCH_INCLUDE)
-LIB_H += util/cgroup.h
-LIB_H += $(LIB_INCLUDE)traceevent/event-parse.h
-LIB_H += util/target.h
-LIB_H += util/rblist.h
-LIB_H += util/intlist.h
-LIB_H += util/perf_regs.h
-LIB_H += util/unwind.h
-LIB_H += util/vdso.h
-LIB_H += ui/helpline.h
-LIB_H += ui/progress.h
-LIB_H += ui/util.h
-LIB_H += ui/ui.h
-
-LIB_OBJS += $(OUTPUT)util/abspath.o
-LIB_OBJS += $(OUTPUT)util/alias.o
-LIB_OBJS += $(OUTPUT)util/annotate.o
-LIB_OBJS += $(OUTPUT)util/build-id.o
-LIB_OBJS += $(OUTPUT)util/config.o
-LIB_OBJS += $(OUTPUT)util/ctype.o
-LIB_OBJS += $(OUTPUT)util/sysfs.o
-LIB_OBJS += $(OUTPUT)util/pmu.o
-LIB_OBJS += $(OUTPUT)util/environment.o
-LIB_OBJS += $(OUTPUT)util/event.o
-LIB_OBJS += $(OUTPUT)util/evlist.o
-LIB_OBJS += $(OUTPUT)util/evsel.o
-LIB_OBJS += $(OUTPUT)util/exec_cmd.o
-LIB_OBJS += $(OUTPUT)util/help.o
-LIB_OBJS += $(OUTPUT)util/levenshtein.o
-LIB_OBJS += $(OUTPUT)util/parse-options.o
-LIB_OBJS += $(OUTPUT)util/parse-events.o
-LIB_OBJS += $(OUTPUT)util/path.o
-LIB_OBJS += $(OUTPUT)util/rbtree.o
-LIB_OBJS += $(OUTPUT)util/bitmap.o
-LIB_OBJS += $(OUTPUT)util/hweight.o
-LIB_OBJS += $(OUTPUT)util/run-command.o
-LIB_OBJS += $(OUTPUT)util/quote.o
-LIB_OBJS += $(OUTPUT)util/strbuf.o
-LIB_OBJS += $(OUTPUT)util/string.o
-LIB_OBJS += $(OUTPUT)util/strlist.o
-LIB_OBJS += $(OUTPUT)util/strfilter.o
-LIB_OBJS += $(OUTPUT)util/top.o
-LIB_OBJS += $(OUTPUT)util/usage.o
-LIB_OBJS += $(OUTPUT)util/wrapper.o
-LIB_OBJS += $(OUTPUT)util/sigchain.o
-LIB_OBJS += $(OUTPUT)util/dso.o
-LIB_OBJS += $(OUTPUT)util/symbol.o
-LIB_OBJS += $(OUTPUT)util/symbol-elf.o
-LIB_OBJS += $(OUTPUT)util/color.o
-LIB_OBJS += $(OUTPUT)util/pager.o
-LIB_OBJS += $(OUTPUT)util/header.o
-LIB_OBJS += $(OUTPUT)util/callchain.o
-LIB_OBJS += $(OUTPUT)util/values.o
-LIB_OBJS += $(OUTPUT)util/debug.o
-LIB_OBJS += $(OUTPUT)util/machine.o
-LIB_OBJS += $(OUTPUT)util/map.o
-LIB_OBJS += $(OUTPUT)util/pstack.o
-LIB_OBJS += $(OUTPUT)util/session.o
-LIB_OBJS += $(OUTPUT)util/thread.o
-LIB_OBJS += $(OUTPUT)util/thread_map.o
-LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
-LIB_OBJS += $(OUTPUT)util/parse-events-flex.o
-LIB_OBJS += $(OUTPUT)util/parse-events-bison.o
-LIB_OBJS += $(OUTPUT)util/pmu-flex.o
-LIB_OBJS += $(OUTPUT)util/pmu-bison.o
-LIB_OBJS += $(OUTPUT)util/trace-event-read.o
-LIB_OBJS += $(OUTPUT)util/trace-event-info.o
-LIB_OBJS += $(OUTPUT)util/trace-event-scripting.o
-LIB_OBJS += $(OUTPUT)util/svghelper.o
-LIB_OBJS += $(OUTPUT)util/sort.o
-LIB_OBJS += $(OUTPUT)util/hist.o
-LIB_OBJS += $(OUTPUT)util/probe-event.o
-LIB_OBJS += $(OUTPUT)util/util.o
-LIB_OBJS += $(OUTPUT)util/xyarray.o
-LIB_OBJS += $(OUTPUT)util/cpumap.o
-LIB_OBJS += $(OUTPUT)util/cgroup.o
-LIB_OBJS += $(OUTPUT)util/target.o
-LIB_OBJS += $(OUTPUT)util/rblist.o
-LIB_OBJS += $(OUTPUT)util/intlist.o
-LIB_OBJS += $(OUTPUT)util/vdso.o
-LIB_OBJS += $(OUTPUT)util/stat.o
-LIB_OBJS += $(OUTPUT)util/record.o
-
-LIB_OBJS += $(OUTPUT)ui/setup.o
-LIB_OBJS += $(OUTPUT)ui/helpline.o
-LIB_OBJS += $(OUTPUT)ui/progress.o
-LIB_OBJS += $(OUTPUT)ui/util.o
-LIB_OBJS += $(OUTPUT)ui/hist.o
-LIB_OBJS += $(OUTPUT)ui/stdio/hist.o
-
-LIB_OBJS += $(OUTPUT)arch/common.o
-
-LIB_OBJS += $(OUTPUT)tests/parse-events.o
-LIB_OBJS += $(OUTPUT)tests/dso-data.o
-LIB_OBJS += $(OUTPUT)tests/attr.o
-LIB_OBJS += $(OUTPUT)tests/vmlinux-kallsyms.o
-LIB_OBJS += $(OUTPUT)tests/open-syscall.o
-LIB_OBJS += $(OUTPUT)tests/open-syscall-all-cpus.o
-LIB_OBJS += $(OUTPUT)tests/open-syscall-tp-fields.o
-LIB_OBJS += $(OUTPUT)tests/mmap-basic.o
-LIB_OBJS += $(OUTPUT)tests/perf-record.o
-LIB_OBJS += $(OUTPUT)tests/rdpmc.o
-LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o
-LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o
-LIB_OBJS += $(OUTPUT)tests/pmu.o
-LIB_OBJS += $(OUTPUT)tests/hists_link.o
-LIB_OBJS += $(OUTPUT)tests/python-use.o
-LIB_OBJS += $(OUTPUT)tests/bp_signal.o
-LIB_OBJS += $(OUTPUT)tests/bp_signal_overflow.o
-LIB_OBJS += $(OUTPUT)tests/task-exit.o
-LIB_OBJS += $(OUTPUT)tests/sw-clock.o
-ifeq ($(ARCH),x86)
-LIB_OBJS += $(OUTPUT)tests/perf-time-to-tsc.o
-endif
-LIB_OBJS += $(OUTPUT)tests/code-reading.o
-LIB_OBJS += $(OUTPUT)tests/sample-parsing.o
-LIB_OBJS += $(OUTPUT)tests/parse-no-sample-id-all.o
-
-BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
-BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
-# Benchmark modules
-BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
-BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
-ifeq ($(RAW_ARCH),x86_64)
-BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
-BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o
-endif
-BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
-BUILTIN_OBJS += $(OUTPUT)bench/mem-memset.o
-
-BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
-BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
-BUILTIN_OBJS += $(OUTPUT)builtin-help.o
-BUILTIN_OBJS += $(OUTPUT)builtin-sched.o
-BUILTIN_OBJS += $(OUTPUT)builtin-buildid-list.o
-BUILTIN_OBJS += $(OUTPUT)builtin-buildid-cache.o
-BUILTIN_OBJS += $(OUTPUT)builtin-list.o
-BUILTIN_OBJS += $(OUTPUT)builtin-record.o
-BUILTIN_OBJS += $(OUTPUT)builtin-report.o
-BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
-BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
-BUILTIN_OBJS += $(OUTPUT)builtin-top.o
-BUILTIN_OBJS += $(OUTPUT)builtin-script.o
-BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
-BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
-BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
-BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
-BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
-BUILTIN_OBJS += $(OUTPUT)tests/builtin-test.o
-BUILTIN_OBJS += $(OUTPUT)builtin-mem.o
-
-PERFLIBS = $(LIB_FILE) $(LIBLK) $(LIBTRACEEVENT)
-
-# We choose to avoid "if .. else if .. else .. endif endif"
-# because maintaining the nesting to match is a pain.  If
-# we had "elif" things would have been much nicer...
-
--include arch/$(ARCH)/Makefile
-
-ifneq ($(OUTPUT),)
-  CFLAGS += -I$(OUTPUT)
-endif
-
-ifdef NO_LIBELF
-EXTLIBS := $(filter-out -lelf,$(EXTLIBS))
-
-# Remove ELF/DWARF dependent codes
-LIB_OBJS := $(filter-out $(OUTPUT)util/symbol-elf.o,$(LIB_OBJS))
-LIB_OBJS := $(filter-out $(OUTPUT)util/dwarf-aux.o,$(LIB_OBJS))
-LIB_OBJS := $(filter-out $(OUTPUT)util/probe-event.o,$(LIB_OBJS))
-LIB_OBJS := $(filter-out $(OUTPUT)util/probe-finder.o,$(LIB_OBJS))
-
-BUILTIN_OBJS := $(filter-out $(OUTPUT)builtin-probe.o,$(BUILTIN_OBJS))
-
-# Use minimal symbol handling
-LIB_OBJS += $(OUTPUT)util/symbol-minimal.o
-
-else # NO_LIBELF
-ifndef NO_DWARF
-  LIB_OBJS += $(OUTPUT)util/probe-finder.o
-  LIB_OBJS += $(OUTPUT)util/dwarf-aux.o
-endif # NO_DWARF
-endif # NO_LIBELF
-
-ifndef NO_LIBUNWIND
-  LIB_OBJS += $(OUTPUT)util/unwind.o
-endif
-LIB_OBJS += $(OUTPUT)tests/keep-tracking.o
-
-ifndef NO_LIBAUDIT
-  BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
-endif
-
-ifndef NO_SLANG
-  LIB_OBJS += $(OUTPUT)ui/browser.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/annotate.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/hists.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/map.o
-  LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o
-  LIB_OBJS += $(OUTPUT)ui/tui/setup.o
-  LIB_OBJS += $(OUTPUT)ui/tui/util.o
-  LIB_OBJS += $(OUTPUT)ui/tui/helpline.o
-  LIB_OBJS += $(OUTPUT)ui/tui/progress.o
-  LIB_H += ui/browser.h
-  LIB_H += ui/browsers/map.h
-  LIB_H += ui/keysyms.h
-  LIB_H += ui/libslang.h
-endif
-
-ifndef NO_GTK2
-  LIB_OBJS += $(OUTPUT)ui/gtk/browser.o
-  LIB_OBJS += $(OUTPUT)ui/gtk/hists.o
-  LIB_OBJS += $(OUTPUT)ui/gtk/setup.o
-  LIB_OBJS += $(OUTPUT)ui/gtk/util.o
-  LIB_OBJS += $(OUTPUT)ui/gtk/helpline.o
-  LIB_OBJS += $(OUTPUT)ui/gtk/progress.o
-  LIB_OBJS += $(OUTPUT)ui/gtk/annotate.o
-endif
-
-ifndef NO_LIBPERL
-  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
-  LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
-endif
-
-ifndef NO_LIBPYTHON
-  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
-  LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
-endif
-
-ifeq ($(NO_PERF_REGS),0)
-  ifeq ($(ARCH),x86)
-    LIB_H += arch/x86/include/perf_regs.h
-  endif
-endif
-
-ifndef NO_LIBNUMA
-  BUILTIN_OBJS += $(OUTPUT)bench/numa.o
-endif
-
-ifdef ASCIIDOC8
-  export ASCIIDOC8
-endif
-
-LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive -Wl,--start-group $(EXTLIBS) -Wl,--end-group
-
-export INSTALL SHELL_PATH
-
-### Build rules
-
-SHELL = $(SHELL_PATH)
-
-all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
-
-please_set_SHELL_PATH_to_a_more_modern_shell:
-       @$$(:)
-
-shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
-
-strip: $(PROGRAMS) $(OUTPUT)perf
-       $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
-
-$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \
-               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-               $(CFLAGS) -c $(filter %.c,$^) -o $@
-
-$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-       $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OUTPUT)perf.o \
-               $(BUILTIN_OBJS) $(LIBS) -o $@
-
-$(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-               '-DPERF_MAN_PATH="$(mandir_SQ)"' \
-               '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
-
-$(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-               '-DPERF_MAN_PATH="$(mandir_SQ)"' \
-               '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
-
-$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
-
-$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
-       $(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
-
-$(SCRIPTS) : % : %.sh
-       $(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
-
-# These can record PERF_VERSION
-$(OUTPUT)perf.o perf.spec \
-       $(SCRIPTS) \
-       : $(OUTPUT)PERF-VERSION-FILE
-
-.SUFFIXES:
-.SUFFIXES: .o .c .S .s
-
-# These two need to be here so that when O= is not used they take precedence
-# over the general rule for .o
-
-$(OUTPUT)util/%-flex.o: $(OUTPUT)util/%-flex.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -w $<
-
-$(OUTPUT)util/%-bison.o: $(OUTPUT)util/%-bison.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $<
-
-$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
-$(OUTPUT)%.i: %.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
-$(OUTPUT)%.s: %.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -S $(CFLAGS) $<
-$(OUTPUT)%.o: %.S
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
-$(OUTPUT)%.s: %.S
-       $(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
-
-$(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-               '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
-               '-DPREFIX="$(prefix_SQ)"' \
-               $<
-
-$(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-               '-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \
-               $<
-
-$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
-               -DPYTHONPATH='"$(OUTPUT)python"' \
-               -DPYTHON='"$(PYTHON_WORD)"' \
-               $<
-
-$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-$(OUTPUT)ui/browser.o: ui/browser.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)ui/browsers/annotate.o: ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+clean:
+       $(make)
  
-$(OUTPUT)ui/browsers/hists.o: ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)ui/browsers/map.o: ui/browsers/map.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)ui/browsers/scripts.o: ui/browsers/scripts.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
-$(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-$(OUTPUT)util/parse-events.o: util/parse-events.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-redundant-decls $<
-
-$(OUTPUT)util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-undef -Wno-switch-default $<
-
-$(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-undef -Wno-switch-default $<
-
-$(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
-
-$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
-       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
-
-$(OUTPUT)perf-%: %.o $(PERFLIBS)
-       $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $(LDFLAGS) $(filter %.o,$^) $(LIBS)
-
-$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
-$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
-
-# we compile into subdirectories. if the target directory is not the source directory, they might not exists. So
-# we depend the various files onto their directories.
-DIRECTORY_DEPS = $(LIB_OBJS) $(BUILTIN_OBJS) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h
-$(DIRECTORY_DEPS): | $(sort $(dir $(DIRECTORY_DEPS)))
-# In the second step, we make a rule to actually create these directories
-$(sort $(dir $(DIRECTORY_DEPS))):
-       $(QUIET_MKDIR)$(MKDIR) -p $@ 2>/dev/null
-
-$(LIB_FILE): $(LIB_OBJS)
-       $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
-
-# libtraceevent.a
-$(LIBTRACEEVENT):
-       $(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) libtraceevent.a
-
-$(LIBTRACEEVENT)-clean:
-       $(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) clean
-
-# if subdir is set, we've been called from above so target has been built
-# already
-$(LIBLK):
-ifeq ($(subdir),)
-       $(QUIET_SUBDIR0)$(LK_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) liblk.a
-endif
-
-$(LIBLK)-clean:
-ifeq ($(subdir),)
-       $(QUIET_SUBDIR0)$(LK_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) clean
-endif
-
-help:
-       @echo 'Perf make targets:'
-       @echo '  doc            - make *all* documentation (see below)'
-       @echo '  man            - make manpage documentation (access with man <foo>)'
-       @echo '  html           - make html documentation'
-       @echo '  info           - make GNU info documentation (access with info <foo>)'
-       @echo '  pdf            - make pdf documentation'
-       @echo '  TAGS           - use etags to make tag information for source browsing'
-       @echo '  tags           - use ctags to make tag information for source browsing'
-       @echo '  cscope - use cscope to make interactive browsing database'
-       @echo ''
-       @echo 'Perf install targets:'
-       @echo '  NOTE: documentation build requires asciidoc, xmlto packages to be installed'
-       @echo '  HINT: use "make prefix=<path> <install target>" to install to a particular'
-       @echo '        path like make prefix=/usr/local install install-doc'
-       @echo '  install        - install compiled binaries'
-       @echo '  install-doc    - install *all* documentation'
-       @echo '  install-man    - install manpage documentation'
-       @echo '  install-html   - install html documentation'
-       @echo '  install-info   - install GNU info documentation'
-       @echo '  install-pdf    - install pdf documentation'
-       @echo ''
-       @echo '  quick-install-doc      - alias for quick-install-man'
-       @echo '  quick-install-man      - install the documentation quickly'
-       @echo '  quick-install-html     - install the html documentation quickly'
-       @echo ''
-       @echo 'Perf maintainer targets:'
-       @echo '  clean                  - clean all binary objects and build output'
-
-
-DOC_TARGETS := doc man html info pdf
-
-INSTALL_DOC_TARGETS := $(patsubst %,install-%,$(DOC_TARGETS)) try-install-man
-INSTALL_DOC_TARGETS += quick-install-doc quick-install-man quick-install-html
-
-# 'make doc' should call 'make -C Documentation all'
-$(DOC_TARGETS):
-       $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:doc=all)
-
-TAGS:
-       $(RM) TAGS
-       $(FIND) . -name '*.[hcS]' -print | xargs etags -a
-
-tags:
-       $(RM) tags
-       $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
-
-cscope:
-       $(RM) cscope*
-       $(FIND) . -name '*.[hcS]' -print | xargs cscope -b
-
-### Detect prefix changes
-TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):\
-             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
-
-$(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS
-       @FLAGS='$(TRACK_CFLAGS)'; \
-           if test x"$$FLAGS" != x"`cat $(OUTPUT)PERF-CFLAGS 2>/dev/null`" ; then \
-               echo 1>&2 "    * new build flags or prefix"; \
-               echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \
-            fi
-
-### Testing rules
-
-# GNU make supports exporting all variables by "export" without parameters.
-# However, the environment gets quite big, and some programs have problems
-# with that.
-
-check: $(OUTPUT)common-cmds.h
-       if sparse; \
-       then \
-               for i in *.c */*.c; \
-               do \
-                       sparse $(CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
-               done; \
-       else \
-               exit 1; \
-       fi
-
-### Installation rules
-
-install-bin: all
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-       $(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-       $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-ifndef NO_LIBPERL
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
-       $(INSTALL) scripts/perl/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
-       $(INSTALL) scripts/perl/*.pl -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl'
-       $(INSTALL) scripts/perl/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
-endif
-ifndef NO_LIBPYTHON
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
-       $(INSTALL) scripts/python/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'
-       $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'
-       $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
-endif
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'
-       $(INSTALL) bash_completion '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'
-       $(INSTALL) tests/attr.py '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'
-       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
-       $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
-
-install: install-bin try-install-man
-
-install-python_ext:
-       $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
-
-# 'make install-doc' should call 'make -C Documentation install'
-$(INSTALL_DOC_TARGETS):
-       $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:-doc=)
-
-### Cleaning rules
-
-clean: $(LIBTRACEEVENT)-clean $(LIBLK)-clean
-       $(RM) $(LIB_OBJS) $(BUILTIN_OBJS) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf.o $(LANG_BINDINGS)
-       $(RM) $(ALL_PROGRAMS) perf
-       $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope*
-       $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
-       $(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS
-       $(RM) $(OUTPUT)util/*-bison*
-       $(RM) $(OUTPUT)util/*-flex*
-       $(python-clean)
-
-.PHONY: all install clean strip $(LIBTRACEEVENT) $(LIBLK)
-.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
-.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
+#
+# All other targets get passed through:
+#
+%:
+       $(print_msg)
+       $(make)
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf

new file mode 100644 (file)

index 0000000..326a26e
--- /dev/null
+++ b/tools/perf/Makefile.perf
@@ -0,0 +1,888 @@
+include ../scripts/Makefile.include
+
+# The default target of this Makefile is...
+all:
+
+include config/utilities.mak
+
+# Define V to have a more verbose compile.
+#
+# Define O to save output files in a separate directory.
+#
+# Define ARCH as name of target architecture if you want cross-builds.
+#
+# Define CROSS_COMPILE as prefix name of compiler if you want cross-builds.
+#
+# Define NO_LIBPERL to disable perl script extension.
+#
+# Define NO_LIBPYTHON to disable python script extension.
+#
+# Define PYTHON to point to the python binary if the default
+# `python' is not correct; for example: PYTHON=python2
+#
+# Define PYTHON_CONFIG to point to the python-config binary if
+# the default `$(PYTHON)-config' is not correct.
+#
+# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
+#
+# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
+#
+# Define LDFLAGS=-static to build a static binary.
+#
+# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
+#
+# Define NO_DWARF if you do not want debug-info analysis feature at all.
+#
+# Define WERROR=0 to disable treating any warnings as errors.
+#
+# Define NO_NEWT if you do not want TUI support. (deprecated)
+#
+# Define NO_SLANG if you do not want TUI support.
+#
+# Define NO_GTK2 if you do not want GTK+ GUI support.
+#
+# Define NO_DEMANGLE if you do not want C++ symbol demangling.
+#
+# Define NO_LIBELF if you do not want libelf dependency (e.g. cross-builds)
+#
+# Define NO_LIBUNWIND if you do not want libunwind dependency for dwarf
+# backtrace post unwind.
+#
+# Define NO_BACKTRACE if you do not want stack backtrace debug feature
+#
+# Define NO_LIBNUMA if you do not want numa perf benchmark
+#
+# Define NO_LIBAUDIT if you do not want libaudit support
+#
+# Define NO_LIBBIONIC if you do not want bionic support
+
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
+ifneq ($(objtree),)
+#$(info Determined 'objtree' to be $(objtree))
+endif
+
+ifneq ($(OUTPUT),)
+#$(info Determined 'OUTPUT' to be $(OUTPUT))
+endif
+
+$(OUTPUT)PERF-VERSION-FILE: ../../.git/HEAD
+       @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
+       @touch $(OUTPUT)PERF-VERSION-FILE
+
+CC = $(CROSS_COMPILE)gcc
+AR = $(CROSS_COMPILE)ar
+
+RM      = rm -f
+LN      = ln -f
+MKDIR   = mkdir
+FIND    = find
+INSTALL = install
+FLEX    = flex
+BISON   = bison
+STRIP   = strip
+
+LK_DIR          = $(srctree)/tools/lib/lk/
+TRACE_EVENT_DIR = $(srctree)/tools/lib/traceevent/
+
+# include config/Makefile by default and rule out
+# non-config cases
+config := 1
+
+NON_CONFIG_TARGETS := clean TAGS tags cscope help
+
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
+  config := 0
+endif
+endif
+
+ifeq ($(config),1)
+include config/Makefile
+endif
+
+export prefix bindir sharedir sysconfdir
+
+# sparse is architecture-neutral, which means that we need to tell it
+# explicitly what architecture to check for. Fix this up for yours..
+SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
+
+# Guard against environment variables
+BUILTIN_OBJS =
+LIB_H =
+LIB_OBJS =
+GTK_OBJS =
+PYRF_OBJS =
+SCRIPT_SH =
+
+SCRIPT_SH += perf-archive.sh
+
+grep-libs = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
+ifneq ($(OUTPUT),)
+  TE_PATH=$(OUTPUT)
+ifneq ($(subdir),)
+  LK_PATH=$(OUTPUT)/../lib/lk/
+else
+  LK_PATH=$(OUTPUT)
+endif
+else
+  TE_PATH=$(TRACE_EVENT_DIR)
+  LK_PATH=$(LK_DIR)
+endif
+
+LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
+export LIBTRACEEVENT
+
+LIBLK = $(LK_PATH)liblk.a
+export LIBLK
+
+# python extension build directories
+PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
+PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
+PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/
+export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
+
+python-clean := $(call QUIET_CLEAN, python) $(RM) -r $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
+
+PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
+PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBLK)
+
+$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
+       $(QUIET_GEN)CFLAGS='$(CFLAGS)' $(PYTHON_WORD) util/setup.py \
+         --quiet build_ext; \
+       mkdir -p $(OUTPUT)python && \
+       cp $(PYTHON_EXTBUILD_LIB)perf.so $(OUTPUT)python/
+#
+# No Perl scripts right now:
+#
+
+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
+
+#
+# Single 'perf' binary right now:
+#
+PROGRAMS += $(OUTPUT)perf
+
+# what 'all' will build and 'install' will install, in perfexecdir
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
+
+# what 'all' will build but not install in perfexecdir
+OTHER_PROGRAMS = $(OUTPUT)perf
+
+# Set paths to tools early so that they can be used for version tests.
+ifndef SHELL_PATH
+  SHELL_PATH = /bin/sh
+endif
+ifndef PERL_PATH
+  PERL_PATH = /usr/bin/perl
+endif
+
+export PERL_PATH
+
+$(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
+       $(QUIET_FLEX)$(FLEX) --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) -t util/parse-events.l > $(OUTPUT)util/parse-events-flex.c
+
+$(OUTPUT)util/parse-events-bison.c: util/parse-events.y
+       $(QUIET_BISON)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $(OUTPUT)util/parse-events-bison.c -p parse_events_
+
+$(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
+       $(QUIET_FLEX)$(FLEX) --header-file=$(OUTPUT)util/pmu-flex.h -t util/pmu.l > $(OUTPUT)util/pmu-flex.c
+
+$(OUTPUT)util/pmu-bison.c: util/pmu.y
+       $(QUIET_BISON)$(BISON) -v util/pmu.y -d -o $(OUTPUT)util/pmu-bison.c -p perf_pmu_
+
+$(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
+$(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
+
+LIB_FILE=$(OUTPUT)libperf.a
+
+LIB_H += ../../include/uapi/linux/perf_event.h
+LIB_H += ../../include/linux/rbtree.h
+LIB_H += ../../include/linux/list.h
+LIB_H += ../../include/uapi/linux/const.h
+LIB_H += ../../include/linux/hash.h
+LIB_H += ../../include/linux/stringify.h
+LIB_H += util/include/linux/bitmap.h
+LIB_H += util/include/linux/bitops.h
+LIB_H += util/include/linux/compiler.h
+LIB_H += util/include/linux/const.h
+LIB_H += util/include/linux/ctype.h
+LIB_H += util/include/linux/kernel.h
+LIB_H += util/include/linux/list.h
+LIB_H += util/include/linux/export.h
+LIB_H += util/include/linux/magic.h
+LIB_H += util/include/linux/poison.h
+LIB_H += util/include/linux/prefetch.h
+LIB_H += util/include/linux/rbtree.h
+LIB_H += util/include/linux/rbtree_augmented.h
+LIB_H += util/include/linux/string.h
+LIB_H += util/include/linux/types.h
+LIB_H += util/include/linux/linkage.h
+LIB_H += util/include/asm/asm-offsets.h
+LIB_H += util/include/asm/bug.h
+LIB_H += util/include/asm/byteorder.h
+LIB_H += util/include/asm/hweight.h
+LIB_H += util/include/asm/swab.h
+LIB_H += util/include/asm/system.h
+LIB_H += util/include/asm/uaccess.h
+LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
+LIB_H += util/include/asm/unistd_32.h
+LIB_H += util/include/asm/unistd_64.h
+LIB_H += perf.h
+LIB_H += util/annotate.h
+LIB_H += util/cache.h
+LIB_H += util/callchain.h
+LIB_H += util/build-id.h
+LIB_H += util/debug.h
+LIB_H += util/sysfs.h
+LIB_H += util/pmu.h
+LIB_H += util/event.h
+LIB_H += util/evsel.h
+LIB_H += util/evlist.h
+LIB_H += util/exec_cmd.h
+LIB_H += util/types.h
+LIB_H += util/levenshtein.h
+LIB_H += util/machine.h
+LIB_H += util/map.h
+LIB_H += util/parse-options.h
+LIB_H += util/parse-events.h
+LIB_H += util/quote.h
+LIB_H += util/util.h
+LIB_H += util/xyarray.h
+LIB_H += util/header.h
+LIB_H += util/help.h
+LIB_H += util/session.h
+LIB_H += util/strbuf.h
+LIB_H += util/strlist.h
+LIB_H += util/strfilter.h
+LIB_H += util/svghelper.h
+LIB_H += util/tool.h
+LIB_H += util/run-command.h
+LIB_H += util/sigchain.h
+LIB_H += util/dso.h
+LIB_H += util/symbol.h
+LIB_H += util/color.h
+LIB_H += util/values.h
+LIB_H += util/sort.h
+LIB_H += util/hist.h
+LIB_H += util/thread.h
+LIB_H += util/thread_map.h
+LIB_H += util/trace-event.h
+LIB_H += util/probe-finder.h
+LIB_H += util/dwarf-aux.h
+LIB_H += util/probe-event.h
+LIB_H += util/pstack.h
+LIB_H += util/cpumap.h
+LIB_H += util/top.h
+LIB_H += $(ARCH_INCLUDE)
+LIB_H += util/cgroup.h
+LIB_H += $(LIB_INCLUDE)traceevent/event-parse.h
+LIB_H += util/target.h
+LIB_H += util/rblist.h
+LIB_H += util/intlist.h
+LIB_H += util/perf_regs.h
+LIB_H += util/unwind.h
+LIB_H += util/vdso.h
+LIB_H += ui/helpline.h
+LIB_H += ui/progress.h
+LIB_H += ui/util.h
+LIB_H += ui/ui.h
+
+LIB_OBJS += $(OUTPUT)util/abspath.o
+LIB_OBJS += $(OUTPUT)util/alias.o
+LIB_OBJS += $(OUTPUT)util/annotate.o
+LIB_OBJS += $(OUTPUT)util/build-id.o
+LIB_OBJS += $(OUTPUT)util/config.o
+LIB_OBJS += $(OUTPUT)util/ctype.o
+LIB_OBJS += $(OUTPUT)util/sysfs.o
+LIB_OBJS += $(OUTPUT)util/pmu.o
+LIB_OBJS += $(OUTPUT)util/environment.o
+LIB_OBJS += $(OUTPUT)util/event.o
+LIB_OBJS += $(OUTPUT)util/evlist.o
+LIB_OBJS += $(OUTPUT)util/evsel.o
+LIB_OBJS += $(OUTPUT)util/exec_cmd.o
+LIB_OBJS += $(OUTPUT)util/help.o
+LIB_OBJS += $(OUTPUT)util/levenshtein.o
+LIB_OBJS += $(OUTPUT)util/parse-options.o
+LIB_OBJS += $(OUTPUT)util/parse-events.o
+LIB_OBJS += $(OUTPUT)util/path.o
+LIB_OBJS += $(OUTPUT)util/rbtree.o
+LIB_OBJS += $(OUTPUT)util/bitmap.o
+LIB_OBJS += $(OUTPUT)util/hweight.o
+LIB_OBJS += $(OUTPUT)util/run-command.o
+LIB_OBJS += $(OUTPUT)util/quote.o
+LIB_OBJS += $(OUTPUT)util/strbuf.o
+LIB_OBJS += $(OUTPUT)util/string.o
+LIB_OBJS += $(OUTPUT)util/strlist.o
+LIB_OBJS += $(OUTPUT)util/strfilter.o
+LIB_OBJS += $(OUTPUT)util/top.o
+LIB_OBJS += $(OUTPUT)util/usage.o
+LIB_OBJS += $(OUTPUT)util/wrapper.o
+LIB_OBJS += $(OUTPUT)util/sigchain.o
+LIB_OBJS += $(OUTPUT)util/dso.o
+LIB_OBJS += $(OUTPUT)util/symbol.o
+LIB_OBJS += $(OUTPUT)util/symbol-elf.o
+LIB_OBJS += $(OUTPUT)util/color.o
+LIB_OBJS += $(OUTPUT)util/pager.o
+LIB_OBJS += $(OUTPUT)util/header.o
+LIB_OBJS += $(OUTPUT)util/callchain.o
+LIB_OBJS += $(OUTPUT)util/values.o
+LIB_OBJS += $(OUTPUT)util/debug.o
+LIB_OBJS += $(OUTPUT)util/machine.o
+LIB_OBJS += $(OUTPUT)util/map.o
+LIB_OBJS += $(OUTPUT)util/pstack.o
+LIB_OBJS += $(OUTPUT)util/session.o
+LIB_OBJS += $(OUTPUT)util/thread.o
+LIB_OBJS += $(OUTPUT)util/thread_map.o
+LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
+LIB_OBJS += $(OUTPUT)util/parse-events-flex.o
+LIB_OBJS += $(OUTPUT)util/parse-events-bison.o
+LIB_OBJS += $(OUTPUT)util/pmu-flex.o
+LIB_OBJS += $(OUTPUT)util/pmu-bison.o
+LIB_OBJS += $(OUTPUT)util/trace-event-read.o
+LIB_OBJS += $(OUTPUT)util/trace-event-info.o
+LIB_OBJS += $(OUTPUT)util/trace-event-scripting.o
+LIB_OBJS += $(OUTPUT)util/svghelper.o
+LIB_OBJS += $(OUTPUT)util/sort.o
+LIB_OBJS += $(OUTPUT)util/hist.o
+LIB_OBJS += $(OUTPUT)util/probe-event.o
+LIB_OBJS += $(OUTPUT)util/util.o
+LIB_OBJS += $(OUTPUT)util/xyarray.o
+LIB_OBJS += $(OUTPUT)util/cpumap.o
+LIB_OBJS += $(OUTPUT)util/cgroup.o
+LIB_OBJS += $(OUTPUT)util/target.o
+LIB_OBJS += $(OUTPUT)util/rblist.o
+LIB_OBJS += $(OUTPUT)util/intlist.o
+LIB_OBJS += $(OUTPUT)util/vdso.o
+LIB_OBJS += $(OUTPUT)util/stat.o
+LIB_OBJS += $(OUTPUT)util/record.o
+LIB_OBJS += $(OUTPUT)util/srcline.o
+LIB_OBJS += $(OUTPUT)util/data.o
+
+LIB_OBJS += $(OUTPUT)ui/setup.o
+LIB_OBJS += $(OUTPUT)ui/helpline.o
+LIB_OBJS += $(OUTPUT)ui/progress.o
+LIB_OBJS += $(OUTPUT)ui/util.o
+LIB_OBJS += $(OUTPUT)ui/hist.o
+LIB_OBJS += $(OUTPUT)ui/stdio/hist.o
+
+LIB_OBJS += $(OUTPUT)arch/common.o
+
+LIB_OBJS += $(OUTPUT)tests/parse-events.o
+LIB_OBJS += $(OUTPUT)tests/dso-data.o
+LIB_OBJS += $(OUTPUT)tests/attr.o
+LIB_OBJS += $(OUTPUT)tests/vmlinux-kallsyms.o
+LIB_OBJS += $(OUTPUT)tests/open-syscall.o
+LIB_OBJS += $(OUTPUT)tests/open-syscall-all-cpus.o
+LIB_OBJS += $(OUTPUT)tests/open-syscall-tp-fields.o
+LIB_OBJS += $(OUTPUT)tests/mmap-basic.o
+LIB_OBJS += $(OUTPUT)tests/perf-record.o
+LIB_OBJS += $(OUTPUT)tests/rdpmc.o
+LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o
+LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o
+LIB_OBJS += $(OUTPUT)tests/pmu.o
+LIB_OBJS += $(OUTPUT)tests/hists_link.o
+LIB_OBJS += $(OUTPUT)tests/python-use.o
+LIB_OBJS += $(OUTPUT)tests/bp_signal.o
+LIB_OBJS += $(OUTPUT)tests/bp_signal_overflow.o
+LIB_OBJS += $(OUTPUT)tests/task-exit.o
+LIB_OBJS += $(OUTPUT)tests/sw-clock.o
+ifeq ($(ARCH),x86)
+LIB_OBJS += $(OUTPUT)tests/perf-time-to-tsc.o
+endif
+LIB_OBJS += $(OUTPUT)tests/code-reading.o
+LIB_OBJS += $(OUTPUT)tests/sample-parsing.o
+LIB_OBJS += $(OUTPUT)tests/parse-no-sample-id-all.o
+
+BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
+BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
+# Benchmark modules
+BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
+BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o
+endif
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memset.o
+
+BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
+BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
+BUILTIN_OBJS += $(OUTPUT)builtin-help.o
+BUILTIN_OBJS += $(OUTPUT)builtin-sched.o
+BUILTIN_OBJS += $(OUTPUT)builtin-buildid-list.o
+BUILTIN_OBJS += $(OUTPUT)builtin-buildid-cache.o
+BUILTIN_OBJS += $(OUTPUT)builtin-list.o
+BUILTIN_OBJS += $(OUTPUT)builtin-record.o
+BUILTIN_OBJS += $(OUTPUT)builtin-report.o
+BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
+BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
+BUILTIN_OBJS += $(OUTPUT)builtin-top.o
+BUILTIN_OBJS += $(OUTPUT)builtin-script.o
+BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
+BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
+BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
+BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
+BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
+BUILTIN_OBJS += $(OUTPUT)tests/builtin-test.o
+BUILTIN_OBJS += $(OUTPUT)builtin-mem.o
+
+PERFLIBS = $(LIB_FILE) $(LIBLK) $(LIBTRACEEVENT)
+
+# We choose to avoid "if .. else if .. else .. endif endif"
+# because maintaining the nesting to match is a pain.  If
+# we had "elif" things would have been much nicer...
+
+-include arch/$(ARCH)/Makefile
+
+ifneq ($(OUTPUT),)
+  CFLAGS += -I$(OUTPUT)
+endif
+
+ifdef NO_LIBELF
+EXTLIBS := $(filter-out -lelf,$(EXTLIBS))
+
+# Remove ELF/DWARF dependent codes
+LIB_OBJS := $(filter-out $(OUTPUT)util/symbol-elf.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/dwarf-aux.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/probe-event.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/probe-finder.o,$(LIB_OBJS))
+
+BUILTIN_OBJS := $(filter-out $(OUTPUT)builtin-probe.o,$(BUILTIN_OBJS))
+
+# Use minimal symbol handling
+LIB_OBJS += $(OUTPUT)util/symbol-minimal.o
+
+else # NO_LIBELF
+ifndef NO_DWARF
+  LIB_OBJS += $(OUTPUT)util/probe-finder.o
+  LIB_OBJS += $(OUTPUT)util/dwarf-aux.o
+endif # NO_DWARF
+endif # NO_LIBELF
+
+ifndef NO_LIBUNWIND
+  LIB_OBJS += $(OUTPUT)util/unwind.o
+endif
+LIB_OBJS += $(OUTPUT)tests/keep-tracking.o
+
+ifndef NO_LIBAUDIT
+  BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
+endif
+
+ifndef NO_SLANG
+  LIB_OBJS += $(OUTPUT)ui/browser.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/annotate.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/hists.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/map.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o
+  LIB_OBJS += $(OUTPUT)ui/tui/setup.o
+  LIB_OBJS += $(OUTPUT)ui/tui/util.o
+  LIB_OBJS += $(OUTPUT)ui/tui/helpline.o
+  LIB_OBJS += $(OUTPUT)ui/tui/progress.o
+  LIB_H += ui/browser.h
+  LIB_H += ui/browsers/map.h
+  LIB_H += ui/keysyms.h
+  LIB_H += ui/libslang.h
+endif
+
+ifndef NO_GTK2
+  ALL_PROGRAMS += $(OUTPUT)libperf-gtk.so
+
+  GTK_OBJS += $(OUTPUT)ui/gtk/browser.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/hists.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/setup.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/util.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/helpline.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/progress.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/annotate.o
+
+install-gtk: $(OUTPUT)libperf-gtk.so
+       $(call QUIET_INSTALL, 'GTK UI') \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(libdir_SQ)'; \
+               $(INSTALL) $(OUTPUT)libperf-gtk.so '$(DESTDIR_SQ)$(libdir_SQ)'
+endif
+
+ifndef NO_LIBPERL
+  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
+  LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
+endif
+
+ifndef NO_LIBPYTHON
+  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
+  LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
+endif
+
+ifeq ($(NO_PERF_REGS),0)
+  ifeq ($(ARCH),x86)
+    LIB_H += arch/x86/include/perf_regs.h
+  endif
+endif
+
+ifndef NO_LIBNUMA
+  BUILTIN_OBJS += $(OUTPUT)bench/numa.o
+endif
+
+ifdef ASCIIDOC8
+  export ASCIIDOC8
+endif
+
+LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive -Wl,--start-group $(EXTLIBS) -Wl,--end-group
+
+export INSTALL SHELL_PATH
+
+### Build rules
+
+SHELL = $(SHELL_PATH)
+
+all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
+
+please_set_SHELL_PATH_to_a_more_modern_shell:
+       @$$(:)
+
+shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
+
+strip: $(PROGRAMS) $(OUTPUT)perf
+       $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
+
+$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \
+               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+               $(CFLAGS) -c $(filter %.c,$^) -o $@
+
+$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+       $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OUTPUT)perf.o \
+               $(BUILTIN_OBJS) $(LIBS) -o $@
+
+$(GTK_OBJS): $(OUTPUT)%.o: %.c $(LIB_H)
+       $(QUIET_CC)$(CC) -o $@ -c -fPIC $(CFLAGS) $(GTK_CFLAGS) $<
+
+$(OUTPUT)libperf-gtk.so: $(GTK_OBJS) $(PERFLIBS)
+       $(QUIET_LINK)$(CC) -o $@ -shared $(ALL_LDFLAGS) $(filter %.o,$^) $(GTK_LIBS)
+
+$(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+               '-DPERF_MAN_PATH="$(mandir_SQ)"' \
+               '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
+
+$(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+               '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+               '-DPERF_MAN_PATH="$(mandir_SQ)"' \
+               '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
+
+$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
+       $(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+$(SCRIPTS) : % : %.sh
+       $(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
+
+# These can record PERF_VERSION
+$(OUTPUT)perf.o perf.spec \
+       $(SCRIPTS) \
+       : $(OUTPUT)PERF-VERSION-FILE
+
+.SUFFIXES:
+
+#
+# If a target does not match any of the later rules then prefix it by $(OUTPUT)
+# This makes targets like 'make O=/tmp/perf perf.o' work in a natural way.
+#
+ifneq ($(OUTPUT),)
+%.o: $(OUTPUT)%.o
+       @echo "    # Redirected target $@ => $(OUTPUT)$@"
+util/%.o: $(OUTPUT)util/%.o
+       @echo "    # Redirected target $@ => $(OUTPUT)$@"
+bench/%.o: $(OUTPUT)bench/%.o
+       @echo "    # Redirected target $@ => $(OUTPUT)$@"
+tests/%.o: $(OUTPUT)tests/%.o
+       @echo "    # Redirected target $@ => $(OUTPUT)$@"
+endif
+
+# These two need to be here so that when O= is not used they take precedence
+# over the general rule for .o
+
+$(OUTPUT)util/%-flex.o: $(OUTPUT)util/%-flex.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -w $<
+
+$(OUTPUT)util/%-bison.o: $(OUTPUT)util/%-bison.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $<
+
+$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
+$(OUTPUT)%.i: %.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
+$(OUTPUT)%.s: %.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -S $(CFLAGS) $<
+$(OUTPUT)%.o: %.S
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
+$(OUTPUT)%.s: %.S
+       $(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
+
+$(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+               '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
+               '-DPREFIX="$(prefix_SQ)"' \
+               $<
+
+$(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+               '-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \
+               $<
+
+$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+               -DPYTHONPATH='"$(OUTPUT)python"' \
+               -DPYTHON='"$(PYTHON_WORD)"' \
+               $<
+
+$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+$(OUTPUT)ui/setup.o: ui/setup.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DLIBDIR='"$(libdir_SQ)"' $<
+
+$(OUTPUT)ui/browser.o: ui/browser.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/annotate.o: ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/hists.o: ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/map.o: ui/browsers/map.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/scripts.o: ui/browsers/scripts.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+$(OUTPUT)util/parse-events.o: util/parse-events.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-redundant-decls $<
+
+$(OUTPUT)util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-undef -Wno-switch-default $<
+
+$(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-undef -Wno-switch-default $<
+
+$(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
+
+$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
+
+$(OUTPUT)perf-%: %.o $(PERFLIBS)
+       $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $(LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
+$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+
+# we compile into subdirectories. if the target directory is not the source directory, they might not exists. So
+# we depend the various files onto their directories.
+DIRECTORY_DEPS = $(LIB_OBJS) $(BUILTIN_OBJS) $(GTK_OBJS)
+DIRECTORY_DEPS += $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h
+$(DIRECTORY_DEPS): | $(sort $(dir $(DIRECTORY_DEPS)))
+# In the second step, we make a rule to actually create these directories
+$(sort $(dir $(DIRECTORY_DEPS))):
+       $(QUIET_MKDIR)$(MKDIR) -p $@ 2>/dev/null
+
+$(LIB_FILE): $(LIB_OBJS)
+       $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
+
+# libtraceevent.a
+TE_SOURCES = $(wildcard $(TRACE_EVENT_DIR)*.[ch])
+
+$(LIBTRACEEVENT): $(TE_SOURCES)
+       $(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) libtraceevent.a
+
+$(LIBTRACEEVENT)-clean:
+       $(call QUIET_CLEAN, libtraceevent)
+       @$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
+
+LIBLK_SOURCES = $(wildcard $(LK_PATH)*.[ch])
+
+# if subdir is set, we've been called from above so target has been built
+# already
+$(LIBLK): $(LIBLK_SOURCES)
+ifeq ($(subdir),)
+       $(QUIET_SUBDIR0)$(LK_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) liblk.a
+endif
+
+$(LIBLK)-clean:
+ifeq ($(subdir),)
+       $(call QUIET_CLEAN, liblk)
+       @$(MAKE) -C $(LK_DIR) O=$(OUTPUT) clean >/dev/null
+endif
+
+help:
+       @echo 'Perf make targets:'
+       @echo '  doc            - make *all* documentation (see below)'
+       @echo '  man            - make manpage documentation (access with man <foo>)'
+       @echo '  html           - make html documentation'
+       @echo '  info           - make GNU info documentation (access with info <foo>)'
+       @echo '  pdf            - make pdf documentation'
+       @echo '  TAGS           - use etags to make tag information for source browsing'
+       @echo '  tags           - use ctags to make tag information for source browsing'
+       @echo '  cscope - use cscope to make interactive browsing database'
+       @echo ''
+       @echo 'Perf install targets:'
+       @echo '  NOTE: documentation build requires asciidoc, xmlto packages to be installed'
+       @echo '  HINT: use "make prefix=<path> <install target>" to install to a particular'
+       @echo '        path like make prefix=/usr/local install install-doc'
+       @echo '  install        - install compiled binaries'
+       @echo '  install-doc    - install *all* documentation'
+       @echo '  install-man    - install manpage documentation'
+       @echo '  install-html   - install html documentation'
+       @echo '  install-info   - install GNU info documentation'
+       @echo '  install-pdf    - install pdf documentation'
+       @echo ''
+       @echo '  quick-install-doc      - alias for quick-install-man'
+       @echo '  quick-install-man      - install the documentation quickly'
+       @echo '  quick-install-html     - install the html documentation quickly'
+       @echo ''
+       @echo 'Perf maintainer targets:'
+       @echo '  clean                  - clean all binary objects and build output'
+
+
+DOC_TARGETS := doc man html info pdf
+
+INSTALL_DOC_TARGETS := $(patsubst %,install-%,$(DOC_TARGETS)) try-install-man
+INSTALL_DOC_TARGETS += quick-install-doc quick-install-man quick-install-html
+
+# 'make doc' should call 'make -C Documentation all'
+$(DOC_TARGETS):
+       $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:doc=all)
+
+TAGS:
+       $(RM) TAGS
+       $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+
+tags:
+       $(RM) tags
+       $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+
+cscope:
+       $(RM) cscope*
+       $(FIND) . -name '*.[hcS]' -print | xargs cscope -b
+
+### Detect prefix changes
+TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):\
+             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
+
+$(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS
+       @FLAGS='$(TRACK_CFLAGS)'; \
+           if test x"$$FLAGS" != x"`cat $(OUTPUT)PERF-CFLAGS 2>/dev/null`" ; then \
+               echo 1>&2 "  FLAGS:   * new build flags or prefix"; \
+               echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \
+            fi
+
+### Testing rules
+
+# GNU make supports exporting all variables by "export" without parameters.
+# However, the environment gets quite big, and some programs have problems
+# with that.
+
+check: $(OUTPUT)common-cmds.h
+       if sparse; \
+       then \
+               for i in *.c */*.c; \
+               do \
+                       sparse $(CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
+               done; \
+       else \
+               exit 1; \
+       fi
+
+### Installation rules
+
+install-gtk:
+
+install-bin: all install-gtk
+       $(call QUIET_INSTALL, binaries) \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'; \
+               $(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'; \
+               $(LN) '$(DESTDIR_SQ)$(bindir_SQ)/perf' '$(DESTDIR_SQ)$(bindir_SQ)/trace'
+       $(call QUIET_INSTALL, libexec) \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+       $(call QUIET_INSTALL, perf-archive) \
+               $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+ifndef NO_LIBPERL
+       $(call QUIET_INSTALL, perl-scripts) \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'; \
+               $(INSTALL) scripts/perl/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'; \
+               $(INSTALL) scripts/perl/*.pl -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl'; \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'; \
+               $(INSTALL) scripts/perl/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
+endif
+ifndef NO_LIBPYTHON
+       $(call QUIET_INSTALL, python-scripts) \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'; \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'; \
+               $(INSTALL) scripts/python/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'; \
+               $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'; \
+               $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
+endif
+       $(call QUIET_INSTALL, bash_completion-script) \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'; \
+               $(INSTALL) bash_completion '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
+       $(call QUIET_INSTALL, tests) \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
+               $(INSTALL) tests/attr.py '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
+               $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
+               $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
+
+install: install-bin try-install-man
+
+install-python_ext:
+       $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
+
+# 'make install-doc' should call 'make -C Documentation install'
+$(INSTALL_DOC_TARGETS):
+       $(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:-doc=)
+
+### Cleaning rules
+
+#
+# This is here, not in config/Makefile, because config/Makefile does
+# not get included for the clean target:
+#
+config-clean:
+       $(call QUIET_CLEAN, config)
+       @$(MAKE) -C config/feature-checks clean >/dev/null
+
+clean: $(LIBTRACEEVENT)-clean $(LIBLK)-clean config-clean
+       $(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_OBJS) $(BUILTIN_OBJS) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf.o $(LANG_BINDINGS) $(GTK_OBJS)
+       $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf
+       $(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex*
+       $(call QUIET_CLEAN, Documentation)
+       @$(MAKE) -C Documentation O=$(OUTPUT) clean >/dev/null
+       $(python-clean)
+
+#
+# Trick: if ../../.git does not exist - we are building out of tree for example,
+# then force version regeneration:
+#
+ifeq ($(wildcard ../../.git/HEAD),)
+    GIT-HEAD-PHONY = ../../.git/HEAD
+else
+    GIT-HEAD-PHONY =
+endif
+
+.PHONY: all install clean config-clean strip install-gtk
+.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
+.PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope .FORCE-PERF-CFLAGS
+
diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h

index 7fcdcdbee917a8473acf25fe7ac72c6c5a9732b0..e84ca76aae779ef484df1633b7cff3e35b327ba8 100644 (file)
--- a/tools/perf/arch/x86/include/perf_regs.h
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -5,7 +5,7 @@
  #include "../../util/types.h"
  #include <asm/perf_regs.h>
  
-#ifndef ARCH_X86_64
+#ifndef HAVE_ARCH_X86_64_SUPPORT
  #define PERF_REGS_MASK ((1ULL << PERF_REG_X86_32_MAX) - 1)
  #else
  #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
@@ -52,7 +52,7 @@ static inline const char *perf_reg_name(int id)
                 return "FS";
         case PERF_REG_X86_GS:
                 return "GS";
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
         case PERF_REG_X86_R8:
                 return "R8";
         case PERF_REG_X86_R9:
@@ -69,7 +69,7 @@ static inline const char *perf_reg_name(int id)
                 return "R14";
         case PERF_REG_X86_R15:
                 return "R15";
-#endif /* ARCH_X86_64 */
+#endif /* HAVE_ARCH_X86_64_SUPPORT */
         default:
                 return NULL;
         }
diff --git a/tools/perf/arch/x86/util/unwind.c b/tools/perf/arch/x86/util/unwind.c

index 78d956eff96fcd8de79a5afd42041c6c72fb02ec..456a88cf5b374bf3319ea2310e88f2cd179c9b28 100644 (file)
--- a/tools/perf/arch/x86/util/unwind.c
+++ b/tools/perf/arch/x86/util/unwind.c
@@ -4,7 +4,7 @@
  #include "perf_regs.h"
  #include "../../util/unwind.h"
  
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
  int unwind__arch_reg_id(int regnum)
  {
         int id;
@@ -108,4 +108,4 @@ int unwind__arch_reg_id(int regnum)
  
         return id;
  }
-#endif /* ARCH_X86_64 */
+#endif /* HAVE_ARCH_X86_64_SUPPORT */
diff --git a/tools/perf/bash_completion b/tools/perf/bash_completion

index 56e6a12aab59b42d4cffbc67858824af818f5fff..62e157db2e2b01e9400f9348d671fd9ba7267104 100644 (file)
--- a/tools/perf/bash_completion
+++ b/tools/perf/bash_completion
@@ -1,17 +1,87 @@
  # perf completion
  
-function_exists()
+# Taken from git.git's completion script.
+__my_reassemble_comp_words_by_ref()
  {
-       declare -F $1 > /dev/null
-       return $?
+       local exclude i j first
+       # Which word separators to exclude?
+       exclude="${1//[^$COMP_WORDBREAKS]}"
+       cword_=$COMP_CWORD
+       if [ -z "$exclude" ]; then
+               words_=("${COMP_WORDS[@]}")
+               return
+       fi
+       # List of word completion separators has shrunk;
+       # re-assemble words to complete.
+       for ((i=0, j=0; i < ${#COMP_WORDS[@]}; i++, j++)); do
+               # Append each nonempty word consisting of just
+               # word separator characters to the current word.
+               first=t
+               while
+                       [ $i -gt 0 ] &&
+                       [ -n "${COMP_WORDS[$i]}" ] &&
+                       # word consists of excluded word separators
+                       [ "${COMP_WORDS[$i]//[^$exclude]}" = "${COMP_WORDS[$i]}" ]
+               do
+                       # Attach to the previous token,
+                       # unless the previous token is the command name.
+                       if [ $j -ge 2 ] && [ -n "$first" ]; then
+                               ((j--))
+                       fi
+                       first=
+                       words_[$j]=${words_[j]}${COMP_WORDS[i]}
+                       if [ $i = $COMP_CWORD ]; then
+                               cword_=$j
+                       fi
+                       if (($i < ${#COMP_WORDS[@]} - 1)); then
+                               ((i++))
+                       else
+                               # Done.
+                               return
+                       fi
+               done
+               words_[$j]=${words_[j]}${COMP_WORDS[i]}
+               if [ $i = $COMP_CWORD ]; then
+                       cword_=$j
+               fi
+       done
  }
  
-function_exists __ltrim_colon_completions ||
+type _get_comp_words_by_ref &>/dev/null ||
+_get_comp_words_by_ref()
+{
+       local exclude cur_ words_ cword_
+       if [ "$1" = "-n" ]; then
+               exclude=$2
+               shift 2
+       fi
+       __my_reassemble_comp_words_by_ref "$exclude"
+       cur_=${words_[cword_]}
+       while [ $# -gt 0 ]; do
+               case "$1" in
+               cur)
+                       cur=$cur_
+                       ;;
+               prev)
+                       prev=${words_[$cword_-1]}
+                       ;;
+               words)
+                       words=("${words_[@]}")
+                       ;;
+               cword)
+                       cword=$cword_
+                       ;;
+               esac
+               shift
+       done
+}
+
+type __ltrim_colon_completions &>/dev/null ||
  __ltrim_colon_completions()
  {
         if [[ "$1" == *:* && "$COMP_WORDBREAKS" == *:* ]]; then
                 # Remove colon-word prefix from COMPREPLY items
-               local colon_word=${1%${1##*:}}
+               local colon_word=${1%"${1##*:}"}
                 local i=${#COMPREPLY[*]}
                 while [[ $((--i)) -ge 0 ]]; do
                         COMPREPLY[$i]=${COMPREPLY[$i]#"$colon_word"}
@@ -19,23 +89,18 @@ __ltrim_colon_completions()
         fi
  }
  
-have perf &&
+type perf &>/dev/null &&
  _perf()
  {
-       local cur prev cmd
+       local cur words cword prev cmd
  
         COMPREPLY=()
-       if function_exists _get_comp_words_by_ref; then
-               _get_comp_words_by_ref -n : cur prev
-       else
-               cur=$(_get_cword :)
-               prev=${COMP_WORDS[COMP_CWORD-1]}
-       fi
+       _get_comp_words_by_ref -n =: cur words cword prev
  
-       cmd=${COMP_WORDS[0]}
+       cmd=${words[0]}
  
         # List perf subcommands or long options
-       if [ $COMP_CWORD -eq 1 ]; then
+       if [ $cword -eq 1 ]; then
                 if [[ $cur == --* ]]; then
                         COMPREPLY=( $( compgen -W '--help --version \
                         --exec-path --html-path --paginate --no-pager \
@@ -45,18 +110,17 @@ _perf()
                         COMPREPLY=( $( compgen -W '$cmds' -- "$cur" ) )
                 fi
         # List possible events for -e option
-       elif [[ $prev == "-e" && "${COMP_WORDS[1]}" == @(record|stat|top) ]]; then
+       elif [[ $prev == "-e" && "${words[1]}" == @(record|stat|top) ]]; then
                 evts=$($cmd list --raw-dump)
                 COMPREPLY=( $( compgen -W '$evts' -- "$cur" ) )
                 __ltrim_colon_completions $cur
         # List long option names
         elif [[ $cur == --* ]];  then
-               subcmd=${COMP_WORDS[1]}
+               subcmd=${words[1]}
                 opts=$($cmd $subcmd --list-opts)
                 COMPREPLY=( $( compgen -W '$opts' -- "$cur" ) )
-       # Fall down to list regular files
-       else
-               _filedir
         fi
  } &&
-complete -F _perf perf
+
+complete -o bashdefault -o default -o nospace -F _perf perf 2>/dev/null \
+       || complete -o default -o nospace -F _perf perf
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h

index a72e36cb53943c1a9689b8ce6ee35abc467ccb1d..57b4ed871459034b57159bb4eb21dc27e88e87ba 100644 (file)
--- a/tools/perf/bench/mem-memcpy-arch.h
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -1,5 +1,5 @@
  
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
  
  #define MEMCPY_FN(fn, name, desc)              \
         extern void *fn(void *, const void *, size_t);
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c

index 8cdca43016b250109d8bdd3ea7568eb13229a125..5ce71d3b72cf9bc16251ba5e7ac81ad0c61560b6 100644 (file)
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -58,7 +58,7 @@ struct routine routines[] = {
         { "default",
           "Default memcpy() provided by glibc",
           memcpy },
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
  
  #define MEMCPY_FN(fn, name, desc) { name, desc, fn },
  #include "mem-memcpy-x86-64-asm-def.h"
diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h

index a040fa77665b0ef90eb66ba3901074f274536750..633800cb0dcb0ad60309c1a12003f42d38e64b08 100644 (file)
--- a/tools/perf/bench/mem-memset-arch.h
+++ b/tools/perf/bench/mem-memset-arch.h
@@ -1,5 +1,5 @@
  
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
  
  #define MEMSET_FN(fn, name, desc)              \
         extern void *fn(void *, int, size_t);
diff --git a/tools/perf/bench/mem-memset.c b/tools/perf/bench/mem-memset.c

index 4a2f12081964163c3f93bede8934c723806d16f2..9af79d2b18e5a178c97adfe6ddcaecda236dcfc6 100644 (file)
--- a/tools/perf/bench/mem-memset.c
+++ b/tools/perf/bench/mem-memset.c
@@ -58,7 +58,7 @@ static const struct routine routines[] = {
         { "default",
           "Default memset() provided by glibc",
           memset },
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
  
  #define MEMSET_FN(fn, name, desc) { name, desc, fn },
  #include "mem-memset-x86-64-asm-def.h"
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c

index 30d1c3225b46222d52f3d73b0cdb1fa1995e8efa..d4c83c60b9b29fa7d6249de70e5f0673d2066287 100644 (file)
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -429,14 +429,14 @@ static int parse_cpu_list(const char *arg)
         return 0;
  }
  
-static void parse_setup_cpu_list(void)
+static int parse_setup_cpu_list(void)
  {
         struct thread_data *td;
         char *str0, *str;
         int t;
  
         if (!g->p.cpu_list_str)
-               return;
+               return 0;
  
         dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
  
@@ -500,8 +500,12 @@ static void parse_setup_cpu_list(void)
  
                 dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
  
-               BUG_ON(bind_cpu_0 < 0 || bind_cpu_0 >= g->p.nr_cpus);
-               BUG_ON(bind_cpu_1 < 0 || bind_cpu_1 >= g->p.nr_cpus);
+               if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) {
+                       printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus);
+                       return -1;
+               }
+
+               BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
                 BUG_ON(bind_cpu_0 > bind_cpu_1);
  
                 for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
@@ -541,6 +545,7 @@ out:
                 printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
  
         free(str0);
+       return 0;
  }
  
  static int parse_cpus_opt(const struct option *opt __maybe_unused,
@@ -561,14 +566,14 @@ static int parse_node_list(const char *arg)
         return 0;
  }
  
-static void parse_setup_node_list(void)
+static int parse_setup_node_list(void)
  {
         struct thread_data *td;
         char *str0, *str;
         int t;
  
         if (!g->p.node_list_str)
-               return;
+               return 0;
  
         dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
  
@@ -619,8 +624,12 @@ static void parse_setup_node_list(void)
  
                 dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
  
-               BUG_ON(bind_node_0 < 0 || bind_node_0 >= g->p.nr_nodes);
-               BUG_ON(bind_node_1 < 0 || bind_node_1 >= g->p.nr_nodes);
+               if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) {
+                       printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes);
+                       return -1;
+               }
+
+               BUG_ON(bind_node_0 < 0 || bind_node_1 < 0);
                 BUG_ON(bind_node_0 > bind_node_1);
  
                 for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
@@ -651,6 +660,7 @@ out:
                 printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
  
         free(str0);
+       return 0;
  }
  
  static int parse_nodes_opt(const struct option *opt __maybe_unused,
@@ -1110,7 +1120,7 @@ static void *worker_thread(void *__tdata)
                 /* Check whether our max runtime timed out: */
                 if (g->p.nr_secs) {
                         timersub(&stop, &start0, &diff);
-                       if (diff.tv_sec >= g->p.nr_secs) {
+                       if ((u32)diff.tv_sec >= g->p.nr_secs) {
                                 g->stop_work = true;
                                 break;
                         }
@@ -1157,7 +1167,7 @@ static void *worker_thread(void *__tdata)
                         runtime_ns_max += diff.tv_usec * 1000;
  
                         if (details >= 0) {
-                               printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016lx]\n",
+                               printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n",
                                         process_nr, thread_nr, runtime_ns_max / bytes_done, val);
                         }
                         fflush(stdout);
@@ -1356,8 +1366,8 @@ static int init(void)
         init_thread_data();
  
         tprintf("#\n");
-       parse_setup_cpu_list();
-       parse_setup_node_list();
+       if (parse_setup_cpu_list() || parse_setup_node_list())
+               return -1;
         tprintf("#\n");
  
         print_summary();
@@ -1600,7 +1610,6 @@ static int run_bench_numa(const char *name, const char **argv)
         return 0;
  
  err:
-       usage_with_options(numa_usage, options);
         return -1;
  }
  
@@ -1701,8 +1710,7 @@ static int bench_all(void)
         BUG_ON(ret < 0);
  
         for (i = 0; i < nr; i++) {
-               if (run_bench_numa(tests[i][0], tests[i] + 1))
-                       return -1;
+               run_bench_numa(tests[i][0], tests[i] + 1);
         }
  
         printf("\n");
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c

index 69cfba8d4c6cfb5a0bd402735d3eaa5bfdd7a467..07a8d7646a1549c61699f7311285238ef5ef93cf 100644 (file)
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -7,9 +7,7 @@
   * Based on pipe-test-1m.c by Ingo Molnar <mingo@redhat.com>
   *  http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c
   * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
- *
   */
-
  #include "../perf.h"
  #include "../util/util.h"
  #include "../util/parse-options.h"
@@ -28,12 +26,24 @@
  #include <sys/time.h>
  #include <sys/types.h>
  
+#include <pthread.h>
+
+struct thread_data {
+       int                     nr;
+       int                     pipe_read;
+       int                     pipe_write;
+       pthread_t               pthread;
+};
+
  #define LOOPS_DEFAULT 1000000
-static int loops = LOOPS_DEFAULT;
+static int                     loops = LOOPS_DEFAULT;
+
+/* Use processes by default: */
+static bool                    threaded;
  
  static const struct option options[] = {
-       OPT_INTEGER('l', "loop", &loops,
-                   "Specify number of loops"),
+       OPT_INTEGER('l', "loop",        &loops,         "Specify number of loops"),
+       OPT_BOOLEAN('T', "threaded",    &threaded,      "Specify threads/process based task setup"),
         OPT_END()
  };
  
@@ -42,13 +52,37 @@ static const char * const bench_sched_pipe_usage[] = {
         NULL
  };
  
-int bench_sched_pipe(int argc, const char **argv,
-                    const char *prefix __maybe_unused)
+static void *worker_thread(void *__tdata)
  {
-       int pipe_1[2], pipe_2[2];
+       struct thread_data *td = __tdata;
         int m = 0, i;
+       int ret;
+
+       for (i = 0; i < loops; i++) {
+               if (!td->nr) {
+                       ret = read(td->pipe_read, &m, sizeof(int));
+                       BUG_ON(ret != sizeof(int));
+                       ret = write(td->pipe_write, &m, sizeof(int));
+                       BUG_ON(ret != sizeof(int));
+               } else {
+                       ret = write(td->pipe_write, &m, sizeof(int));
+                       BUG_ON(ret != sizeof(int));
+                       ret = read(td->pipe_read, &m, sizeof(int));
+                       BUG_ON(ret != sizeof(int));
+               }
+       }
+
+       return NULL;
+}
+
+int bench_sched_pipe(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+       struct thread_data threads[2], *td;
+       int pipe_1[2], pipe_2[2];
         struct timeval start, stop, diff;
         unsigned long long result_usec = 0;
+       int nr_threads = 2;
+       int t;
  
         /*
          * why does "ret" exist?
@@ -58,43 +92,66 @@ int bench_sched_pipe(int argc, const char **argv,
         int __maybe_unused ret, wait_stat;
         pid_t pid, retpid __maybe_unused;
  
-       argc = parse_options(argc, argv, options,
-                            bench_sched_pipe_usage, 0);
+       argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0);
  
         BUG_ON(pipe(pipe_1));
         BUG_ON(pipe(pipe_2));
  
-       pid = fork();
-       assert(pid >= 0);
-
         gettimeofday(&start, NULL);
  
-       if (!pid) {
-               for (i = 0; i < loops; i++) {
-                       ret = read(pipe_1[0], &m, sizeof(int));
-                       ret = write(pipe_2[1], &m, sizeof(int));
-               }
-       } else {
-               for (i = 0; i < loops; i++) {
-                       ret = write(pipe_1[1], &m, sizeof(int));
-                       ret = read(pipe_2[0], &m, sizeof(int));
+       for (t = 0; t < nr_threads; t++) {
+               td = threads + t;
+
+               td->nr = t;
+
+               if (t == 0) {
+                       td->pipe_read = pipe_1[0];
+                       td->pipe_write = pipe_2[1];
+               } else {
+                       td->pipe_write = pipe_1[1];
+                       td->pipe_read = pipe_2[0];
                 }
         }
  
-       gettimeofday(&stop, NULL);
-       timersub(&stop, &start, &diff);
  
-       if (pid) {
+       if (threaded) {
+
+               for (t = 0; t < nr_threads; t++) {
+                       td = threads + t;
+
+                       ret = pthread_create(&td->pthread, NULL, worker_thread, td);
+                       BUG_ON(ret);
+               }
+
+               for (t = 0; t < nr_threads; t++) {
+                       td = threads + t;
+
+                       ret = pthread_join(td->pthread, NULL);
+                       BUG_ON(ret);
+               }
+
+       } else {
+               pid = fork();
+               assert(pid >= 0);
+
+               if (!pid) {
+                       worker_thread(threads + 0);
+                       exit(0);
+               } else {
+                       worker_thread(threads + 1);
+               }
+
                 retpid = waitpid(pid, &wait_stat, 0);
                 assert((retpid == pid) && WIFEXITED(wait_stat));
-       } else {
-               exit(0);
         }
  
+       gettimeofday(&stop, NULL);
+       timersub(&stop, &start, &diff);
+
         switch (bench_format) {
         case BENCH_FORMAT_DEFAULT:
-               printf("# Executed %d pipe operations between two tasks\n\n",
-                       loops);
+               printf("# Executed %d pipe operations between two %s\n\n",
+                       loops, threaded ? "threads" : "processes");
  
                 result_usec = diff.tv_sec * 1000000;
                 result_usec += diff.tv_usec;
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index 5ebd0c3b71b6aa45d80aa63b47661996577cf6a4..03cfa592071f003e13ab0a541bf8d646c0df15c4 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -28,8 +28,10 @@
  #include "util/hist.h"
  #include "util/session.h"
  #include "util/tool.h"
+#include "util/data.h"
  #include "arch/common.h"
  
+#include <dlfcn.h>
  #include <linux/bitmap.h>
  
  struct perf_annotate {
@@ -63,7 +65,7 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
                 return 0;
         }
  
-       he = __hists__add_entry(&evsel->hists, al, NULL, 1, 1);
+       he = __hists__add_entry(&evsel->hists, al, NULL, 1, 1, 0);
         if (he == NULL)
                 return -ENOMEM;
  
@@ -142,8 +144,18 @@ find_next:
  
                 if (use_browser == 2) {
                         int ret;
+                       int (*annotate)(struct hist_entry *he,
+                                       struct perf_evsel *evsel,
+                                       struct hist_browser_timer *hbt);
+
+                       annotate = dlsym(perf_gtk_handle,
+                                        "hist_entry__gtk_annotate");
+                       if (annotate == NULL) {
+                               ui__error("GTK browser not found!\n");
+                               return;
+                       }
  
-                       ret = hist_entry__gtk_annotate(he, evsel, NULL);
+                       ret = annotate(he, evsel, NULL);
                         if (!ret || !ann->skip_missing)
                                 return;
  
@@ -188,9 +200,13 @@ static int __cmd_annotate(struct perf_annotate *ann)
         struct perf_session *session;
         struct perf_evsel *pos;
         u64 total_nr_samples;
+       struct perf_data_file file = {
+               .path  = input_name,
+               .mode  = PERF_DATA_MODE_READ,
+               .force = ann->force,
+       };
  
-       session = perf_session__new(input_name, O_RDONLY,
-                                   ann->force, false, &ann->tool);
+       session = perf_session__new(&file, false, &ann->tool);
         if (session == NULL)
                 return -ENOMEM;
  
@@ -243,12 +259,21 @@ static int __cmd_annotate(struct perf_annotate *ann)
         }
  
         if (total_nr_samples == 0) {
-               ui__error("The %s file has no samples!\n", session->filename);
+               ui__error("The %s file has no samples!\n", file.path);
                 goto out_delete;
         }
  
-       if (use_browser == 2)
-               perf_gtk__show_annotations();
+       if (use_browser == 2) {
+               void (*show_annotations)(void);
+
+               show_annotations = dlsym(perf_gtk_handle,
+                                        "perf_gtk__show_annotations");
+               if (show_annotations == NULL) {
+                       ui__error("GTK browser not found!\n");
+                       goto out_delete;
+               }
+               show_annotations();
+       }
  
  out_delete:
         /*
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c

index 77298bf892b85c68d3c662cfe62e050518630a49..33af80fa49cff70ff779c77c2be3f901587f8329 100644 (file)
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -35,7 +35,7 @@ struct bench_suite {
  /* sentinel: easy for help */
  #define suite_all { "all", "Test all benchmark suites", NULL }
  
-#ifdef LIBNUMA_SUPPORT
+#ifdef HAVE_LIBNUMA_SUPPORT
  static struct bench_suite numa_suites[] = {
         { "mem",
           "Benchmark for NUMA workloads",
@@ -80,7 +80,7 @@ struct bench_subsys {
  };
  
  static struct bench_subsys subsystems[] = {
-#ifdef LIBNUMA_SUPPORT
+#ifdef HAVE_LIBNUMA_SUPPORT
         { "numa",
           "NUMA scheduling and MM behavior",
           numa_suites },
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c

index c96c8fa3824353f1c5401cd2a6bacb9701a2762e..cfede86161d8ae9410c123745bb60def09785331 100644 (file)
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -6,6 +6,11 @@
   * Copyright (C) 2010, Red Hat Inc.
   * Copyright (C) 2010, Arnaldo Carvalho de Melo <acme@redhat.com>
   */
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <dirent.h>
+#include <unistd.h>
  #include "builtin.h"
  #include "perf.h"
  #include "util/cache.h"
@@ -17,6 +22,140 @@
  #include "util/session.h"
  #include "util/symbol.h"
  
+static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
+{
+       char root_dir[PATH_MAX];
+       char notes[PATH_MAX];
+       u8 build_id[BUILD_ID_SIZE];
+       char *p;
+
+       strlcpy(root_dir, proc_dir, sizeof(root_dir));
+
+       p = strrchr(root_dir, '/');
+       if (!p)
+               return -1;
+       *p = '\0';
+
+       scnprintf(notes, sizeof(notes), "%s/sys/kernel/notes", root_dir);
+
+       if (sysfs__read_build_id(notes, build_id, sizeof(build_id)))
+               return -1;
+
+       build_id__sprintf(build_id, sizeof(build_id), sbuildid);
+
+       return 0;
+}
+
+static int build_id_cache__kcore_dir(char *dir, size_t sz)
+{
+       struct timeval tv;
+       struct tm tm;
+       char dt[32];
+
+       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
+               return -1;
+
+       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
+               return -1;
+
+       scnprintf(dir, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
+
+       return 0;
+}
+
+static int build_id_cache__kcore_existing(const char *from_dir, char *to_dir,
+                                         size_t to_dir_sz)
+{
+       char from[PATH_MAX];
+       char to[PATH_MAX];
+       struct dirent *dent;
+       int ret = -1;
+       DIR *d;
+
+       d = opendir(to_dir);
+       if (!d)
+               return -1;
+
+       scnprintf(from, sizeof(from), "%s/modules", from_dir);
+
+       while (1) {
+               dent = readdir(d);
+               if (!dent)
+                       break;
+               if (dent->d_type != DT_DIR)
+                       continue;
+               scnprintf(to, sizeof(to), "%s/%s/modules", to_dir,
+                         dent->d_name);
+               if (!compare_proc_modules(from, to)) {
+                       scnprintf(to, sizeof(to), "%s/%s", to_dir,
+                                 dent->d_name);
+                       strlcpy(to_dir, to, to_dir_sz);
+                       ret = 0;
+                       break;
+               }
+       }
+
+       closedir(d);
+
+       return ret;
+}
+
+static int build_id_cache__add_kcore(const char *filename, const char *debugdir)
+{
+       char dir[32], sbuildid[BUILD_ID_SIZE * 2 + 1];
+       char from_dir[PATH_MAX], to_dir[PATH_MAX];
+       char *p;
+
+       strlcpy(from_dir, filename, sizeof(from_dir));
+
+       p = strrchr(from_dir, '/');
+       if (!p || strcmp(p + 1, "kcore"))
+               return -1;
+       *p = '\0';
+
+       if (build_id_cache__kcore_buildid(from_dir, sbuildid))
+               return -1;
+
+       scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s",
+                 debugdir, sbuildid);
+
+       if (!build_id_cache__kcore_existing(from_dir, to_dir, sizeof(to_dir))) {
+               pr_debug("same kcore found in %s\n", to_dir);
+               return 0;
+       }
+
+       if (build_id_cache__kcore_dir(dir, sizeof(dir)))
+               return -1;
+
+       scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s/%s",
+                 debugdir, sbuildid, dir);
+
+       if (mkdir_p(to_dir, 0755))
+               return -1;
+
+       if (kcore_copy(from_dir, to_dir)) {
+               /* Remove YYYYmmddHHMMSShh directory */
+               if (!rmdir(to_dir)) {
+                       p = strrchr(to_dir, '/');
+                       if (p)
+                               *p = '\0';
+                       /* Try to remove buildid directory */
+                       if (!rmdir(to_dir)) {
+                               p = strrchr(to_dir, '/');
+                               if (p)
+                                       *p = '\0';
+                               /* Try to remove [kernel.kcore] directory */
+                               rmdir(to_dir);
+                       }
+               }
+               return -1;
+       }
+
+       pr_debug("kcore added to build-id cache directory %s\n", to_dir);
+
+       return 0;
+}
+
  static int build_id_cache__add_file(const char *filename, const char *debugdir)
  {
         char sbuild_id[BUILD_ID_SIZE * 2 + 1];
@@ -82,8 +221,12 @@ static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused)
  
  static int build_id_cache__fprintf_missing(const char *filename, bool force, FILE *fp)
  {
-       struct perf_session *session = perf_session__new(filename, O_RDONLY,
-                                                        force, false, NULL);
+       struct perf_data_file file = {
+               .path  = filename,
+               .mode  = PERF_DATA_MODE_READ,
+               .force = force,
+       };
+       struct perf_session *session = perf_session__new(&file, false, NULL);
         if (session == NULL)
                 return -1;
  
@@ -130,11 +273,14 @@ int cmd_buildid_cache(int argc, const char **argv,
         char const *add_name_list_str = NULL,
                    *remove_name_list_str = NULL,
                    *missing_filename = NULL,
-                  *update_name_list_str = NULL;
+                  *update_name_list_str = NULL,
+                  *kcore_filename;
  
         const struct option buildid_cache_options[] = {
         OPT_STRING('a', "add", &add_name_list_str,
                    "file list", "file(s) to add"),
+       OPT_STRING('k', "kcore", &kcore_filename,
+                  "file", "kcore file to add"),
         OPT_STRING('r', "remove", &remove_name_list_str, "file list",
                     "file(s) to remove"),
         OPT_STRING('M', "missing", &missing_filename, "file",
@@ -217,5 +363,9 @@ int cmd_buildid_cache(int argc, const char **argv,
                 }
         }
  
+       if (kcore_filename &&
+           build_id_cache__add_kcore(kcore_filename, debugdir))
+               pr_warning("Couldn't add %s\n", kcore_filename);
+
         return ret;
  }
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c

index e74366a13218dbc5da8ef576b298d8bcc21c4ed6..ed3873b3e23873ded61f12486d50bb118c9471b2 100644 (file)
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -15,6 +15,7 @@
  #include "util/parse-options.h"
  #include "util/session.h"
  #include "util/symbol.h"
+#include "util/data.h"
  
  static int sysfs__fprintf_build_id(FILE *fp)
  {
@@ -52,6 +53,11 @@ static bool dso__skip_buildid(struct dso *dso, int with_hits)
  static int perf_session__list_build_ids(bool force, bool with_hits)
  {
         struct perf_session *session;
+       struct perf_data_file file = {
+               .path  = input_name,
+               .mode  = PERF_DATA_MODE_READ,
+               .force = force,
+       };
  
         symbol__elf_init();
         /*
@@ -60,15 +66,14 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
         if (filename__fprintf_build_id(input_name, stdout))
                 goto out;
  
-       session = perf_session__new(input_name, O_RDONLY, force, false,
-                                   &build_id__mark_dso_hit_ops);
+       session = perf_session__new(&file, false, &build_id__mark_dso_hit_ops);
         if (session == NULL)
                 return -1;
         /*
          * in pipe-mode, the only way to get the buildids is to parse
          * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
          */
-       if (with_hits || session->fd_pipe)
+       if (with_hits || perf_data_file__is_pipe(&file))
                 perf_session__process_events(session, &build_id__mark_dso_hit_ops);
  
         perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits);
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c

index f28799e94f2a4bbbb1226da3cd78b908c7b1f610..419d27dd708b682ba1418dd59347a5955df649a9 100644 (file)
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -16,6 +16,7 @@
  #include "util/sort.h"
  #include "util/symbol.h"
  #include "util/util.h"
+#include "util/data.h"
  
  #include <stdlib.h>
  #include <math.h>
@@ -42,7 +43,7 @@ struct diff_hpp_fmt {
  
  struct data__file {
         struct perf_session     *session;
-       const char              *file;
+       struct perf_data_file   file;
         int                      idx;
         struct hists            *hists;
         struct diff_hpp_fmt      fmt[PERF_HPP_DIFF__MAX_INDEX];
@@ -304,9 +305,10 @@ static int formula_fprintf(struct hist_entry *he, struct hist_entry *pair,
  
  static int hists__add_entry(struct hists *self,
                             struct addr_location *al, u64 period,
-                           u64 weight)
+                           u64 weight, u64 transaction)
  {
-       if (__hists__add_entry(self, al, NULL, period, weight) != NULL)
+       if (__hists__add_entry(self, al, NULL, period, weight, transaction)
+           != NULL)
                 return 0;
         return -ENOMEM;
  }
@@ -328,7 +330,8 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
         if (al.filtered)
                 return 0;
  
-       if (hists__add_entry(&evsel->hists, &al, sample->period, sample->weight)) {
+       if (hists__add_entry(&evsel->hists, &al, sample->period,
+                            sample->weight, sample->transaction)) {
                 pr_warning("problem incrementing symbol period, skipping event\n");
                 return -1;
         }
@@ -599,7 +602,7 @@ static void data__fprintf(void)
  
         data__for_each_file(i, d)
                 fprintf(stdout, "#  [%d] %s %s\n",
-                       d->idx, d->file,
+                       d->idx, d->file.path,
                         !d->idx ? "(Baseline)" : "");
  
         fprintf(stdout, "#\n");
@@ -661,17 +664,16 @@ static int __cmd_diff(void)
         int ret = -EINVAL, i;
  
         data__for_each_file(i, d) {
-               d->session = perf_session__new(d->file, O_RDONLY, force,
-                                              false, &tool);
+               d->session = perf_session__new(&d->file, false, &tool);
                 if (!d->session) {
-                       pr_err("Failed to open %s\n", d->file);
+                       pr_err("Failed to open %s\n", d->file.path);
                         ret = -ENOMEM;
                         goto out_delete;
                 }
  
                 ret = perf_session__process_events(d->session, &tool);
                 if (ret) {
-                       pr_err("Failed to process %s\n", d->file);
+                       pr_err("Failed to process %s\n", d->file.path);
                         goto out_delete;
                 }
  
@@ -1014,7 +1016,12 @@ static int data_init(int argc, const char **argv)
                 return -ENOMEM;
  
         data__for_each_file(i, d) {
-               d->file = use_default ? defaults[i] : argv[i];
+               struct perf_data_file *file = &d->file;
+
+               file->path  = use_default ? defaults[i] : argv[i];
+               file->mode  = PERF_DATA_MODE_READ,
+               file->force = force,
+
                 d->idx  = i;
         }
  
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c

index 05bd9dfe875cb95aeb6d2008b3319c2e8ca8f5eb..20b0f12763b091fd392366865ab113f9aa66a3c7 100644 (file)
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -14,13 +14,18 @@
  #include "util/parse-events.h"
  #include "util/parse-options.h"
  #include "util/session.h"
+#include "util/data.h"
  
  static int __cmd_evlist(const char *file_name, struct perf_attr_details *details)
  {
         struct perf_session *session;
         struct perf_evsel *pos;
+       struct perf_data_file file = {
+               .path = file_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
  
-       session = perf_session__new(file_name, O_RDONLY, 0, false, NULL);
+       session = perf_session__new(&file, 0, NULL);
         if (session == NULL)
                 return -ENOMEM;
  
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c

index afe377b2884f740b0c469a367ab3938d99f7df0e..4aa6d7850bcc1db8804571f10ee5625cbdd4b3d8 100644 (file)
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -15,6 +15,7 @@
  #include "util/tool.h"
  #include "util/debug.h"
  #include "util/build-id.h"
+#include "util/data.h"
  
  #include "util/parse-options.h"
  
@@ -231,7 +232,7 @@ static int perf_event__inject_buildid(struct perf_tool *tool,
                                  * account this as unresolved.
                                  */
                         } else {
-#ifdef LIBELF_SUPPORT
+#ifdef HAVE_LIBELF_SUPPORT
                                 pr_warning("no symbols found in %s, maybe "
                                            "install a debug package?\n",
                                            al.map->dso->long_name);
@@ -345,6 +346,10 @@ static int __cmd_inject(struct perf_inject *inject)
  {
         struct perf_session *session;
         int ret = -EINVAL;
+       struct perf_data_file file = {
+               .path = inject->input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
  
         signal(SIGINT, sig_handler);
  
@@ -355,7 +360,7 @@ static int __cmd_inject(struct perf_inject *inject)
                 inject->tool.tracing_data = perf_event__repipe_tracing_data;
         }
  
-       session = perf_session__new(inject->input_name, O_RDONLY, false, true, &inject->tool);
+       session = perf_session__new(&file, true, &inject->tool);
         if (session == NULL)
                 return -ENOMEM;
  
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c

index 9b5f077fee5b1b65a4e51b48ef1af0727b8c134b..1126382659a95fd62e0e1e9a6624e691872fc7b4 100644 (file)
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -13,6 +13,7 @@
  
  #include "util/parse-options.h"
  #include "util/trace-event.h"
+#include "util/data.h"
  
  #include "util/debug.h"
  
@@ -486,8 +487,12 @@ static int __cmd_kmem(void)
                 { "kmem:kfree",                 perf_evsel__process_free_event, },
                 { "kmem:kmem_cache_free",       perf_evsel__process_free_event, },
         };
+       struct perf_data_file file = {
+               .path = input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
  
-       session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_kmem);
+       session = perf_session__new(&file, false, &perf_kmem);
         if (session == NULL)
                 return -ENOMEM;
  
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c

index 935d52216c899eb42774ae9f6e63767ff77fc8ba..188bb29b373f9ee7280f25f28c127a00e83d195a 100644 (file)
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -17,6 +17,7 @@
  #include "util/tool.h"
  #include "util/stat.h"
  #include "util/top.h"
+#include "util/data.h"
  
  #include <sys/prctl.h>
  #include <sys/timerfd.h>
@@ -1215,10 +1216,13 @@ static int read_events(struct perf_kvm_stat *kvm)
                 .comm                   = perf_event__process_comm,
                 .ordered_samples        = true,
         };
+       struct perf_data_file file = {
+               .path = input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
  
         kvm->tool = eops;
-       kvm->session = perf_session__new(kvm->file_name, O_RDONLY, 0, false,
-                                        &kvm->tool);
+       kvm->session = perf_session__new(&file, false, &kvm->tool);
         if (!kvm->session) {
                 pr_err("Initializing perf session failed\n");
                 return -EINVAL;
@@ -1426,8 +1430,9 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
         const struct option live_options[] = {
                 OPT_STRING('p', "pid", &kvm->opts.target.pid, "pid",
                         "record events on existing process id"),
-               OPT_UINTEGER('m', "mmap-pages", &kvm->opts.mmap_pages,
-                       "number of mmap data pages"),
+               OPT_CALLBACK('m', "mmap-pages", &kvm->opts.mmap_pages, "pages",
+                       "number of mmap data pages",
+                       perf_evlist__parse_mmap_pages),
                 OPT_INCR('v', "verbose", &verbose,
                         "be more verbose (show counter open errors, etc)"),
                 OPT_BOOLEAN('a', "all-cpus", &kvm->opts.target.system_wide,
@@ -1449,6 +1454,9 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
                 "perf kvm stat live [<options>]",
                 NULL
         };
+       struct perf_data_file file = {
+               .mode = PERF_DATA_MODE_WRITE,
+       };
  
  
         /* event handling */
@@ -1513,7 +1521,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
         /*
          * perf session
          */
-       kvm->session = perf_session__new(NULL, O_WRONLY, false, false, &kvm->tool);
+       kvm->session = perf_session__new(&file, false, &kvm->tool);
         if (kvm->session == NULL) {
                 err = -ENOMEM;
                 goto out;
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c

index ee33ba2f05dd8b2f1f9c90f1820e0eab93b86f8b..33c7253295b966b8a6c4a20753fd5cfdead12048 100644 (file)
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -15,6 +15,7 @@
  #include "util/debug.h"
  #include "util/session.h"
  #include "util/tool.h"
+#include "util/data.h"
  
  #include <sys/types.h>
  #include <sys/prctl.h>
@@ -56,7 +57,9 @@ struct lock_stat {
  
         unsigned int            nr_readlock;
         unsigned int            nr_trylock;
+
         /* these times are in nano sec. */
+       u64                     avg_wait_time;
         u64                     wait_time_total;
         u64                     wait_time_min;
         u64                     wait_time_max;
@@ -208,6 +211,7 @@ static struct thread_stat *thread_stat_findnew_first(u32 tid)
  
  SINGLE_KEY(nr_acquired)
  SINGLE_KEY(nr_contended)
+SINGLE_KEY(avg_wait_time)
  SINGLE_KEY(wait_time_total)
  SINGLE_KEY(wait_time_max)
  
@@ -244,6 +248,7 @@ static struct rb_root               result; /* place to store sorted data */
  struct lock_key keys[] = {
         DEF_KEY_LOCK(acquired, nr_acquired),
         DEF_KEY_LOCK(contended, nr_contended),
+       DEF_KEY_LOCK(avg_wait, avg_wait_time),
         DEF_KEY_LOCK(wait_total, wait_time_total),
         DEF_KEY_LOCK(wait_min, wait_time_min),
         DEF_KEY_LOCK(wait_max, wait_time_max),
@@ -321,10 +326,12 @@ static struct lock_stat *lock_stat_findnew(void *addr, const char *name)
  
         new->addr = addr;
         new->name = zalloc(sizeof(char) * strlen(name) + 1);
-       if (!new->name)
+       if (!new->name) {
+               free(new);
                 goto alloc_failed;
-       strcpy(new->name, name);
+       }
  
+       strcpy(new->name, name);
         new->wait_time_min = ULLONG_MAX;
  
         list_add(&new->hash_entry, entry);
@@ -400,17 +407,17 @@ static int report_lock_acquire_event(struct perf_evsel *evsel,
  
         ls = lock_stat_findnew(addr, name);
         if (!ls)
-               return -1;
+               return -ENOMEM;
         if (ls->discard)
                 return 0;
  
         ts = thread_stat_findnew(sample->tid);
         if (!ts)
-               return -1;
+               return -ENOMEM;
  
         seq = get_seq(ts, addr);
         if (!seq)
-               return -1;
+               return -ENOMEM;
  
         switch (seq->state) {
         case SEQ_STATE_UNINITIALIZED:
@@ -446,7 +453,6 @@ broken:
                 list_del(&seq->list);
                 free(seq);
                 goto end;
-               break;
         default:
                 BUG_ON("Unknown state of lock sequence found!\n");
                 break;
@@ -473,17 +479,17 @@ static int report_lock_acquired_event(struct perf_evsel *evsel,
  
         ls = lock_stat_findnew(addr, name);
         if (!ls)
-               return -1;
+               return -ENOMEM;
         if (ls->discard)
                 return 0;
  
         ts = thread_stat_findnew(sample->tid);
         if (!ts)
-               return -1;
+               return -ENOMEM;
  
         seq = get_seq(ts, addr);
         if (!seq)
-               return -1;
+               return -ENOMEM;
  
         switch (seq->state) {
         case SEQ_STATE_UNINITIALIZED:
@@ -508,8 +514,6 @@ static int report_lock_acquired_event(struct perf_evsel *evsel,
                 list_del(&seq->list);
                 free(seq);
                 goto end;
-               break;
-
         default:
                 BUG_ON("Unknown state of lock sequence found!\n");
                 break;
@@ -517,6 +521,7 @@ static int report_lock_acquired_event(struct perf_evsel *evsel,
  
         seq->state = SEQ_STATE_ACQUIRED;
         ls->nr_acquired++;
+       ls->avg_wait_time = ls->nr_contended ? ls->wait_time_total/ls->nr_contended : 0;
         seq->prev_event_time = sample->time;
  end:
         return 0;
@@ -536,17 +541,17 @@ static int report_lock_contended_event(struct perf_evsel *evsel,
  
         ls = lock_stat_findnew(addr, name);
         if (!ls)
-               return -1;
+               return -ENOMEM;
         if (ls->discard)
                 return 0;
  
         ts = thread_stat_findnew(sample->tid);
         if (!ts)
-               return -1;
+               return -ENOMEM;
  
         seq = get_seq(ts, addr);
         if (!seq)
-               return -1;
+               return -ENOMEM;
  
         switch (seq->state) {
         case SEQ_STATE_UNINITIALIZED:
@@ -564,7 +569,6 @@ static int report_lock_contended_event(struct perf_evsel *evsel,
                 list_del(&seq->list);
                 free(seq);
                 goto end;
-               break;
         default:
                 BUG_ON("Unknown state of lock sequence found!\n");
                 break;
@@ -572,6 +576,7 @@ static int report_lock_contended_event(struct perf_evsel *evsel,
  
         seq->state = SEQ_STATE_CONTENDED;
         ls->nr_contended++;
+       ls->avg_wait_time = ls->wait_time_total/ls->nr_contended;
         seq->prev_event_time = sample->time;
  end:
         return 0;
@@ -591,22 +596,21 @@ static int report_lock_release_event(struct perf_evsel *evsel,
  
         ls = lock_stat_findnew(addr, name);
         if (!ls)
-               return -1;
+               return -ENOMEM;
         if (ls->discard)
                 return 0;
  
         ts = thread_stat_findnew(sample->tid);
         if (!ts)
-               return -1;
+               return -ENOMEM;
  
         seq = get_seq(ts, addr);
         if (!seq)
-               return -1;
+               return -ENOMEM;
  
         switch (seq->state) {
         case SEQ_STATE_UNINITIALIZED:
                 goto end;
-               break;
         case SEQ_STATE_ACQUIRED:
                 break;
         case SEQ_STATE_READ_ACQUIRED:
@@ -624,7 +628,6 @@ static int report_lock_release_event(struct perf_evsel *evsel,
                 ls->discard = 1;
                 bad_hist[BROKEN_RELEASE]++;
                 goto free_seq;
-               break;
         default:
                 BUG_ON("Unknown state of lock sequence found!\n");
                 break;
@@ -690,7 +693,7 @@ static void print_bad_events(int bad, int total)
  
         pr_info("\n=== output for debug===\n\n");
         pr_info("bad: %d, total: %d\n", bad, total);
-       pr_info("bad rate: %f %%\n", (double)bad / (double)total * 100);
+       pr_info("bad rate: %.2f %%\n", (double)bad / (double)total * 100);
         pr_info("histogram of events caused bad sequence\n");
         for (i = 0; i < BROKEN_MAX; i++)
                 pr_info(" %10s: %d\n", name[i], bad_hist[i]);
@@ -707,6 +710,7 @@ static void print_result(void)
         pr_info("%10s ", "acquired");
         pr_info("%10s ", "contended");
  
+       pr_info("%15s ", "avg wait (ns)");
         pr_info("%15s ", "total wait (ns)");
         pr_info("%15s ", "max wait (ns)");
         pr_info("%15s ", "min wait (ns)");
@@ -738,6 +742,7 @@ static void print_result(void)
                 pr_info("%10u ", st->nr_acquired);
                 pr_info("%10u ", st->nr_contended);
  
+               pr_info("%15" PRIu64 " ", st->avg_wait_time);
                 pr_info("%15" PRIu64 " ", st->wait_time_total);
                 pr_info("%15" PRIu64 " ", st->wait_time_max);
                 pr_info("%15" PRIu64 " ", st->wait_time_min == ULLONG_MAX ?
@@ -822,6 +827,18 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
         return 0;
  }
  
+static void sort_result(void)
+{
+       unsigned int i;
+       struct lock_stat *st;
+
+       for (i = 0; i < LOCKHASH_SIZE; i++) {
+               list_for_each_entry(st, &lockhash_table[i], hash_entry) {
+                       insert_to_result(st, compare);
+               }
+       }
+}
+
  static const struct perf_evsel_str_handler lock_tracepoints[] = {
         { "lock:lock_acquire",   perf_evsel__process_lock_acquire,   }, /* CONFIG_LOCKDEP */
         { "lock:lock_acquired",  perf_evsel__process_lock_acquired,  }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
@@ -829,51 +846,51 @@ static const struct perf_evsel_str_handler lock_tracepoints[] = {
         { "lock:lock_release",   perf_evsel__process_lock_release,   }, /* CONFIG_LOCKDEP */
  };
  
-static int read_events(void)
+static int __cmd_report(bool display_info)
  {
+       int err = -EINVAL;
         struct perf_tool eops = {
                 .sample          = process_sample_event,
                 .comm            = perf_event__process_comm,
                 .ordered_samples = true,
         };
-       session = perf_session__new(input_name, O_RDONLY, 0, false, &eops);
+       struct perf_data_file file = {
+               .path = input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
+
+       session = perf_session__new(&file, false, &eops);
         if (!session) {
                 pr_err("Initializing perf session failed\n");
-               return -1;
+               return -ENOMEM;
         }
  
+       if (!perf_session__has_traces(session, "lock record"))
+               goto out_delete;
+
         if (perf_session__set_tracepoints_handlers(session, lock_tracepoints)) {
                 pr_err("Initializing perf session tracepoint handlers failed\n");
-               return -1;
+               goto out_delete;
         }
  
-       return perf_session__process_events(session, &eops);
-}
-
-static void sort_result(void)
-{
-       unsigned int i;
-       struct lock_stat *st;
+       if (select_key())
+               goto out_delete;
  
-       for (i = 0; i < LOCKHASH_SIZE; i++) {
-               list_for_each_entry(st, &lockhash_table[i], hash_entry) {
-                       insert_to_result(st, compare);
-               }
-       }
-}
+       err = perf_session__process_events(session, &eops);
+       if (err)
+               goto out_delete;
  
-static int __cmd_report(void)
-{
         setup_pager();
+       if (display_info) /* used for info subcommand */
+               err = dump_info();
+       else {
+               sort_result();
+               print_result();
+       }
  
-       if ((select_key() != 0) ||
-           (read_events() != 0))
-               return -1;
-
-       sort_result();
-       print_result();
-
-       return 0;
+out_delete:
+       perf_session__delete(session);
+       return err;
  }
  
  static int __cmd_record(int argc, const char **argv)
@@ -881,7 +898,7 @@ static int __cmd_record(int argc, const char **argv)
         const char *record_args[] = {
                 "record", "-R", "-m", "1024", "-c", "1",
         };
-       unsigned int rec_argc, i, j;
+       unsigned int rec_argc, i, j, ret;
         const char **rec_argv;
  
         for (i = 0; i < ARRAY_SIZE(lock_tracepoints); i++) {
@@ -898,7 +915,7 @@ static int __cmd_record(int argc, const char **argv)
         rec_argc += 2 * ARRAY_SIZE(lock_tracepoints);
  
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
-       if (rec_argv == NULL)
+       if (!rec_argv)
                 return -ENOMEM;
  
         for (i = 0; i < ARRAY_SIZE(record_args); i++)
@@ -914,7 +931,9 @@ static int __cmd_record(int argc, const char **argv)
  
         BUG_ON(i != rec_argc);
  
-       return cmd_record(i, rec_argv, NULL);
+       ret = cmd_record(i, rec_argv, NULL);
+       free(rec_argv);
+       return ret;
  }
  
  int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
@@ -934,7 +953,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
         };
         const struct option report_options[] = {
         OPT_STRING('k', "key", &sort_key, "acquired",
-                   "key for sorting (acquired / contended / wait_total / wait_max / wait_min)"),
+                   "key for sorting (acquired / contended / avg_wait / wait_total / wait_max / wait_min)"),
         /* TODO: type */
         OPT_END()
         };
@@ -972,7 +991,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
                         if (argc)
                                 usage_with_options(report_usage, report_options);
                 }
-               __cmd_report();
+               rc = __cmd_report(false);
         } else if (!strcmp(argv[0], "script")) {
                 /* Aliased to 'perf script' */
                 return cmd_script(argc, argv, prefix);
@@ -985,11 +1004,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
                 }
                 /* recycling report_lock_ops */
                 trace_handler = &report_lock_ops;
-               setup_pager();
-               if (read_events() != 0)
-                       rc = -1;
-               else
-                       rc = dump_info();
+               rc = __cmd_report(true);
         } else {
                 usage_with_options(lock_usage, lock_options);
         }
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c

index 253133a6251d3c3d657671b7a8a603ef4cb03643..31c00f186da127fa6fca4157d839426886008c07 100644 (file)
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -5,6 +5,7 @@
  #include "util/trace-event.h"
  #include "util/tool.h"
  #include "util/session.h"
+#include "util/data.h"
  
  #define MEM_OPERATION_LOAD     "load"
  #define MEM_OPERATION_STORE    "store"
@@ -119,10 +120,14 @@ static int process_sample_event(struct perf_tool *tool,
  
  static int report_raw_events(struct perf_mem *mem)
  {
+       struct perf_data_file file = {
+               .path = input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
         int err = -EINVAL;
         int ret;
-       struct perf_session *session = perf_session__new(input_name, O_RDONLY,
-                                                        0, false, &mem->tool);
+       struct perf_session *session = perf_session__new(&file, false,
+                                                        &mem->tool);
  
         if (session == NULL)
                 return -ENOMEM;
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c

index e8a66f9a67153c6819422839a55bd1ae5f7d4fce..89acc17cf2a02e4c7d8760a7b89e1300007cc5a3 100644 (file)
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -173,7 +173,7 @@ static int opt_set_target(const struct option *opt, const char *str,
         if  (str && !params.target) {
                 if (!strcmp(opt->long_name, "exec"))
                         params.uprobes = true;
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
                 else if (!strcmp(opt->long_name, "module"))
                         params.uprobes = false;
  #endif
@@ -187,7 +187,7 @@ static int opt_set_target(const struct option *opt, const char *str,
         return ret;
  }
  
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
  static int opt_show_lines(const struct option *opt __maybe_unused,
                           const char *str, int unset __maybe_unused)
  {
@@ -257,7 +257,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                 "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
                 "perf probe [<options>] --del '[GROUP:]EVENT' ...",
                 "perf probe --list",
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
                 "perf probe [<options>] --line 'LINEDESC'",
                 "perf probe [<options>] --vars 'PROBEPOINT'",
  #endif
@@ -271,7 +271,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
                 opt_del_probe_event),
         OPT_CALLBACK('a', "add", NULL,
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
                 "[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
                 " [[NAME=]ARG ...]",
  #else
@@ -283,7 +283,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                 "\t\tFUNC:\tFunction name\n"
                 "\t\tOFF:\tOffset from function entry (in byte)\n"
                 "\t\t%return:\tPut the probe at function return\n"
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
                 "\t\tSRC:\tSource code path\n"
                 "\t\tRL:\tRelative line number from function entry.\n"
                 "\t\tAL:\tAbsolute line number in file.\n"
@@ -296,7 +296,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                 opt_add_probe_event),
         OPT_BOOLEAN('f', "force", &params.force_add, "forcibly add events"
                     " with existing name"),
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
         OPT_CALLBACK('L', "line", NULL,
                      "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]",
                      "Show source code lines.", opt_show_lines),
@@ -408,7 +408,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
                 return ret;
         }
  
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
         if (params.show_lines && !params.uprobes) {
                 if (params.mod_events) {
                         pr_err("  Error: Don't use --line with"
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index a41ac41546c962df3e0801b230f06b4aa9730c7a..ab8d15e6e8cca108489a8c7fd620250763c4dc5d 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -24,12 +24,13 @@
  #include "util/symbol.h"
  #include "util/cpumap.h"
  #include "util/thread_map.h"
+#include "util/data.h"
  
  #include <unistd.h>
  #include <sched.h>
  #include <sys/mman.h>
  
-#ifndef HAVE_ON_EXIT
+#ifndef HAVE_ON_EXIT_SUPPORT
  #ifndef ATEXIT_MAX
  #define ATEXIT_MAX 32
  #endif
@@ -65,12 +66,10 @@ struct perf_record {
         struct perf_tool        tool;
         struct perf_record_opts opts;
         u64                     bytes_written;
-       const char              *output_name;
+       struct perf_data_file   file;
         struct perf_evlist      *evlist;
         struct perf_session     *session;
         const char              *progname;
-       int                     output;
-       unsigned int            page_size;
         int                     realtime_prio;
         bool                    no_buildid;
         bool                    no_buildid_cache;
@@ -85,11 +84,13 @@ static void advance_output(struct perf_record *rec, size_t size)
  
  static int write_output(struct perf_record *rec, void *buf, size_t size)
  {
+       struct perf_data_file *file = &rec->file;
+
         while (size) {
-               int ret = write(rec->output, buf, size);
+               int ret = write(file->fd, buf, size);
  
                 if (ret < 0) {
-                       pr_err("failed to write\n");
+                       pr_err("failed to write perf data, error: %m\n");
                         return -1;
                 }
  
@@ -119,7 +120,7 @@ static int perf_record__mmap_read(struct perf_record *rec,
  {
         unsigned int head = perf_mmap__read_head(md);
         unsigned int old = md->prev;
-       unsigned char *data = md->base + rec->page_size;
+       unsigned char *data = md->base + page_size;
         unsigned long size;
         void *buf;
         int rc = 0;
@@ -234,10 +235,6 @@ try_again:
                                "or try again with a smaller value of -m/--mmap_pages.\n"
                                "(current value: %d)\n", opts->mmap_pages);
                         rc = -errno;
-               } else if (!is_power_of_2(opts->mmap_pages) &&
-                          (opts->mmap_pages != UINT_MAX)) {
-                       pr_err("--mmap_pages/-m value must be a power of two.");
-                       rc = -EINVAL;
                 } else {
                         pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
                         rc = -errno;
@@ -253,13 +250,14 @@ out:
  
  static int process_buildids(struct perf_record *rec)
  {
-       u64 size = lseek(rec->output, 0, SEEK_CUR);
+       struct perf_data_file *file  = &rec->file;
+       struct perf_session *session = rec->session;
  
+       u64 size = lseek(file->fd, 0, SEEK_CUR);
         if (size == 0)
                 return 0;
  
-       rec->session->fd = rec->output;
-       return __perf_session__process_events(rec->session, rec->post_processing_offset,
+       return __perf_session__process_events(session, rec->post_processing_offset,
                                               size - rec->post_processing_offset,
                                               size, &build_id__mark_dso_hit_ops);
  }
@@ -267,17 +265,18 @@ static int process_buildids(struct perf_record *rec)
  static void perf_record__exit(int status, void *arg)
  {
         struct perf_record *rec = arg;
+       struct perf_data_file *file = &rec->file;
  
         if (status != 0)
                 return;
  
-       if (!rec->opts.pipe_output) {
+       if (!file->is_pipe) {
                 rec->session->header.data_size += rec->bytes_written;
  
                 if (!rec->no_buildid)
                         process_buildids(rec);
                 perf_session__write_header(rec->session, rec->evlist,
-                                          rec->output, true);
+                                          file->fd, true);
                 perf_session__delete(rec->session);
                 perf_evlist__delete(rec->evlist);
                 symbol__exit();
@@ -345,62 +344,26 @@ out:
  
  static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
  {
-       struct stat st;
-       int flags;
-       int err, output, feat;
+       int err, feat;
         unsigned long waking = 0;
         const bool forks = argc > 0;
         struct machine *machine;
         struct perf_tool *tool = &rec->tool;
         struct perf_record_opts *opts = &rec->opts;
         struct perf_evlist *evsel_list = rec->evlist;
-       const char *output_name = rec->output_name;
+       struct perf_data_file *file = &rec->file;
         struct perf_session *session;
         bool disabled = false;
  
         rec->progname = argv[0];
  
-       rec->page_size = sysconf(_SC_PAGE_SIZE);
-
         on_exit(perf_record__sig_exit, rec);
         signal(SIGCHLD, sig_handler);
         signal(SIGINT, sig_handler);
         signal(SIGUSR1, sig_handler);
         signal(SIGTERM, sig_handler);
  
-       if (!output_name) {
-               if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
-                       opts->pipe_output = true;
-               else
-                       rec->output_name = output_name = "perf.data";
-       }
-       if (output_name) {
-               if (!strcmp(output_name, "-"))
-                       opts->pipe_output = true;
-               else if (!stat(output_name, &st) && st.st_size) {
-                       char oldname[PATH_MAX];
-                       snprintf(oldname, sizeof(oldname), "%s.old",
-                                output_name);
-                       unlink(oldname);
-                       rename(output_name, oldname);
-               }
-       }
-
-       flags = O_CREAT|O_RDWR|O_TRUNC;
-
-       if (opts->pipe_output)
-               output = STDOUT_FILENO;
-       else
-               output = open(output_name, flags, S_IRUSR | S_IWUSR);
-       if (output < 0) {
-               perror("failed to create output file");
-               return -1;
-       }
-
-       rec->output = output;
-
-       session = perf_session__new(output_name, O_WRONLY,
-                                   true, false, NULL);
+       session = perf_session__new(file, false, NULL);
         if (session == NULL) {
                 pr_err("Not enough memory for reading perf file header\n");
                 return -1;
@@ -422,7 +385,7 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
  
         if (forks) {
                 err = perf_evlist__prepare_workload(evsel_list, &opts->target,
-                                                   argv, opts->pipe_output,
+                                                   argv, file->is_pipe,
                                                     true);
                 if (err < 0) {
                         pr_err("Couldn't run the workload!\n");
@@ -443,13 +406,13 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
          */
         on_exit(perf_record__exit, rec);
  
-       if (opts->pipe_output) {
-               err = perf_header__write_pipe(output);
+       if (file->is_pipe) {
+               err = perf_header__write_pipe(file->fd);
                 if (err < 0)
                         goto out_delete_session;
         } else {
                 err = perf_session__write_header(session, evsel_list,
-                                                output, false);
+                                                file->fd, false);
                 if (err < 0)
                         goto out_delete_session;
         }
@@ -462,11 +425,11 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
                 goto out_delete_session;
         }
  
-       rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
+       rec->post_processing_offset = lseek(file->fd, 0, SEEK_CUR);
  
         machine = &session->machines.host;
  
-       if (opts->pipe_output) {
+       if (file->is_pipe) {
                 err = perf_event__synthesize_attrs(tool, session,
                                                    process_synthesized_event);
                 if (err < 0) {
@@ -483,7 +446,7 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
                          * return this more properly and also
                          * propagate errors that now are calling die()
                          */
-                       err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
+                       err = perf_event__synthesize_tracing_data(tool, file->fd, evsel_list,
                                                                   process_synthesized_event);
                         if (err <= 0) {
                                 pr_err("Couldn't record tracing data.\n");
@@ -590,7 +553,7 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
         fprintf(stderr,
                 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
                 (double)rec->bytes_written / 1024.0 / 1024.0,
-               output_name,
+               file->path,
                 rec->bytes_written / 24);
  
         return 0;
@@ -618,6 +581,9 @@ static const struct branch_mode branch_modes[] = {
         BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
         BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
         BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
+       BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
+       BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
+       BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
         BRANCH_END
  };
  
@@ -684,7 +650,7 @@ error:
         return ret;
  }
  
-#ifdef LIBUNWIND_SUPPORT
+#ifdef HAVE_LIBUNWIND_SUPPORT
  static int get_stack_size(char *str, unsigned long *_size)
  {
         char *endptr;
@@ -710,7 +676,7 @@ static int get_stack_size(char *str, unsigned long *_size)
                max_size, str);
         return -1;
  }
-#endif /* LIBUNWIND_SUPPORT */
+#endif /* HAVE_LIBUNWIND_SUPPORT */
  
  int record_parse_callchain_opt(const struct option *opt,
                                const char *arg, int unset)
@@ -748,7 +714,7 @@ int record_parse_callchain_opt(const struct option *opt,
                                        "needed for -g fp\n");
                         break;
  
-#ifdef LIBUNWIND_SUPPORT
+#ifdef HAVE_LIBUNWIND_SUPPORT
                 /* Dwarf style */
                 } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
                         const unsigned long default_stack_dump_size = 8192;
@@ -768,7 +734,7 @@ int record_parse_callchain_opt(const struct option *opt,
                         if (!ret)
                                 pr_debug("callchain: stack dump size %d\n",
                                          opts->stack_dump_size);
-#endif /* LIBUNWIND_SUPPORT */
+#endif /* HAVE_LIBUNWIND_SUPPORT */
                 } else {
                         pr_err("callchain: Unknown -g option "
                                "value: %s\n", arg);
@@ -815,7 +781,7 @@ static struct perf_record record = {
  
  #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
  
-#ifdef LIBUNWIND_SUPPORT
+#ifdef HAVE_LIBUNWIND_SUPPORT
  const char record_callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
  #else
  const char record_callchain_help[] = CALLCHAIN_HELP "[fp]";
@@ -849,13 +815,14 @@ const struct option record_options[] = {
         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
                     "list of cpus to monitor"),
         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
-       OPT_STRING('o', "output", &record.output_name, "file",
+       OPT_STRING('o', "output", &record.file.path, "file",
                     "output file name"),
         OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
                     "child tasks do not inherit counters"),
         OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
-       OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
-                    "number of mmap data pages"),
+       OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages",
+                    "number of mmap data pages",
+                    perf_evlist__parse_mmap_pages),
         OPT_BOOLEAN(0, "group", &record.opts.group,
                     "put the counters into a counter group"),
         OPT_CALLBACK_DEFAULT('g', "call-graph", &record.opts,
@@ -891,6 +858,8 @@ const struct option record_options[] = {
                      parse_branch_stack),
         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
                     "sample by weight (on special events only)"),
+       OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
+                   "sample transaction flags (special events only)"),
         OPT_END()
  };
  
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 72eae7498c09419c8f1f77a7a005c6dcbbb5eb4b..81addcabb356800780320c71e7bd69af52b4f2b9 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -33,8 +33,10 @@
  #include "util/thread.h"
  #include "util/sort.h"
  #include "util/hist.h"
+#include "util/data.h"
  #include "arch/common.h"
  
+#include <dlfcn.h>
  #include <linux/bitmap.h>
  
  struct perf_report {
@@ -47,6 +49,7 @@ struct perf_report {
         bool                    show_threads;
         bool                    inverted_callchain;
         bool                    mem_mode;
+       int                     max_stack;
         struct perf_read_values show_threads_values;
         const char              *pretty_printing_style;
         const char              *cpu_list;
@@ -88,7 +91,8 @@ static int perf_report__add_mem_hist_entry(struct perf_tool *tool,
         if ((sort__has_parent || symbol_conf.use_callchain) &&
             sample->callchain) {
                 err = machine__resolve_callchain(machine, evsel, al->thread,
-                                                sample, &parent, al);
+                                                sample, &parent, al,
+                                                rep->max_stack);
                 if (err)
                         return err;
         }
@@ -179,7 +183,8 @@ static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
         if ((sort__has_parent || symbol_conf.use_callchain)
             && sample->callchain) {
                 err = machine__resolve_callchain(machine, evsel, al->thread,
-                                                sample, &parent, al);
+                                                sample, &parent, al,
+                                                rep->max_stack);
                 if (err)
                         return err;
         }
@@ -242,24 +247,27 @@ out:
         return err;
  }
  
-static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
+static int perf_evsel__add_hist_entry(struct perf_tool *tool,
+                                     struct perf_evsel *evsel,
                                       struct addr_location *al,
                                       struct perf_sample *sample,
                                       struct machine *machine)
  {
+       struct perf_report *rep = container_of(tool, struct perf_report, tool);
         struct symbol *parent = NULL;
         int err = 0;
         struct hist_entry *he;
  
         if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
                 err = machine__resolve_callchain(machine, evsel, al->thread,
-                                                sample, &parent, al);
+                                                sample, &parent, al,
+                                                rep->max_stack);
                 if (err)
                         return err;
         }
  
         he = __hists__add_entry(&evsel->hists, al, parent, sample->period,
-                                       sample->weight);
+                               sample->weight, sample->transaction);
         if (he == NULL)
                 return -ENOMEM;
  
@@ -330,7 +338,8 @@ static int process_sample_event(struct perf_tool *tool,
                 if (al.map != NULL)
                         al.map->dso->hit = 1;
  
-               ret = perf_evsel__add_hist_entry(evsel, &al, sample, machine);
+               ret = perf_evsel__add_hist_entry(tool, evsel, &al, sample,
+                                                machine);
                 if (ret < 0)
                         pr_debug("problem incrementing symbol period, skipping event\n");
         }
@@ -366,8 +375,9 @@ static int perf_report__setup_sample_type(struct perf_report *rep)
  {
         struct perf_session *self = rep->session;
         u64 sample_type = perf_evlist__combined_sample_type(self->evlist);
+       bool is_pipe = perf_data_file__is_pipe(self->file);
  
-       if (!self->fd_pipe && !(sample_type & PERF_SAMPLE_CALLCHAIN)) {
+       if (!is_pipe && !(sample_type & PERF_SAMPLE_CALLCHAIN)) {
                 if (sort__has_parent) {
                         ui__error("Selected --sort parent, but no "
                                     "callchain data. Did you call "
@@ -390,7 +400,7 @@ static int perf_report__setup_sample_type(struct perf_report *rep)
         }
  
         if (sort__mode == SORT_MODE__BRANCH) {
-               if (!self->fd_pipe &&
+               if (!is_pipe &&
                     !(sample_type & PERF_SAMPLE_BRANCH_STACK)) {
                         ui__error("Selected -b but no branch data. "
                                   "Did you call perf record without -b?\n");
@@ -486,6 +496,7 @@ static int __cmd_report(struct perf_report *rep)
         struct map *kernel_map;
         struct kmap *kernel_kmap;
         const char *help = "For a higher level overview, try: perf report --sort comm,dso";
+       struct perf_data_file *file = session->file;
  
         signal(SIGINT, sig_handler);
  
@@ -570,7 +581,7 @@ static int __cmd_report(struct perf_report *rep)
                 return 0;
  
         if (nr_samples == 0) {
-               ui__error("The %s file has no samples!\n", session->filename);
+               ui__error("The %s file has no samples!\n", file->path);
                 return 0;
         }
  
@@ -591,8 +602,19 @@ static int __cmd_report(struct perf_report *rep)
                                 ret = 0;
  
                 } else if (use_browser == 2) {
-                       perf_evlist__gtk_browse_hists(session->evlist, help,
-                                                     NULL, rep->min_percent);
+                       int (*hist_browser)(struct perf_evlist *,
+                                           const char *,
+                                           struct hist_browser_timer *,
+                                           float min_pcnt);
+
+                       hist_browser = dlsym(perf_gtk_handle,
+                                            "perf_evlist__gtk_browse_hists");
+                       if (hist_browser == NULL) {
+                               ui__error("GTK browser not found!\n");
+                               return ret;
+                       }
+                       hist_browser(session->evlist, help, NULL,
+                                    rep->min_percent);
                 }
         } else
                 perf_evlist__tty_browse_hists(session->evlist, rep, help);
@@ -757,6 +779,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                         .ordered_samples = true,
                         .ordering_requires_timestamps = true,
                 },
+               .max_stack               = PERF_MAX_STACK_DEPTH,
                 .pretty_printing_style   = "normal",
         };
         const struct option options[] = {
@@ -787,7 +810,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                    "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline,"
                    " dso_to, dso_from, symbol_to, symbol_from, mispredict,"
                    " weight, local_weight, mem, symbol_daddr, dso_daddr, tlb, "
-                  "snoop, locked"),
+                  "snoop, locked, abort, in_tx, transaction"),
         OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
                     "Show sample percentage for different cpu modes"),
         OPT_STRING('p', "parent", &parent_pattern, "regex",
@@ -797,6 +820,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
                      "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
                      "Default: fractal,0.5,callee,function", &parse_callchain_opt, callchain_default_opt),
+       OPT_INTEGER(0, "max-stack", &report.max_stack,
+                   "Set the maximum stack depth when parsing the callchain, "
+                   "anything beyond the specified depth will be ignored. "
+                   "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
         OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
                     "alias for inverted call graph"),
         OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
@@ -845,6 +872,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                      "Don't show entries under that percent", parse_percent_limit),
         OPT_END()
         };
+       struct perf_data_file file = {
+               .mode  = PERF_DATA_MODE_READ,
+       };
  
         perf_config(perf_report_config, &report);
  
@@ -874,9 +904,11 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                 perf_hpp__init();
         }
  
+       file.path  = input_name;
+       file.force = report.force;
+
  repeat:
-       session = perf_session__new(input_name, O_RDONLY,
-                                   report.force, false, &report.tool);
+       session = perf_session__new(&file, false, &report.tool);
         if (session == NULL)
                 return -ENOMEM;
  
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c

index d8c51b2f263f7475da99a28df87cad3d1f6bb4d2..5a46b102eb081416d8c9af35f855a7b1b09e2e5a 100644 (file)
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1446,8 +1446,12 @@ static int perf_sched__read_events(struct perf_sched *sched,
                 { "sched:sched_migrate_task", process_sched_migrate_task_event, },
         };
         struct perf_session *session;
+       struct perf_data_file file = {
+               .path = input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
  
-       session = perf_session__new(input_name, O_RDONLY, 0, false, &sched->tool);
+       session = perf_session__new(&file, false, &sched->tool);
         if (session == NULL) {
                 pr_debug("No Memory for session\n");
                 return -1;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index 9c333ff3dfeb3de716eac13d48d7364e998bb50d..27de6068049d053426f0a1d918361321b6b6cbec 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -15,6 +15,7 @@
  #include "util/evlist.h"
  #include "util/evsel.h"
  #include "util/sort.h"
+#include "util/data.h"
  #include <linux/bitmap.h>
  
  static char const              *script_name;
@@ -409,7 +410,9 @@ static void print_sample_bts(union perf_event *event,
         printf(" => ");
  
         /* print branch_to information */
-       if (PRINT_FIELD(ADDR))
+       if (PRINT_FIELD(ADDR) ||
+           ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) &&
+            !output[attr->type].user_set))
                 print_sample_addr(event, sample, machine, thread, attr);
  
         printf("\n");
@@ -1113,10 +1116,14 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
         char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN];
         DIR *scripts_dir, *lang_dir;
         struct perf_session *session;
+       struct perf_data_file file = {
+               .path = input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
         char *temp;
         int i = 0;
  
-       session = perf_session__new(input_name, O_RDONLY, 0, false, NULL);
+       session = perf_session__new(&file, false, NULL);
         if (!session)
                 return -1;
  
@@ -1317,12 +1324,17 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                 "perf script [<options>] <top-script> [script-args]",
                 NULL
         };
+       struct perf_data_file file = {
+               .mode = PERF_DATA_MODE_READ,
+       };
  
         setup_scripting();
  
         argc = parse_options(argc, argv, options, script_usage,
                              PARSE_OPT_STOP_AT_NON_OPTION);
  
+       file.path = input_name;
+
         if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
                 rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
                 if (!rec_script_path)
@@ -1486,8 +1498,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
         if (!script_name)
                 setup_pager();
  
-       session = perf_session__new(input_name, O_RDONLY, 0, false,
-                                   &perf_script);
+       session = perf_session__new(&file, false, &perf_script);
         if (session == NULL)
                 return -ENOMEM;
  
@@ -1514,7 +1525,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                         return -1;
                 }
  
-               input = open(session->filename, O_RDONLY);      /* input_name */
+               input = open(file.path, O_RDONLY);      /* input_name */
                 if (input < 0) {
                         perror("failed to open file");
                         return -1;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index 5098f144b92defd53e94f9f24184e3ade0672928..1a9c95d270aa2326f45ee0d2c036df10601f48a9 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,6 +46,7 @@
  #include "util/util.h"
  #include "util/parse-options.h"
  #include "util/parse-events.h"
+#include "util/pmu.h"
  #include "util/event.h"
  #include "util/evlist.h"
  #include "util/evsel.h"
@@ -70,6 +71,41 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix);
  static void print_counter(struct perf_evsel *counter, char *prefix);
  static void print_aggr(char *prefix);
  
+/* Default events used for perf stat -T */
+static const char * const transaction_attrs[] = {
+       "task-clock",
+       "{"
+       "instructions,"
+       "cycles,"
+       "cpu/cycles-t/,"
+       "cpu/tx-start/,"
+       "cpu/el-start/,"
+       "cpu/cycles-ct/"
+       "}"
+};
+
+/* More limited version when the CPU does not have all events. */
+static const char * const transaction_limited_attrs[] = {
+       "task-clock",
+       "{"
+       "instructions,"
+       "cycles,"
+       "cpu/cycles-t/,"
+       "cpu/tx-start/"
+       "}"
+};
+
+/* must match transaction_attrs and the beginning limited_attrs */
+enum {
+       T_TASK_CLOCK,
+       T_INSTRUCTIONS,
+       T_CYCLES,
+       T_CYCLES_IN_TX,
+       T_TRANSACTION_START,
+       T_ELISION_START,
+       T_CYCLES_IN_TX_CP,
+};
+
  static struct perf_evlist      *evsel_list;
  
  static struct perf_target      target = {
@@ -90,6 +126,7 @@ static enum aggr_mode                aggr_mode                       = AGGR_GLOBAL;
  static volatile pid_t          child_pid                       = -1;
  static bool                    null_run                        =  false;
  static int                     detailed_run                    =  0;
+static bool                    transaction_run;
  static bool                    big_num                         =  true;
  static int                     big_num_opt                     =  -1;
  static const char              *csv_sep                        = NULL;
@@ -214,7 +251,10 @@ static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
  static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
  static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
  static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
  static struct stats walltime_nsecs_stats;
+static struct stats runtime_transaction_stats[MAX_NR_CPUS];
+static struct stats runtime_elision_stats[MAX_NR_CPUS];
  
  static void perf_stat__reset_stats(struct perf_evlist *evlist)
  {
@@ -236,6 +276,11 @@ static void perf_stat__reset_stats(struct perf_evlist *evlist)
         memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
         memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
         memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
+       memset(runtime_cycles_in_tx_stats, 0,
+                       sizeof(runtime_cycles_in_tx_stats));
+       memset(runtime_transaction_stats, 0,
+               sizeof(runtime_transaction_stats));
+       memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
         memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
  }
  
@@ -274,6 +319,29 @@ static inline int nsec_counter(struct perf_evsel *evsel)
         return 0;
  }
  
+static struct perf_evsel *nth_evsel(int n)
+{
+       static struct perf_evsel **array;
+       static int array_len;
+       struct perf_evsel *ev;
+       int j;
+
+       /* Assumes this only called when evsel_list does not change anymore. */
+       if (!array) {
+               list_for_each_entry(ev, &evsel_list->entries, node)
+                       array_len++;
+               array = malloc(array_len * sizeof(void *));
+               if (!array)
+                       exit(ENOMEM);
+               j = 0;
+               list_for_each_entry(ev, &evsel_list->entries, node)
+                       array[j++] = ev;
+       }
+       if (n < array_len)
+               return array[n];
+       return NULL;
+}
+
  /*
   * Update various tracking values we maintain to print
   * more semantic information such as miss/hit ratios,
@@ -285,6 +353,15 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
                 update_stats(&runtime_nsecs_stats[0], count[0]);
         else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
                 update_stats(&runtime_cycles_stats[0], count[0]);
+       else if (transaction_run &&
+                perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
+               update_stats(&runtime_cycles_in_tx_stats[0], count[0]);
+       else if (transaction_run &&
+                perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
+               update_stats(&runtime_transaction_stats[0], count[0]);
+       else if (transaction_run &&
+                perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
+               update_stats(&runtime_elision_stats[0], count[0]);
         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
                 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -629,10 +706,13 @@ static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
  {
         double msecs = avg / 1e6;
         const char *fmt = csv_output ? "%.6f%s%s" : "%18.6f%s%-25s";
+       char name[25];
  
         aggr_printout(evsel, cpu, nr);
  
-       fprintf(output, fmt, msecs, csv_sep, perf_evsel__name(evsel));
+       scnprintf(name, sizeof(name), "%s%s",
+                 perf_evsel__name(evsel), csv_output ? "" : " (msec)");
+       fprintf(output, fmt, msecs, csv_sep, name);
  
         if (evsel->cgrp)
                 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
@@ -828,7 +908,7 @@ static void print_ll_cache_misses(int cpu,
  
  static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
  {
-       double total, ratio = 0.0;
+       double total, ratio = 0.0, total2;
         const char *fmt;
  
         if (csv_output)
@@ -853,11 +933,10 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
  
         if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
                 total = avg_stats(&runtime_cycles_stats[cpu]);
-               if (total)
+               if (total) {
                         ratio = avg / total;
-
-               fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
-
+                       fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
+               }
                 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
                 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
  
@@ -920,10 +999,47 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
         } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
                 total = avg_stats(&runtime_nsecs_stats[cpu]);
  
+               if (total) {
+                       ratio = avg / total;
+                       fprintf(output, " # %8.3f GHz                    ", ratio);
+               }
+       } else if (transaction_run &&
+                  perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
+               total = avg_stats(&runtime_cycles_stats[cpu]);
+               if (total)
+                       fprintf(output,
+                               " #   %5.2f%% transactional cycles   ",
+                               100.0 * (avg / total));
+       } else if (transaction_run &&
+                  perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
+               total = avg_stats(&runtime_cycles_stats[cpu]);
+               total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+               if (total2 < avg)
+                       total2 = avg;
+               if (total)
+                       fprintf(output,
+                               " #   %5.2f%% aborted cycles         ",
+                               100.0 * ((total2-avg) / total));
+       } else if (transaction_run &&
+                  perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
+                  avg > 0 &&
+                  runtime_cycles_in_tx_stats[cpu].n != 0) {
+               total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+
+               if (total)
+                       ratio = total / avg;
+
+               fprintf(output, " # %8.0f cycles / transaction   ", ratio);
+       } else if (transaction_run &&
+                  perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
+                  avg > 0 &&
+                  runtime_cycles_in_tx_stats[cpu].n != 0) {
+               total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+
                 if (total)
-                       ratio = 1.0 * avg / total;
+                       ratio = total / avg;
  
-               fprintf(output, " # %8.3f GHz                    ", ratio);
+               fprintf(output, " # %8.0f cycles / elision       ", ratio);
         } else if (runtime_nsecs_stats[cpu].n != 0) {
                 char unit = 'M';
  
@@ -1116,7 +1232,11 @@ static void print_stat(int argc, const char **argv)
         if (!csv_output) {
                 fprintf(output, "\n");
                 fprintf(output, " Performance counter stats for ");
-               if (!perf_target__has_task(&target)) {
+               if (target.system_wide)
+                       fprintf(output, "\'system wide");
+               else if (target.cpu_list)
+                       fprintf(output, "\'CPU(s) %s", target.cpu_list);
+               else if (!perf_target__has_task(&target)) {
                         fprintf(output, "\'%s", argv[0]);
                         for (i = 1; i < argc; i++)
                                 fprintf(output, " %s", argv[i]);
@@ -1237,6 +1357,16 @@ static int perf_stat_init_aggr_mode(void)
         return 0;
  }
  
+static int setup_events(const char * const *attrs, unsigned len)
+{
+       unsigned i;
+
+       for (i = 0; i < len; i++) {
+               if (parse_events(evsel_list, attrs[i]))
+                       return -1;
+       }
+       return 0;
+}
  
  /*
   * Add default attributes, if there were no attributes specified or
@@ -1355,6 +1485,22 @@ static int add_default_attributes(void)
         if (null_run)
                 return 0;
  
+       if (transaction_run) {
+               int err;
+               if (pmu_have_event("cpu", "cycles-ct") &&
+                   pmu_have_event("cpu", "el-start"))
+                       err = setup_events(transaction_attrs,
+                                       ARRAY_SIZE(transaction_attrs));
+               else
+                       err = setup_events(transaction_limited_attrs,
+                                ARRAY_SIZE(transaction_limited_attrs));
+               if (err < 0) {
+                       fprintf(stderr, "Cannot set up transaction events\n");
+                       return -1;
+               }
+               return 0;
+       }
+
         if (!evsel_list->nr_entries) {
                 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
                         return -1;
@@ -1389,6 +1535,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
         int output_fd = 0;
         const char *output_name = NULL;
         const struct option options[] = {
+       OPT_BOOLEAN('T', "transaction", &transaction_run,
+                   "hardware transaction statistics"),
         OPT_CALLBACK('e', "event", &evsel_list, "event",
                      "event selector. use 'perf list' to list available events",
                      parse_events_option),
@@ -1514,8 +1662,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
         } else if (big_num_opt == 0) /* User passed --no-big-num */
                 big_num = false;
  
-       if (!argc && !perf_target__has_task(&target))
+       if (!argc && perf_target__none(&target))
                 usage_with_options(stat_usage, options);
+
         if (run_count < 0) {
                 usage_with_options(stat_usage, options);
         } else if (run_count == 0) {
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c

index c2e02319347abd86c74f6ab0b44c37106c0fb238..e11c61d9bda4dee188719f96b2fdbb00e5d6140f 100644 (file)
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -36,6 +36,7 @@
  #include "util/session.h"
  #include "util/svghelper.h"
  #include "util/tool.h"
+#include "util/data.h"
  
  #define SUPPORT_OLD_POWER_EVENTS 1
  #define PWR_EVENT_EXIT -1
@@ -990,8 +991,13 @@ static int __cmd_timechart(const char *output_name)
                 { "power:power_frequency",      process_sample_power_frequency },
  #endif
         };
-       struct perf_session *session = perf_session__new(input_name, O_RDONLY,
-                                                        0, false, &perf_timechart);
+       struct perf_data_file file = {
+               .path = input_name,
+               .mode = PERF_DATA_MODE_READ,
+       };
+
+       struct perf_session *session = perf_session__new(&file, false,
+                                                        &perf_timechart);
         int ret = -EINVAL;
  
         if (session == NULL)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

index 212214162bb2820286694417035e470e1d465e76..386d83324a8d0d519800c8b3cba7a6b5defa95f5 100644 (file)
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -247,9 +247,8 @@ static struct hist_entry *perf_evsel__add_hist_entry(struct perf_evsel *evsel,
  
         pthread_mutex_lock(&evsel->hists.lock);
         he = __hists__add_entry(&evsel->hists, al, NULL, sample->period,
-                               sample->weight);
+                               sample->weight, sample->transaction);
         pthread_mutex_unlock(&evsel->hists.lock);
-
         if (he == NULL)
                 return NULL;
  
@@ -771,7 +770,8 @@ static void perf_event__process_sample(struct perf_tool *tool,
                     sample->callchain) {
                         err = machine__resolve_callchain(machine, evsel,
                                                          al.thread, sample,
-                                                        &parent, &al);
+                                                        &parent, &al,
+                                                        top->max_stack);
                         if (err)
                                 return;
                 }
@@ -930,11 +930,8 @@ static int __cmd_top(struct perf_top *top)
         struct perf_record_opts *opts = &top->record_opts;
         pthread_t thread;
         int ret;
-       /*
-        * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
-        * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
-        */
-       top->session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
+
+       top->session = perf_session__new(NULL, false, NULL);
         if (top->session == NULL)
                 return -ENOMEM;
  
@@ -1051,10 +1048,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                         .user_freq      = UINT_MAX,
                         .user_interval  = ULLONG_MAX,
                         .freq           = 4000, /* 4 KHz */
-                       .target              = {
+                       .target         = {
                                 .uses_mmap   = true,
                         },
                 },
+               .max_stack           = PERF_MAX_STACK_DEPTH,
                 .sym_pcnt_filter     = 5,
         };
         struct perf_record_opts *opts = &top.record_opts;
@@ -1074,10 +1072,13 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
                     "list of cpus to monitor"),
         OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
                    "file", "vmlinux pathname"),
+       OPT_BOOLEAN(0, "ignore-vmlinux", &symbol_conf.ignore_vmlinux,
+                   "don't load vmlinux even if found"),
         OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
                     "hide kernel symbols"),
-       OPT_UINTEGER('m', "mmap-pages", &opts->mmap_pages,
-                    "number of mmap data pages"),
+       OPT_CALLBACK('m', "mmap-pages", &opts->mmap_pages, "pages",
+                    "number of mmap data pages",
+                    perf_evlist__parse_mmap_pages),
         OPT_INTEGER('r', "realtime", &top.realtime_prio,
                     "collect data with this RT SCHED_FIFO priority"),
         OPT_INTEGER('d', "delay", &top.delay_secs,
@@ -1103,12 +1104,16 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
         OPT_INCR('v', "verbose", &verbose,
                     "be more verbose (show counter open errors, etc)"),
         OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-                  "sort by key(s): pid, comm, dso, symbol, parent, weight, local_weight"),
+                  "sort by key(s): pid, comm, dso, symbol, parent, weight, local_weight,"
+                  " abort, in_tx, transaction"),
         OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
                     "Show a column with the number of samples"),
         OPT_CALLBACK_DEFAULT('G', "call-graph", &top.record_opts,
                              "mode[,dump_size]", record_callchain_help,
                              &parse_callchain_opt, "fp"),
+       OPT_INTEGER(0, "max-stack", &top.max_stack,
+                   "Set the maximum stack depth when parsing the callchain. "
+                   "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
         OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
                    "ignore callees of these functions in call graphs",
                    report_parse_ignore_callees_opt),
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

index 71aa3e35406bd064e87044d2ae71597a5f117092..fa620bc1db69bf9bd9c4ee059e1e48a8d07a01a0 100644 (file)
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -10,9 +10,11 @@
  #include "util/strlist.h"
  #include "util/intlist.h"
  #include "util/thread_map.h"
+#include "util/stat.h"
  
  #include <libaudit.h>
  #include <stdlib.h>
+#include <sys/eventfd.h>
  #include <sys/mman.h>
  #include <linux/futex.h>
  
@@ -33,49 +35,96 @@
  # define MADV_UNMERGEABLE      13
  #endif
  
-static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
-                                        unsigned long arg,
-                                        u8 arg_idx __maybe_unused,
-                                        u8 *arg_mask __maybe_unused)
+struct syscall_arg {
+       unsigned long val;
+       struct thread *thread;
+       struct trace  *trace;
+       void          *parm;
+       u8            idx;
+       u8            mask;
+};
+
+struct strarray {
+       int         offset;
+       int         nr_entries;
+       const char **entries;
+};
+
+#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
+       .nr_entries = ARRAY_SIZE(array), \
+       .entries = array, \
+}
+
+#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
+       .offset     = off, \
+       .nr_entries = ARRAY_SIZE(array), \
+       .entries = array, \
+}
+
+static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
+                                               const char *intfmt,
+                                               struct syscall_arg *arg)
  {
-       return scnprintf(bf, size, "%#lx", arg);
+       struct strarray *sa = arg->parm;
+       int idx = arg->val - sa->offset;
+
+       if (idx < 0 || idx >= sa->nr_entries)
+               return scnprintf(bf, size, intfmt, arg->val);
+
+       return scnprintf(bf, size, "%s", sa->entries[idx]);
  }
  
-#define SCA_HEX syscall_arg__scnprintf_hex
+static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
+                                             struct syscall_arg *arg)
+{
+       return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
+}
  
-static size_t syscall_arg__scnprintf_whence(char *bf, size_t size,
-                                           unsigned long arg,
-                                           u8 arg_idx __maybe_unused,
-                                           u8 *arg_mask __maybe_unused)
+#define SCA_STRARRAY syscall_arg__scnprintf_strarray
+
+static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
+                                                struct syscall_arg *arg)
  {
-       int whence = arg;
+       return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
+}
  
-       switch (whence) {
-#define P_WHENCE(n) case SEEK_##n: return scnprintf(bf, size, #n)
-       P_WHENCE(SET);
-       P_WHENCE(CUR);
-       P_WHENCE(END);
-#ifdef SEEK_DATA
-       P_WHENCE(DATA);
-#endif
-#ifdef SEEK_HOLE
-       P_WHENCE(HOLE);
-#endif
-#undef P_WHENCE
-       default: break;
-       }
+#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
+
+static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
+                                       struct syscall_arg *arg);
+
+#define SCA_FD syscall_arg__scnprintf_fd
+
+static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
+                                          struct syscall_arg *arg)
+{
+       int fd = arg->val;
+
+       if (fd == AT_FDCWD)
+               return scnprintf(bf, size, "CWD");
+
+       return syscall_arg__scnprintf_fd(bf, size, arg);
+}
  
-       return scnprintf(bf, size, "%#x", whence);
+#define SCA_FDAT syscall_arg__scnprintf_fd_at
+
+static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
+                                             struct syscall_arg *arg);
+
+#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
+
+static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
+                                        struct syscall_arg *arg)
+{
+       return scnprintf(bf, size, "%#lx", arg->val);
  }
  
-#define SCA_WHENCE syscall_arg__scnprintf_whence
+#define SCA_HEX syscall_arg__scnprintf_hex
  
  static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
-                                              unsigned long arg,
-                                              u8 arg_idx __maybe_unused,
-                                              u8 *arg_mask __maybe_unused)
+                                              struct syscall_arg *arg)
  {
-       int printed = 0, prot = arg;
+       int printed = 0, prot = arg->val;
  
         if (prot == PROT_NONE)
                 return scnprintf(bf, size, "NONE");
@@ -104,10 +153,9 @@ static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
  #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
  
  static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
-                                               unsigned long arg, u8 arg_idx __maybe_unused,
-                                               u8 *arg_mask __maybe_unused)
+                                               struct syscall_arg *arg)
  {
-       int printed = 0, flags = arg;
+       int printed = 0, flags = arg->val;
  
  #define        P_MMAP_FLAG(n) \
         if (flags & MAP_##n) { \
@@ -148,10 +196,9 @@ static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
  #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
  
  static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
-                                                     unsigned long arg, u8 arg_idx __maybe_unused,
-                                                     u8 *arg_mask __maybe_unused)
+                                                     struct syscall_arg *arg)
  {
-       int behavior = arg;
+       int behavior = arg->val;
  
         switch (behavior) {
  #define        P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
@@ -190,8 +237,38 @@ static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
  
  #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
  
-static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, unsigned long arg,
-                                             u8 arg_idx __maybe_unused, u8 *arg_mask)
+static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
+                                          struct syscall_arg *arg)
+{
+       int printed = 0, op = arg->val;
+
+       if (op == 0)
+               return scnprintf(bf, size, "NONE");
+#define        P_CMD(cmd) \
+       if ((op & LOCK_##cmd) == LOCK_##cmd) { \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
+               op &= ~LOCK_##cmd; \
+       }
+
+       P_CMD(SH);
+       P_CMD(EX);
+       P_CMD(NB);
+       P_CMD(UN);
+       P_CMD(MAND);
+       P_CMD(RW);
+       P_CMD(READ);
+       P_CMD(WRITE);
+#undef P_OP
+
+       if (op)
+               printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
+
+       return printed;
+}
+
+#define SCA_FLOCK syscall_arg__scnprintf_flock
+
+static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
  {
         enum syscall_futex_args {
                 SCF_UADDR   = (1 << 0),
@@ -201,24 +278,24 @@ static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, unsigned lo
                 SCF_UADDR2  = (1 << 4),
                 SCF_VAL3    = (1 << 5),
         };
-       int op = arg;
+       int op = arg->val;
         int cmd = op & FUTEX_CMD_MASK;
         size_t printed = 0;
  
         switch (cmd) {
  #define        P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
-       P_FUTEX_OP(WAIT);           *arg_mask |= SCF_VAL3|SCF_UADDR2;             break;
-       P_FUTEX_OP(WAKE);           *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-       P_FUTEX_OP(FD);             *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-       P_FUTEX_OP(REQUEUE);        *arg_mask |= SCF_VAL3|SCF_TIMEOUT;            break;
-       P_FUTEX_OP(CMP_REQUEUE);    *arg_mask |= SCF_TIMEOUT;                     break;
-       P_FUTEX_OP(CMP_REQUEUE_PI); *arg_mask |= SCF_TIMEOUT;                     break;
+       P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
+       P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+       P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+       P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
+       P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
+       P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
         P_FUTEX_OP(WAKE_OP);                                                      break;
-       P_FUTEX_OP(LOCK_PI);        *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-       P_FUTEX_OP(UNLOCK_PI);      *arg_mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-       P_FUTEX_OP(TRYLOCK_PI);     *arg_mask |= SCF_VAL3|SCF_UADDR2;             break;
-       P_FUTEX_OP(WAIT_BITSET);    *arg_mask |= SCF_UADDR2;                      break;
-       P_FUTEX_OP(WAKE_BITSET);    *arg_mask |= SCF_UADDR2;                      break;
+       P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+       P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+       P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
+       P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
+       P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
         }
@@ -234,14 +311,194 @@ static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, unsigned lo
  
  #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
  
+static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
+static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
+
+static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
+static DEFINE_STRARRAY(itimers);
+
+static const char *whences[] = { "SET", "CUR", "END",
+#ifdef SEEK_DATA
+"DATA",
+#endif
+#ifdef SEEK_HOLE
+"HOLE",
+#endif
+};
+static DEFINE_STRARRAY(whences);
+
+static const char *fcntl_cmds[] = {
+       "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
+       "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
+       "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
+       "F_GETOWNER_UIDS",
+};
+static DEFINE_STRARRAY(fcntl_cmds);
+
+static const char *rlimit_resources[] = {
+       "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
+       "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
+       "RTTIME",
+};
+static DEFINE_STRARRAY(rlimit_resources);
+
+static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
+static DEFINE_STRARRAY(sighow);
+
+static const char *clockid[] = {
+       "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
+       "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
+};
+static DEFINE_STRARRAY(clockid);
+
+static const char *socket_families[] = {
+       "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
+       "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
+       "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
+       "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
+       "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
+       "ALG", "NFC", "VSOCK",
+};
+static DEFINE_STRARRAY(socket_families);
+
+#ifndef SOCK_TYPE_MASK
+#define SOCK_TYPE_MASK 0xf
+#endif
+
+static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
+                                                     struct syscall_arg *arg)
+{
+       size_t printed;
+       int type = arg->val,
+           flags = type & ~SOCK_TYPE_MASK;
+
+       type &= SOCK_TYPE_MASK;
+       /*
+        * Can't use a strarray, MIPS may override for ABI reasons.
+        */
+       switch (type) {
+#define        P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
+       P_SK_TYPE(STREAM);
+       P_SK_TYPE(DGRAM);
+       P_SK_TYPE(RAW);
+       P_SK_TYPE(RDM);
+       P_SK_TYPE(SEQPACKET);
+       P_SK_TYPE(DCCP);
+       P_SK_TYPE(PACKET);
+#undef P_SK_TYPE
+       default:
+               printed = scnprintf(bf, size, "%#x", type);
+       }
+
+#define        P_SK_FLAG(n) \
+       if (flags & SOCK_##n) { \
+               printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
+               flags &= ~SOCK_##n; \
+       }
+
+       P_SK_FLAG(CLOEXEC);
+       P_SK_FLAG(NONBLOCK);
+#undef P_SK_FLAG
+
+       if (flags)
+               printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
+
+       return printed;
+}
+
+#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
+
+#ifndef MSG_PROBE
+#define MSG_PROBE           0x10
+#endif
+#ifndef MSG_WAITFORONE
+#define MSG_WAITFORONE 0x10000
+#endif
+#ifndef MSG_SENDPAGE_NOTLAST
+#define MSG_SENDPAGE_NOTLAST 0x20000
+#endif
+#ifndef MSG_FASTOPEN
+#define MSG_FASTOPEN        0x20000000
+#endif
+
+static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
+                                              struct syscall_arg *arg)
+{
+       int printed = 0, flags = arg->val;
+
+       if (flags == 0)
+               return scnprintf(bf, size, "NONE");
+#define        P_MSG_FLAG(n) \
+       if (flags & MSG_##n) { \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               flags &= ~MSG_##n; \
+       }
+
+       P_MSG_FLAG(OOB);
+       P_MSG_FLAG(PEEK);
+       P_MSG_FLAG(DONTROUTE);
+       P_MSG_FLAG(TRYHARD);
+       P_MSG_FLAG(CTRUNC);
+       P_MSG_FLAG(PROBE);
+       P_MSG_FLAG(TRUNC);
+       P_MSG_FLAG(DONTWAIT);
+       P_MSG_FLAG(EOR);
+       P_MSG_FLAG(WAITALL);
+       P_MSG_FLAG(FIN);
+       P_MSG_FLAG(SYN);
+       P_MSG_FLAG(CONFIRM);
+       P_MSG_FLAG(RST);
+       P_MSG_FLAG(ERRQUEUE);
+       P_MSG_FLAG(NOSIGNAL);
+       P_MSG_FLAG(MORE);
+       P_MSG_FLAG(WAITFORONE);
+       P_MSG_FLAG(SENDPAGE_NOTLAST);
+       P_MSG_FLAG(FASTOPEN);
+       P_MSG_FLAG(CMSG_CLOEXEC);
+#undef P_MSG_FLAG
+
+       if (flags)
+               printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+       return printed;
+}
+
+#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
+
+static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
+                                                struct syscall_arg *arg)
+{
+       size_t printed = 0;
+       int mode = arg->val;
+
+       if (mode == F_OK) /* 0 */
+               return scnprintf(bf, size, "F");
+#define        P_MODE(n) \
+       if (mode & n##_OK) { \
+               printed += scnprintf(bf + printed, size - printed, "%s", #n); \
+               mode &= ~n##_OK; \
+       }
+
+       P_MODE(R);
+       P_MODE(W);
+       P_MODE(X);
+#undef P_MODE
+
+       if (mode)
+               printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
+
+       return printed;
+}
+
+#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
+
  static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
-                                              unsigned long arg,
-                                              u8 arg_idx, u8 *arg_mask)
+                                              struct syscall_arg *arg)
  {
-       int printed = 0, flags = arg;
+       int printed = 0, flags = arg->val;
  
         if (!(flags & O_CREAT))
-               *arg_mask |= 1 << (arg_idx + 1); /* Mask the mode parm */
+               arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
  
         if (flags == 0)
                 return scnprintf(bf, size, "RDONLY");
@@ -291,32 +548,225 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
  
  #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
  
+static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
+                                                  struct syscall_arg *arg)
+{
+       int printed = 0, flags = arg->val;
+
+       if (flags == 0)
+               return scnprintf(bf, size, "NONE");
+#define        P_FLAG(n) \
+       if (flags & EFD_##n) { \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               flags &= ~EFD_##n; \
+       }
+
+       P_FLAG(SEMAPHORE);
+       P_FLAG(CLOEXEC);
+       P_FLAG(NONBLOCK);
+#undef P_FLAG
+
+       if (flags)
+               printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+       return printed;
+}
+
+#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
+
+static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
+                                               struct syscall_arg *arg)
+{
+       int printed = 0, flags = arg->val;
+
+#define        P_FLAG(n) \
+       if (flags & O_##n) { \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               flags &= ~O_##n; \
+       }
+
+       P_FLAG(CLOEXEC);
+       P_FLAG(NONBLOCK);
+#undef P_FLAG
+
+       if (flags)
+               printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+       return printed;
+}
+
+#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
+
+static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
+{
+       int sig = arg->val;
+
+       switch (sig) {
+#define        P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
+       P_SIGNUM(HUP);
+       P_SIGNUM(INT);
+       P_SIGNUM(QUIT);
+       P_SIGNUM(ILL);
+       P_SIGNUM(TRAP);
+       P_SIGNUM(ABRT);
+       P_SIGNUM(BUS);
+       P_SIGNUM(FPE);
+       P_SIGNUM(KILL);
+       P_SIGNUM(USR1);
+       P_SIGNUM(SEGV);
+       P_SIGNUM(USR2);
+       P_SIGNUM(PIPE);
+       P_SIGNUM(ALRM);
+       P_SIGNUM(TERM);
+       P_SIGNUM(STKFLT);
+       P_SIGNUM(CHLD);
+       P_SIGNUM(CONT);
+       P_SIGNUM(STOP);
+       P_SIGNUM(TSTP);
+       P_SIGNUM(TTIN);
+       P_SIGNUM(TTOU);
+       P_SIGNUM(URG);
+       P_SIGNUM(XCPU);
+       P_SIGNUM(XFSZ);
+       P_SIGNUM(VTALRM);
+       P_SIGNUM(PROF);
+       P_SIGNUM(WINCH);
+       P_SIGNUM(IO);
+       P_SIGNUM(PWR);
+       P_SIGNUM(SYS);
+       default: break;
+       }
+
+       return scnprintf(bf, size, "%#x", sig);
+}
+
+#define SCA_SIGNUM syscall_arg__scnprintf_signum
+
+#define TCGETS         0x5401
+
+static const char *tioctls[] = {
+       "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
+       "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
+       "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
+       "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
+       "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
+       "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
+       "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
+       "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
+       "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
+       "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
+       "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
+       [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
+       "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
+       "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
+       "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
+};
+
+static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
+
+#define STRARRAY(arg, name, array) \
+         .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
+         .arg_parm      = { [arg] = &strarray__##array, }
+
  static struct syscall_fmt {
         const char *name;
         const char *alias;
-       size_t     (*arg_scnprintf[6])(char *bf, size_t size, unsigned long arg, u8 arg_idx, u8 *arg_mask);
+       size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
+       void       *arg_parm[6];
         bool       errmsg;
         bool       timeout;
         bool       hexret;
  } syscall_fmts[] = {
-       { .name     = "access",     .errmsg = true, },
+       { .name     = "access",     .errmsg = true,
+         .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
         { .name     = "brk",        .hexret = true,
           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
-       { .name     = "mmap",       .hexret = true, },
+       { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
+       { .name     = "close",      .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, 
         { .name     = "connect",    .errmsg = true, },
-       { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
-       { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
+       { .name     = "dup",        .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "dup2",       .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "dup3",       .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
+       { .name     = "eventfd2",   .errmsg = true,
+         .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
+       { .name     = "faccessat",  .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+       { .name     = "fadvise64",  .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fallocate",  .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fchdir",     .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fchmod",     .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fchmodat",   .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+       { .name     = "fchown",     .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fchownat",   .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+       { .name     = "fcntl",      .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */
+                            [1] = SCA_STRARRAY, /* cmd */ },
+         .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
+       { .name     = "fdatasync",  .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "flock",      .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */
+                            [1] = SCA_FLOCK, /* cmd */ }, },
+       { .name     = "fsetxattr",  .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+       { .name     = "fstatfs",    .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "fsync",    .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "ftruncate", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
         { .name     = "futex",      .errmsg = true,
           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
+       { .name     = "futimesat", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+       { .name     = "getdents",   .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "getdents64", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
+       { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
         { .name     = "ioctl",      .errmsg = true,
-         .arg_scnprintf = { [2] = SCA_HEX, /* arg */ }, },
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ 
+                            [1] = SCA_STRHEXARRAY, /* cmd */
+                            [2] = SCA_HEX, /* arg */ },
+         .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
+       { .name     = "kill",       .errmsg = true,
+         .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
+       { .name     = "linkat",     .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
         { .name     = "lseek",      .errmsg = true,
-         .arg_scnprintf = { [2] = SCA_WHENCE, /* whence */ }, },
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */
+                            [2] = SCA_STRARRAY, /* whence */ },
+         .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
         { .name     = "madvise",    .errmsg = true,
           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
                              [2] = SCA_MADV_BHV, /* behavior */ }, },
+       { .name     = "mkdirat",    .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+       { .name     = "mknodat",    .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+       { .name     = "mlock",      .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
+       { .name     = "mlockall",   .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
         { .name     = "mmap",       .hexret = true,
           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
                              [2] = SCA_MMAP_PROT, /* prot */
@@ -327,24 +777,91 @@ static struct syscall_fmt {
         { .name     = "mremap",     .hexret = true,
           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
                              [4] = SCA_HEX, /* new_addr */ }, },
+       { .name     = "munlock",    .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
         { .name     = "munmap",     .errmsg = true,
           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
+       { .name     = "name_to_handle_at", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+       { .name     = "newfstatat", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
         { .name     = "open",       .errmsg = true,
           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
         { .name     = "open_by_handle_at", .errmsg = true,
-         .arg_scnprintf = { [2] = SCA_OPEN_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
+                            [2] = SCA_OPEN_FLAGS, /* flags */ }, },
         { .name     = "openat",     .errmsg = true,
-         .arg_scnprintf = { [2] = SCA_OPEN_FLAGS, /* flags */ }, },
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
+                            [2] = SCA_OPEN_FLAGS, /* flags */ }, },
+       { .name     = "pipe2",      .errmsg = true,
+         .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
         { .name     = "poll",       .errmsg = true, .timeout = true, },
         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
-       { .name     = "pread",      .errmsg = true, .alias = "pread64", },
-       { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
-       { .name     = "read",       .errmsg = true, },
-       { .name     = "recvfrom",   .errmsg = true, },
+       { .name     = "pread",      .errmsg = true, .alias = "pread64",
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "preadv",     .errmsg = true, .alias = "pread",
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
+       { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "pwritev",    .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "read",       .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "readlinkat", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+       { .name     = "readv",      .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "recvfrom",   .errmsg = true,
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "recvmmsg",   .errmsg = true,
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "recvmsg",    .errmsg = true,
+         .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "renameat",   .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+       { .name     = "rt_sigaction", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
+       { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
+       { .name     = "rt_sigqueueinfo", .errmsg = true,
+         .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
+       { .name     = "rt_tgsigqueueinfo", .errmsg = true,
+         .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
         { .name     = "select",     .errmsg = true, .timeout = true, },
-       { .name     = "socket",     .errmsg = true, },
+       { .name     = "sendmmsg",    .errmsg = true,
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "sendmsg",    .errmsg = true,
+         .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "sendto",     .errmsg = true,
+         .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+       { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
+       { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
+       { .name     = "shutdown",   .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "socket",     .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
+                            [1] = SCA_SK_TYPE, /* type */ },
+         .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
+       { .name     = "socketpair", .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
+                            [1] = SCA_SK_TYPE, /* type */ },
+         .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
+       { .name     = "symlinkat",  .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+       { .name     = "tgkill",     .errmsg = true,
+         .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
+       { .name     = "tkill",      .errmsg = true,
+         .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
+       { .name     = "unlinkat",   .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+       { .name     = "utimensat",  .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
+       { .name     = "write",      .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+       { .name     = "writev",     .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
  };
  
  static int syscall_fmt__cmp(const void *name, const void *fmtp)
@@ -364,8 +881,8 @@ struct syscall {
         const char          *name;
         bool                filtered;
         struct syscall_fmt  *fmt;
-       size_t              (**arg_scnprintf)(char *bf, size_t size,
-                                             unsigned long arg, u8 arg_idx, u8 *args_mask);
+       size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
+       void                **arg_parm;
  };
  
  static size_t fprintf_duration(unsigned long t, FILE *fp)
@@ -389,11 +906,24 @@ struct thread_trace {
         unsigned long     nr_events;
         char              *entry_str;
         double            runtime_ms;
+       struct {
+               int       max;
+               char      **table;
+       } paths;
+
+       struct intlist *syscall_stats;
  };
  
  static struct thread_trace *thread_trace__new(void)
  {
-       return zalloc(sizeof(struct thread_trace));
+       struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
+
+       if (ttrace)
+               ttrace->paths.max = -1;
+
+       ttrace->syscall_stats = intlist__new(NULL);
+
+       return ttrace;
  }
  
  static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
@@ -421,26 +951,140 @@ fail:
  
  struct trace {
         struct perf_tool        tool;
-       int                     audit_machine;
+       struct {
+               int             machine;
+               int             open_id;
+       }                       audit;
         struct {
                 int             max;
                 struct syscall  *table;
         } syscalls;
         struct perf_record_opts opts;
-       struct machine          host;
+       struct machine          *host;
         u64                     base_time;
+       bool                    full_time;
         FILE                    *output;
         unsigned long           nr_events;
         struct strlist          *ev_qualifier;
         bool                    not_ev_qualifier;
+       bool                    live;
+       const char              *last_vfs_getname;
         struct intlist          *tid_list;
         struct intlist          *pid_list;
         bool                    sched;
         bool                    multiple_threads;
+       bool                    summary;
+       bool                    show_comm;
+       bool                    show_tool_stats;
         double                  duration_filter;
         double                  runtime_ms;
+       struct {
+               u64             vfs_getname, proc_getname;
+       } stats;
  };
  
+static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
+{
+       struct thread_trace *ttrace = thread->priv;
+
+       if (fd > ttrace->paths.max) {
+               char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
+
+               if (npath == NULL)
+                       return -1;
+
+               if (ttrace->paths.max != -1) {
+                       memset(npath + ttrace->paths.max + 1, 0,
+                              (fd - ttrace->paths.max) * sizeof(char *));
+               } else {
+                       memset(npath, 0, (fd + 1) * sizeof(char *));
+               }
+
+               ttrace->paths.table = npath;
+               ttrace->paths.max   = fd;
+       }
+
+       ttrace->paths.table[fd] = strdup(pathname);
+
+       return ttrace->paths.table[fd] != NULL ? 0 : -1;
+}
+
+static int thread__read_fd_path(struct thread *thread, int fd)
+{
+       char linkname[PATH_MAX], pathname[PATH_MAX];
+       struct stat st;
+       int ret;
+
+       if (thread->pid_ == thread->tid) {
+               scnprintf(linkname, sizeof(linkname),
+                         "/proc/%d/fd/%d", thread->pid_, fd);
+       } else {
+               scnprintf(linkname, sizeof(linkname),
+                         "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
+       }
+
+       if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
+               return -1;
+
+       ret = readlink(linkname, pathname, sizeof(pathname));
+
+       if (ret < 0 || ret > st.st_size)
+               return -1;
+
+       pathname[ret] = '\0';
+       return trace__set_fd_pathname(thread, fd, pathname);
+}
+
+static const char *thread__fd_path(struct thread *thread, int fd,
+                                  struct trace *trace)
+{
+       struct thread_trace *ttrace = thread->priv;
+
+       if (ttrace == NULL)
+               return NULL;
+
+       if (fd < 0)
+               return NULL;
+
+       if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL))
+               if (!trace->live)
+                       return NULL;
+               ++trace->stats.proc_getname;
+               if (thread__read_fd_path(thread, fd)) {
+                       return NULL;
+       }
+
+       return ttrace->paths.table[fd];
+}
+
+static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
+                                       struct syscall_arg *arg)
+{
+       int fd = arg->val;
+       size_t printed = scnprintf(bf, size, "%d", fd);
+       const char *path = thread__fd_path(arg->thread, fd, arg->trace);
+
+       if (path)
+               printed += scnprintf(bf + printed, size - printed, "<%s>", path);
+
+       return printed;
+}
+
+static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
+                                             struct syscall_arg *arg)
+{
+       int fd = arg->val;
+       size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
+       struct thread_trace *ttrace = arg->thread->priv;
+
+       if (ttrace && fd >= 0 && fd <= ttrace->paths.max) {
+               free(ttrace->paths.table[fd]);
+               ttrace->paths.table[fd] = NULL;
+       }
+
+       return printed;
+}
+
  static bool trace__filter_duration(struct trace *trace, double t)
  {
         return t < (trace->duration_filter * NSEC_PER_MSEC);
@@ -454,10 +1098,12 @@ static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
  }
  
  static bool done = false;
+static bool interrupted = false;
  
-static void sig_handler(int sig __maybe_unused)
+static void sig_handler(int sig)
  {
         done = true;
+       interrupted = sig == SIGINT;
  }
  
  static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
@@ -466,8 +1112,11 @@ static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thre
         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
         printed += fprintf_duration(duration, fp);
  
-       if (trace->multiple_threads)
+       if (trace->multiple_threads) {
+               if (trace->show_comm)
+                       printed += fprintf(fp, "%.14s/", thread->comm);
                 printed += fprintf(fp, "%d ", thread->tid);
+       }
  
         return printed;
  }
@@ -506,16 +1155,17 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
         if (err)
                 return err;
  
-       machine__init(&trace->host, "", HOST_KERNEL_ID);
-       machine__create_kernel_maps(&trace->host);
+       trace->host = machine__new_host();
+       if (trace->host == NULL)
+               return -ENOMEM;
  
         if (perf_target__has_task(&trace->opts.target)) {
                 err = perf_event__synthesize_thread_map(&trace->tool, evlist->threads,
                                                         trace__tool_process,
-                                                       &trace->host);
+                                                       trace->host);
         } else {
                 err = perf_event__synthesize_threads(&trace->tool, trace__tool_process,
-                                                    &trace->host);
+                                                    trace->host);
         }
  
         if (err)
@@ -533,6 +1183,9 @@ static int syscall__set_arg_fmts(struct syscall *sc)
         if (sc->arg_scnprintf == NULL)
                 return -1;
  
+       if (sc->fmt)
+               sc->arg_parm = sc->fmt->arg_parm;
+
         for (field = sc->tp_format->format.fields->next; field; field = field->next) {
                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
@@ -548,7 +1201,7 @@ static int trace__read_syscall_info(struct trace *trace, int id)
  {
         char tp_name[128];
         struct syscall *sc;
-       const char *name = audit_syscall_to_name(id, trace->audit_machine);
+       const char *name = audit_syscall_to_name(id, trace->audit.machine);
  
         if (name == NULL)
                 return -1;
@@ -603,32 +1256,52 @@ static int trace__read_syscall_info(struct trace *trace, int id)
  }
  
  static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
-                                     unsigned long *args)
+                                     unsigned long *args, struct trace *trace,
+                                     struct thread *thread)
  {
-       int i = 0;
         size_t printed = 0;
  
         if (sc->tp_format != NULL) {
                 struct format_field *field;
-               u8 mask = 0, bit = 1;
+               u8 bit = 1;
+               struct syscall_arg arg = {
+                       .idx    = 0,
+                       .mask   = 0,
+                       .trace  = trace,
+                       .thread = thread,
+               };
  
                 for (field = sc->tp_format->format.fields->next; field;
-                    field = field->next, ++i, bit <<= 1) {
-                       if (mask & bit)
+                    field = field->next, ++arg.idx, bit <<= 1) {
+                       if (arg.mask & bit)
+                               continue;
+                       /*
+                        * Suppress this argument if its value is zero and
+                        * and we don't have a string associated in an
+                        * strarray for it.
+                        */
+                       if (args[arg.idx] == 0 &&
+                           !(sc->arg_scnprintf &&
+                             sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
+                             sc->arg_parm[arg.idx]))
                                 continue;
  
                         printed += scnprintf(bf + printed, size - printed,
                                              "%s%s: ", printed ? ", " : "", field->name);
-
-                       if (sc->arg_scnprintf && sc->arg_scnprintf[i]) {
-                               printed += sc->arg_scnprintf[i](bf + printed, size - printed,
-                                                               args[i], i, &mask);
+                       if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
+                               arg.val = args[arg.idx];
+                               if (sc->arg_parm)
+                                       arg.parm = sc->arg_parm[arg.idx];
+                               printed += sc->arg_scnprintf[arg.idx](bf + printed,
+                                                                     size - printed, &arg);
                         } else {
                                 printed += scnprintf(bf + printed, size - printed,
-                                                    "%ld", args[i]);
+                                                    "%ld", args[arg.idx]);
                         }
                 }
         } else {
+               int i = 0;
+
                 while (i < 6) {
                         printed += scnprintf(bf + printed, size - printed,
                                              "%sarg%d: %ld",
@@ -644,10 +1317,8 @@ typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
                                   struct perf_sample *sample);
  
  static struct syscall *trace__syscall_info(struct trace *trace,
-                                          struct perf_evsel *evsel,
-                                          struct perf_sample *sample)
+                                          struct perf_evsel *evsel, int id)
  {
-       int id = perf_evsel__intval(evsel, sample, "id");
  
         if (id < 0) {
  
@@ -688,6 +1359,32 @@ out_cant_read:
         return NULL;
  }
  
+static void thread__update_stats(struct thread_trace *ttrace,
+                                int id, struct perf_sample *sample)
+{
+       struct int_node *inode;
+       struct stats *stats;
+       u64 duration = 0;
+
+       inode = intlist__findnew(ttrace->syscall_stats, id);
+       if (inode == NULL)
+               return;
+
+       stats = inode->priv;
+       if (stats == NULL) {
+               stats = malloc(sizeof(struct stats));
+               if (stats == NULL)
+                       return;
+               init_stats(stats);
+               inode->priv = stats;
+       }
+
+       if (ttrace->entry_time && sample->time > ttrace->entry_time)
+               duration = sample->time - ttrace->entry_time;
+
+       update_stats(stats, duration);
+}
+
  static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
                             struct perf_sample *sample)
  {
@@ -695,7 +1392,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
         void *args;
         size_t printed = 0;
         struct thread *thread;
-       struct syscall *sc = trace__syscall_info(trace, evsel, sample);
+       int id = perf_evsel__intval(evsel, sample, "id");
+       struct syscall *sc = trace__syscall_info(trace, evsel, id);
         struct thread_trace *ttrace;
  
         if (sc == NULL)
@@ -704,8 +1402,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
         if (sc->filtered)
                 return 0;
  
-       thread = machine__findnew_thread(&trace->host, sample->pid,
-                                        sample->tid);
+       thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
         ttrace = thread__trace(thread, trace->output);
         if (ttrace == NULL)
                 return -1;
@@ -728,7 +1425,8 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
         msg = ttrace->entry_str;
         printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
  
-       printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,  args);
+       printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
+                                          args, trace, thread);
  
         if (!strcmp(sc->name, "exit_group") || !strcmp(sc->name, "exit")) {
                 if (!trace->duration_filter) {
@@ -747,7 +1445,8 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
         int ret;
         u64 duration = 0;
         struct thread *thread;
-       struct syscall *sc = trace__syscall_info(trace, evsel, sample);
+       int id = perf_evsel__intval(evsel, sample, "id");
+       struct syscall *sc = trace__syscall_info(trace, evsel, id);
         struct thread_trace *ttrace;
  
         if (sc == NULL)
@@ -756,14 +1455,22 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
         if (sc->filtered)
                 return 0;
  
-       thread = machine__findnew_thread(&trace->host, sample->pid,
-                                        sample->tid);
+       thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
         ttrace = thread__trace(thread, trace->output);
         if (ttrace == NULL)
                 return -1;
  
+       if (trace->summary)
+               thread__update_stats(ttrace, id, sample);
+
         ret = perf_evsel__intval(evsel, sample, "ret");
  
+       if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
+               trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
+               trace->last_vfs_getname = NULL;
+               ++trace->stats.vfs_getname;
+       }
+
         ttrace = thread->priv;
  
         ttrace->exit_time = sample->time;
@@ -808,12 +1515,19 @@ out:
         return 0;
  }
  
+static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
+                             struct perf_sample *sample)
+{
+       trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
+       return 0;
+}
+
  static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
                                      struct perf_sample *sample)
  {
          u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
-       struct thread *thread = machine__findnew_thread(&trace->host,
+       struct thread *thread = machine__findnew_thread(trace->host,
                                                         sample->pid,
                                                         sample->tid);
         struct thread_trace *ttrace = thread__trace(thread, trace->output);
@@ -861,7 +1575,7 @@ static int trace__process_sample(struct perf_tool *tool,
         if (skip_sample(trace, sample))
                 return 0;
  
-       if (trace->base_time == 0)
+       if (!trace->full_time && trace->base_time == 0)
                 trace->base_time = sample->time;
  
         if (handler)
@@ -901,6 +1615,51 @@ static int parse_target_str(struct trace *trace)
         return 0;
  }
  
+static int trace__record(int argc, const char **argv)
+{
+       unsigned int rec_argc, i, j;
+       const char **rec_argv;
+       const char * const record_args[] = {
+               "record",
+               "-R",
+               "-m", "1024",
+               "-c", "1",
+               "-e", "raw_syscalls:sys_enter,raw_syscalls:sys_exit",
+       };
+
+       rec_argc = ARRAY_SIZE(record_args) + argc;
+       rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+       if (rec_argv == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < ARRAY_SIZE(record_args); i++)
+               rec_argv[i] = record_args[i];
+
+       for (j = 0; j < (unsigned int)argc; j++, i++)
+               rec_argv[i] = argv[j];
+
+       return cmd_record(i, rec_argv, NULL);
+}
+
+static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
+
+static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname",
+                                                    evlist->nr_entries);
+       if (evsel == NULL)
+               return;
+
+       if (perf_evsel__field(evsel, "pathname") == NULL) {
+               perf_evsel__delete(evsel);
+               return;
+       }
+
+       evsel->handler.func = trace__vfs_getname;
+       perf_evlist__add(evlist, evsel);
+}
+
  static int trace__run(struct trace *trace, int argc, const char **argv)
  {
         struct perf_evlist *evlist = perf_evlist__new();
@@ -909,23 +1668,23 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
         unsigned long before;
         const bool forks = argc > 0;
  
+       trace->live = true;
+
         if (evlist == NULL) {
                 fprintf(trace->output, "Not enough memory to run!\n");
                 goto out;
         }
  
         if (perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_enter", trace__sys_enter) ||
-           perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_exit", trace__sys_exit)) {
-               fprintf(trace->output, "Couldn't read the raw_syscalls tracepoints information!\n");
-               goto out_delete_evlist;
-       }
+               perf_evlist__add_newtp(evlist, "raw_syscalls", "sys_exit", trace__sys_exit))
+               goto out_error_tp;
+
+       perf_evlist__add_vfs_getname(evlist);
  
         if (trace->sched &&
-           perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
-                                  trace__sched_stat_runtime)) {
-               fprintf(trace->output, "Couldn't read the sched_stat_runtime tracepoint information!\n");
-               goto out_delete_evlist;
-       }
+               perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
+                               trace__sched_stat_runtime))
+               goto out_error_tp;
  
         err = perf_evlist__create_maps(evlist, &trace->opts.target);
         if (err < 0) {
@@ -954,10 +1713,8 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
         }
  
         err = perf_evlist__open(evlist);
-       if (err < 0) {
-               fprintf(trace->output, "Couldn't create the events: %s\n", strerror(errno));
-               goto out_delete_maps;
-       }
+       if (err < 0)
+               goto out_error_open;
  
         err = perf_evlist__mmap(evlist, UINT_MAX, false);
         if (err < 0) {
@@ -990,11 +1747,11 @@ again:
                                 continue;
                         }
  
-                       if (trace->base_time == 0)
+                       if (!trace->full_time && trace->base_time == 0)
                                 trace->base_time = sample.time;
  
                         if (type != PERF_RECORD_SAMPLE) {
-                               trace__process_event(trace, &trace->host, event);
+                               trace__process_event(trace, trace->host, event);
                                 continue;
                         }
  
@@ -1014,24 +1771,36 @@ again:
                         handler = evsel->handler.func;
                         handler(trace, evsel, &sample);
  
-                       if (done)
-                               goto out_unmap_evlist;
+                       if (interrupted)
+                               goto out_disable;
                 }
         }
  
         if (trace->nr_events == before) {
-               if (done)
-                       goto out_unmap_evlist;
+               int timeout = done ? 100 : -1;
  
-               poll(evlist->pollfd, evlist->nr_fds, -1);
+               if (poll(evlist->pollfd, evlist->nr_fds, timeout) > 0)
+                       goto again;
+       } else {
+               goto again;
         }
  
-       if (done)
-               perf_evlist__disable(evlist);
+out_disable:
+       perf_evlist__disable(evlist);
  
-       goto again;
+       if (!err) {
+               if (trace->summary)
+                       trace__fprintf_thread_summary(trace, trace->output);
+
+               if (trace->show_tool_stats) {
+                       fprintf(trace->output, "Stats:\n "
+                                              " vfs_getname : %" PRIu64 "\n"
+                                              " proc_getname: %" PRIu64 "\n",
+                               trace->stats.vfs_getname,
+                               trace->stats.proc_getname);
+               }
+       }
  
-out_unmap_evlist:
         perf_evlist__munmap(evlist);
  out_close_evlist:
         perf_evlist__close(evlist);
@@ -1040,7 +1809,22 @@ out_delete_maps:
  out_delete_evlist:
         perf_evlist__delete(evlist);
  out:
+       trace->live = false;
         return err;
+{
+       char errbuf[BUFSIZ];
+
+out_error_tp:
+       perf_evlist__strerror_tp(evlist, errno, errbuf, sizeof(errbuf));
+       goto out_error;
+
+out_error_open:
+       perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
+
+out_error:
+       fprintf(trace->output, "%s\n", errbuf);
+       goto out_delete_evlist;
+}
  }
  
  static int trace__replay(struct trace *trace)
@@ -1048,8 +1832,12 @@ static int trace__replay(struct trace *trace)
         const struct perf_evsel_str_handler handlers[] = {
                 { "raw_syscalls:sys_enter",  trace__sys_enter, },
                 { "raw_syscalls:sys_exit",   trace__sys_exit, },
+               { "probe:vfs_getname",       trace__vfs_getname, },
+       };
+       struct perf_data_file file = {
+               .path  = input_name,
+               .mode  = PERF_DATA_MODE_READ,
         };
-
         struct perf_session *session;
         int err = -1;
  
@@ -1072,11 +1860,12 @@ static int trace__replay(struct trace *trace)
         if (symbol__init() < 0)
                 return -1;
  
-       session = perf_session__new(input_name, O_RDONLY, 0, false,
-                                   &trace->tool);
+       session = perf_session__new(&file, false, &trace->tool);
         if (session == NULL)
                 return -ENOMEM;
  
+       trace->host = &session->machines.host;
+
         err = perf_session__set_tracepoints_handlers(session, handlers);
         if (err)
                 goto out;
@@ -1101,6 +1890,9 @@ static int trace__replay(struct trace *trace)
         if (err)
                 pr_err("Failed to process events, error %d", err);
  
+       else if (trace->summary)
+               trace__fprintf_thread_summary(trace, trace->output);
+
  out:
         perf_session__delete(session);
  
@@ -1111,47 +1903,111 @@ static size_t trace__fprintf_threads_header(FILE *fp)
  {
         size_t printed;
  
-       printed  = fprintf(fp, "\n _____________________________________________________________________\n");
-       printed += fprintf(fp," __)    Summary of events    (__\n\n");
-       printed += fprintf(fp,"              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
-       printed += fprintf(fp," _____________________________________________________________________\n\n");
+       printed  = fprintf(fp, "\n _____________________________________________________________________________\n");
+       printed += fprintf(fp, " __)    Summary of events    (__\n\n");
+       printed += fprintf(fp, "              [ task - pid ]     [ events ] [ ratio ]  [ runtime ]\n");
+       printed += fprintf(fp, "                                  syscall  count    min     max    avg  stddev\n");
+       printed += fprintf(fp, "                                                   msec    msec   msec     %%\n");
+       printed += fprintf(fp, " _____________________________________________________________________________\n\n");
  
         return printed;
  }
  
-static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
+static size_t thread__dump_stats(struct thread_trace *ttrace,
+                                struct trace *trace, FILE *fp)
  {
-       size_t printed = trace__fprintf_threads_header(fp);
-       struct rb_node *nd;
-
-       for (nd = rb_first(&trace->host.threads); nd; nd = rb_next(nd)) {
-               struct thread *thread = rb_entry(nd, struct thread, rb_node);
-               struct thread_trace *ttrace = thread->priv;
-               const char *color;
-               double ratio;
-
-               if (ttrace == NULL)
-                       continue;
-
-               ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
-
-               color = PERF_COLOR_NORMAL;
-               if (ratio > 50.0)
-                       color = PERF_COLOR_RED;
-               else if (ratio > 25.0)
-                       color = PERF_COLOR_GREEN;
-               else if (ratio > 5.0)
-                       color = PERF_COLOR_YELLOW;
-
-               printed += color_fprintf(fp, color, "%20s", thread->comm);
-               printed += fprintf(fp, " - %-5d :%11lu   [", thread->tid, ttrace->nr_events);
-               printed += color_fprintf(fp, color, "%5.1f%%", ratio);
-               printed += fprintf(fp, " ] %10.3f ms\n", ttrace->runtime_ms);
+       struct stats *stats;
+       size_t printed = 0;
+       struct syscall *sc;
+       struct int_node *inode = intlist__first(ttrace->syscall_stats);
+
+       if (inode == NULL)
+               return 0;
+
+       printed += fprintf(fp, "\n");
+
+       /* each int_node is a syscall */
+       while (inode) {
+               stats = inode->priv;
+               if (stats) {
+                       double min = (double)(stats->min) / NSEC_PER_MSEC;
+                       double max = (double)(stats->max) / NSEC_PER_MSEC;
+                       double avg = avg_stats(stats);
+                       double pct;
+                       u64 n = (u64) stats->n;
+
+                       pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
+                       avg /= NSEC_PER_MSEC;
+
+                       sc = &trace->syscalls.table[inode->i];
+                       printed += fprintf(fp, "%24s  %14s : ", "", sc->name);
+                       printed += fprintf(fp, "%5" PRIu64 "  %8.3f  %8.3f",
+                                          n, min, max);
+                       printed += fprintf(fp, "  %8.3f  %6.2f\n", avg, pct);
+               }
+
+               inode = intlist__next(inode);
         }
  
+       printed += fprintf(fp, "\n\n");
+
         return printed;
  }
  
+/* struct used to pass data to per-thread function */
+struct summary_data {
+       FILE *fp;
+       struct trace *trace;
+       size_t printed;
+};
+
+static int trace__fprintf_one_thread(struct thread *thread, void *priv)
+{
+       struct summary_data *data = priv;
+       FILE *fp = data->fp;
+       size_t printed = data->printed;
+       struct trace *trace = data->trace;
+       struct thread_trace *ttrace = thread->priv;
+       const char *color;
+       double ratio;
+
+       if (ttrace == NULL)
+               return 0;
+
+       ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
+
+       color = PERF_COLOR_NORMAL;
+       if (ratio > 50.0)
+               color = PERF_COLOR_RED;
+       else if (ratio > 25.0)
+               color = PERF_COLOR_GREEN;
+       else if (ratio > 5.0)
+               color = PERF_COLOR_YELLOW;
+
+       printed += color_fprintf(fp, color, "%20s", thread->comm);
+       printed += fprintf(fp, " - %-5d :%11lu   [", thread->tid, ttrace->nr_events);
+       printed += color_fprintf(fp, color, "%5.1f%%", ratio);
+       printed += fprintf(fp, " ] %10.3f ms\n", ttrace->runtime_ms);
+       printed += thread__dump_stats(ttrace, trace, fp);
+
+       data->printed += printed;
+
+       return 0;
+}
+
+static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
+{
+       struct summary_data data = {
+               .fp = fp,
+               .trace = trace
+       };
+       data.printed = trace__fprintf_threads_header(fp);
+
+       machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
+
+       return data.printed;
+}
+
  static int trace__set_duration(const struct option *opt, const char *str,
                                int unset __maybe_unused)
  {
@@ -1183,10 +2039,15 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
         const char * const trace_usage[] = {
                 "perf trace [<options>] [<command>]",
                 "perf trace [<options>] -- <command> [<options>]",
+               "perf trace record [<options>] [<command>]",
+               "perf trace record [<options>] -- <command> [<options>]",
                 NULL
         };
         struct trace trace = {
-               .audit_machine = audit_detect_machine(),
+               .audit = {
+                       .machine = audit_detect_machine(),
+                       .open_id = audit_name_to_syscall("open", trace.audit.machine),
+               },
                 .syscalls = {
                         . max = -1,
                 },
@@ -1201,10 +2062,14 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
                         .mmap_pages    = 1024,
                 },
                 .output = stdout,
+               .show_comm = true,
         };
         const char *output_name = NULL;
         const char *ev_qualifier_str = NULL;
         const struct option trace_options[] = {
+       OPT_BOOLEAN(0, "comm", &trace.show_comm,
+                   "show the thread COMM next to its id"),
+       OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
         OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
                     "list of events to trace"),
         OPT_STRING('o', "output", &output_name, "file", "output file name"),
@@ -1219,8 +2084,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
                     "list of cpus to monitor"),
         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
                     "child tasks do not inherit counters"),
-       OPT_UINTEGER('m', "mmap-pages", &trace.opts.mmap_pages,
-                    "number of mmap data pages"),
+       OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
+                    "number of mmap data pages",
+                    perf_evlist__parse_mmap_pages),
         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
                    "user to profile"),
         OPT_CALLBACK(0, "duration", &trace, "float",
@@ -1228,11 +2094,18 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
                      trace__set_duration),
         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
+       OPT_BOOLEAN('T', "time", &trace.full_time,
+                   "Show full timestamp, not time relative to first start"),
+       OPT_BOOLEAN(0, "summary", &trace.summary,
+                   "Show syscall summary with statistics"),
         OPT_END()
         };
         int err;
         char bf[BUFSIZ];
  
+       if ((argc > 1) && (strcmp(argv[1], "record") == 0))
+               return trace__record(argc-2, &argv[2]);
+
         argc = parse_options(argc, argv, trace_options, trace_usage, 0);
  
         if (output_name != NULL) {
@@ -1280,9 +2153,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
         else
                 err = trace__run(&trace, argc, argv);
  
-       if (trace.sched && !err)
-               trace__fprintf_thread_summary(&trace, trace.output);
-
  out_close:
         if (output_name != NULL)
                 fclose(trace.output);
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile

index 75b93d7f786010000368e299d214717131c67fa1..e4d06b2080d77d32925b5806176ff6c735d7bb1f 100644 (file)
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -23,7 +23,7 @@ ifeq ($(ARCH),x86_64)
    endif
    ifeq (${IS_X86_64}, 1)
      RAW_ARCH := x86_64
-    CFLAGS += -DARCH_X86_64
+    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT
      ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
    endif
    NO_PERF_REGS := 0
@@ -35,7 +35,7 @@ ifeq ($(ARCH),arm)
  endif
  
  ifeq ($(NO_PERF_REGS),0)
-  CFLAGS += -DHAVE_PERF_REGS
+  CFLAGS += -DHAVE_PERF_REGS_SUPPORT
  endif
  
  ifeq ($(src-perf),)
@@ -55,7 +55,6 @@ LIB_INCLUDE := $(srctree)/tools/lib/
  # include ARCH specific config
  -include $(src-perf)/arch/$(ARCH)/Makefile
  
-include $(src-perf)/config/feature-tests.mak
  include $(src-perf)/config/utilities.mak
  
  ifeq ($(call get-executable,$(FLEX)),)
@@ -71,10 +70,7 @@ ifneq ($(WERROR),0)
    CFLAGS += -Werror
  endif
  
-ifeq ("$(origin DEBUG)", "command line")
-  PERF_DEBUG = $(DEBUG)
-endif
-ifndef PERF_DEBUG
+ifeq ($(DEBUG),0)
    CFLAGS += -O6
  endif
  
@@ -93,20 +89,126 @@ CFLAGS += -std=gnu99
  
  EXTLIBS = -lelf -lpthread -lrt -lm -ldl
  
-ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -Werror -fstack-protector-all,-fstack-protector-all),y)
-  CFLAGS += -fstack-protector-all
+ifneq ($(OUTPUT),)
+  OUTPUT_FEATURES = $(OUTPUT)config/feature-checks/
+  $(shell mkdir -p $(OUTPUT_FEATURES))
  endif
  
-ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -Werror -Wstack-protector,-Wstack-protector),y)
-  CFLAGS += -Wstack-protector
+feature_check = $(eval $(feature_check_code))
+define feature_check_code
+  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) LDFLAGS=$(LDFLAGS) -C config/feature-checks test-$1 >/dev/null 2>/dev/null && echo 1 || echo 0)
+endef
+
+feature_set = $(eval $(feature_set_code))
+define feature_set_code
+  feature-$(1) := 1
+endef
+
+#
+# Build the feature check binaries in parallel, ignore errors, ignore return value and suppress output:
+#
+
+#
+# Note that this is not a complete list of all feature tests, just
+# those that are typically built on a fully configured system.
+#
+# [ Feature tests not mentioned here have to be built explicitly in
+#   the rule that uses them - an example for that is the 'bionic'
+#   feature check. ]
+#
+CORE_FEATURE_TESTS =                   \
+       backtrace                       \
+       dwarf                           \
+       fortify-source                  \
+       glibc                           \
+       gtk2                            \
+       gtk2-infobar                    \
+       libaudit                        \
+       libbfd                          \
+       libelf                          \
+       libelf-getphdrnum               \
+       libelf-mmap                     \
+       libnuma                         \
+       libperl                         \
+       libpython                       \
+       libpython-version               \
+       libslang                        \
+       libunwind                       \
+       libunwind-debug-frame           \
+       on-exit                         \
+       stackprotector                  \
+       stackprotector-all
+
+#
+# So here we detect whether test-all was rebuilt, to be able
+# to skip the print-out of the long features list if the file
+# existed before and after it was built:
+#
+ifeq ($(wildcard $(OUTPUT)config/feature-checks/test-all),)
+  test-all-failed := 1
+else
+  test-all-failed := 0
+endif
+
+#
+# Special fast-path for the 'all features are available' case:
+#
+$(call feature_check,all,$(MSG))
+
+#
+# Just in case the build freshly failed, make sure we print the
+# feature matrix:
+#
+ifeq ($(feature-all), 0)
+  test-all-failed := 1
+endif
+
+ifeq ($(test-all-failed),1)
+  $(info )
+  $(info Auto-detecting system features:)
+endif
+
+ifeq ($(feature-all), 1)
+  #
+  # test-all.c passed - just set all the core feature flags to 1:
+  #
+  $(foreach feat,$(CORE_FEATURE_TESTS),$(call feature_set,$(feat)))
+else
+  $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) LDFLAGS=$(LDFLAGS) -i -j -C config/feature-checks $(CORE_FEATURE_TESTS) >/dev/null 2>&1)
+  $(foreach feat,$(CORE_FEATURE_TESTS),$(call feature_check,$(feat)))
+endif
+
+#
+# Print the result of the feature test:
+#
+feature_print = $(eval $(feature_print_code)) $(info $(MSG))
+
+define feature_print_code
+  ifeq ($(feature-$(1)), 1)
+    MSG = $(shell printf '...%30s: [ \033[32mon\033[m  ]' $(1))
+  else
+    MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1))
+  endif
+endef
+
+#
+# Only print out our features if we rebuilt the testcases or if a test failed:
+#
+ifeq ($(test-all-failed), 1)
+  $(foreach feat,$(CORE_FEATURE_TESTS),$(call feature_print,$(feat)))
+  $(info )
  endif
  
-ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -Werror -Wvolatile-register-var,-Wvolatile-register-var),y)
-  CFLAGS += -Wvolatile-register-var
+ifeq ($(feature-stackprotector-all), 1)
+  CFLAGS += -fstack-protector-all
+endif
+
+ifeq ($(feature-stackprotector), 1)
+  CFLAGS += -Wstack-protector
  endif
  
-ifndef PERF_DEBUG
-  ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -D_FORTIFY_SOURCE=2,-D_FORTIFY_SOURCE=2),y)
+ifeq ($(DEBUG),0)
+  ifeq ($(feature-fortify-source), 1)
      CFLAGS += -D_FORTIFY_SOURCE=2
    endif
  endif
@@ -132,84 +234,74 @@ CFLAGS += -I$(LIB_INCLUDE)
  CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
  
  ifndef NO_BIONIC
-ifeq ($(call try-cc,$(SOURCE_BIONIC),$(CFLAGS),bionic),y)
-  BIONIC := 1
-  EXTLIBS := $(filter-out -lrt,$(EXTLIBS))
-  EXTLIBS := $(filter-out -lpthread,$(EXTLIBS))
+  $(feature_check,bionic)
+  ifeq ($(feature-bionic), 1)
+    BIONIC := 1
+    EXTLIBS := $(filter-out -lrt,$(EXTLIBS))
+    EXTLIBS := $(filter-out -lpthread,$(EXTLIBS))
+  endif
  endif
-endif # NO_BIONIC
  
  ifdef NO_LIBELF
    NO_DWARF := 1
    NO_DEMANGLE := 1
    NO_LIBUNWIND := 1
  else
-FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS)
-ifneq ($(call try-cc,$(SOURCE_LIBELF),$(FLAGS_LIBELF),libelf),y)
-  FLAGS_GLIBC=$(CFLAGS) $(LDFLAGS)
-  ifeq ($(call try-cc,$(SOURCE_GLIBC),$(FLAGS_GLIBC),glibc),y)
-    LIBC_SUPPORT := 1
-  endif
-  ifeq ($(BIONIC),1)
-    LIBC_SUPPORT := 1
-  endif
-  ifeq ($(LIBC_SUPPORT),1)
-    msg := $(warning No libelf found, disables 'probe' tool, please install elfutils-libelf-devel/libelf-dev);
+  ifeq ($(feature-libelf), 0)
+    ifeq ($(feature-glibc), 1)
+      LIBC_SUPPORT := 1
+    endif
+    ifeq ($(BIONIC),1)
+      LIBC_SUPPORT := 1
+    endif
+    ifeq ($(LIBC_SUPPORT),1)
+      msg := $(warning No libelf found, disables 'probe' tool, please install elfutils-libelf-devel/libelf-dev);
  
-    NO_LIBELF := 1
-    NO_DWARF := 1
-    NO_DEMANGLE := 1
+      NO_LIBELF := 1
+      NO_DWARF := 1
+      NO_DEMANGLE := 1
+    else
+      msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
+    endif
    else
-    msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
-  endif
-else
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBDW_DIR=/opt/libdw/
-  ifdef LIBDW_DIR
-    LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
-    LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
-  endif
+    # for linking with debug library, run like:
+    # make DEBUG=1 LIBDW_DIR=/opt/libdw/
+    ifdef LIBDW_DIR
+      LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
+      LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
+    endif
  
-  FLAGS_DWARF=$(CFLAGS) $(LIBDW_CFLAGS) -ldw -lz -lelf $(LIBDW_LDFLAGS) $(LDFLAGS) $(EXTLIBS)
-  ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF),libdw),y)
-    msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
-    NO_DWARF := 1
-  endif # Dwarf support
-endif # SOURCE_LIBELF
+    ifneq ($(feature-dwarf), 1)
+      msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
+      NO_DWARF := 1
+    endif # Dwarf support
+  endif # libelf support
  endif # NO_LIBELF
  
  ifndef NO_LIBELF
-CFLAGS += -DLIBELF_SUPPORT
-FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS)
-ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y)
-  CFLAGS += -DLIBELF_MMAP
-endif
-ifeq ($(call try-cc,$(SOURCE_ELF_GETPHDRNUM),$(FLAGS_LIBELF),-DHAVE_ELF_GETPHDRNUM),y)
-  CFLAGS += -DHAVE_ELF_GETPHDRNUM
-endif
+  CFLAGS += -DHAVE_LIBELF_SUPPORT
  
-# include ARCH specific config
--include $(src-perf)/arch/$(ARCH)/Makefile
+  ifeq ($(feature-libelf-mmap), 1)
+    CFLAGS += -DHAVE_LIBELF_MMAP_SUPPORT
+  endif
  
-ifndef NO_DWARF
-ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
-  msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
-  NO_DWARF := 1
-else
-  CFLAGS += -DDWARF_SUPPORT $(LIBDW_CFLAGS)
-  LDFLAGS += $(LIBDW_LDFLAGS)
-  EXTLIBS += -lelf -ldw
-endif # PERF_HAVE_DWARF_REGS
-endif # NO_DWARF
+  ifeq ($(feature-libelf-getphdrnum), 1)
+    CFLAGS += -DHAVE_ELF_GETPHDRNUM_SUPPORT
+  endif
  
-endif # NO_LIBELF
+  # include ARCH specific config
+  -include $(src-perf)/arch/$(ARCH)/Makefile
  
-ifndef NO_LIBELF
-CFLAGS += -DLIBELF_SUPPORT
-FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS)
-ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y)
-  CFLAGS += -DLIBELF_MMAP
-endif # try-cc
+  ifndef NO_DWARF
+    ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
+      msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
+      NO_DWARF := 1
+    else
+      CFLAGS += -DHAVE_DWARF_SUPPORT $(LIBDW_CFLAGS)
+      LDFLAGS += $(LIBDW_LDFLAGS)
+      EXTLIBS += -lelf -ldw
+    endif # PERF_HAVE_DWARF_REGS
+  endif # NO_DWARF
  endif # NO_LIBELF
  
  ifeq ($(LIBUNWIND_LIBS),)
@@ -217,38 +309,40 @@ ifeq ($(LIBUNWIND_LIBS),)
  endif
  
  ifndef NO_LIBUNWIND
-# for linking with debug library, run like:
-# make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
-ifdef LIBUNWIND_DIR
-  LIBUNWIND_CFLAGS  := -I$(LIBUNWIND_DIR)/include
-  LIBUNWIND_LDFLAGS := -L$(LIBUNWIND_DIR)/lib
-endif
+  #
+  # For linking with debug library, run like:
+  #
+  #   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
+  #
+  ifdef LIBUNWIND_DIR
+    LIBUNWIND_CFLAGS  := -I$(LIBUNWIND_DIR)/include
+    LIBUNWIND_LDFLAGS := -L$(LIBUNWIND_DIR)/lib
+  endif
  
-FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS)
-ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y)
-  msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
-  NO_LIBUNWIND := 1
-endif # Libunwind support
-ifneq ($(call try-cc,$(SOURCE_LIBUNWIND_DEBUG_FRAME),$(FLAGS_UNWIND),libunwind debug_frame),y)
-  msg := $(warning No debug_frame support found in libunwind);
-CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
-endif # debug_frame support in libunwind
-endif # NO_LIBUNWIND
+  ifneq ($(feature-libunwind), 1)
+    msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1);
+    NO_LIBUNWIND := 1
+  else
+    ifneq ($(feature-libunwind-debug-frame), 1)
+      msg := $(warning No debug_frame support found in libunwind);
+      CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
+    endif
+  endif
+endif
  
  ifndef NO_LIBUNWIND
-  CFLAGS += -DLIBUNWIND_SUPPORT
+  CFLAGS += -DHAVE_LIBUNWIND_SUPPORT
    EXTLIBS += $(LIBUNWIND_LIBS)
    CFLAGS += $(LIBUNWIND_CFLAGS)
    LDFLAGS += $(LIBUNWIND_LDFLAGS)
-endif # NO_LIBUNWIND
+endif
  
  ifndef NO_LIBAUDIT
-  FLAGS_LIBAUDIT = $(CFLAGS) $(LDFLAGS) -laudit
-  ifneq ($(call try-cc,$(SOURCE_LIBAUDIT),$(FLAGS_LIBAUDIT),libaudit),y)
+  ifneq ($(feature-libaudit), 1)
      msg := $(warning No libaudit.h found, disables 'trace' tool, please install audit-libs-devel or libaudit-dev);
      NO_LIBAUDIT := 1
    else
-    CFLAGS += -DLIBAUDIT_SUPPORT
+    CFLAGS += -DHAVE_LIBAUDIT_SUPPORT
      EXTLIBS += -laudit
    endif
  endif
@@ -258,30 +352,30 @@ ifdef NO_NEWT
  endif
  
  ifndef NO_SLANG
-  FLAGS_SLANG=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) -I/usr/include/slang -lslang
-  ifneq ($(call try-cc,$(SOURCE_SLANG),$(FLAGS_SLANG),libslang),y)
+  ifneq ($(feature-libslang), 1)
      msg := $(warning slang not found, disables TUI support. Please install slang-devel or libslang-dev);
      NO_SLANG := 1
    else
      # Fedora has /usr/include/slang/slang.h, but ubuntu /usr/include/slang.h
      CFLAGS += -I/usr/include/slang
-    CFLAGS += -DSLANG_SUPPORT
+    CFLAGS += -DHAVE_SLANG_SUPPORT
      EXTLIBS += -lslang
    endif
  endif
  
  ifndef NO_GTK2
    FLAGS_GTK2=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) $(shell pkg-config --libs --cflags gtk+-2.0 2>/dev/null)
-  ifneq ($(call try-cc,$(SOURCE_GTK2),$(FLAGS_GTK2),gtk2),y)
+  ifneq ($(feature-gtk2), 1)
      msg := $(warning GTK2 not found, disables GTK2 support. Please install gtk2-devel or libgtk2.0-dev);
      NO_GTK2 := 1
    else
-    ifeq ($(call try-cc,$(SOURCE_GTK2_INFOBAR),$(FLAGS_GTK2),-DHAVE_GTK_INFO_BAR),y)
-      CFLAGS += -DHAVE_GTK_INFO_BAR
+    ifeq ($(feature-gtk2-infobar), 1)
+      GTK_CFLAGS := -DHAVE_GTK_INFO_BAR_SUPPORT
      endif
-    CFLAGS += -DGTK2_SUPPORT
-    CFLAGS += $(shell pkg-config --cflags gtk+-2.0 2>/dev/null)
-    EXTLIBS += $(shell pkg-config --libs gtk+-2.0 2>/dev/null)
+    CFLAGS += -DHAVE_GTK2_SUPPORT
+    GTK_CFLAGS += $(shell pkg-config --cflags gtk+-2.0 2>/dev/null)
+    GTK_LIBS := $(shell pkg-config --libs gtk+-2.0 2>/dev/null)
+    EXTLIBS += -ldl
    endif
  endif
  
@@ -297,7 +391,7 @@ else
    PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
    FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
  
-  ifneq ($(call try-cc,$(SOURCE_PERL_EMBED),$(FLAGS_PERL_EMBED),perl),y)
+  ifneq ($(feature-libperl), 1)
      CFLAGS += -DNO_LIBPERL
      NO_LIBPERL := 1
    else
@@ -342,11 +436,11 @@ else
        PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
        FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
  
-      ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED),python),y)
+      ifneq ($(feature-libpython), 1)
          $(call disable-python,Python.h (for Python 2.x))
        else
  
-        ifneq ($(call try-cc,$(SOURCE_PYTHON_VERSION),$(FLAGS_PYTHON_EMBED),python version),y)
+        ifneq ($(feature-libpython-version), 1)
            $(warning Python 3 is not yet supported; please set)
            $(warning PYTHON and/or PYTHON_CONFIG appropriately.)
            $(warning If you also have Python 2 installed, then)
@@ -369,33 +463,30 @@ else
    endif
  endif
  
+ifeq ($(feature-libbfd), 1)
+  EXTLIBS += -lbfd
+endif
+
  ifdef NO_DEMANGLE
    CFLAGS += -DNO_DEMANGLE
  else
-  ifdef HAVE_CPLUS_DEMANGLE
+  ifdef HAVE_CPLUS_DEMANGLE_SUPPORT
      EXTLIBS += -liberty
-    CFLAGS += -DHAVE_CPLUS_DEMANGLE
+    CFLAGS += -DHAVE_CPLUS_DEMANGLE_SUPPORT
    else
-    FLAGS_BFD=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) -DPACKAGE='perf' -lbfd
-    has_bfd := $(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD),libbfd)
-    ifeq ($(has_bfd),y)
-      EXTLIBS += -lbfd
-    else
-      FLAGS_BFD_IBERTY=$(FLAGS_BFD) -liberty
-      has_bfd_iberty := $(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD_IBERTY),liberty)
-      ifeq ($(has_bfd_iberty),y)
+    ifneq ($(feature-libbfd), 1)
+      $(feature_check,liberty)
+      ifeq ($(feature-liberty), 1)
          EXTLIBS += -lbfd -liberty
        else
-        FLAGS_BFD_IBERTY_Z=$(FLAGS_BFD_IBERTY) -lz
-        has_bfd_iberty_z := $(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD_IBERTY_Z),libz)
-        ifeq ($(has_bfd_iberty_z),y)
+        $(feature_check,liberty-z)
+        ifeq ($(feature-liberty-z), 1)
            EXTLIBS += -lbfd -liberty -lz
          else
-          FLAGS_CPLUS_DEMANGLE=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) -liberty
-          has_cplus_demangle := $(call try-cc,$(SOURCE_CPLUS_DEMANGLE),$(FLAGS_CPLUS_DEMANGLE),demangle)
-          ifeq ($(has_cplus_demangle),y)
+          $(feature_check,cplus-demangle)
+          ifeq ($(feature-cplus-demangle), 1)
              EXTLIBS += -liberty
-            CFLAGS += -DHAVE_CPLUS_DEMANGLE
+            CFLAGS += -DHAVE_CPLUS_DEMANGLE_SUPPORT
            else
              msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling)
              CFLAGS += -DNO_DEMANGLE
@@ -406,31 +497,28 @@ else
    endif
  endif
  
-ifndef NO_STRLCPY
-  ifeq ($(call try-cc,$(SOURCE_STRLCPY),,-DHAVE_STRLCPY),y)
-    CFLAGS += -DHAVE_STRLCPY
-  endif
+ifneq ($(filter -lbfd,$(EXTLIBS)),)
+  CFLAGS += -DHAVE_LIBBFD_SUPPORT
  endif
  
  ifndef NO_ON_EXIT
-  ifeq ($(call try-cc,$(SOURCE_ON_EXIT),,-DHAVE_ON_EXIT),y)
-    CFLAGS += -DHAVE_ON_EXIT
+  ifeq ($(feature-on-exit), 1)
+    CFLAGS += -DHAVE_ON_EXIT_SUPPORT
    endif
  endif
  
  ifndef NO_BACKTRACE
-  ifeq ($(call try-cc,$(SOURCE_BACKTRACE),,-DBACKTRACE_SUPPORT),y)
-    CFLAGS += -DBACKTRACE_SUPPORT
+  ifeq ($(feature-backtrace), 1)
+    CFLAGS += -DHAVE_BACKTRACE_SUPPORT
    endif
  endif
  
  ifndef NO_LIBNUMA
-  FLAGS_LIBNUMA = $(CFLAGS) $(LDFLAGS) -lnuma
-  ifneq ($(call try-cc,$(SOURCE_LIBNUMA),$(FLAGS_LIBNUMA),libnuma),y)
+  ifeq ($(feature-libnuma), 0)
      msg := $(warning No numa.h found, disables 'perf bench numa mem' benchmark, please install numa-libs-devel or libnuma-dev);
      NO_LIBNUMA := 1
    else
-    CFLAGS += -DLIBNUMA_SUPPORT
+    CFLAGS += -DHAVE_LIBNUMA_SUPPORT
      EXTLIBS += -lnuma
    endif
  endif
@@ -466,7 +554,12 @@ else
  sysconfdir = $(prefix)/etc
  ETC_PERFCONFIG = etc/perfconfig
  endif
+ifeq ($(IS_X86_64),1)
+lib = lib64
+else
  lib = lib
+endif
+libdir = $(prefix)/$(lib)
  
  # Shell quote (do not use $(call) to accommodate ancient setups);
  ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
@@ -479,6 +572,7 @@ template_dir_SQ = $(subst ','\'',$(template_dir))
  htmldir_SQ = $(subst ','\'',$(htmldir))
  prefix_SQ = $(subst ','\'',$(prefix))
  sysconfdir_SQ = $(subst ','\'',$(sysconfdir))
+libdir_SQ = $(subst ','\'',$(libdir))
  
  ifneq ($(filter /%,$(firstword $(perfexecdir))),)
  perfexec_instdir = $(perfexecdir)
diff --git a/tools/perf/config/feature-checks/Makefile b/tools/perf/config/feature-checks/Makefile

new file mode 100644 (file)

index 0000000..abaf8f4
--- /dev/null
+++ b/tools/perf/config/feature-checks/Makefile
@@ -0,0 +1,148 @@
+
+FILES=                                 \
+       test-all                        \
+       test-backtrace                  \
+       test-bionic                     \
+       test-dwarf                      \
+       test-fortify-source             \
+       test-glibc                      \
+       test-gtk2                       \
+       test-gtk2-infobar               \
+       test-hello                      \
+       test-libaudit                   \
+       test-libbfd                     \
+       test-liberty                    \
+       test-liberty-z                  \
+       test-cplus-demangle             \
+       test-libelf                     \
+       test-libelf-getphdrnum          \
+       test-libelf-mmap                \
+       test-libnuma                    \
+       test-libperl                    \
+       test-libpython                  \
+       test-libpython-version          \
+       test-libslang                   \
+       test-libunwind                  \
+       test-libunwind-debug-frame      \
+       test-on-exit                    \
+       test-stackprotector-all         \
+       test-stackprotector
+
+CC := $(CC) -MD
+
+all: $(FILES)
+
+BUILD = $(CC) $(LDFLAGS) -o $(OUTPUT)$@ $@.c
+
+###############################
+
+test-all:
+       $(BUILD) -Werror -fstack-protector -fstack-protector-all -O2 -Werror -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lunwind -lunwind-x86_64 -lelf -laudit -I/usr/include/slang -lslang $(shell pkg-config --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl
+
+test-hello:
+       $(BUILD)
+
+test-stackprotector-all:
+       $(BUILD) -Werror -fstack-protector-all
+
+test-stackprotector:
+       $(BUILD) -Werror -fstack-protector -Wstack-protector
+
+test-fortify-source:
+       $(BUILD) -O2 -Werror -D_FORTIFY_SOURCE=2
+
+test-bionic:
+       $(BUILD)
+
+test-libelf:
+       $(BUILD) -lelf
+
+test-glibc:
+       $(BUILD)
+
+test-dwarf:
+       $(BUILD) -ldw
+
+test-libelf-mmap:
+       $(BUILD) -lelf
+
+test-libelf-getphdrnum:
+       $(BUILD) -lelf
+
+test-libnuma:
+       $(BUILD) -lnuma
+
+test-libunwind:
+       $(BUILD) -lunwind -lunwind-x86_64 -lelf
+
+test-libunwind-debug-frame:
+       $(BUILD) -lunwind -lunwind-x86_64 -lelf
+
+test-libaudit:
+       $(BUILD) -laudit
+
+test-libslang:
+       $(BUILD) -I/usr/include/slang -lslang
+
+test-gtk2:
+       $(BUILD) $(shell pkg-config --libs --cflags gtk+-2.0 2>/dev/null)
+
+test-gtk2-infobar:
+       $(BUILD) $(shell pkg-config --libs --cflags gtk+-2.0 2>/dev/null)
+
+grep-libs  = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
+PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
+PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
+PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
+PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
+FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
+
+test-libperl:
+       $(BUILD) $(FLAGS_PERL_EMBED)
+
+override PYTHON := python
+override PYTHON_CONFIG := python-config
+
+escape-for-shell-sq =  $(subst ','\'',$(1))
+shell-sq = '$(escape-for-shell-sq)'
+
+PYTHON_CONFIG_SQ = $(call shell-sq,$(PYTHON_CONFIG))
+
+PYTHON_EMBED_LDOPTS = $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
+PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
+PYTHON_EMBED_CCOPTS = $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+FLAGS_PYTHON_EMBED = $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
+
+test-libpython:
+       $(BUILD) $(FLAGS_PYTHON_EMBED)
+
+test-libpython-version:
+       $(BUILD) $(FLAGS_PYTHON_EMBED)
+
+test-libbfd:
+       $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
+
+test-liberty:
+       $(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty
+
+test-liberty-z:
+       $(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz
+
+test-cplus-demangle:
+       $(BUILD) -liberty
+
+test-on-exit:
+       $(BUILD)
+
+test-backtrace:
+       $(BUILD)
+
+-include *.d
+
+###############################
+
+clean:
+       rm -f $(FILES) *.d
diff --git a/tools/perf/config/feature-checks/test-all.c b/tools/perf/config/feature-checks/test-all.c

new file mode 100644 (file)

index 0000000..ed8aa7b
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-all.c
@@ -0,0 +1,111 @@
+/*
+ * test-all.c: Try to build all the main testcases at once.
+ *
+ * A well-configured system will have all the prereqs installed, so we can speed
+ * up auto-detection on such systems.
+ */
+
+/*
+ * Quirk: Python and Perl headers cannot be in arbitrary places, so keep
+ * these 3 testcases at the top:
+ */
+#define main main_test_libpython
+# include "test-libpython.c"
+#undef main
+
+#define main main_test_libpython_version
+# include "test-libpython-version.c"
+#undef main
+
+#define main main_test_libperl
+# include "test-libperl.c"
+#undef main
+
+#define main main_test_hello
+# include "test-hello.c"
+#undef main
+
+#define main main_test_libelf
+# include "test-libelf.c"
+#undef main
+
+#define main main_test_libelf_mmap
+# include "test-libelf-mmap.c"
+#undef main
+
+#define main main_test_glibc
+# include "test-glibc.c"
+#undef main
+
+#define main main_test_dwarf
+# include "test-dwarf.c"
+#undef main
+
+#define main main_test_libelf_getphdrnum
+# include "test-libelf-getphdrnum.c"
+#undef main
+
+#define main main_test_libunwind
+# include "test-libunwind.c"
+#undef main
+
+#define main main_test_libunwind_debug_frame
+# include "test-libunwind-debug-frame.c"
+#undef main
+
+#define main main_test_libaudit
+# include "test-libaudit.c"
+#undef main
+
+#define main main_test_libslang
+# include "test-libslang.c"
+#undef main
+
+#define main main_test_gtk2
+# include "test-gtk2.c"
+#undef main
+
+#define main main_test_gtk2_infobar
+# include "test-gtk2-infobar.c"
+#undef main
+
+#define main main_test_libbfd
+# include "test-libbfd.c"
+#undef main
+
+#define main main_test_on_exit
+# include "test-on-exit.c"
+#undef main
+
+#define main main_test_backtrace
+# include "test-backtrace.c"
+#undef main
+
+#define main main_test_libnuma
+# include "test-libnuma.c"
+#undef main
+
+int main(int argc, char *argv[])
+{
+       main_test_libpython();
+       main_test_libpython_version();
+       main_test_libperl();
+       main_test_hello();
+       main_test_libelf();
+       main_test_libelf_mmap();
+       main_test_glibc();
+       main_test_dwarf();
+       main_test_libelf_getphdrnum();
+       main_test_libunwind();
+       main_test_libunwind_debug_frame();
+       main_test_libaudit();
+       main_test_libslang();
+       main_test_gtk2(argc, argv);
+       main_test_gtk2_infobar(argc, argv);
+       main_test_libbfd();
+       main_test_on_exit();
+       main_test_backtrace();
+       main_test_libnuma();
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-backtrace.c b/tools/perf/config/feature-checks/test-backtrace.c

new file mode 100644 (file)

index 0000000..7124aa1
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-backtrace.c
@@ -0,0 +1,13 @@
+#include <execinfo.h>
+#include <stdio.h>
+
+int main(void)
+{
+       void *backtrace_fns[10];
+       size_t entries;
+
+       entries = backtrace(backtrace_fns, 10);
+       backtrace_symbols_fd(backtrace_fns, entries, 1);
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-bionic.c b/tools/perf/config/feature-checks/test-bionic.c

new file mode 100644 (file)

index 0000000..eac24e9
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-bionic.c
@@ -0,0 +1,6 @@
+#include <android/api-level.h>
+
+int main(void)
+{
+       return __ANDROID_API__;
+}
diff --git a/tools/perf/config/feature-checks/test-cplus-demangle.c b/tools/perf/config/feature-checks/test-cplus-demangle.c

new file mode 100644 (file)

index 0000000..610c686
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-cplus-demangle.c
@@ -0,0 +1,14 @@
+extern int printf(const char *format, ...);
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+       char symbol[4096] = "FieldName__9ClassNameFd";
+       char *tmp;
+
+       tmp = cplus_demangle(symbol, 0);
+
+       printf("demangled symbol: {%s}\n", tmp);
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-dwarf.c b/tools/perf/config/feature-checks/test-dwarf.c

new file mode 100644 (file)

index 0000000..3fc1801
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-dwarf.c
@@ -0,0 +1,10 @@
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+
+int main(void)
+{
+       Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+
+       return (long)dbg;
+}
diff --git a/tools/perf/config/feature-checks/test-fortify-source.c b/tools/perf/config/feature-checks/test-fortify-source.c

new file mode 100644 (file)

index 0000000..c9f398d
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-fortify-source.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+       return puts("hi");
+}
diff --git a/tools/perf/config/feature-checks/test-glibc.c b/tools/perf/config/feature-checks/test-glibc.c

new file mode 100644 (file)

index 0000000..b082034
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-glibc.c
@@ -0,0 +1,8 @@
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+       const char *version = gnu_get_libc_version();
+
+       return (long)version;
+}
diff --git a/tools/perf/config/feature-checks/test-gtk2-infobar.c b/tools/perf/config/feature-checks/test-gtk2-infobar.c

new file mode 100644 (file)

index 0000000..397b464
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-gtk2-infobar.c
@@ -0,0 +1,11 @@
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#include <gtk/gtk.h>
+#pragma GCC diagnostic error "-Wstrict-prototypes"
+
+int main(int argc, char *argv[])
+{
+       gtk_init(&argc, &argv);
+       gtk_info_bar_new();
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-gtk2.c b/tools/perf/config/feature-checks/test-gtk2.c

new file mode 100644 (file)

index 0000000..6bd80e5
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-gtk2.c
@@ -0,0 +1,10 @@
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#include <gtk/gtk.h>
+#pragma GCC diagnostic error "-Wstrict-prototypes"
+
+int main(int argc, char *argv[])
+{
+       gtk_init(&argc, &argv);
+
+        return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-hello.c b/tools/perf/config/feature-checks/test-hello.c

new file mode 100644 (file)

index 0000000..c9f398d
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-hello.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+       return puts("hi");
+}
diff --git a/tools/perf/config/feature-checks/test-libaudit.c b/tools/perf/config/feature-checks/test-libaudit.c

new file mode 100644 (file)

index 0000000..afc019f
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libaudit.c
@@ -0,0 +1,10 @@
+#include <libaudit.h>
+
+extern int printf(const char *format, ...);
+
+int main(void)
+{
+       printf("error message: %s\n", audit_errno_to_name(0));
+
+       return audit_open();
+}
diff --git a/tools/perf/config/feature-checks/test-libbfd.c b/tools/perf/config/feature-checks/test-libbfd.c

new file mode 100644 (file)

index 0000000..2405990
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libbfd.c
@@ -0,0 +1,15 @@
+#include <bfd.h>
+
+extern int printf(const char *format, ...);
+
+int main(void)
+{
+       char symbol[4096] = "FieldName__9ClassNameFd";
+       char *tmp;
+
+       tmp = bfd_demangle(0, symbol, 0);
+
+       printf("demangled symbol: {%s}\n", tmp);
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libelf-getphdrnum.c b/tools/perf/config/feature-checks/test-libelf-getphdrnum.c

new file mode 100644 (file)

index 0000000..d710459
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libelf-getphdrnum.c
@@ -0,0 +1,8 @@
+#include <libelf.h>
+
+int main(void)
+{
+       size_t dst;
+
+       return elf_getphdrnum(0, &dst);
+}
diff --git a/tools/perf/config/feature-checks/test-libelf-mmap.c b/tools/perf/config/feature-checks/test-libelf-mmap.c

new file mode 100644 (file)

index 0000000..564427d
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libelf-mmap.c
@@ -0,0 +1,8 @@
+#include <libelf.h>
+
+int main(void)
+{
+       Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+
+       return (long)elf;
+}
diff --git a/tools/perf/config/feature-checks/test-libelf.c b/tools/perf/config/feature-checks/test-libelf.c

new file mode 100644 (file)

index 0000000..08db322
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libelf.c
@@ -0,0 +1,8 @@
+#include <libelf.h>
+
+int main(void)
+{
+       Elf *elf = elf_begin(0, ELF_C_READ, 0);
+
+       return (long)elf;
+}
diff --git a/tools/perf/config/feature-checks/test-libnuma.c b/tools/perf/config/feature-checks/test-libnuma.c

new file mode 100644 (file)

index 0000000..4763d9c
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libnuma.c
@@ -0,0 +1,9 @@
+#include <numa.h>
+#include <numaif.h>
+
+int main(void)
+{
+       numa_available();
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libperl.c b/tools/perf/config/feature-checks/test-libperl.c

new file mode 100644 (file)

index 0000000..8871f6a
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libperl.c
@@ -0,0 +1,9 @@
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+       perl_alloc();
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libpython-version.c b/tools/perf/config/feature-checks/test-libpython-version.c

new file mode 100644 (file)

index 0000000..facea12
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libpython-version.c
@@ -0,0 +1,10 @@
+#include <Python.h>
+
+#if PY_VERSION_HEX >= 0x03000000
+       #error
+#endif
+
+int main(void)
+{
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libpython.c b/tools/perf/config/feature-checks/test-libpython.c

new file mode 100644 (file)

index 0000000..b24b28a
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libpython.c
@@ -0,0 +1,8 @@
+#include <Python.h>
+
+int main(void)
+{
+       Py_Initialize();
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libslang.c b/tools/perf/config/feature-checks/test-libslang.c

new file mode 100644 (file)

index 0000000..22ff22e
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libslang.c
@@ -0,0 +1,6 @@
+#include <slang.h>
+
+int main(void)
+{
+       return SLsmg_init_smg();
+}
diff --git a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c

new file mode 100644 (file)

index 0000000..0ef8087
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
@@ -0,0 +1,16 @@
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+                                unw_word_t ip, unw_word_t segbase,
+                                const char *obj_name, unw_word_t start,
+                                unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+       dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libunwind.c b/tools/perf/config/feature-checks/test-libunwind.c

new file mode 100644 (file)

index 0000000..43b9369
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libunwind.c
@@ -0,0 +1,27 @@
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+                                      unw_word_t ip,
+                                      unw_dyn_info_t *di,
+                                      unw_proc_info_t *pi,
+                                      int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+       unw_addr_space_t addr_space;
+
+       addr_space = unw_create_addr_space(&accessors, 0);
+       if (addr_space)
+               return 0;
+
+       unw_init_remote(NULL, addr_space, NULL);
+       dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+       return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-on-exit.c b/tools/perf/config/feature-checks/test-on-exit.c

new file mode 100644 (file)

index 0000000..8e88b16
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-on-exit.c
@@ -0,0 +1,16 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+static void exit_fn(int status, void *__data)
+{
+       printf("exit status: %d, data: %d\n", status, *(int *)__data);
+}
+
+static int data = 123;
+
+int main(void)
+{
+       on_exit(exit_fn, &data);
+
+       return 321;
+}
diff --git a/tools/perf/config/feature-checks/test-stackprotector-all.c b/tools/perf/config/feature-checks/test-stackprotector-all.c

new file mode 100644 (file)

index 0000000..c9f398d
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-stackprotector-all.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+       return puts("hi");
+}
diff --git a/tools/perf/config/feature-checks/test-stackprotector.c b/tools/perf/config/feature-checks/test-stackprotector.c

new file mode 100644 (file)

index 0000000..c9f398d
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-stackprotector.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+       return puts("hi");
+}
diff --git a/tools/perf/config/feature-checks/test-volatile-register-var.c b/tools/perf/config/feature-checks/test-volatile-register-var.c

new file mode 100644 (file)

index 0000000..c9f398d
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-volatile-register-var.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+       return puts("hi");
+}
diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak

deleted file mode 100644 (file)

index 028fe99..0000000
--- a/tools/perf/config/feature-tests.mak
+++ /dev/null
@@ -1,265 +0,0 @@
-define SOURCE_HELLO
-#include <stdio.h>
-int main(void)
-{
-       return puts(\"hi\");
-}
-endef
-
-ifndef NO_DWARF
-define SOURCE_DWARF
-#include <dwarf.h>
-#include <elfutils/libdw.h>
-#include <elfutils/version.h>
-#ifndef _ELFUTILS_PREREQ
-#error
-#endif
-
-int main(void)
-{
-       Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
-       return (long)dbg;
-}
-endef
-endif
-
-define SOURCE_LIBELF
-#include <libelf.h>
-
-int main(void)
-{
-       Elf *elf = elf_begin(0, ELF_C_READ, 0);
-       return (long)elf;
-}
-endef
-
-define SOURCE_GLIBC
-#include <gnu/libc-version.h>
-
-int main(void)
-{
-       const char *version = gnu_get_libc_version();
-       return (long)version;
-}
-endef
-
-define SOURCE_BIONIC
-#include <android/api-level.h>
-
-int main(void)
-{
-       return __ANDROID_API__;
-}
-endef
-
-define SOURCE_ELF_MMAP
-#include <libelf.h>
-int main(void)
-{
-       Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
-       return (long)elf;
-}
-endef
-
-define SOURCE_ELF_GETPHDRNUM
-#include <libelf.h>
-int main(void)
-{
-       size_t dst;
-       return elf_getphdrnum(0, &dst);
-}
-endef
-
-ifndef NO_SLANG
-define SOURCE_SLANG
-#include <slang.h>
-
-int main(void)
-{
-       return SLsmg_init_smg();
-}
-endef
-endif
-
-ifndef NO_GTK2
-define SOURCE_GTK2
-#pragma GCC diagnostic ignored \"-Wstrict-prototypes\"
-#include <gtk/gtk.h>
-#pragma GCC diagnostic error \"-Wstrict-prototypes\"
-
-int main(int argc, char *argv[])
-{
-        gtk_init(&argc, &argv);
-
-        return 0;
-}
-endef
-
-define SOURCE_GTK2_INFOBAR
-#pragma GCC diagnostic ignored \"-Wstrict-prototypes\"
-#include <gtk/gtk.h>
-#pragma GCC diagnostic error \"-Wstrict-prototypes\"
-
-int main(void)
-{
-       gtk_info_bar_new();
-
-       return 0;
-}
-endef
-endif
-
-ifndef NO_LIBPERL
-define SOURCE_PERL_EMBED
-#include <EXTERN.h>
-#include <perl.h>
-
-int main(void)
-{
-perl_alloc();
-return 0;
-}
-endef
-endif
-
-ifndef NO_LIBPYTHON
-define SOURCE_PYTHON_VERSION
-#include <Python.h>
-#if PY_VERSION_HEX >= 0x03000000
-       #error
-#endif
-int main(void)
-{
-       return 0;
-}
-endef
-define SOURCE_PYTHON_EMBED
-#include <Python.h>
-int main(void)
-{
-       Py_Initialize();
-       return 0;
-}
-endef
-endif
-
-define SOURCE_BFD
-#include <bfd.h>
-
-int main(void)
-{
-       bfd_demangle(0, 0, 0);
-       return 0;
-}
-endef
-
-define SOURCE_CPLUS_DEMANGLE
-extern char *cplus_demangle(const char *, int);
-
-int main(void)
-{
-       cplus_demangle(0, 0);
-       return 0;
-}
-endef
-
-define SOURCE_STRLCPY
-#include <stdlib.h>
-extern size_t strlcpy(char *dest, const char *src, size_t size);
-
-int main(void)
-{
-       strlcpy(NULL, NULL, 0);
-       return 0;
-}
-endef
-
-ifndef NO_LIBUNWIND
-define SOURCE_LIBUNWIND
-#include <libunwind.h>
-#include <stdlib.h>
-
-extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
-                                      unw_word_t ip,
-                                      unw_dyn_info_t *di,
-                                      unw_proc_info_t *pi,
-                                      int need_unwind_info, void *arg);
-
-#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
-
-int main(void)
-{
-       unw_addr_space_t addr_space;
-       addr_space = unw_create_addr_space(NULL, 0);
-       unw_init_remote(NULL, addr_space, NULL);
-       dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
-       return 0;
-}
-endef
-
-define SOURCE_LIBUNWIND_DEBUG_FRAME
-#include <libunwind.h>
-#include <stdlib.h>
-
-extern int
-UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
-                                unw_word_t ip, unw_word_t segbase,
-                                const char *obj_name, unw_word_t start,
-                                unw_word_t end);
-
-#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
-
-int main(void)
-{
-       dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
-       return 0;
-}
-endef
-
-endif
-
-ifndef NO_BACKTRACE
-define SOURCE_BACKTRACE
-#include <execinfo.h>
-#include <stdio.h>
-
-int main(void)
-{
-       backtrace(NULL, 0);
-       backtrace_symbols(NULL, 0);
-       return 0;
-}
-endef
-endif
-
-ifndef NO_LIBAUDIT
-define SOURCE_LIBAUDIT
-#include <libaudit.h>
-
-int main(void)
-{
-       printf(\"error message: %s\", audit_errno_to_name(0));
-       return audit_open();
-}
-endef
-endif
-
-define SOURCE_ON_EXIT
-#include <stdio.h>
-
-int main(void)
-{
-       return on_exit(NULL, NULL);
-}
-endef
-
-define SOURCE_LIBNUMA
-#include <numa.h>
-#include <numaif.h>
-
-int main(void)
-{
-       numa_available();
-       return 0;
-}
-endef
diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak

index 94d2d4f9c35d0e39ed0b570b033aca5294611736..f168debc5be268a75456a3639ccbf120ded594a2 100644 (file)
--- a/tools/perf/config/utilities.mak
+++ b/tools/perf/config/utilities.mak
@@ -179,16 +179,9 @@ _ge_attempt = $(if $(get-executable),$(get-executable),$(_gea_warn)$(call _gea_e
  _gea_warn = $(warning The path '$(1)' is not executable.)
  _gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
  
-# try-cc
-# Usage: option = $(call try-cc, source-to-build, cc-options, msg)
-ifneq ($(V),1)
-TRY_CC_OUTPUT= > /dev/null 2>&1
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+  ifneq ($(V),1)
+    QUIET_CLEAN                = @printf '  CLEAN    %s\n' $1;
+    QUIET_INSTALL      = @printf '  INSTALL  %s\n' $1;
+  endif
  endif
-TRY_CC_MSG=echo "    CHK $(3)" 1>&2;
-
-try-cc = $(shell sh -c                                           \
-       'TMP="$(OUTPUT)$(TMPOUT).$$$$";                           \
-        $(TRY_CC_MSG)                                            \
-        echo "$(1)" |                                            \
-        $(CC) -x c - $(2) -o "$$TMP" $(TRY_CC_OUTPUT) && echo y; \
-        rm -f "$$TMP"')
diff --git a/tools/perf/perf.c b/tools/perf/perf.c

index 85e1aed95204c21de59b2b80721735670839b814..8b38b4e80ec2c6f6de581022143c989ed6a0e955 100644 (file)
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -49,14 +49,14 @@ static struct cmd_struct commands[] = {
         { "version",    cmd_version,    0 },
         { "script",     cmd_script,     0 },
         { "sched",      cmd_sched,      0 },
-#ifdef LIBELF_SUPPORT
+#ifdef HAVE_LIBELF_SUPPORT
         { "probe",      cmd_probe,      0 },
  #endif
         { "kmem",       cmd_kmem,       0 },
         { "lock",       cmd_lock,       0 },
         { "kvm",        cmd_kvm,        0 },
         { "test",       cmd_test,       0 },
-#ifdef LIBAUDIT_SUPPORT
+#ifdef HAVE_LIBAUDIT_SUPPORT
         { "trace",      cmd_trace,      0 },
  #endif
         { "inject",     cmd_inject,     0 },
@@ -456,6 +456,7 @@ int main(int argc, const char **argv)
  {
         const char *cmd;
  
+       /* The page_size is placed in util object. */
         page_size = sysconf(_SC_PAGE_SIZE);
  
         cmd = perf_extract_argv0_path(argv[0]);
@@ -480,7 +481,14 @@ int main(int argc, const char **argv)
                 fprintf(stderr, "cannot handle %s internally", cmd);
                 goto out;
         }
-
+#ifdef HAVE_LIBAUDIT_SUPPORT
+       if (!prefixcmp(cmd, "trace")) {
+               set_buildid_dir();
+               setup_path();
+               argv[0] = "trace";
+               return cmd_trace(argc, argv, NULL);
+       }
+#endif
         /* Look for flags.. */
         argv++;
         argc--;
diff --git a/tools/perf/perf.h b/tools/perf/perf.h

index cf20187eee0a7345373dfc053c580bc1e77c4bb8..f61c230beec47f909d4c78af578346a0e96af184 100644 (file)
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -182,7 +182,9 @@ struct ip_callchain {
  struct branch_flags {
         u64 mispred:1;
         u64 predicted:1;
-       u64 reserved:62;
+       u64 in_tx:1;
+       u64 abort:1;
+       u64 reserved:60;
  };
  
  struct branch_entry {
@@ -218,7 +220,6 @@ struct perf_record_opts {
         bool         no_delay;
         bool         no_inherit;
         bool         no_samples;
-       bool         pipe_output;
         bool         raw_samples;
         bool         sample_address;
         bool         sample_weight;
@@ -231,6 +232,7 @@ struct perf_record_opts {
         u64          default_interval;
         u64          user_interval;
         u16          stack_dump_size;
+       bool         sample_transaction;
  };
  
  #endif
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c

index dffe0551acaa717add54e96bfb864c8fbf6cf2d6..9cc81a3eb9b456e6d2693bce799fedb0caba9862 100644 (file)
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -35,6 +35,7 @@ static char *test_file(int size)
         if (size != write(fd, buf, size))
                 templ = NULL;
  
+       free(buf);
         close(fd);
         return templ;
  }
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c

index 4228ffc0d9681d7acda97ecacccf8bea9b76f1e4..025503a22ff722386bbd7e9a13fe8758dee66969 100644 (file)
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -222,7 +222,8 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
                                                           &sample) < 0)
                                 goto out;
  
-                       he = __hists__add_entry(&evsel->hists, &al, NULL, 1, 1);
+                       he = __hists__add_entry(&evsel->hists, &al, NULL,
+                                               1, 1, 0);
                         if (he == NULL)
                                 goto out;
  
@@ -244,7 +245,8 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
                                                           &sample) < 0)
                                 goto out;
  
-                       he = __hists__add_entry(&evsel->hists, &al, NULL, 1, 1);
+                       he = __hists__add_entry(&evsel->hists, &al, NULL, 1, 1,
+                                               0);
                         if (he == NULL)
                                 goto out;
  
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c

index b8a7056519ac7c29b430a87d73be85314b4d8082..82ac71550091c4b8e5940c6eb122d88299a0fca6 100644 (file)
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -45,7 +45,7 @@ int test__PERF_RECORD(void)
         };
         cpu_set_t cpu_mask;
         size_t cpu_mask_size = sizeof(cpu_mask);
-       struct perf_evlist *evlist = perf_evlist__new();
+       struct perf_evlist *evlist = perf_evlist__new_default();
         struct perf_evsel *evsel;
         struct perf_sample sample;
         const char *cmd = "sleep";
@@ -65,16 +65,6 @@ int test__PERF_RECORD(void)
                 goto out;
         }
  
-       /*
-        * We need at least one evsel in the evlist, use the default
-        * one: "cycles".
-        */
-       err = perf_evlist__add_default(evlist);
-       if (err < 0) {
-               pr_debug("Not enough memory to create evsel\n");
-               goto out_delete_evlist;
-       }
-
         /*
          * Create maps of threads and cpus to monitor. In this case
          * we start with all threads and cpus (-1, -1) but then in
diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c

index 28fe5894b0618901493100ea0995c8dda093b9ec..b07f8a14e15df58f22de8db3a22cd689dc5c647e 100644 (file)
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c
@@ -37,20 +37,11 @@ int test__task_exit(void)
         signal(SIGCHLD, sig_handler);
         signal(SIGUSR1, sig_handler);
  
-       evlist = perf_evlist__new();
+       evlist = perf_evlist__new_default();
         if (evlist == NULL) {
-               pr_debug("perf_evlist__new\n");
+               pr_debug("perf_evlist__new_default\n");
                 return -1;
         }
-       /*
-        * We need at least one evsel in the evlist, use the default
-        * one: "cycles".
-        */
-       err = perf_evlist__add_default(evlist);
-       if (err < 0) {
-               pr_debug("Not enough memory to create evsel\n");
-               goto out_free_evlist;
-       }
  
         /*
          * Create maps of threads and cpus to monitor. In this case
@@ -117,7 +108,6 @@ out_close_evlist:
         perf_evlist__close(evlist);
  out_delete_maps:
         perf_evlist__delete_maps(evlist);
-out_free_evlist:
         perf_evlist__delete(evlist);
         return err;
  }
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c

index 08545ae46992cd5b284e279bf5ac131cf9c84e62..f0697a3aede04cb19f91d459cab62a6c08c5326f 100644 (file)
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -442,35 +442,37 @@ static bool annotate_browser__callq(struct annotate_browser *browser,
  {
         struct map_symbol *ms = browser->b.priv;
         struct disasm_line *dl = browser->selection;
-       struct symbol *sym = ms->sym;
         struct annotation *notes;
-       struct symbol *target;
-       u64 ip;
+       struct addr_map_symbol target = {
+               .map = ms->map,
+               .addr = map__objdump_2mem(ms->map, dl->ops.target.addr),
+       };
         char title[SYM_TITLE_MAX_SIZE];
  
         if (!ins__is_call(dl->ins))
                 return false;
  
-       ip = ms->map->map_ip(ms->map, dl->ops.target.addr);
-       target = map__find_symbol(ms->map, ip, NULL);
-       if (target == NULL) {
+       if (map_groups__find_ams(&target, NULL) ||
+           map__rip_2objdump(target.map, target.map->map_ip(target.map,
+                                                            target.addr)) !=
+           dl->ops.target.addr) {
                 ui_helpline__puts("The called function was not found.");
                 return true;
         }
  
-       notes = symbol__annotation(target);
+       notes = symbol__annotation(target.sym);
         pthread_mutex_lock(&notes->lock);
  
-       if (notes->src == NULL && symbol__alloc_hist(target) < 0) {
+       if (notes->src == NULL && symbol__alloc_hist(target.sym) < 0) {
                 pthread_mutex_unlock(&notes->lock);
                 ui__warning("Not enough memory for annotating '%s' symbol!\n",
-                           target->name);
+                           target.sym->name);
                 return true;
         }
  
         pthread_mutex_unlock(&notes->lock);
-       symbol__tui_annotate(target, ms->map, evsel, hbt);
-       sym_title(sym, ms->map, title, sizeof(title));
+       symbol__tui_annotate(target.sym, target.map, evsel, hbt);
+       sym_title(ms->sym, ms->map, title, sizeof(title));
         ui_browser__show_title(&browser->b, title);
         return true;
  }
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c

index f538794615dbb7c399371d93bd1b96cef77ea234..9c7ff8d31b274e32d22147664cb622437db514f1 100644 (file)
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -154,9 +154,9 @@ static int perf_gtk__annotate_symbol(GtkWidget *window, struct symbol *sym,
         return 0;
  }
  
-int symbol__gtk_annotate(struct symbol *sym, struct map *map,
-                        struct perf_evsel *evsel,
-                        struct hist_browser_timer *hbt)
+static int symbol__gtk_annotate(struct symbol *sym, struct map *map,
+                               struct perf_evsel *evsel,
+                               struct hist_browser_timer *hbt)
  {
         GtkWidget *window;
         GtkWidget *notebook;
@@ -226,6 +226,13 @@ int symbol__gtk_annotate(struct symbol *sym, struct map *map,
         return 0;
  }
  
+int hist_entry__gtk_annotate(struct hist_entry *he,
+                            struct perf_evsel *evsel,
+                            struct hist_browser_timer *hbt)
+{
+       return symbol__gtk_annotate(he->ms.sym, he->ms.map, evsel, hbt);
+}
+
  void perf_gtk__show_annotations(void)
  {
         GtkWidget *window;
diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c

index c95012cdb4387f95a4125f7da3f58cf224f33dc8..c24d91221290e0049b409de582dfcf6ce7994ef5 100644 (file)
--- a/tools/perf/ui/gtk/browser.c
+++ b/tools/perf/ui/gtk/browser.c
@@ -43,7 +43,7 @@ const char *perf_gtk__get_percent_color(double percent)
         return NULL;
  }
  
-#ifdef HAVE_GTK_INFO_BAR
+#ifdef HAVE_GTK_INFO_BAR_SUPPORT
  GtkWidget *perf_gtk__setup_info_bar(void)
  {
         GtkWidget *info_bar;
diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h

index 3d96785ef155044d006f4438bf35b17ee3717148..8576cf194872da1065e90eb63a40482ebb63479e 100644 (file)
--- a/tools/perf/ui/gtk/gtk.h
+++ b/tools/perf/ui/gtk/gtk.h
@@ -12,7 +12,7 @@ struct perf_gtk_context {
         GtkWidget *main_window;
         GtkWidget *notebook;
  
-#ifdef HAVE_GTK_INFO_BAR
+#ifdef HAVE_GTK_INFO_BAR_SUPPORT
         GtkWidget *info_bar;
         GtkWidget *message_label;
  #endif
@@ -20,6 +20,9 @@ struct perf_gtk_context {
         guint statbar_ctx_id;
  };
  
+int perf_gtk__init(void);
+void perf_gtk__exit(bool wait_for_ok);
+
  extern struct perf_gtk_context *pgctx;
  
  static inline bool perf_gtk__is_active_context(struct perf_gtk_context *ctx)
@@ -39,7 +42,7 @@ void perf_gtk__resize_window(GtkWidget *window);
  const char *perf_gtk__get_percent_color(double percent);
  GtkWidget *perf_gtk__setup_statusbar(void);
  
-#ifdef HAVE_GTK_INFO_BAR
+#ifdef HAVE_GTK_INFO_BAR_SUPPORT
  GtkWidget *perf_gtk__setup_info_bar(void);
  #else
  static inline GtkWidget *perf_gtk__setup_info_bar(void)
@@ -48,4 +51,17 @@ static inline GtkWidget *perf_gtk__setup_info_bar(void)
  }
  #endif
  
+struct perf_evsel;
+struct perf_evlist;
+struct hist_entry;
+struct hist_browser_timer;
+
+int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, const char *help,
+                                 struct hist_browser_timer *hbt,
+                                 float min_pcnt);
+int hist_entry__gtk_annotate(struct hist_entry *he,
+                            struct perf_evsel *evsel,
+                            struct hist_browser_timer *hbt);
+void perf_gtk__show_annotations(void);
+
  #endif /* _PERF_GTK_H_ */
diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c

index c06942a41c782e0ac029dba3719c638a4563b772..696c1fbe42482db03b4fb460b488860435a4edf5 100644 (file)
--- a/tools/perf/ui/gtk/util.c
+++ b/tools/perf/ui/gtk/util.c
@@ -53,7 +53,7 @@ static int perf_gtk__error(const char *format, va_list args)
         return 0;
  }
  
-#ifdef HAVE_GTK_INFO_BAR
+#ifdef HAVE_GTK_INFO_BAR_SUPPORT
  static int perf_gtk__warning_info_bar(const char *format, va_list args)
  {
         char *msg;
@@ -105,7 +105,7 @@ static int perf_gtk__warning_statusbar(const char *format, va_list args)
  
  struct perf_error_ops perf_gtk_eops = {
         .error          = perf_gtk__error,
-#ifdef HAVE_GTK_INFO_BAR
+#ifdef HAVE_GTK_INFO_BAR_SUPPORT
         .warning        = perf_gtk__warning_info_bar,
  #else
         .warning        = perf_gtk__warning_statusbar,
diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c

index 47d9a571f261da9c65e36aa82f8dbdb70a838acc..5df5140a9f29e466dd055b8873711e722875f646 100644 (file)
--- a/tools/perf/ui/setup.c
+++ b/tools/perf/ui/setup.c
@@ -1,10 +1,64 @@
  #include <pthread.h>
+#include <dlfcn.h>
  
  #include "../util/cache.h"
  #include "../util/debug.h"
  #include "../util/hist.h"
  
  pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
+void *perf_gtk_handle;
+
+#ifdef HAVE_GTK2_SUPPORT
+static int setup_gtk_browser(void)
+{
+       int (*perf_ui_init)(void);
+
+       if (perf_gtk_handle)
+               return 0;
+
+       perf_gtk_handle = dlopen(PERF_GTK_DSO, RTLD_LAZY);
+       if (perf_gtk_handle == NULL) {
+               char buf[PATH_MAX];
+               scnprintf(buf, sizeof(buf), "%s/%s", LIBDIR, PERF_GTK_DSO);
+               perf_gtk_handle = dlopen(buf, RTLD_LAZY);
+       }
+       if (perf_gtk_handle == NULL)
+               return -1;
+
+       perf_ui_init = dlsym(perf_gtk_handle, "perf_gtk__init");
+       if (perf_ui_init == NULL)
+               goto out_close;
+
+       if (perf_ui_init() == 0)
+               return 0;
+
+out_close:
+       dlclose(perf_gtk_handle);
+       return -1;
+}
+
+static void exit_gtk_browser(bool wait_for_ok)
+{
+       void (*perf_ui_exit)(bool);
+
+       if (perf_gtk_handle == NULL)
+               return;
+
+       perf_ui_exit = dlsym(perf_gtk_handle, "perf_gtk__exit");
+       if (perf_ui_exit == NULL)
+               goto out_close;
+
+       perf_ui_exit(wait_for_ok);
+
+out_close:
+       dlclose(perf_gtk_handle);
+
+       perf_gtk_handle = NULL;
+}
+#else
+static inline int setup_gtk_browser(void) { return -1; }
+static inline void exit_gtk_browser(bool wait_for_ok __maybe_unused) {}
+#endif
  
  void setup_browser(bool fallback_to_pager)
  {
@@ -17,8 +71,11 @@ void setup_browser(bool fallback_to_pager)
  
         switch (use_browser) {
         case 2:
-               if (perf_gtk__init() == 0)
+               if (setup_gtk_browser() == 0)
                         break;
+               printf("GTK browser requested but could not find %s\n",
+                      PERF_GTK_DSO);
+               sleep(1);
                 /* fall through */
         case 1:
                 use_browser = 1;
@@ -39,7 +96,7 @@ void exit_browser(bool wait_for_ok)
  {
         switch (use_browser) {
         case 2:
-               perf_gtk__exit(wait_for_ok);
+               exit_gtk_browser(wait_for_ok);
                 break;
  
         case 1:
diff --git a/tools/perf/ui/ui.h b/tools/perf/ui/ui.h

index 70cb0d4eb8aa390c008a4d84c575662c2515280d..ab88383f8be85d5278dfaba5b6b6b07b11a6b2d2 100644 (file)
--- a/tools/perf/ui/ui.h
+++ b/tools/perf/ui/ui.h
@@ -6,13 +6,14 @@
  #include <linux/compiler.h>
  
  extern pthread_mutex_t ui__lock;
+extern void *perf_gtk_handle;
  
  extern int use_browser;
  
  void setup_browser(bool fallback_to_pager);
  void exit_browser(bool wait_for_ok);
  
-#ifdef SLANG_SUPPORT
+#ifdef HAVE_SLANG_SUPPORT
  int ui__init(void);
  void ui__exit(bool wait_for_ok);
  #else
@@ -23,17 +24,6 @@ static inline int ui__init(void)
  static inline void ui__exit(bool wait_for_ok __maybe_unused) {}
  #endif
  
-#ifdef GTK2_SUPPORT
-int perf_gtk__init(void);
-void perf_gtk__exit(bool wait_for_ok);
-#else
-static inline int perf_gtk__init(void)
-{
-       return -1;
-}
-static inline void perf_gtk__exit(bool wait_for_ok __maybe_unused) {}
-#endif
-
  void ui__refresh_dimensions(bool force);
  
  #endif /* _PERF_UI_H_ */
diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN

index 15a77b7c0e36f155c968cbf59780aec089c53f3a..ce7a804b2951656b4cfb1946d5541f392123e7ea 100755 (executable)
--- a/tools/perf/util/PERF-VERSION-GEN
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -40,7 +40,7 @@ else
         VC=unset
  fi
  test "$VN" = "$VC" || {
-       echo >&2 "PERF_VERSION = $VN"
+       echo >&2 "  PERF_VERSION = $VN"
         echo "#define PERF_VERSION \"$VN\"" >$GVF
  }
  
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c

index 7eae5488ecea47344cac10677104cb9e6cb2cc44..cf6242c92ee244ab45498993cab477af7950853f 100644 (file)
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -825,20 +825,16 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
                 dl->ops.target.offset = dl->ops.target.addr -
                                         map__rip_2objdump(map, sym->start);
  
-       /*
-        * kcore has no symbols, so add the call target name if it is on the
-        * same map.
-        */
+       /* kcore has no symbols, so add the call target name */
         if (dl->ins && ins__is_call(dl->ins) && !dl->ops.target.name) {
-               struct symbol *s;
-               u64 ip = dl->ops.target.addr;
-
-               if (ip >= map->start && ip <= map->end) {
-                       ip = map->map_ip(map, ip);
-                       s = map__find_symbol(map, ip, NULL);
-                       if (s && s->start == ip)
-                               dl->ops.target.name = strdup(s->name);
-               }
+               struct addr_map_symbol target = {
+                       .map = map,
+                       .addr = dl->ops.target.addr,
+               };
+
+               if (!map_groups__find_ams(&target, NULL) &&
+                   target.sym->start == target.al_addr)
+                       dl->ops.target.name = strdup(target.sym->name);
         }
  
         disasm__add(&notes->src->source, dl);
@@ -879,6 +875,8 @@ int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize)
         FILE *file;
         int err = 0;
         char symfs_filename[PATH_MAX];
+       struct kcore_extract kce;
+       bool delete_extract = false;
  
         if (filename) {
                 snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
@@ -940,6 +938,23 @@ fallback:
         pr_debug("annotating [%p] %30s : [%p] %30s\n",
                  dso, dso->long_name, sym, sym->name);
  
+       if (dso__is_kcore(dso)) {
+               kce.kcore_filename = symfs_filename;
+               kce.addr = map__rip_2objdump(map, sym->start);
+               kce.offs = sym->start;
+               kce.len = sym->end + 1 - sym->start;
+               if (!kcore_extract__create(&kce)) {
+                       delete_extract = true;
+                       strlcpy(symfs_filename, kce.extract_filename,
+                               sizeof(symfs_filename));
+                       if (free_filename) {
+                               free(filename);
+                               free_filename = false;
+                       }
+                       filename = symfs_filename;
+               }
+       }
+
         snprintf(command, sizeof(command),
                  "%s %s%s --start-address=0x%016" PRIx64
                  " --stop-address=0x%016" PRIx64
@@ -972,6 +987,8 @@ fallback:
  
         pclose(file);
  out_free_filename:
+       if (delete_extract)
+               kcore_extract__delete(&kce);
         if (free_filename)
                 free(filename);
         return err;
@@ -1070,7 +1087,7 @@ static void symbol__free_source_line(struct symbol *sym, int len)
                           (sizeof(src_line->p) * (src_line->nr_pcnt - 1));
  
         for (i = 0; i < len; i++) {
-               free(src_line->path);
+               free_srcline(src_line->path);
                 src_line = (void *)src_line + sizeof_src_line;
         }
  
@@ -1081,13 +1098,11 @@ static void symbol__free_source_line(struct symbol *sym, int len)
  /* Get the filename:line for the colored entries */
  static int symbol__get_source_line(struct symbol *sym, struct map *map,
                                    struct perf_evsel *evsel,
-                                  struct rb_root *root, int len,
-                                  const char *filename)
+                                  struct rb_root *root, int len)
  {
         u64 start;
         int i, k;
         int evidx = evsel->idx;
-       char cmd[PATH_MAX * 2];
         struct source_line *src_line;
         struct annotation *notes = symbol__annotation(sym);
         struct sym_hist *h = annotation__histogram(notes, evidx);
@@ -1115,10 +1130,7 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
         start = map__rip_2objdump(map, sym->start);
  
         for (i = 0; i < len; i++) {
-               char *path = NULL;
-               size_t line_len;
                 u64 offset;
-               FILE *fp;
                 double percent_max = 0.0;
  
                 src_line->nr_pcnt = nr_pcnt;
@@ -1135,23 +1147,9 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
                         goto next;
  
                 offset = start + i;
-               sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset);
-               fp = popen(cmd, "r");
-               if (!fp)
-                       goto next;
-
-               if (getline(&path, &line_len, fp) < 0 || !line_len)
-                       goto next_close;
-
-               src_line->path = malloc(sizeof(char) * line_len + 1);
-               if (!src_line->path)
-                       goto next_close;
-
-               strcpy(src_line->path, path);
+               src_line->path = get_srcline(map->dso, offset);
                 insert_source_line(&tmp_root, src_line);
  
-       next_close:
-               pclose(fp);
         next:
                 src_line = (void *)src_line + sizeof_src_line;
         }
@@ -1192,7 +1190,7 @@ static void print_summary(struct rb_root *root, const char *filename)
  
                 path = src_line->path;
                 color = get_percent_color(percent_max);
-               color_fprintf(stdout, color, " %s", path);
+               color_fprintf(stdout, color, " %s\n", path);
  
                 node = rb_next(node);
         }
@@ -1356,7 +1354,6 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map,
                          bool full_paths, int min_pcnt, int max_lines)
  {
         struct dso *dso = map->dso;
-       const char *filename = dso->long_name;
         struct rb_root source_line = RB_ROOT;
         u64 len;
  
@@ -1366,9 +1363,8 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map,
         len = symbol__size(sym);
  
         if (print_lines) {
-               symbol__get_source_line(sym, map, evsel, &source_line,
-                                       len, filename);
-               print_summary(&source_line, filename);
+               symbol__get_source_line(sym, map, evsel, &source_line, len);
+               print_summary(&source_line, dso->long_name);
         }
  
         symbol__annotate_printf(sym, map, evsel, full_paths,
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h

index af755156d2785b7963d834adcd5e1ee3cc07b4be..834b7b57b7884f7b833291caa6bcaad2a1eff1d1 100644 (file)
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -150,7 +150,7 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map,
                          struct perf_evsel *evsel, bool print_lines,
                          bool full_paths, int min_pcnt, int max_lines);
  
-#ifdef SLANG_SUPPORT
+#ifdef HAVE_SLANG_SUPPORT
  int symbol__tui_annotate(struct symbol *sym, struct map *map,
                          struct perf_evsel *evsel,
                          struct hist_browser_timer *hbt);
@@ -165,30 +165,6 @@ static inline int symbol__tui_annotate(struct symbol *sym __maybe_unused,
  }
  #endif
  
-#ifdef GTK2_SUPPORT
-int symbol__gtk_annotate(struct symbol *sym, struct map *map,
-                        struct perf_evsel *evsel,
-                        struct hist_browser_timer *hbt);
-
-static inline int hist_entry__gtk_annotate(struct hist_entry *he,
-                                          struct perf_evsel *evsel,
-                                          struct hist_browser_timer *hbt)
-{
-       return symbol__gtk_annotate(he->ms.sym, he->ms.map, evsel, hbt);
-}
-
-void perf_gtk__show_annotations(void);
-#else
-static inline int hist_entry__gtk_annotate(struct hist_entry *he __maybe_unused,
-                               struct perf_evsel *evsel __maybe_unused,
-                               struct hist_browser_timer *hbt __maybe_unused)
-{
-       return 0;
-}
-
-static inline void perf_gtk__show_annotations(void) {}
-#endif
-
  extern const char      *disassembler_style;
  
  #endif /* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h

index 26e367239873fadbdcde27610d6ee2577c0e9f79..7b176dd02e1ae94e6f5479b78ae4c2e84bb7836c 100644 (file)
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -70,8 +70,7 @@ extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2
  extern char *perf_pathdup(const char *fmt, ...)
         __attribute__((format (printf, 1, 2)));
  
-#ifndef HAVE_STRLCPY
+/* Matches the libc/libbsd function attribute so we declare this unconditionally: */
  extern size_t strlcpy(char *dest, const char *src, size_t size);
-#endif
  
  #endif /* __PERF_CACHE_H */
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c

index 482f68081cd8d50a7c84fd25845841c18fcfb870..e3970e3eaacf45bc7a10db4a86314c2fdd79fbb5 100644 (file)
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -21,12 +21,6 @@
  
  __thread struct callchain_cursor callchain_cursor;
  
-#define chain_for_each_child(child, parent)    \
-       list_for_each_entry(child, &parent->children, siblings)
-
-#define chain_for_each_child_safe(child, next, parent) \
-       list_for_each_entry_safe(child, next, &parent->children, siblings)
-
  static void
  rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
                     enum chain_mode mode)
@@ -71,10 +65,16 @@ static void
  __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
                   u64 min_hit)
  {
+       struct rb_node *n;
         struct callchain_node *child;
  
-       chain_for_each_child(child, node)
+       n = rb_first(&node->rb_root_in);
+       while (n) {
+               child = rb_entry(n, struct callchain_node, rb_node_in);
+               n = rb_next(n);
+
                 __sort_chain_flat(rb_root, child, min_hit);
+       }
  
         if (node->hit && node->hit >= min_hit)
                 rb_insert_callchain(rb_root, node, CHAIN_FLAT);
@@ -94,11 +94,16 @@ sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root,
  static void __sort_chain_graph_abs(struct callchain_node *node,
                                    u64 min_hit)
  {
+       struct rb_node *n;
         struct callchain_node *child;
  
         node->rb_root = RB_ROOT;
+       n = rb_first(&node->rb_root_in);
+
+       while (n) {
+               child = rb_entry(n, struct callchain_node, rb_node_in);
+               n = rb_next(n);
  
-       chain_for_each_child(child, node) {
                 __sort_chain_graph_abs(child, min_hit);
                 if (callchain_cumul_hits(child) >= min_hit)
                         rb_insert_callchain(&node->rb_root, child,
@@ -117,13 +122,18 @@ sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root,
  static void __sort_chain_graph_rel(struct callchain_node *node,
                                    double min_percent)
  {
+       struct rb_node *n;
         struct callchain_node *child;
         u64 min_hit;
  
         node->rb_root = RB_ROOT;
         min_hit = ceil(node->children_hit * min_percent);
  
-       chain_for_each_child(child, node) {
+       n = rb_first(&node->rb_root_in);
+       while (n) {
+               child = rb_entry(n, struct callchain_node, rb_node_in);
+               n = rb_next(n);
+
                 __sort_chain_graph_rel(child, min_percent);
                 if (callchain_cumul_hits(child) >= min_hit)
                         rb_insert_callchain(&node->rb_root, child,
@@ -173,19 +183,26 @@ create_child(struct callchain_node *parent, bool inherit_children)
                 return NULL;
         }
         new->parent = parent;
-       INIT_LIST_HEAD(&new->children);
         INIT_LIST_HEAD(&new->val);
  
         if (inherit_children) {
-               struct callchain_node *next;
+               struct rb_node *n;
+               struct callchain_node *child;
+
+               new->rb_root_in = parent->rb_root_in;
+               parent->rb_root_in = RB_ROOT;
  
-               list_splice(&parent->children, &new->children);
-               INIT_LIST_HEAD(&parent->children);
+               n = rb_first(&new->rb_root_in);
+               while (n) {
+                       child = rb_entry(n, struct callchain_node, rb_node_in);
+                       child->parent = new;
+                       n = rb_next(n);
+               }
  
-               chain_for_each_child(next, new)
-                       next->parent = new;
+               /* make it the first child */
+               rb_link_node(&new->rb_node_in, NULL, &parent->rb_root_in.rb_node);
+               rb_insert_color(&new->rb_node_in, &parent->rb_root_in);
         }
-       list_add_tail(&new->siblings, &parent->children);
  
         return new;
  }
@@ -223,7 +240,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
         }
  }
  
-static void
+static struct callchain_node *
  add_child(struct callchain_node *parent,
           struct callchain_cursor *cursor,
           u64 period)
@@ -235,6 +252,19 @@ add_child(struct callchain_node *parent,
  
         new->children_hit = 0;
         new->hit = period;
+       return new;
+}
+
+static s64 match_chain(struct callchain_cursor_node *node,
+                     struct callchain_list *cnode)
+{
+       struct symbol *sym = node->sym;
+
+       if (cnode->ms.sym && sym &&
+           callchain_param.key == CCKEY_FUNCTION)
+               return cnode->ms.sym->start - sym->start;
+       else
+               return cnode->ip - node->ip;
  }
  
  /*
@@ -272,9 +302,33 @@ split_add_child(struct callchain_node *parent,
  
         /* create a new child for the new branch if any */
         if (idx_total < cursor->nr) {
+               struct callchain_node *first;
+               struct callchain_list *cnode;
+               struct callchain_cursor_node *node;
+               struct rb_node *p, **pp;
+
                 parent->hit = 0;
-               add_child(parent, cursor, period);
                 parent->children_hit += period;
+
+               node = callchain_cursor_current(cursor);
+               new = add_child(parent, cursor, period);
+
+               /*
+                * This is second child since we moved parent's children
+                * to new (first) child above.
+                */
+               p = parent->rb_root_in.rb_node;
+               first = rb_entry(p, struct callchain_node, rb_node_in);
+               cnode = list_first_entry(&first->val, struct callchain_list,
+                                        list);
+
+               if (match_chain(node, cnode) < 0)
+                       pp = &p->rb_left;
+               else
+                       pp = &p->rb_right;
+
+               rb_link_node(&new->rb_node_in, p, pp);
+               rb_insert_color(&new->rb_node_in, &parent->rb_root_in);
         } else {
                 parent->hit = period;
         }
@@ -291,16 +345,40 @@ append_chain_children(struct callchain_node *root,
                       u64 period)
  {
         struct callchain_node *rnode;
+       struct callchain_cursor_node *node;
+       struct rb_node **p = &root->rb_root_in.rb_node;
+       struct rb_node *parent = NULL;
+
+       node = callchain_cursor_current(cursor);
+       if (!node)
+               return;
  
         /* lookup in childrens */
-       chain_for_each_child(rnode, root) {
-               unsigned int ret = append_chain(rnode, cursor, period);
+       while (*p) {
+               s64 ret;
+               struct callchain_list *cnode;
  
-               if (!ret)
+               parent = *p;
+               rnode = rb_entry(parent, struct callchain_node, rb_node_in);
+               cnode = list_first_entry(&rnode->val, struct callchain_list,
+                                        list);
+
+               /* just check first entry */
+               ret = match_chain(node, cnode);
+               if (ret == 0) {
+                       append_chain(rnode, cursor, period);
                         goto inc_children_hit;
+               }
+
+               if (ret < 0)
+                       p = &parent->rb_left;
+               else
+                       p = &parent->rb_right;
         }
         /* nothing in children, add to the current node */
-       add_child(root, cursor, period);
+       rnode = add_child(root, cursor, period);
+       rb_link_node(&rnode->rb_node_in, parent, p);
+       rb_insert_color(&rnode->rb_node_in, &root->rb_root_in);
  
  inc_children_hit:
         root->children_hit += period;
@@ -325,28 +403,20 @@ append_chain(struct callchain_node *root,
          */
         list_for_each_entry(cnode, &root->val, list) {
                 struct callchain_cursor_node *node;
-               struct symbol *sym;
  
                 node = callchain_cursor_current(cursor);
                 if (!node)
                         break;
  
-               sym = node->sym;
-
-               if (cnode->ms.sym && sym &&
-                   callchain_param.key == CCKEY_FUNCTION) {
-                       if (cnode->ms.sym->start != sym->start)
-                               break;
-               } else if (cnode->ip != node->ip)
+               if (match_chain(node, cnode) != 0)
                         break;
  
-               if (!found)
-                       found = true;
+               found = true;
  
                 callchain_cursor_advance(cursor);
         }
  
-       /* matches not, relay on the parent */
+       /* matches not, relay no the parent */
         if (!found) {
                 cursor->curr = curr_snap;
                 cursor->pos = start;
@@ -395,8 +465,9 @@ merge_chain_branch(struct callchain_cursor *cursor,
                    struct callchain_node *dst, struct callchain_node *src)
  {
         struct callchain_cursor_node **old_last = cursor->last;
-       struct callchain_node *child, *next_child;
+       struct callchain_node *child;
         struct callchain_list *list, *next_list;
+       struct rb_node *n;
         int old_pos = cursor->nr;
         int err = 0;
  
@@ -412,12 +483,16 @@ merge_chain_branch(struct callchain_cursor *cursor,
                 append_chain_children(dst, cursor, src->hit);
         }
  
-       chain_for_each_child_safe(child, next_child, src) {
+       n = rb_first(&src->rb_root_in);
+       while (n) {
+               child = container_of(n, struct callchain_node, rb_node_in);
+               n = rb_next(n);
+               rb_erase(&child->rb_node_in, &src->rb_root_in);
+
                 err = merge_chain_branch(cursor, dst, child);
                 if (err)
                         break;
  
-               list_del(&child->siblings);
                 free(child);
         }
  
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h

index 2b585bc308cff6f5f7cdfa1685c469007b0db3b5..7bb36022377f7f3c663095c15b8bb8007b87d839 100644 (file)
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -21,11 +21,11 @@ enum chain_order {
  
  struct callchain_node {
         struct callchain_node   *parent;
-       struct list_head        siblings;
-       struct list_head        children;
         struct list_head        val;
-       struct rb_node          rb_node; /* to sort nodes in an rbtree */
-       struct rb_root          rb_root; /* sorted tree of children */
+       struct rb_node          rb_node_in; /* to insert nodes in an rbtree */
+       struct rb_node          rb_node;    /* to sort nodes in an output tree */
+       struct rb_root          rb_root_in; /* input tree of children */
+       struct rb_root          rb_root;    /* sorted output tree of children */
         unsigned int            val_nr;
         u64                     hit;
         u64                     children_hit;
@@ -86,13 +86,12 @@ extern __thread struct callchain_cursor callchain_cursor;
  
  static inline void callchain_init(struct callchain_root *root)
  {
-       INIT_LIST_HEAD(&root->node.siblings);
-       INIT_LIST_HEAD(&root->node.children);
         INIT_LIST_HEAD(&root->node.val);
  
         root->node.parent = NULL;
         root->node.hit = 0;
         root->node.children_hit = 0;
+       root->node.rb_root_in = RB_ROOT;
         root->max_depth = 0;
  }
  
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c

new file mode 100644 (file)

index 0000000..7d09faf
--- /dev/null
+++ b/tools/perf/util/data.c
@@ -0,0 +1,120 @@
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "data.h"
+#include "util.h"
+
+static bool check_pipe(struct perf_data_file *file)
+{
+       struct stat st;
+       bool is_pipe = false;
+       int fd = perf_data_file__is_read(file) ?
+                STDIN_FILENO : STDOUT_FILENO;
+
+       if (!file->path) {
+               if (!fstat(fd, &st) && S_ISFIFO(st.st_mode))
+                       is_pipe = true;
+       } else {
+               if (!strcmp(file->path, "-"))
+                       is_pipe = true;
+       }
+
+       if (is_pipe)
+               file->fd = fd;
+
+       return file->is_pipe = is_pipe;
+}
+
+static int check_backup(struct perf_data_file *file)
+{
+       struct stat st;
+
+       if (!stat(file->path, &st) && st.st_size) {
+               /* TODO check errors properly */
+               char oldname[PATH_MAX];
+               snprintf(oldname, sizeof(oldname), "%s.old",
+                        file->path);
+               unlink(oldname);
+               rename(file->path, oldname);
+       }
+
+       return 0;
+}
+
+static int open_file_read(struct perf_data_file *file)
+{
+       struct stat st;
+       int fd;
+
+       fd = open(file->path, O_RDONLY);
+       if (fd < 0) {
+               int err = errno;
+
+               pr_err("failed to open %s: %s", file->path, strerror(err));
+               if (err == ENOENT && !strcmp(file->path, "perf.data"))
+                       pr_err("  (try 'perf record' first)");
+               pr_err("\n");
+               return -err;
+       }
+
+       if (fstat(fd, &st) < 0)
+               goto out_close;
+
+       if (!file->force && st.st_uid && (st.st_uid != geteuid())) {
+               pr_err("file %s not owned by current user or root\n",
+                      file->path);
+               goto out_close;
+       }
+
+       if (!st.st_size) {
+               pr_info("zero-sized file (%s), nothing to do!\n",
+                       file->path);
+               goto out_close;
+       }
+
+       file->size = st.st_size;
+       return fd;
+
+ out_close:
+       close(fd);
+       return -1;
+}
+
+static int open_file_write(struct perf_data_file *file)
+{
+       if (check_backup(file))
+               return -1;
+
+       return open(file->path, O_CREAT|O_RDWR|O_TRUNC, S_IRUSR|S_IWUSR);
+}
+
+static int open_file(struct perf_data_file *file)
+{
+       int fd;
+
+       fd = perf_data_file__is_read(file) ?
+            open_file_read(file) : open_file_write(file);
+
+       file->fd = fd;
+       return fd < 0 ? -1 : 0;
+}
+
+int perf_data_file__open(struct perf_data_file *file)
+{
+       if (check_pipe(file))
+               return 0;
+
+       if (!file->path)
+               file->path = "perf.data";
+
+       return open_file(file);
+}
+
+void perf_data_file__close(struct perf_data_file *file)
+{
+       close(file->fd);
+}
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h

new file mode 100644 (file)

index 0000000..8c2df80
--- /dev/null
+++ b/tools/perf/util/data.h
@@ -0,0 +1,48 @@
+#ifndef __PERF_DATA_H
+#define __PERF_DATA_H
+
+#include <stdbool.h>
+
+enum perf_data_mode {
+       PERF_DATA_MODE_WRITE,
+       PERF_DATA_MODE_READ,
+};
+
+struct perf_data_file {
+       const char *path;
+       int fd;
+       bool is_pipe;
+       bool force;
+       unsigned long size;
+       enum perf_data_mode mode;
+};
+
+static inline bool perf_data_file__is_read(struct perf_data_file *file)
+{
+       return file->mode == PERF_DATA_MODE_READ;
+}
+
+static inline bool perf_data_file__is_write(struct perf_data_file *file)
+{
+       return file->mode == PERF_DATA_MODE_WRITE;
+}
+
+static inline int perf_data_file__is_pipe(struct perf_data_file *file)
+{
+       return file->is_pipe;
+}
+
+static inline int perf_data_file__fd(struct perf_data_file *file)
+{
+       return file->fd;
+}
+
+static inline unsigned long perf_data_file__size(struct perf_data_file *file)
+{
+       return file->size;
+}
+
+int perf_data_file__open(struct perf_data_file *file);
+void perf_data_file__close(struct perf_data_file *file);
+
+#endif /* __PERF_DATA_H */
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c

index e3c1ff8512c827330d4afde013e19b7c5eeb58f2..af4c687cc49b54279c95c10ed48fe12843700658 100644 (file)
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -7,19 +7,20 @@
  char dso__symtab_origin(const struct dso *dso)
  {
         static const char origin[] = {
-               [DSO_BINARY_TYPE__KALLSYMS]             = 'k',
-               [DSO_BINARY_TYPE__VMLINUX]              = 'v',
-               [DSO_BINARY_TYPE__JAVA_JIT]             = 'j',
-               [DSO_BINARY_TYPE__DEBUGLINK]            = 'l',
-               [DSO_BINARY_TYPE__BUILD_ID_CACHE]       = 'B',
-               [DSO_BINARY_TYPE__FEDORA_DEBUGINFO]     = 'f',
-               [DSO_BINARY_TYPE__UBUNTU_DEBUGINFO]     = 'u',
-               [DSO_BINARY_TYPE__BUILDID_DEBUGINFO]    = 'b',
-               [DSO_BINARY_TYPE__SYSTEM_PATH_DSO]      = 'd',
-               [DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE]  = 'K',
-               [DSO_BINARY_TYPE__GUEST_KALLSYMS]       = 'g',
-               [DSO_BINARY_TYPE__GUEST_KMODULE]        = 'G',
-               [DSO_BINARY_TYPE__GUEST_VMLINUX]        = 'V',
+               [DSO_BINARY_TYPE__KALLSYMS]                     = 'k',
+               [DSO_BINARY_TYPE__VMLINUX]                      = 'v',
+               [DSO_BINARY_TYPE__JAVA_JIT]                     = 'j',
+               [DSO_BINARY_TYPE__DEBUGLINK]                    = 'l',
+               [DSO_BINARY_TYPE__BUILD_ID_CACHE]               = 'B',
+               [DSO_BINARY_TYPE__FEDORA_DEBUGINFO]             = 'f',
+               [DSO_BINARY_TYPE__UBUNTU_DEBUGINFO]             = 'u',
+               [DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO]       = 'o',
+               [DSO_BINARY_TYPE__BUILDID_DEBUGINFO]            = 'b',
+               [DSO_BINARY_TYPE__SYSTEM_PATH_DSO]              = 'd',
+               [DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE]          = 'K',
+               [DSO_BINARY_TYPE__GUEST_KALLSYMS]               = 'g',
+               [DSO_BINARY_TYPE__GUEST_KMODULE]                = 'G',
+               [DSO_BINARY_TYPE__GUEST_VMLINUX]                = 'V',
         };
  
         if (dso == NULL || dso->symtab_type == DSO_BINARY_TYPE__NOT_FOUND)
@@ -64,6 +65,28 @@ int dso__binary_type_file(struct dso *dso, enum dso_binary_type type,
                          symbol_conf.symfs, dso->long_name);
                 break;
  
+       case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO:
+       {
+               char *last_slash;
+               size_t len;
+               size_t dir_size;
+
+               last_slash = dso->long_name + dso->long_name_len;
+               while (last_slash != dso->long_name && *last_slash != '/')
+                       last_slash--;
+
+               len = scnprintf(file, size, "%s", symbol_conf.symfs);
+               dir_size = last_slash - dso->long_name + 2;
+               if (dir_size > (size - len)) {
+                       ret = -1;
+                       break;
+               }
+               len += scnprintf(file + len, dir_size, "%s",  dso->long_name);
+               len += scnprintf(file + len , size - len, ".debug%s",
+                                                               last_slash);
+               break;
+       }
+
         case DSO_BINARY_TYPE__BUILDID_DEBUGINFO:
                 if (!dso->has_build_id) {
                         ret = -1;
@@ -427,6 +450,7 @@ struct dso *dso__new(const char *name)
                 dso->rel = 0;
                 dso->sorted_by_name = 0;
                 dso->has_build_id = 0;
+               dso->has_srcline = 1;
                 dso->kernel = DSO_TYPE_USER;
                 dso->needs_swap = DSO_SWAP__UNSET;
                 INIT_LIST_HEAD(&dso->node);
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h

index b793053335d60fca3302fd34b27543cd6151b0fb..9ac666abbe7ef988f30630c053f9c4e709eb3edb 100644 (file)
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -6,6 +6,7 @@
  #include <stdbool.h>
  #include "types.h"
  #include "map.h"
+#include "build-id.h"
  
  enum dso_binary_type {
         DSO_BINARY_TYPE__KALLSYMS = 0,
@@ -23,6 +24,7 @@ enum dso_binary_type {
         DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE,
         DSO_BINARY_TYPE__KCORE,
         DSO_BINARY_TYPE__GUEST_KCORE,
+       DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO,
         DSO_BINARY_TYPE__NOT_FOUND,
  };
  
@@ -81,6 +83,7 @@ struct dso {
         enum dso_binary_type    data_type;
         u8               adjust_symbols:1;
         u8               has_build_id:1;
+       u8               has_srcline:1;
         u8               hit:1;
         u8               annotate_warned:1;
         u8               sname_alloc:1;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c

index 9b393e7dca6fe849037d5a1314a8cf6b22e1f596..63df031fc9c73868160305b7875d6e20eded09df 100644 (file)
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -187,7 +187,7 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                 return -1;
         }
  
-       event->header.type = PERF_RECORD_MMAP2;
+       event->header.type = PERF_RECORD_MMAP;
         /*
          * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c
          */
@@ -198,7 +198,6 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                 char prot[5];
                 char execname[PATH_MAX];
                 char anonstr[] = "//anon";
-               unsigned int ino;
                 size_t size;
                 ssize_t n;
  
@@ -209,13 +208,10 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                 strcpy(execname, "");
  
                 /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n",
-                      &event->mmap2.start, &event->mmap2.len, prot,
-                      &event->mmap2.pgoff, &event->mmap2.maj,
-                      &event->mmap2.min,
-                      &ino, execname);
-
-               event->mmap2.ino = (u64)ino;
+               n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %*x:%*x %*u %s\n",
+                      &event->mmap.start, &event->mmap.len, prot,
+                      &event->mmap.pgoff,
+                      execname);
  
                 if (n != 8)
                         continue;
@@ -227,15 +223,15 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                         strcpy(execname, anonstr);
  
                 size = strlen(execname) + 1;
-               memcpy(event->mmap2.filename, execname, size);
+               memcpy(event->mmap.filename, execname, size);
                 size = PERF_ALIGN(size, sizeof(u64));
-               event->mmap2.len -= event->mmap.start;
-               event->mmap2.header.size = (sizeof(event->mmap2) -
-                                       (sizeof(event->mmap2.filename) - size));
-               memset(event->mmap2.filename + size, 0, machine->id_hdr_size);
-               event->mmap2.header.size += machine->id_hdr_size;
-               event->mmap2.pid = tgid;
-               event->mmap2.tid = pid;
+               event->mmap.len -= event->mmap.start;
+               event->mmap.header.size = (sizeof(event->mmap) -
+                                       (sizeof(event->mmap.filename) - size));
+               memset(event->mmap.filename + size, 0, machine->id_hdr_size);
+               event->mmap.header.size += machine->id_hdr_size;
+               event->mmap.pid = tgid;
+               event->mmap.tid = pid;
  
                 if (process(tool, event, &synth_sample, machine) != 0) {
                         rc = -1;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h

index c67ecc457d295d029a2307c1b0ad7e1531b944d1..752709ccfb00e4c0f424dec263960168db0fe2f8 100644 (file)
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -61,6 +61,12 @@ struct read_event {
         u64 id;
  };
  
+struct throttle_event {
+       struct perf_event_header header;
+       u64 time;
+       u64 id;
+       u64 stream_id;
+};
  
  #define PERF_SAMPLE_MASK                               \
         (PERF_SAMPLE_IP | PERF_SAMPLE_TID |             \
@@ -69,6 +75,9 @@ struct read_event {
          PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD |         \
          PERF_SAMPLE_IDENTIFIER)
  
+/* perf sample has 16 bits size limit */
+#define PERF_SAMPLE_MAX_SIZE (1 << 16)
+
  struct sample_event {
         struct perf_event_header        header;
         u64 array[];
@@ -111,6 +120,7 @@ struct perf_sample {
         u64 stream_id;
         u64 period;
         u64 weight;
+       u64 transaction;
         u32 cpu;
         u32 raw_size;
         u64 data_src;
@@ -177,6 +187,7 @@ union perf_event {
         struct fork_event               fork;
         struct lost_event               lost;
         struct read_event               read;
+       struct throttle_event           throttle;
         struct sample_event             sample;
         struct attr_event               attr;
         struct event_type_event         event_type;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c

index f9f77bee0b1b416414fa3561fb2eee4785e502c4..85c4c80bcac8c68e3fe179dd2d73cee2f928923f 100644 (file)
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -18,6 +18,7 @@
  #include <unistd.h>
  
  #include "parse-events.h"
+#include "parse-options.h"
  
  #include <sys/mman.h>
  
@@ -49,6 +50,18 @@ struct perf_evlist *perf_evlist__new(void)
         return evlist;
  }
  
+struct perf_evlist *perf_evlist__new_default(void)
+{
+       struct perf_evlist *evlist = perf_evlist__new();
+
+       if (evlist && perf_evlist__add_default(evlist)) {
+               perf_evlist__delete(evlist);
+               evlist = NULL;
+       }
+
+       return evlist;
+}
+
  /**
   * perf_evlist__set_id_pos - set the positions of event ids.
   * @evlist: selected event list
@@ -527,7 +540,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
                 if ((old & md->mask) + size != ((old + size) & md->mask)) {
                         unsigned int offset = old;
                         unsigned int len = min(sizeof(*event), size), cpy;
-                       void *dst = &md->event_copy;
+                       void *dst = md->event_copy;
  
                         do {
                                 cpy = min(md->mask + 1 - (offset & md->mask), len);
@@ -537,7 +550,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
                                 len -= cpy;
                         } while (len);
  
-                       event = &md->event_copy;
+                       event = (union perf_event *) md->event_copy;
                 }
  
                 old += size;
@@ -595,9 +608,36 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist,
         return 0;
  }
  
-static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, int prot, int mask)
+static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
+                                      int prot, int mask, int cpu, int thread,
+                                      int *output)
  {
         struct perf_evsel *evsel;
+
+       list_for_each_entry(evsel, &evlist->entries, node) {
+               int fd = FD(evsel, cpu, thread);
+
+               if (*output == -1) {
+                       *output = fd;
+                       if (__perf_evlist__mmap(evlist, idx, prot, mask,
+                                               *output) < 0)
+                               return -1;
+               } else {
+                       if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0)
+                               return -1;
+               }
+
+               if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
+                   perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0)
+                       return -1;
+       }
+
+       return 0;
+}
+
+static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, int prot,
+                                    int mask)
+{
         int cpu, thread;
         int nr_cpus = cpu_map__nr(evlist->cpus);
         int nr_threads = thread_map__nr(evlist->threads);
@@ -607,23 +647,9 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, int prot, int m
                 int output = -1;
  
                 for (thread = 0; thread < nr_threads; thread++) {
-                       list_for_each_entry(evsel, &evlist->entries, node) {
-                               int fd = FD(evsel, cpu, thread);
-
-                               if (output == -1) {
-                                       output = fd;
-                                       if (__perf_evlist__mmap(evlist, cpu,
-                                                               prot, mask, output) < 0)
-                                               goto out_unmap;
-                               } else {
-                                       if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, output) != 0)
-                                               goto out_unmap;
-                               }
-
-                               if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
-                                   perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0)
-                                       goto out_unmap;
-                       }
+                       if (perf_evlist__mmap_per_evsel(evlist, cpu, prot, mask,
+                                                       cpu, thread, &output))
+                               goto out_unmap;
                 }
         }
  
@@ -635,9 +661,9 @@ out_unmap:
         return -1;
  }
  
-static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot, int mask)
+static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot,
+                                       int mask)
  {
-       struct perf_evsel *evsel;
         int thread;
         int nr_threads = thread_map__nr(evlist->threads);
  
@@ -645,23 +671,9 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot, in
         for (thread = 0; thread < nr_threads; thread++) {
                 int output = -1;
  
-               list_for_each_entry(evsel, &evlist->entries, node) {
-                       int fd = FD(evsel, 0, thread);
-
-                       if (output == -1) {
-                               output = fd;
-                               if (__perf_evlist__mmap(evlist, thread,
-                                                       prot, mask, output) < 0)
-                                       goto out_unmap;
-                       } else {
-                               if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, output) != 0)
-                                       goto out_unmap;
-                       }
-
-                       if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
-                           perf_evlist__id_add_fd(evlist, evsel, 0, thread, fd) < 0)
-                               goto out_unmap;
-               }
+               if (perf_evlist__mmap_per_evsel(evlist, thread, prot, mask, 0,
+                                               thread, &output))
+                       goto out_unmap;
         }
  
         return 0;
@@ -672,20 +684,70 @@ out_unmap:
         return -1;
  }
  
-/** perf_evlist__mmap - Create per cpu maps to receive events
- *
- * @evlist - list of events
- * @pages - map length in pages
- * @overwrite - overwrite older events?
- *
- * If overwrite is false the user needs to signal event consuption using:
- *
- *     struct perf_mmap *m = &evlist->mmap[cpu];
- *     unsigned int head = perf_mmap__read_head(m);
+static size_t perf_evlist__mmap_size(unsigned long pages)
+{
+       /* 512 kiB: default amount of unprivileged mlocked memory */
+       if (pages == UINT_MAX)
+               pages = (512 * 1024) / page_size;
+       else if (!is_power_of_2(pages))
+               return 0;
+
+       return (pages + 1) * page_size;
+}
+
+int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
+                                 int unset __maybe_unused)
+{
+       unsigned int pages, val, *mmap_pages = opt->value;
+       size_t size;
+       static struct parse_tag tags[] = {
+               { .tag  = 'B', .mult = 1       },
+               { .tag  = 'K', .mult = 1 << 10 },
+               { .tag  = 'M', .mult = 1 << 20 },
+               { .tag  = 'G', .mult = 1 << 30 },
+               { .tag  = 0 },
+       };
+
+       val = parse_tag_value(str, tags);
+       if (val != (unsigned int) -1) {
+               /* we got file size value */
+               pages = PERF_ALIGN(val, page_size) / page_size;
+               if (!is_power_of_2(pages)) {
+                       pages = next_pow2(pages);
+                       pr_info("rounding mmap pages size to %u (%u pages)\n",
+                               pages * page_size, pages);
+               }
+       } else {
+               /* we got pages count value */
+               char *eptr;
+               pages = strtoul(str, &eptr, 10);
+               if (*eptr != '\0') {
+                       pr_err("failed to parse --mmap_pages/-m value\n");
+                       return -1;
+               }
+       }
+
+       size = perf_evlist__mmap_size(pages);
+       if (!size) {
+               pr_err("--mmap_pages/-m value must be a power of two.");
+               return -1;
+       }
+
+       *mmap_pages = pages;
+       return 0;
+}
+
+/**
+ * perf_evlist__mmap - Create mmaps to receive events.
+ * @evlist: list of events
+ * @pages: map length in pages
+ * @overwrite: overwrite older events?
   *
- *     perf_mmap__write_tail(m, head)
+ * If @overwrite is %false the user needs to signal event consumption using
+ * perf_mmap__write_tail().  Using perf_evlist__mmap_read() does this
+ * automatically.
   *
- * Using perf_evlist__read_on_cpu does this automatically.
+ * Return: %0 on success, negative error code otherwise.
   */
  int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
                       bool overwrite)
@@ -695,14 +757,6 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
         const struct thread_map *threads = evlist->threads;
         int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE), mask;
  
-        /* 512 kiB: default amount of unprivileged mlocked memory */
-        if (pages == UINT_MAX)
-                pages = (512 * 1024) / page_size;
-       else if (!is_power_of_2(pages))
-               return -EINVAL;
-
-       mask = pages * page_size - 1;
-
         if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
                 return -ENOMEM;
  
@@ -710,7 +764,9 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
                 return -ENOMEM;
  
         evlist->overwrite = overwrite;
-       evlist->mmap_len = (pages + 1) * page_size;
+       evlist->mmap_len = perf_evlist__mmap_size(pages);
+       pr_debug("mmap size %zuB\n", evlist->mmap_len);
+       mask = evlist->mmap_len - page_size - 1;
  
         list_for_each_entry(evsel, &evlist->entries, node) {
                 if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
@@ -1066,3 +1122,66 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp)
  
         return printed + fprintf(fp, "\n");;
  }
+
+int perf_evlist__strerror_tp(struct perf_evlist *evlist __maybe_unused,
+                            int err, char *buf, size_t size)
+{
+       char sbuf[128];
+
+       switch (err) {
+       case ENOENT:
+               scnprintf(buf, size, "%s",
+                         "Error:\tUnable to find debugfs\n"
+                         "Hint:\tWas your kernel was compiled with debugfs support?\n"
+                         "Hint:\tIs the debugfs filesystem mounted?\n"
+                         "Hint:\tTry 'sudo mount -t debugfs nodev /sys/kernel/debug'");
+               break;
+       case EACCES:
+               scnprintf(buf, size,
+                         "Error:\tNo permissions to read %s/tracing/events/raw_syscalls\n"
+                         "Hint:\tTry 'sudo mount -o remount,mode=755 %s'\n",
+                         debugfs_mountpoint, debugfs_mountpoint);
+               break;
+       default:
+               scnprintf(buf, size, "%s", strerror_r(err, sbuf, sizeof(sbuf)));
+               break;
+       }
+
+       return 0;
+}
+
+int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
+                              int err, char *buf, size_t size)
+{
+       int printed, value;
+       char sbuf[128], *emsg = strerror_r(err, sbuf, sizeof(sbuf));
+
+       switch (err) {
+       case EACCES:
+       case EPERM:
+               printed = scnprintf(buf, size,
+                                   "Error:\t%s.\n"
+                                   "Hint:\tCheck /proc/sys/kernel/perf_event_paranoid setting.", emsg);
+
+               if (filename__read_int("/proc/sys/kernel/perf_event_paranoid", &value))
+                       break;
+
+               printed += scnprintf(buf + printed, size - printed, "\nHint:\t");
+
+               if (value >= 2) {
+                       printed += scnprintf(buf + printed, size - printed,
+                                            "For your workloads it needs to be <= 1\nHint:\t");
+               }
+               printed += scnprintf(buf + printed, size - printed,
+                                    "For system wide tracing it needs to be set to -1");
+
+               printed += scnprintf(buf + printed, size - printed,
+                                   ".\nHint:\tThe current value is %d.", value);
+               break;
+       default:
+               scnprintf(buf, size, "%s", emsg);
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h

index 880d7139d2fb3fce78d75b8e701251827045588e..7f8f1aeb9cfe66a03985c0977688be636990cf3b 100644 (file)
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -21,7 +21,7 @@ struct perf_mmap {
         void             *base;
         int              mask;
         unsigned int     prev;
-       union perf_event event_copy;
+       char             event_copy[PERF_SAMPLE_MAX_SIZE];
  };
  
  struct perf_evlist {
@@ -31,7 +31,7 @@ struct perf_evlist {
         int              nr_groups;
         int              nr_fds;
         int              nr_mmaps;
-       int              mmap_len;
+       size_t           mmap_len;
         int              id_pos;
         int              is_pos;
         u64              combined_sample_type;
@@ -53,6 +53,7 @@ struct perf_evsel_str_handler {
  };
  
  struct perf_evlist *perf_evlist__new(void);
+struct perf_evlist *perf_evlist__new_default(void);
  void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
                        struct thread_map *threads);
  void perf_evlist__exit(struct perf_evlist *evlist);
@@ -103,6 +104,10 @@ int perf_evlist__prepare_workload(struct perf_evlist *evlist,
                                   bool want_signal);
  int perf_evlist__start_workload(struct perf_evlist *evlist);
  
+int perf_evlist__parse_mmap_pages(const struct option *opt,
+                                 const char *str,
+                                 int unset);
+
  int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
                       bool overwrite);
  void perf_evlist__munmap(struct perf_evlist *evlist);
@@ -163,6 +168,9 @@ static inline struct perf_evsel *perf_evlist__last(struct perf_evlist *evlist)
  
  size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp);
  
+int perf_evlist__strerror_tp(struct perf_evlist *evlist, int err, char *buf, size_t size);
+int perf_evlist__strerror_open(struct perf_evlist *evlist, int err, char *buf, size_t size);
+
  static inline unsigned int perf_mmap__read_head(struct perf_mmap *mm)
  {
         struct perf_event_mmap_page *pc = mm->base;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

index 0ce9febf1ba0c8c1a691c74a0c55fa1c4a8dfd14..3a334f0019979e88ed8fbaa152f2183470c973f5 100644 (file)
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -678,9 +678,11 @@ void perf_evsel__config(struct perf_evsel *evsel,
                 attr->sample_type       |= PERF_SAMPLE_WEIGHT;
  
         attr->mmap  = track;
-       attr->mmap2 = track && !perf_missing_features.mmap2;
         attr->comm  = track;
  
+       if (opts->sample_transaction)
+               attr->sample_type       |= PERF_SAMPLE_TRANSACTION;
+
         /*
          * XXX see the function comment above
          *
@@ -983,6 +985,7 @@ static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp)
         ret += PRINT_ATTR2(exclude_host, exclude_guest);
         ret += PRINT_ATTR2N("excl.callchain_kern", exclude_callchain_kernel,
                             "excl.callchain_user", exclude_callchain_user);
+       ret += PRINT_ATTR_U32(mmap2);
  
         ret += PRINT_ATTR_U32(wakeup_events);
         ret += PRINT_ATTR_U32(wakeup_watermark);
@@ -1214,6 +1217,7 @@ static int perf_evsel__parse_id_sample(const struct perf_evsel *evsel,
  
                 sample->pid = u.val32[0];
                 sample->tid = u.val32[1];
+               array--;
         }
  
         return 0;
@@ -1453,6 +1457,9 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
                         array = (void *)array + sz;
                         OVERFLOW_CHECK_u64(array);
                         data->user_stack.size = *array++;
+                       if (WARN_ONCE(data->user_stack.size > sz,
+                                     "user stack dump failure\n"))
+                               return -EFAULT;
                 }
         }
  
@@ -1470,6 +1477,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
                 array++;
         }
  
+       data->transaction = 0;
+       if (type & PERF_SAMPLE_TRANSACTION) {
+               data->transaction = *array;
+               array++;
+       }
+
         return 0;
  }
  
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

index 4a7bdc713bab2fa4266069388ed79d530ba64b94..5aa68cddc7d9fff14bf6fdff9be38dcf2655a483 100644 (file)
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -197,6 +197,12 @@ static inline bool perf_evsel__match2(struct perf_evsel *e1,
                (e1->attr.config == e2->attr.config);
  }
  
+#define perf_evsel__cmp(a, b)                  \
+       ((a) &&                                 \
+        (b) &&                                 \
+        (a)->attr.type == (b)->attr.type &&    \
+        (a)->attr.config == (b)->attr.config)
+
  int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
                               int cpu, int thread, bool scale);
  
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh

index 3ac38031d53400337815560562a52093aaeee9e2..36a885d2cd22d12ed004177f375bcaf4aac86a28 100755 (executable)
--- a/tools/perf/util/generate-cmdlist.sh
+++ b/tools/perf/util/generate-cmdlist.sh
@@ -22,7 +22,7 @@ do
       }' "Documentation/perf-$cmd.txt"
  done
  
-echo "#ifdef LIBELF_SUPPORT"
+echo "#ifdef HAVE_LIBELF_SUPPORT"
  sed -n -e 's/^perf-\([^        ]*\)[   ].* full.*/\1/p' command-list.txt |
  sort |
  while read cmd
@@ -35,5 +35,5 @@ do
             p
       }' "Documentation/perf-$cmd.txt"
  done
-echo "#endif /* LIBELF_SUPPORT */"
+echo "#endif /* HAVE_LIBELF_SUPPORT */"
  echo "};"
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c

index c3e5a3b817ab714497dc7f6f39520945dc886b1a..26d9520a0c1b6ebf550bfd001bcd278fa8736d9c 100644 (file)
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -22,6 +22,7 @@
  #include "vdso.h"
  #include "strbuf.h"
  #include "build-id.h"
+#include "data.h"
  
  static bool no_buildid_cache = false;
  
@@ -2189,7 +2190,7 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full)
  {
         struct header_print_data hd;
         struct perf_header *header = &session->header;
-       int fd = session->fd;
+       int fd = perf_data_file__fd(session->file);
         hd.fp = fp;
         hd.full = full;
  
@@ -2650,7 +2651,8 @@ static int perf_header__read_pipe(struct perf_session *session)
         struct perf_header *header = &session->header;
         struct perf_pipe_file_header f_header;
  
-       if (perf_file_header__read_pipe(&f_header, header, session->fd,
+       if (perf_file_header__read_pipe(&f_header, header,
+                                       perf_data_file__fd(session->file),
                                         session->repipe) < 0) {
                 pr_debug("incompatible file format\n");
                 return -EINVAL;
@@ -2751,18 +2753,19 @@ static int perf_evlist__prepare_tracepoint_events(struct perf_evlist *evlist,
  
  int perf_session__read_header(struct perf_session *session)
  {
+       struct perf_data_file *file = session->file;
         struct perf_header *header = &session->header;
         struct perf_file_header f_header;
         struct perf_file_attr   f_attr;
         u64                     f_id;
         int nr_attrs, nr_ids, i, j;
-       int fd = session->fd;
+       int fd = perf_data_file__fd(file);
  
         session->evlist = perf_evlist__new();
         if (session->evlist == NULL)
                 return -ENOMEM;
  
-       if (session->fd_pipe)
+       if (perf_data_file__is_pipe(file))
                 return perf_header__read_pipe(session);
  
         if (perf_file_header__read(&f_header, header, fd) < 0)
@@ -2777,7 +2780,7 @@ int perf_session__read_header(struct perf_session *session)
         if (f_header.data.size == 0) {
                 pr_warning("WARNING: The %s file's data size field is 0 which is unexpected.\n"
                            "Was the 'perf record' command properly terminated?\n",
-                          session->filename);
+                          file->path);
         }
  
         nr_attrs = f_header.attrs.size / f_header.attr_size;
@@ -2990,18 +2993,19 @@ int perf_event__process_tracing_data(struct perf_tool *tool __maybe_unused,
                                      struct perf_session *session)
  {
         ssize_t size_read, padding, size = event->tracing_data.size;
-       off_t offset = lseek(session->fd, 0, SEEK_CUR);
+       int fd = perf_data_file__fd(session->file);
+       off_t offset = lseek(fd, 0, SEEK_CUR);
         char buf[BUFSIZ];
  
         /* setup for reading amidst mmap */
-       lseek(session->fd, offset + sizeof(struct tracing_data_event),
+       lseek(fd, offset + sizeof(struct tracing_data_event),
               SEEK_SET);
  
-       size_read = trace_report(session->fd, &session->pevent,
+       size_read = trace_report(fd, &session->pevent,
                                  session->repipe);
         padding = PERF_ALIGN(size_read, sizeof(u64)) - size_read;
  
-       if (readn(session->fd, buf, padding) < 0) {
+       if (readn(fd, buf, padding) < 0) {
                 pr_err("%s: reading input file", __func__);
                 return -1;
         }
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index 9ff6cf3e9a99f69596b37372f60203c11bec88a3..cca03831f41abcc88557572220fa0d2e9b6a14c8 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -160,6 +160,10 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
         hists__new_col_len(hists, HISTC_MEM_LVL, 21 + 3);
         hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12);
         hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12);
+
+       if (h->transaction)
+               hists__new_col_len(hists, HISTC_TRANSACTION,
+                                  hist_entry__transaction_len());
  }
  
  void hists__output_recalc_col_len(struct hists *hists, int max_rows)
@@ -346,7 +350,7 @@ static struct hist_entry *add_hist_entry(struct hists *hists,
         struct rb_node **p;
         struct rb_node *parent = NULL;
         struct hist_entry *he;
-       int cmp;
+       int64_t cmp;
  
         p = &hists->entries_in->rb_node;
  
@@ -466,7 +470,7 @@ struct hist_entry *__hists__add_branch_entry(struct hists *self,
  struct hist_entry *__hists__add_entry(struct hists *self,
                                       struct addr_location *al,
                                       struct symbol *sym_parent, u64 period,
-                                     u64 weight)
+                                     u64 weight, u64 transaction)
  {
         struct hist_entry entry = {
                 .thread = al->thread,
@@ -487,6 +491,7 @@ struct hist_entry *__hists__add_entry(struct hists *self,
                 .hists  = self,
                 .branch_info = NULL,
                 .mem_info = NULL,
+               .transaction = transaction,
         };
  
         return add_hist_entry(self, &entry, al, period, weight);
@@ -530,6 +535,7 @@ void hist_entry__free(struct hist_entry *he)
  {
         free(he->branch_info);
         free(he->mem_info);
+       free_srcline(he->srcline);
         free(he);
  }
  
@@ -884,7 +890,7 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
         struct rb_node **p;
         struct rb_node *parent = NULL;
         struct hist_entry *he;
-       int cmp;
+       int64_t cmp;
  
         if (sort__need_collapse)
                 root = &hists->entries_collapsed;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h

index 1329b6b6ffe61b0f5fb97d8582a837f554b3c752..20b175808cd372c031f3e2cf2cc3ad0d49d8df70 100644 (file)
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -45,6 +45,8 @@ enum hist_column {
         HISTC_CPU,
         HISTC_SRCLINE,
         HISTC_MISPREDICT,
+       HISTC_IN_TX,
+       HISTC_ABORT,
         HISTC_SYMBOL_FROM,
         HISTC_SYMBOL_TO,
         HISTC_DSO_FROM,
@@ -57,6 +59,7 @@ enum hist_column {
         HISTC_MEM_TLB,
         HISTC_MEM_LVL,
         HISTC_MEM_SNOOP,
+       HISTC_TRANSACTION,
         HISTC_NR_COLS, /* Last entry */
  };
  
@@ -82,9 +85,10 @@ struct hists {
  struct hist_entry *__hists__add_entry(struct hists *self,
                                       struct addr_location *al,
                                       struct symbol *parent, u64 period,
-                                     u64 weight);
+                                     u64 weight, u64 transaction);
  int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
  int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
+int hist_entry__transaction_len(void);
  int hist_entry__sort_snprintf(struct hist_entry *self, char *bf, size_t size,
                               struct hists *hists);
  void hist_entry__free(struct hist_entry *);
@@ -183,7 +187,7 @@ struct hist_browser_timer {
         int refresh;
  };
  
-#ifdef SLANG_SUPPORT
+#ifdef HAVE_SLANG_SUPPORT
  #include "../ui/keysyms.h"
  int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel,
                              struct hist_browser_timer *hbt);
@@ -224,20 +228,5 @@ static inline int script_browse(const char *script_opt __maybe_unused)
  #define K_SWITCH_INPUT_DATA -3000
  #endif
  
-#ifdef GTK2_SUPPORT
-int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, const char *help,
-                                 struct hist_browser_timer *hbt __maybe_unused,
-                                 float min_pcnt);
-#else
-static inline
-int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist __maybe_unused,
-                                 const char *help __maybe_unused,
-                                 struct hist_browser_timer *hbt __maybe_unused,
-                                 float min_pcnt __maybe_unused)
-{
-       return 0;
-}
-#endif
-
  unsigned int hists__sort_list_width(struct hists *self);
  #endif /* __PERF_HIST_H */
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h

index cf6727e99c440657b3cd6f29ac05c4f6f38bd89b..8f149655f497302f39b5352cf099fbd71d4d02d7 100644 (file)
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -1,7 +1,7 @@
  #ifndef _PERF_DWARF_REGS_H_
  #define _PERF_DWARF_REGS_H_
  
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
  const char *get_arch_regstr(unsigned int n);
  #endif
  
diff --git a/tools/perf/util/include/linux/compiler.h b/tools/perf/util/include/linux/compiler.h

index 96b919dae11c27bbbae7e14f1c6eb7cb0cd91a89..b003ad7200b230702b133071ab3d402395ab57e6 100644 (file)
--- a/tools/perf/util/include/linux/compiler.h
+++ b/tools/perf/util/include/linux/compiler.h
@@ -2,20 +2,29 @@
  #define _PERF_LINUX_COMPILER_H_
  
  #ifndef __always_inline
-#define __always_inline        inline
+# define __always_inline       inline __attribute__((always_inline))
  #endif
+
  #define __user
+
  #ifndef __attribute_const__
-#define __attribute_const__
+# define __attribute_const__
  #endif
  
  #ifndef __maybe_unused
-#define __maybe_unused         __attribute__((unused))
+# define __maybe_unused                __attribute__((unused))
+#endif
+
+#ifndef __packed
+# define __packed              __attribute__((__packed__))
  #endif
-#define __packed       __attribute__((__packed__))
  
  #ifndef __force
-#define __force
+# define __force
+#endif
+
+#ifndef __weak
+# define __weak                        __attribute__((weak))
  #endif
  
  #endif
diff --git a/tools/perf/util/intlist.c b/tools/perf/util/intlist.c

index 11a8d86f7fea3bf4ec5fb8cb05f27dc4c491d948..89715b64a31578e2f26fc98a4e9c778bb72710d5 100644 (file)
--- a/tools/perf/util/intlist.c
+++ b/tools/perf/util/intlist.c
@@ -20,6 +20,7 @@ static struct rb_node *intlist__node_new(struct rblist *rblist __maybe_unused,
  
         if (node != NULL) {
                 node->i = i;
+               node->priv = NULL;
                 rc = &node->rb_node;
         }
  
@@ -57,22 +58,36 @@ void intlist__remove(struct intlist *ilist, struct int_node *node)
         rblist__remove_node(&ilist->rblist, &node->rb_node);
  }
  
-struct int_node *intlist__find(struct intlist *ilist, int i)
+static struct int_node *__intlist__findnew(struct intlist *ilist,
+                                          int i, bool create)
  {
-       struct int_node *node;
+       struct int_node *node = NULL;
         struct rb_node *rb_node;
  
         if (ilist == NULL)
                 return NULL;
  
-       node = NULL;
-       rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+       if (create)
+               rb_node = rblist__findnew(&ilist->rblist, (void *)((long)i));
+       else
+               rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+
         if (rb_node)
                 node = container_of(rb_node, struct int_node, rb_node);
  
         return node;
  }
  
+struct int_node *intlist__find(struct intlist *ilist, int i)
+{
+       return __intlist__findnew(ilist, i, false);
+}
+
+struct int_node *intlist__findnew(struct intlist *ilist, int i)
+{
+       return __intlist__findnew(ilist, i, true);
+}
+
  static int intlist__parse_list(struct intlist *ilist, const char *s)
  {
         char *sep;
diff --git a/tools/perf/util/intlist.h b/tools/perf/util/intlist.h

index 62351dad848f76c478cced833c315cc71ff1ed63..aa6877d36858288c9027d83ab2390b82a2370fc2 100644 (file)
--- a/tools/perf/util/intlist.h
+++ b/tools/perf/util/intlist.h
@@ -9,6 +9,7 @@
  struct int_node {
         struct rb_node rb_node;
         int i;
+       void *priv;
  };
  
  struct intlist {
@@ -23,6 +24,7 @@ int intlist__add(struct intlist *ilist, int i);
  
  struct int_node *intlist__entry(const struct intlist *ilist, unsigned int idx);
  struct int_node *intlist__find(struct intlist *ilist, int i);
+struct int_node *intlist__findnew(struct intlist *ilist, int i);
  
  static inline bool intlist__has_entry(struct intlist *ilist, int i)
  {
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c

index 6188d2876a7128aaa68e426c3334dfcae099dc41..ea93425cce95bbf782e59a661c2e09be78294ed0 100644 (file)
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -46,6 +46,23 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
         return 0;
  }
  
+struct machine *machine__new_host(void)
+{
+       struct machine *machine = malloc(sizeof(*machine));
+
+       if (machine != NULL) {
+               machine__init(machine, "", HOST_KERNEL_ID);
+
+               if (machine__create_kernel_maps(machine) < 0)
+                       goto out_delete;
+       }
+
+       return machine;
+out_delete:
+       free(machine);
+       return NULL;
+}
+
  static void dsos__delete(struct list_head *dsos)
  {
         struct dso *pos, *n;
@@ -776,75 +793,44 @@ static int machine__set_modules_path(struct machine *machine)
         return map_groups__set_modules_path_dir(&machine->kmaps, modules_path);
  }
  
-static int machine__create_modules(struct machine *machine)
+static int machine__create_module(void *arg, const char *name, u64 start)
  {
-       char *line = NULL;
-       size_t n;
-       FILE *file;
+       struct machine *machine = arg;
         struct map *map;
+
+       map = machine__new_module(machine, start, name);
+       if (map == NULL)
+               return -1;
+
+       dso__kernel_module_get_build_id(map->dso, machine->root_dir);
+
+       return 0;
+}
+
+static int machine__create_modules(struct machine *machine)
+{
         const char *modules;
         char path[PATH_MAX];
  
-       if (machine__is_default_guest(machine))
+       if (machine__is_default_guest(machine)) {
                 modules = symbol_conf.default_guest_modules;
-       else {
-               sprintf(path, "%s/proc/modules", machine->root_dir);
+       } else {
+               snprintf(path, PATH_MAX, "%s/proc/modules", machine->root_dir);
                 modules = path;
         }
  
         if (symbol__restricted_filename(modules, "/proc/modules"))
                 return -1;
  
-       file = fopen(modules, "r");
-       if (file == NULL)
+       if (modules__parse(modules, machine, machine__create_module))
                 return -1;
  
-       while (!feof(file)) {
-               char name[PATH_MAX];
-               u64 start;
-               char *sep;
-               int line_len;
-
-               line_len = getline(&line, &n, file);
-               if (line_len < 0)
-                       break;
-
-               if (!line)
-                       goto out_failure;
-
-               line[--line_len] = '\0'; /* \n */
-
-               sep = strrchr(line, 'x');
-               if (sep == NULL)
-                       continue;
-
-               hex2u64(sep + 1, &start);
-
-               sep = strchr(line, ' ');
-               if (sep == NULL)
-                       continue;
-
-               *sep = '\0';
-
-               snprintf(name, sizeof(name), "[%s]", line);
-               map = machine__new_module(machine, start, name);
-               if (map == NULL)
-                       goto out_delete_line;
-               dso__kernel_module_get_build_id(map->dso, machine->root_dir);
-       }
+       if (!machine__set_modules_path(machine))
+               return 0;
  
-       free(line);
-       fclose(file);
+       pr_debug("Problems setting modules path maps, continuing anyway...\n");
  
-       if (machine__set_modules_path(machine) < 0) {
-               pr_debug("Problems setting modules path maps, continuing anyway...\n");
-       }
         return 0;
-
-out_delete_line:
-       free(line);
-out_failure:
-       return -1;
  }
  
  int machine__create_kernel_maps(struct machine *machine)
@@ -1267,10 +1253,12 @@ static int machine__resolve_callchain_sample(struct machine *machine,
                                              struct thread *thread,
                                              struct ip_callchain *chain,
                                              struct symbol **parent,
-                                            struct addr_location *root_al)
+                                            struct addr_location *root_al,
+                                            int max_stack)
  {
         u8 cpumode = PERF_RECORD_MISC_USER;
-       unsigned int i;
+       int chain_nr = min(max_stack, (int)chain->nr);
+       int i;
         int err;
  
         callchain_cursor_reset(&callchain_cursor);
@@ -1280,7 +1268,7 @@ static int machine__resolve_callchain_sample(struct machine *machine,
                 return 0;
         }
  
-       for (i = 0; i < chain->nr; i++) {
+       for (i = 0; i < chain_nr; i++) {
                 u64 ip;
                 struct addr_location al;
  
@@ -1352,12 +1340,14 @@ int machine__resolve_callchain(struct machine *machine,
                                struct thread *thread,
                                struct perf_sample *sample,
                                struct symbol **parent,
-                              struct addr_location *root_al)
+                              struct addr_location *root_al,
+                              int max_stack)
  {
         int ret;
  
         ret = machine__resolve_callchain_sample(machine, thread,
-                                               sample->callchain, parent, root_al);
+                                               sample->callchain, parent,
+                                               root_al, max_stack);
         if (ret)
                 return ret;
  
@@ -1376,3 +1366,26 @@ int machine__resolve_callchain(struct machine *machine,
                                    sample);
  
  }
+
+int machine__for_each_thread(struct machine *machine,
+                            int (*fn)(struct thread *thread, void *p),
+                            void *priv)
+{
+       struct rb_node *nd;
+       struct thread *thread;
+       int rc = 0;
+
+       for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
+               thread = rb_entry(nd, struct thread, rb_node);
+               rc = fn(thread, priv);
+               if (rc != 0)
+                       return rc;
+       }
+
+       list_for_each_entry(thread, &machine->dead_threads, node) {
+               rc = fn(thread, priv);
+               if (rc != 0)
+                       return rc;
+       }
+       return rc;
+}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h

index 58a6be1fc739ba8e543ebd7d1bb6ea5185991703..4c1f5d567f542d7b65e22541a965c5e76d2b3ad4 100644 (file)
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -74,6 +74,7 @@ char *machine__mmap_name(struct machine *machine, char *bf, size_t size);
  void machines__set_symbol_filter(struct machines *machines,
                                  symbol_filter_t symbol_filter);
  
+struct machine *machine__new_host(void);
  int machine__init(struct machine *machine, const char *root_dir, pid_t pid);
  void machine__exit(struct machine *machine);
  void machine__delete_dead_threads(struct machine *machine);
@@ -91,7 +92,8 @@ int machine__resolve_callchain(struct machine *machine,
                                struct thread *thread,
                                struct perf_sample *sample,
                                struct symbol **parent,
-                              struct addr_location *root_al);
+                              struct addr_location *root_al,
+                              int max_stack);
  
  /*
   * Default guest kernel is defined by parameter --guestkallsyms
@@ -165,4 +167,8 @@ void machines__destroy_kernel_maps(struct machines *machines);
  
  size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp);
  
+int machine__for_each_thread(struct machine *machine,
+                            int (*fn)(struct thread *thread, void *p),
+                            void *priv);
+
  #endif /* __PERF_MACHINE_H */
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c

index 4f6680d2043b1e68ef7b9dfbc63a37326fcc6f22..ef5bc913ca7a9fdda7946401e55f9b0295dddfb6 100644 (file)
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -172,7 +172,7 @@ int map__load(struct map *map, symbol_filter_t filter)
                 pr_warning(", continuing without symbols\n");
                 return -1;
         } else if (nr == 0) {
-#ifdef LIBELF_SUPPORT
+#ifdef HAVE_LIBELF_SUPPORT
                 const size_t len = strlen(name);
                 const size_t real_len = len - sizeof(DSO__DELETED);
  
@@ -252,10 +252,16 @@ size_t map__fprintf_dsoname(struct map *map, FILE *fp)
         return fprintf(fp, "%s", dsoname);
  }
  
-/*
+/**
+ * map__rip_2objdump - convert symbol start address to objdump address.
+ * @map: memory map
+ * @rip: symbol start address
+ *
   * objdump wants/reports absolute IPs for ET_EXEC, and RIPs for ET_DYN.
   * map->dso->adjust_symbols==1 for ET_EXEC-like cases except ET_REL which is
   * relative to section start.
+ *
+ * Return: Address suitable for passing to "objdump --start-address="
   */
  u64 map__rip_2objdump(struct map *map, u64 rip)
  {
@@ -268,6 +274,29 @@ u64 map__rip_2objdump(struct map *map, u64 rip)
         return map->unmap_ip(map, rip);
  }
  
+/**
+ * map__objdump_2mem - convert objdump address to a memory address.
+ * @map: memory map
+ * @ip: objdump address
+ *
+ * Closely related to map__rip_2objdump(), this function takes an address from
+ * objdump and converts it to a memory address.  Note this assumes that @map
+ * contains the address.  To be sure the result is valid, check it forwards
+ * e.g. map__rip_2objdump(map->map_ip(map, map__objdump_2mem(map, ip))) == ip
+ *
+ * Return: Memory address.
+ */
+u64 map__objdump_2mem(struct map *map, u64 ip)
+{
+       if (!map->dso->adjust_symbols)
+               return map->unmap_ip(map, ip);
+
+       if (map->dso->rel)
+               return map->unmap_ip(map, ip + map->pgoff);
+
+       return ip;
+}
+
  void map_groups__init(struct map_groups *mg)
  {
         int i;
@@ -371,6 +400,23 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
         return NULL;
  }
  
+int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter)
+{
+       if (ams->addr < ams->map->start || ams->addr > ams->map->end) {
+               if (ams->map->groups == NULL)
+                       return -1;
+               ams->map = map_groups__find(ams->map->groups, ams->map->type,
+                                           ams->addr);
+               if (ams->map == NULL)
+                       return -1;
+       }
+
+       ams->al_addr = ams->map->map_ip(ams->map, ams->addr);
+       ams->sym = map__find_symbol(ams->map, ams->al_addr, filter);
+
+       return ams->sym ? 0 : -1;
+}
+
  size_t __map_groups__fprintf_maps(struct map_groups *mg,
                                   enum map_type type, int verbose, FILE *fp)
  {
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h

index 4886ca2805361df87a57c9f4406ed5c38fe418b5..e4e259c3ba167d77836e67d85c63e6d110f61550 100644 (file)
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -84,6 +84,9 @@ static inline u64 identity__map_ip(struct map *map __maybe_unused, u64 ip)
  /* rip/ip <-> addr suitable for passing to `objdump --start-address=` */
  u64 map__rip_2objdump(struct map *map, u64 rip);
  
+/* objdump address -> memory address */
+u64 map__objdump_2mem(struct map *map, u64 ip);
+
  struct symbol;
  
  typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
@@ -167,6 +170,10 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
                                                struct map **mapp,
                                                symbol_filter_t filter);
  
+struct addr_map_symbol;
+
+int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter);
+
  static inline
  struct symbol *map_groups__find_function_by_name(struct map_groups *mg,
                                                  const char *name, struct map **mapp,
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index 98125319b158b7e6e588ba4adf861494b69e18d6..c90e55cf7e82ebc134fb827ca68f45385138f161 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -998,8 +998,10 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
         char evt_path[MAXPATHLEN];
         char dir_path[MAXPATHLEN];
  
-       if (debugfs_valid_mountpoint(tracing_events_path))
+       if (debugfs_valid_mountpoint(tracing_events_path)) {
+               printf("  [ Tracepoints not available: %s ]\n", strerror(errno));
                 return;
+       }
  
         sys_dir = opendir(tracing_events_path);
         if (!sys_dir)
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l

index 91346b7539607fb9e3b6dcb8f944af36f9e88572..343299575b303fb5443b99288fd487aab26e3915 100644 (file)
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -126,6 +126,37 @@ modifier_bp        [rwx]{1,3}
  
  }
  
+<config>{
+config                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
+config1                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
+config2                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); }
+name                   { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NAME); }
+period                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); }
+branch_type            { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); }
+,                      { return ','; }
+"/"                    { BEGIN(INITIAL); return '/'; }
+{name_minus}           { return str(yyscanner, PE_NAME); }
+}
+
+<mem>{
+{modifier_bp}          { return str(yyscanner, PE_MODIFIER_BP); }
+:                      { return ':'; }
+{num_dec}              { return value(yyscanner, 10); }
+{num_hex}              { return value(yyscanner, 16); }
+       /*
+        * We need to separate 'mem:' scanner part, in order to get specific
+        * modifier bits parsed out. Otherwise we would need to handle PE_NAME
+        * and we'd need to parse it manually. During the escape from <mem>
+        * state we need to put the escaping char back, so we dont miss it.
+        */
+.                      { unput(*yytext); BEGIN(INITIAL); }
+       /*
+        * We destroy the scanner after reaching EOF,
+        * but anyway just to be sure get back to INIT state.
+        */
+<<EOF>>                        { BEGIN(INITIAL); }
+}
+
  cpu-cycles|cycles                              { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); }
  stalled-cycles-frontend|idle-cycles-frontend   { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); }
  stalled-cycles-backend|idle-cycles-backend     { return sym(yyscanner, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); }
@@ -162,18 +193,6 @@ speculative-read|speculative-load  |
  refs|Reference|ops|access              |
  misses|miss                            { return str(yyscanner, PE_NAME_CACHE_OP_RESULT); }
  
-<config>{
-config                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
-config1                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
-config2                        { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); }
-name                   { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NAME); }
-period                 { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD); }
-branch_type            { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE); }
-,                      { return ','; }
-"/"                    { BEGIN(INITIAL); return '/'; }
-{name_minus}           { return str(yyscanner, PE_NAME); }
-}
-
  mem:                   { BEGIN(mem); return PE_PREFIX_MEM; }
  r{num_raw_hex}         { return raw(yyscanner); }
  {num_dec}              { return value(yyscanner, 10); }
@@ -189,25 +208,7 @@ r{num_raw_hex}             { return raw(yyscanner); }
  "}"                    { return '}'; }
  =                      { return '='; }
  \n                     { }
-
-<mem>{
-{modifier_bp}          { return str(yyscanner, PE_MODIFIER_BP); }
-:                      { return ':'; }
-{num_dec}              { return value(yyscanner, 10); }
-{num_hex}              { return value(yyscanner, 16); }
-       /*
-        * We need to separate 'mem:' scanner part, in order to get specific
-        * modifier bits parsed out. Otherwise we would need to handle PE_NAME
-        * and we'd need to parse it manually. During the escape from <mem>
-        * state we need to put the escaping char back, so we dont miss it.
-        */
-.                      { unput(*yytext); BEGIN(INITIAL); }
-       /*
-        * We destroy the scanner after reaching EOF,
-        * but anyway just to be sure get back to INIT state.
-        */
-<<EOF>>                        { BEGIN(INITIAL); }
-}
+.                      { }
  
  %%
  
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c

index a8c49548ca48f5bc4f7ae14134ee57d7779b46e8..5d13cb45b3171a98a171b0f7228d1fff470bf8c0 100644 (file)
--- a/tools/perf/util/path.c
+++ b/tools/perf/util/path.c
@@ -22,19 +22,23 @@ static const char *get_perf_dir(void)
         return ".";
  }
  
-#ifndef HAVE_STRLCPY
-size_t strlcpy(char *dest, const char *src, size_t size)
+/*
+ * If libc has strlcpy() then that version will override this
+ * implementation:
+ */
+size_t __weak strlcpy(char *dest, const char *src, size_t size)
  {
         size_t ret = strlen(src);
  
         if (size) {
                 size_t len = (ret >= size) ? size - 1 : ret;
+
                 memcpy(dest, src, len);
                 dest[len] = '\0';
         }
+
         return ret;
  }
-#endif
  
  static char *get_pathname(void)
  {
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h

index 5a4f2b6f3738c1966b180484dcfb71f70d0e0afd..a3d42cd749196b83c4c8226bfd069a68cd3a6917 100644 (file)
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -1,7 +1,7 @@
  #ifndef __PERF_REGS_H
  #define __PERF_REGS_H
  
-#ifdef HAVE_PERF_REGS
+#ifdef HAVE_PERF_REGS_SUPPORT
  #include <perf_regs.h>
  #else
  #define PERF_REGS_MASK 0
@@ -10,5 +10,5 @@ static inline const char *perf_reg_name(int id __maybe_unused)
  {
         return NULL;
  }
-#endif /* HAVE_PERF_REGS */
+#endif /* HAVE_PERF_REGS_SUPPORT */
  #endif /* __PERF_REGS_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c

index bc9d8069d37637835758d02e13ec3bc751d0f5e3..64362fe45b71f562d73c40594bc050c90ec01c3d 100644 (file)
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -637,3 +637,19 @@ void print_pmu_events(const char *event_glob, bool name_only)
                 printf("\n");
         free(aliases);
  }
+
+bool pmu_have_event(const char *pname, const char *name)
+{
+       struct perf_pmu *pmu;
+       struct perf_pmu_alias *alias;
+
+       pmu = NULL;
+       while ((pmu = perf_pmu__scan(pmu)) != NULL) {
+               if (strcmp(pname, pmu->name))
+                       continue;
+               list_for_each_entry(alias, &pmu->aliases, list)
+                       if (!strcmp(alias->name, name))
+                               return true;
+       }
+       return false;
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h

index 6b2cbe2d4cc3a6d59a3d692770793c19ea984930..1179b26f244a31280e5454787a51a8f01ff9ac2e 100644 (file)
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -42,6 +42,7 @@ int perf_pmu__format_parse(char *dir, struct list_head *head);
  struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);
  
  void print_pmu_events(const char *event_glob, bool name_only);
+bool pmu_have_event(const char *pname, const char *name);
  
  int perf_pmu__test(void);
  #endif /* __PMU_H */
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c

index aa04bf9c9ad791a00faf7546cf75163a0dabd60a..779b2dacd43fbe51854cb2e36e09621a82875df9 100644 (file)
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -201,7 +201,7 @@ static int convert_to_perf_probe_point(struct probe_trace_point *tp,
         return 0;
  }
  
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
  /* Open new debuginfo of given module */
  static struct debuginfo *open_debuginfo(const char *module)
  {
@@ -630,7 +630,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
         return ret;
  }
  
-#else  /* !DWARF_SUPPORT */
+#else  /* !HAVE_DWARF_SUPPORT */
  
  static int kprobe_convert_to_perf_probe(struct probe_trace_point *tp,
                                         struct perf_probe_point *pp)
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c

index c09e0a9fdf4cda118be077fbee1fb382dcc3d5ee..f0692737ebf1237373a140c0ec76a7a4a7a96f9d 100644 (file)
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1357,10 +1357,10 @@ int debuginfo__find_probe_point(struct debuginfo *self, unsigned long addr,
                         goto post;
                 }
  
+               fname = dwarf_decl_file(&spdie);
                 if (addr == (unsigned long)baseaddr) {
                         /* Function entry - Relative line number is 0 */
                         lineno = baseline;
-                       fname = dwarf_decl_file(&spdie);
                         goto post;
                 }
  
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h

index 3b7d63018960d7ff6a6808ce81eff2d34b40bd09..3f0c29dd6ac518507348a10f02b003791aa27fab 100644 (file)
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -14,7 +14,7 @@ static inline int is_c_varname(const char *name)
         return isalpha(name[0]) || name[0] == '_';
  }
  
-#ifdef DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
  
  #include "dwarf-aux.h"
  
@@ -105,6 +105,6 @@ struct line_finder {
         int                     found;
  };
  
-#endif /* DWARF_SUPPORT */
+#endif /* HAVE_DWARF_SUPPORT */
  
  #endif /*_PROBE_FINDER_H */
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c

index 71b5412bbbb9d1197dc5ae999c94704211108238..06efd027b1db8649b997393bb0105d6dfddb2f4b 100644 (file)
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -33,13 +33,6 @@ int eprintf(int level, const char *fmt, ...)
  # define PyVarObject_HEAD_INIT(type, size) PyObject_HEAD_INIT(type) size,
  #endif
  
-struct throttle_event {
-       struct perf_event_header header;
-       u64                      time;
-       u64                      id;
-       u64                      stream_id;
-};
-
  PyMODINIT_FUNC initperf(void);
  
  #define member_def(type, member, ptype, help) \
@@ -1036,6 +1029,7 @@ PyMODINIT_FUNC initperf(void)
             pyrf_cpu_map__setup_types() < 0)
                 return;
  
+       /* The page_size is placed in util object. */
         page_size = sysconf(_SC_PAGE_SIZE);
  
         Py_INCREF(&pyrf_evlist__type);
diff --git a/tools/perf/util/rblist.c b/tools/perf/util/rblist.c

index a16cdd2625ad2b4a0ff8ff39d6b91b331f3a3a1e..0dfe27d99458c845991e490283c067d0d0706221 100644 (file)
--- a/tools/perf/util/rblist.c
+++ b/tools/perf/util/rblist.c
@@ -48,10 +48,12 @@ void rblist__remove_node(struct rblist *rblist, struct rb_node *rb_node)
         rblist->node_delete(rblist, rb_node);
  }
  
-struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
+static struct rb_node *__rblist__findnew(struct rblist *rblist,
+                                        const void *entry,
+                                        bool create)
  {
         struct rb_node **p = &rblist->entries.rb_node;
-       struct rb_node *parent = NULL;
+       struct rb_node *parent = NULL, *new_node = NULL;
  
         while (*p != NULL) {
                 int rc;
@@ -67,7 +69,26 @@ struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
                         return parent;
         }
  
-       return NULL;
+       if (create) {
+               new_node = rblist->node_new(rblist, entry);
+               if (new_node) {
+                       rb_link_node(new_node, parent, p);
+                       rb_insert_color(new_node, &rblist->entries);
+                       ++rblist->nr_entries;
+               }
+       }
+
+       return new_node;
+}
+
+struct rb_node *rblist__find(struct rblist *rblist, const void *entry)
+{
+       return __rblist__findnew(rblist, entry, false);
+}
+
+struct rb_node *rblist__findnew(struct rblist *rblist, const void *entry)
+{
+       return __rblist__findnew(rblist, entry, true);
  }
  
  void rblist__init(struct rblist *rblist)
diff --git a/tools/perf/util/rblist.h b/tools/perf/util/rblist.h

index 6d0cae5ae83d5e848b56a9518575ad3c7a114444..ff9913b994c26126203e303f694021f9d24e739a 100644 (file)
--- a/tools/perf/util/rblist.h
+++ b/tools/perf/util/rblist.h
@@ -32,6 +32,7 @@ void rblist__delete(struct rblist *rblist);
  int rblist__add_node(struct rblist *rblist, const void *new_entry);
  void rblist__remove_node(struct rblist *rblist, struct rb_node *rb_node);
  struct rb_node *rblist__find(struct rblist *rblist, const void *entry);
+struct rb_node *rblist__findnew(struct rblist *rblist, const void *entry);
  struct rb_node *rblist__entry(const struct rblist *rblist, unsigned int idx);
  
  static inline bool rblist__empty(const struct rblist *rblist)
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c

index a85e4ae5f3ac582381740f8b29460e6f83ae6bd6..c0c9795c4f0235b51848e01db33a7950069b182a 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -282,7 +282,7 @@ static void perl_process_tracepoint(union perf_event *perf_event __maybe_unused,
  
         event = find_cache_event(evsel);
         if (!event)
-               die("ug! no event found for type %" PRIu64, evsel->attr.config);
+               die("ug! no event found for type %" PRIu64, (u64)evsel->attr.config);
  
         pid = raw_field_value(event, "common_pid", data);
  
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c

index 568b750c01f60b3d81cda29966ed40534a7bc8e1..854c5aa4db0d2277386756c42306e3d7b0c80cfe 100644 (file)
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -16,73 +16,34 @@
  #include "perf_regs.h"
  #include "vdso.h"
  
-static int perf_session__open(struct perf_session *self, bool force)
+static int perf_session__open(struct perf_session *self)
  {
-       struct stat input_stat;
-
-       if (!strcmp(self->filename, "-")) {
-               self->fd_pipe = true;
-               self->fd = STDIN_FILENO;
-
-               if (perf_session__read_header(self) < 0)
-                       pr_err("incompatible file format (rerun with -v to learn more)");
-
-               return 0;
-       }
-
-       self->fd = open(self->filename, O_RDONLY);
-       if (self->fd < 0) {
-               int err = errno;
-
-               pr_err("failed to open %s: %s", self->filename, strerror(err));
-               if (err == ENOENT && !strcmp(self->filename, "perf.data"))
-                       pr_err("  (try 'perf record' first)");
-               pr_err("\n");
-               return -errno;
-       }
-
-       if (fstat(self->fd, &input_stat) < 0)
-               goto out_close;
-
-       if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
-               pr_err("file %s not owned by current user or root\n",
-                      self->filename);
-               goto out_close;
-       }
-
-       if (!input_stat.st_size) {
-               pr_info("zero-sized file (%s), nothing to do!\n",
-                       self->filename);
-               goto out_close;
-       }
+       struct perf_data_file *file = self->file;
  
         if (perf_session__read_header(self) < 0) {
                 pr_err("incompatible file format (rerun with -v to learn more)");
-               goto out_close;
+               return -1;
         }
  
+       if (perf_data_file__is_pipe(file))
+               return 0;
+
         if (!perf_evlist__valid_sample_type(self->evlist)) {
                 pr_err("non matching sample_type");
-               goto out_close;
+               return -1;
         }
  
         if (!perf_evlist__valid_sample_id_all(self->evlist)) {
                 pr_err("non matching sample_id_all");
-               goto out_close;
+               return -1;
         }
  
         if (!perf_evlist__valid_read_format(self->evlist)) {
                 pr_err("non matching read_format");
-               goto out_close;
+               return -1;
         }
  
-       self->size = input_stat.st_size;
         return 0;
-
-out_close:
-       close(self->fd);
-       self->fd = -1;
-       return -1;
  }
  
  void perf_session__set_id_hdr_size(struct perf_session *session)
@@ -106,39 +67,36 @@ static void perf_session__destroy_kernel_maps(struct perf_session *self)
         machines__destroy_kernel_maps(&self->machines);
  }
  
-struct perf_session *perf_session__new(const char *filename, int mode,
-                                      bool force, bool repipe,
-                                      struct perf_tool *tool)
+struct perf_session *perf_session__new(struct perf_data_file *file,
+                                      bool repipe, struct perf_tool *tool)
  {
         struct perf_session *self;
-       struct stat st;
-       size_t len;
-
-       if (!filename || !strlen(filename)) {
-               if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
-                       filename = "-";
-               else
-                       filename = "perf.data";
-       }
-
-       len = strlen(filename);
-       self = zalloc(sizeof(*self) + len);
  
-       if (self == NULL)
+       self = zalloc(sizeof(*self));
+       if (!self)
                 goto out;
  
-       memcpy(self->filename, filename, len);
         self->repipe = repipe;
         INIT_LIST_HEAD(&self->ordered_samples.samples);
         INIT_LIST_HEAD(&self->ordered_samples.sample_cache);
         INIT_LIST_HEAD(&self->ordered_samples.to_free);
         machines__init(&self->machines);
  
-       if (mode == O_RDONLY) {
-               if (perf_session__open(self, force) < 0)
+       if (file) {
+               if (perf_data_file__open(file))
                         goto out_delete;
-               perf_session__set_id_hdr_size(self);
-       } else if (mode == O_WRONLY) {
+
+               self->file = file;
+
+               if (perf_data_file__is_read(file)) {
+                       if (perf_session__open(self) < 0)
+                               goto out_close;
+
+                       perf_session__set_id_hdr_size(self);
+               }
+       }
+
+       if (!file || perf_data_file__is_write(file)) {
                 /*
                  * In O_RDONLY mode this will be performed when reading the
                  * kernel MMAP event, in perf_event__process_mmap().
@@ -153,10 +111,13 @@ struct perf_session *perf_session__new(const char *filename, int mode,
                 tool->ordered_samples = false;
         }
  
-out:
         return self;
-out_delete:
+
+ out_close:
+       perf_data_file__close(file);
+ out_delete:
         perf_session__delete(self);
+ out:
         return NULL;
  }
  
@@ -193,7 +154,8 @@ void perf_session__delete(struct perf_session *self)
         perf_session__delete_threads(self);
         perf_session_env__delete(&self->header.env);
         machines__exit(&self->machines);
-       close(self->fd);
+       if (self->file)
+               perf_data_file__close(self->file);
         free(self);
         vdso__exit();
  }
@@ -397,6 +359,17 @@ static void perf_event__read_swap(union perf_event *event, bool sample_id_all)
                 swap_sample_id_all(event, &event->read + 1);
  }
  
+static void perf_event__throttle_swap(union perf_event *event,
+                                     bool sample_id_all)
+{
+       event->throttle.time      = bswap_64(event->throttle.time);
+       event->throttle.id        = bswap_64(event->throttle.id);
+       event->throttle.stream_id = bswap_64(event->throttle.stream_id);
+
+       if (sample_id_all)
+               swap_sample_id_all(event, &event->throttle + 1);
+}
+
  static u8 revbyte(u8 b)
  {
         int rev = (b >> 4) | ((b & 0xf) << 4);
@@ -442,6 +415,9 @@ void perf_event__attr_swap(struct perf_event_attr *attr)
         attr->bp_type           = bswap_32(attr->bp_type);
         attr->bp_addr           = bswap_64(attr->bp_addr);
         attr->bp_len            = bswap_64(attr->bp_len);
+       attr->branch_sample_type = bswap_64(attr->branch_sample_type);
+       attr->sample_regs_user   = bswap_64(attr->sample_regs_user);
+       attr->sample_stack_user  = bswap_32(attr->sample_stack_user);
  
         swap_bitfield((u8 *) (&attr->read_format + 1), sizeof(u64));
  }
@@ -482,6 +458,8 @@ static perf_event__swap_op perf_event__swap_ops[] = {
         [PERF_RECORD_EXIT]                = perf_event__task_swap,
         [PERF_RECORD_LOST]                = perf_event__all64_swap,
         [PERF_RECORD_READ]                = perf_event__read_swap,
+       [PERF_RECORD_THROTTLE]            = perf_event__throttle_swap,
+       [PERF_RECORD_UNTHROTTLE]          = perf_event__throttle_swap,
         [PERF_RECORD_SAMPLE]              = perf_event__all64_swap,
         [PERF_RECORD_HEADER_ATTR]         = perf_event__hdr_attr_swap,
         [PERF_RECORD_HEADER_EVENT_TYPE]   = perf_event__event_type_swap,
@@ -860,6 +838,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
         if (sample_type & PERF_SAMPLE_DATA_SRC)
                 printf(" . data_src: 0x%"PRIx64"\n", sample->data_src);
  
+       if (sample_type & PERF_SAMPLE_TRANSACTION)
+               printf("... transaction: %" PRIx64 "\n", sample->transaction);
+
         if (sample_type & PERF_SAMPLE_READ)
                 sample_read__printf(sample, evsel->attr.read_format);
  }
@@ -1031,6 +1012,7 @@ static int perf_session_deliver_event(struct perf_session *session,
  static int perf_session__process_user_event(struct perf_session *session, union perf_event *event,
                                             struct perf_tool *tool, u64 file_offset)
  {
+       int fd = perf_data_file__fd(session->file);
         int err;
  
         dump_event(session, event, file_offset, NULL);
@@ -1044,7 +1026,7 @@ static int perf_session__process_user_event(struct perf_session *session, union
                 return err;
         case PERF_RECORD_HEADER_TRACING_DATA:
                 /* setup for reading amidst mmap */
-               lseek(session->fd, file_offset, SEEK_SET);
+               lseek(fd, file_offset, SEEK_SET);
                 return tool->tracing_data(tool, event, session);
         case PERF_RECORD_HEADER_BUILD_ID:
                 return tool->build_id(tool, event, session);
@@ -1170,6 +1152,7 @@ volatile int session_done;
  static int __perf_session__process_pipe_events(struct perf_session *self,
                                                struct perf_tool *tool)
  {
+       int fd = perf_data_file__fd(self->file);
         union perf_event *event;
         uint32_t size, cur_size = 0;
         void *buf = NULL;
@@ -1188,7 +1171,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self,
                 return -errno;
  more:
         event = buf;
-       err = readn(self->fd, event, sizeof(struct perf_event_header));
+       err = readn(fd, event, sizeof(struct perf_event_header));
         if (err <= 0) {
                 if (err == 0)
                         goto done;
@@ -1220,7 +1203,7 @@ more:
         p += sizeof(struct perf_event_header);
  
         if (size - sizeof(struct perf_event_header)) {
-               err = readn(self->fd, p, size - sizeof(struct perf_event_header));
+               err = readn(fd, p, size - sizeof(struct perf_event_header));
                 if (err <= 0) {
                         if (err == 0) {
                                 pr_err("unexpected end of event stream\n");
@@ -1247,7 +1230,9 @@ more:
         if (!session_done())
                 goto more;
  done:
-       err = 0;
+       /* do the final flush for ordered samples */
+       self->ordered_samples.next_flush = ULLONG_MAX;
+       err = flush_sample_queue(self, tool);
  out_err:
         free(buf);
         perf_session__warn_about_errors(self, tool);
@@ -1299,6 +1284,7 @@ int __perf_session__process_events(struct perf_session *session,
                                    u64 data_offset, u64 data_size,
                                    u64 file_size, struct perf_tool *tool)
  {
+       int fd = perf_data_file__fd(session->file);
         u64 head, page_offset, file_offset, file_pos, progress_next;
         int err, mmap_prot, mmap_flags, map_idx = 0;
         size_t  mmap_size;
@@ -1331,7 +1317,7 @@ int __perf_session__process_events(struct perf_session *session,
                 mmap_flags = MAP_PRIVATE;
         }
  remap:
-       buf = mmap(NULL, mmap_size, mmap_prot, mmap_flags, session->fd,
+       buf = mmap(NULL, mmap_size, mmap_prot, mmap_flags, fd,
                    file_offset);
         if (buf == MAP_FAILED) {
                 pr_err("failed to mmap file\n");
@@ -1376,13 +1362,13 @@ more:
                                     "Processing events...");
         }
  
-       err = 0;
         if (session_done())
-               goto out_err;
+               goto out;
  
         if (file_pos < file_size)
                 goto more;
  
+out:
         /* do the final flush for ordered samples */
         session->ordered_samples.next_flush = ULLONG_MAX;
         err = flush_sample_queue(session, tool);
@@ -1396,16 +1382,17 @@ out_err:
  int perf_session__process_events(struct perf_session *self,
                                  struct perf_tool *tool)
  {
+       u64 size = perf_data_file__size(self->file);
         int err;
  
         if (perf_session__register_idle_thread(self) == NULL)
                 return -ENOMEM;
  
-       if (!self->fd_pipe)
+       if (!perf_data_file__is_pipe(self->file))
                 err = __perf_session__process_events(self,
                                                      self->header.data_offset,
                                                      self->header.data_size,
-                                                    self->size, tool);
+                                                    size, tool);
         else
                 err = __perf_session__process_pipe_events(self, tool);
  
@@ -1525,7 +1512,8 @@ void perf_evsel__print_ip(struct perf_evsel *evsel, union perf_event *event,
         if (symbol_conf.use_callchain && sample->callchain) {
  
                 if (machine__resolve_callchain(machine, evsel, al.thread,
-                                              sample, NULL, NULL) != 0) {
+                                              sample, NULL, NULL,
+                                              PERF_MAX_STACK_DEPTH) != 0) {
                         if (verbose)
                                 error("Failed to resolve callchain. Skipping\n");
                         return;
@@ -1629,13 +1617,14 @@ int perf_session__cpu_bitmap(struct perf_session *session,
  void perf_session__fprintf_info(struct perf_session *session, FILE *fp,
                                 bool full)
  {
+       int fd = perf_data_file__fd(session->file);
         struct stat st;
         int ret;
  
         if (session == NULL || fp == NULL)
                 return;
  
-       ret = fstat(session->fd, &st);
+       ret = fstat(fd, &st);
         if (ret == -1)
                 return;
  
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h

index 04bf7373a7e5fb04222b1a9cdb5c295045c0626f..27c74d38b868052345609f0427d3a74cb5d9051a 100644 (file)
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -7,6 +7,7 @@
  #include "machine.h"
  #include "symbol.h"
  #include "thread.h"
+#include "data.h"
  #include <linux/rbtree.h>
  #include <linux/perf_event.h>
  
@@ -29,16 +30,13 @@ struct ordered_samples {
  
  struct perf_session {
         struct perf_header      header;
-       unsigned long           size;
         struct machines         machines;
         struct perf_evlist      *evlist;
         struct pevent           *pevent;
         struct events_stats     stats;
-       int                     fd;
-       bool                    fd_pipe;
         bool                    repipe;
         struct ordered_samples  ordered_samples;
-       char                    filename[1];
+       struct perf_data_file   *file;
  };
  
  #define PRINT_IP_OPT_IP                (1<<0)
@@ -49,9 +47,8 @@ struct perf_session {
  
  struct perf_tool;
  
-struct perf_session *perf_session__new(const char *filename, int mode,
-                                      bool force, bool repipe,
-                                      struct perf_tool *tool);
+struct perf_session *perf_session__new(struct perf_data_file *file,
+                                      bool repipe, struct perf_tool *tool);
  void perf_session__delete(struct perf_session *session);
  
  void perf_event_header__bswap(struct perf_event_header *self);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c

index 5f118a089519a46bb6b370536660476bc6880eda..1f9821db9e7719750d3d0d3304a42a262d3fda17 100644 (file)
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -182,9 +182,19 @@ static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r)
  static int64_t
  sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
  {
+       int64_t ret;
+
         if (!left->ms.sym && !right->ms.sym)
                 return right->level - left->level;
  
+       /*
+        * comparing symbol address alone is not enough since it's a
+        * relative address within a dso.
+        */
+       ret = sort__dso_cmp(left, right);
+       if (ret != 0)
+               return ret;
+
         return _sort__sym_cmp(left->ms.sym, right->ms.sym);
  }
  
@@ -243,50 +253,32 @@ struct sort_entry sort_sym = {
  static int64_t
  sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
  {
-       return (int64_t)(right->ip - left->ip);
+       if (!left->srcline) {
+               if (!left->ms.map)
+                       left->srcline = SRCLINE_UNKNOWN;
+               else {
+                       struct map *map = left->ms.map;
+                       left->srcline = get_srcline(map->dso,
+                                           map__rip_2objdump(map, left->ip));
+               }
+       }
+       if (!right->srcline) {
+               if (!right->ms.map)
+                       right->srcline = SRCLINE_UNKNOWN;
+               else {
+                       struct map *map = right->ms.map;
+                       right->srcline = get_srcline(map->dso,
+                                           map__rip_2objdump(map, right->ip));
+               }
+       }
+       return strcmp(left->srcline, right->srcline);
  }
  
  static int hist_entry__srcline_snprintf(struct hist_entry *self, char *bf,
                                         size_t size,
                                         unsigned int width __maybe_unused)
  {
-       FILE *fp = NULL;
-       char cmd[PATH_MAX + 2], *path = self->srcline, *nl;
-       size_t line_len;
-
-       if (path != NULL)
-               goto out_path;
-
-       if (!self->ms.map)
-               goto out_ip;
-
-       if (!strncmp(self->ms.map->dso->long_name, "/tmp/perf-", 10))
-               goto out_ip;
-
-       snprintf(cmd, sizeof(cmd), "addr2line -e %s %016" PRIx64,
-                self->ms.map->dso->long_name, self->ip);
-       fp = popen(cmd, "r");
-       if (!fp)
-               goto out_ip;
-
-       if (getline(&path, &line_len, fp) < 0 || !line_len)
-               goto out_ip;
-       self->srcline = strdup(path);
-       if (self->srcline == NULL)
-               goto out_ip;
-
-       nl = strchr(self->srcline, '\n');
-       if (nl != NULL)
-               *nl = '\0';
-       path = self->srcline;
-out_path:
-       if (fp)
-               pclose(fp);
-       return repsep_snprintf(bf, size, "%s", path);
-out_ip:
-       if (fp)
-               pclose(fp);
-       return repsep_snprintf(bf, size, "%-#*llx", BITS_PER_LONG / 4, self->ip);
+       return repsep_snprintf(bf, size, "%s", self->srcline);
  }
  
  struct sort_entry sort_srcline = {
@@ -858,6 +850,127 @@ struct sort_entry sort_mem_snoop = {
         .se_width_idx   = HISTC_MEM_SNOOP,
  };
  
+static int64_t
+sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return left->branch_info->flags.abort !=
+               right->branch_info->flags.abort;
+}
+
+static int hist_entry__abort_snprintf(struct hist_entry *self, char *bf,
+                                   size_t size, unsigned int width)
+{
+       static const char *out = ".";
+
+       if (self->branch_info->flags.abort)
+               out = "A";
+       return repsep_snprintf(bf, size, "%-*s", width, out);
+}
+
+struct sort_entry sort_abort = {
+       .se_header      = "Transaction abort",
+       .se_cmp         = sort__abort_cmp,
+       .se_snprintf    = hist_entry__abort_snprintf,
+       .se_width_idx   = HISTC_ABORT,
+};
+
+static int64_t
+sort__in_tx_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return left->branch_info->flags.in_tx !=
+               right->branch_info->flags.in_tx;
+}
+
+static int hist_entry__in_tx_snprintf(struct hist_entry *self, char *bf,
+                                   size_t size, unsigned int width)
+{
+       static const char *out = ".";
+
+       if (self->branch_info->flags.in_tx)
+               out = "T";
+
+       return repsep_snprintf(bf, size, "%-*s", width, out);
+}
+
+struct sort_entry sort_in_tx = {
+       .se_header      = "Branch in transaction",
+       .se_cmp         = sort__in_tx_cmp,
+       .se_snprintf    = hist_entry__in_tx_snprintf,
+       .se_width_idx   = HISTC_IN_TX,
+};
+
+static int64_t
+sort__transaction_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return left->transaction - right->transaction;
+}
+
+static inline char *add_str(char *p, const char *str)
+{
+       strcpy(p, str);
+       return p + strlen(str);
+}
+
+static struct txbit {
+       unsigned flag;
+       const char *name;
+       int skip_for_len;
+} txbits[] = {
+       { PERF_TXN_ELISION,        "EL ",        0 },
+       { PERF_TXN_TRANSACTION,    "TX ",        1 },
+       { PERF_TXN_SYNC,           "SYNC ",      1 },
+       { PERF_TXN_ASYNC,          "ASYNC ",     0 },
+       { PERF_TXN_RETRY,          "RETRY ",     0 },
+       { PERF_TXN_CONFLICT,       "CON ",       0 },
+       { PERF_TXN_CAPACITY_WRITE, "CAP-WRITE ", 1 },
+       { PERF_TXN_CAPACITY_READ,  "CAP-READ ",  0 },
+       { 0, NULL, 0 }
+};
+
+int hist_entry__transaction_len(void)
+{
+       int i;
+       int len = 0;
+
+       for (i = 0; txbits[i].name; i++) {
+               if (!txbits[i].skip_for_len)
+                       len += strlen(txbits[i].name);
+       }
+       len += 4; /* :XX<space> */
+       return len;
+}
+
+static int hist_entry__transaction_snprintf(struct hist_entry *self, char *bf,
+                                           size_t size, unsigned int width)
+{
+       u64 t = self->transaction;
+       char buf[128];
+       char *p = buf;
+       int i;
+
+       buf[0] = 0;
+       for (i = 0; txbits[i].name; i++)
+               if (txbits[i].flag & t)
+                       p = add_str(p, txbits[i].name);
+       if (t && !(t & (PERF_TXN_SYNC|PERF_TXN_ASYNC)))
+               p = add_str(p, "NEITHER ");
+       if (t & PERF_TXN_ABORT_MASK) {
+               sprintf(p, ":%" PRIx64,
+                       (t & PERF_TXN_ABORT_MASK) >>
+                       PERF_TXN_ABORT_SHIFT);
+               p += strlen(p);
+       }
+
+       return repsep_snprintf(bf, size, "%-*s", width, buf);
+}
+
+struct sort_entry sort_transaction = {
+       .se_header      = "Transaction                ",
+       .se_cmp         = sort__transaction_cmp,
+       .se_snprintf    = hist_entry__transaction_snprintf,
+       .se_width_idx   = HISTC_TRANSACTION,
+};
+
  struct sort_dimension {
         const char              *name;
         struct sort_entry       *entry;
@@ -876,6 +989,7 @@ static struct sort_dimension common_sort_dimensions[] = {
         DIM(SORT_SRCLINE, "srcline", sort_srcline),
         DIM(SORT_LOCAL_WEIGHT, "local_weight", sort_local_weight),
         DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight),
+       DIM(SORT_TRANSACTION, "transaction", sort_transaction),
  };
  
  #undef DIM
@@ -888,6 +1002,8 @@ static struct sort_dimension bstack_sort_dimensions[] = {
         DIM(SORT_SYM_FROM, "symbol_from", sort_sym_from),
         DIM(SORT_SYM_TO, "symbol_to", sort_sym_to),
         DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
+       DIM(SORT_IN_TX, "in_tx", sort_in_tx),
+       DIM(SORT_ABORT, "abort", sort_abort),
  };
  
  #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h

index 4e80dbd271e77e245d41f2f36afdffdf8ae7ab4e..bf4333694d3ab108acd7ca0e873f26cf3cff1ca1 100644 (file)
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -85,6 +85,7 @@ struct hist_entry {
         struct map_symbol       ms;
         struct thread           *thread;
         u64                     ip;
+       u64                     transaction;
         s32                     cpu;
  
         struct hist_entry_diff  diff;
@@ -145,6 +146,7 @@ enum sort_type {
         SORT_SRCLINE,
         SORT_LOCAL_WEIGHT,
         SORT_GLOBAL_WEIGHT,
+       SORT_TRANSACTION,
  
         /* branch stack specific sort keys */
         __SORT_BRANCH_STACK,
@@ -153,6 +155,8 @@ enum sort_type {
         SORT_SYM_FROM,
         SORT_SYM_TO,
         SORT_MISPREDICT,
+       SORT_ABORT,
+       SORT_IN_TX,
  
         /* memory mode specific sort keys */
         __SORT_MEMORY_MODE,
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c

new file mode 100644 (file)

index 0000000..d11aefb
--- /dev/null
+++ b/tools/perf/util/srcline.c
@@ -0,0 +1,265 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <linux/kernel.h>
+
+#include "util/dso.h"
+#include "util/util.h"
+#include "util/debug.h"
+
+#ifdef HAVE_LIBBFD_SUPPORT
+
+/*
+ * Implement addr2line using libbfd.
+ */
+#define PACKAGE "perf"
+#include <bfd.h>
+
+struct a2l_data {
+       const char      *input;
+       unsigned long   addr;
+
+       bool            found;
+       const char      *filename;
+       const char      *funcname;
+       unsigned        line;
+
+       bfd             *abfd;
+       asymbol         **syms;
+};
+
+static int bfd_error(const char *string)
+{
+       const char *errmsg;
+
+       errmsg = bfd_errmsg(bfd_get_error());
+       fflush(stdout);
+
+       if (string)
+               pr_debug("%s: %s\n", string, errmsg);
+       else
+               pr_debug("%s\n", errmsg);
+
+       return -1;
+}
+
+static int slurp_symtab(bfd *abfd, struct a2l_data *a2l)
+{
+       long storage;
+       long symcount;
+       asymbol **syms;
+       bfd_boolean dynamic = FALSE;
+
+       if ((bfd_get_file_flags(abfd) & HAS_SYMS) == 0)
+               return bfd_error(bfd_get_filename(abfd));
+
+       storage = bfd_get_symtab_upper_bound(abfd);
+       if (storage == 0L) {
+               storage = bfd_get_dynamic_symtab_upper_bound(abfd);
+               dynamic = TRUE;
+       }
+       if (storage < 0L)
+               return bfd_error(bfd_get_filename(abfd));
+
+       syms = malloc(storage);
+       if (dynamic)
+               symcount = bfd_canonicalize_dynamic_symtab(abfd, syms);
+       else
+               symcount = bfd_canonicalize_symtab(abfd, syms);
+
+       if (symcount < 0) {
+               free(syms);
+               return bfd_error(bfd_get_filename(abfd));
+       }
+
+       a2l->syms = syms;
+       return 0;
+}
+
+static void find_address_in_section(bfd *abfd, asection *section, void *data)
+{
+       bfd_vma pc, vma;
+       bfd_size_type size;
+       struct a2l_data *a2l = data;
+
+       if (a2l->found)
+               return;
+
+       if ((bfd_get_section_flags(abfd, section) & SEC_ALLOC) == 0)
+               return;
+
+       pc = a2l->addr;
+       vma = bfd_get_section_vma(abfd, section);
+       size = bfd_get_section_size(section);
+
+       if (pc < vma || pc >= vma + size)
+               return;
+
+       a2l->found = bfd_find_nearest_line(abfd, section, a2l->syms, pc - vma,
+                                          &a2l->filename, &a2l->funcname,
+                                          &a2l->line);
+}
+
+static struct a2l_data *addr2line_init(const char *path)
+{
+       bfd *abfd;
+       struct a2l_data *a2l = NULL;
+
+       abfd = bfd_openr(path, NULL);
+       if (abfd == NULL)
+               return NULL;
+
+       if (!bfd_check_format(abfd, bfd_object))
+               goto out;
+
+       a2l = zalloc(sizeof(*a2l));
+       if (a2l == NULL)
+               goto out;
+
+       a2l->abfd = abfd;
+       a2l->input = strdup(path);
+       if (a2l->input == NULL)
+               goto out;
+
+       if (slurp_symtab(abfd, a2l))
+               goto out;
+
+       return a2l;
+
+out:
+       if (a2l) {
+               free((void *)a2l->input);
+               free(a2l);
+       }
+       bfd_close(abfd);
+       return NULL;
+}
+
+static void addr2line_cleanup(struct a2l_data *a2l)
+{
+       if (a2l->abfd)
+               bfd_close(a2l->abfd);
+       free((void *)a2l->input);
+       free(a2l->syms);
+       free(a2l);
+}
+
+static int addr2line(const char *dso_name, unsigned long addr,
+                    char **file, unsigned int *line)
+{
+       int ret = 0;
+       struct a2l_data *a2l;
+
+       a2l = addr2line_init(dso_name);
+       if (a2l == NULL) {
+               pr_warning("addr2line_init failed for %s\n", dso_name);
+               return 0;
+       }
+
+       a2l->addr = addr;
+       bfd_map_over_sections(a2l->abfd, find_address_in_section, a2l);
+
+       if (a2l->found && a2l->filename) {
+               *file = strdup(a2l->filename);
+               *line = a2l->line;
+
+               if (*file)
+                       ret = 1;
+       }
+
+       addr2line_cleanup(a2l);
+       return ret;
+}
+
+#else /* HAVE_LIBBFD_SUPPORT */
+
+static int addr2line(const char *dso_name, unsigned long addr,
+                    char **file, unsigned int *line_nr)
+{
+       FILE *fp;
+       char cmd[PATH_MAX];
+       char *filename = NULL;
+       size_t len;
+       char *sep;
+       int ret = 0;
+
+       scnprintf(cmd, sizeof(cmd), "addr2line -e %s %016"PRIx64,
+                 dso_name, addr);
+
+       fp = popen(cmd, "r");
+       if (fp == NULL) {
+               pr_warning("popen failed for %s\n", dso_name);
+               return 0;
+       }
+
+       if (getline(&filename, &len, fp) < 0 || !len) {
+               pr_warning("addr2line has no output for %s\n", dso_name);
+               goto out;
+       }
+
+       sep = strchr(filename, '\n');
+       if (sep)
+               *sep = '\0';
+
+       if (!strcmp(filename, "??:0")) {
+               pr_debug("no debugging info in %s\n", dso_name);
+               free(filename);
+               goto out;
+       }
+
+       sep = strchr(filename, ':');
+       if (sep) {
+               *sep++ = '\0';
+               *file = filename;
+               *line_nr = strtoul(sep, NULL, 0);
+               ret = 1;
+       }
+out:
+       pclose(fp);
+       return ret;
+}
+#endif /* HAVE_LIBBFD_SUPPORT */
+
+char *get_srcline(struct dso *dso, unsigned long addr)
+{
+       char *file = NULL;
+       unsigned line = 0;
+       char *srcline;
+       char *dso_name = dso->long_name;
+       size_t size;
+
+       if (!dso->has_srcline)
+               return SRCLINE_UNKNOWN;
+
+       if (dso_name[0] == '[')
+               goto out;
+
+       if (!strncmp(dso_name, "/tmp/perf-", 10))
+               goto out;
+
+       if (!addr2line(dso_name, addr, &file, &line))
+               goto out;
+
+       /* just calculate actual length */
+       size = snprintf(NULL, 0, "%s:%u", file, line) + 1;
+
+       srcline = malloc(size);
+       if (srcline)
+               snprintf(srcline, size, "%s:%u", file, line);
+       else
+               srcline = SRCLINE_UNKNOWN;
+
+       free(file);
+       return srcline;
+
+out:
+       dso->has_srcline = 0;
+       return SRCLINE_UNKNOWN;
+}
+
+void free_srcline(char *srcline)
+{
+       if (srcline && strcmp(srcline, SRCLINE_UNKNOWN) != 0)
+               free(srcline);
+}
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c

index a9c829be52169eac5b9f00d9382fd9281152b9e6..eed0b96302af189fa4b7c75f73ef538b8493336e 100644 (file)
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -8,7 +8,7 @@
  #include "symbol.h"
  #include "debug.h"
  
-#ifndef HAVE_ELF_GETPHDRNUM
+#ifndef HAVE_ELF_GETPHDRNUM_SUPPORT
  static int elf_getphdrnum(Elf *elf, size_t *dst)
  {
         GElf_Ehdr gehdr;
@@ -487,27 +487,27 @@ int filename__read_debuglink(const char *filename, char *debuglink,
  
         ek = elf_kind(elf);
         if (ek != ELF_K_ELF)
-               goto out_close;
+               goto out_elf_end;
  
         if (gelf_getehdr(elf, &ehdr) == NULL) {
                 pr_err("%s: cannot get elf header.\n", __func__);
-               goto out_close;
+               goto out_elf_end;
         }
  
         sec = elf_section_by_name(elf, &ehdr, &shdr,
                                   ".gnu_debuglink", NULL);
         if (sec == NULL)
-               goto out_close;
+               goto out_elf_end;
  
         data = elf_getdata(sec, NULL);
         if (data == NULL)
-               goto out_close;
+               goto out_elf_end;
  
         /* the start of this section is a zero-terminated string */
         strncpy(debuglink, data->d_buf, size);
  
+out_elf_end:
         elf_end(elf);
-
  out_close:
         close(fd);
  out:
@@ -1018,6 +1018,601 @@ int file__read_maps(int fd, bool exe, mapfn_t mapfn, void *data,
         return err;
  }
  
+static int copy_bytes(int from, off_t from_offs, int to, off_t to_offs, u64 len)
+{
+       ssize_t r;
+       size_t n;
+       int err = -1;
+       char *buf = malloc(page_size);
+
+       if (buf == NULL)
+               return -1;
+
+       if (lseek(to, to_offs, SEEK_SET) != to_offs)
+               goto out;
+
+       if (lseek(from, from_offs, SEEK_SET) != from_offs)
+               goto out;
+
+       while (len) {
+               n = page_size;
+               if (len < n)
+                       n = len;
+               /* Use read because mmap won't work on proc files */
+               r = read(from, buf, n);
+               if (r < 0)
+                       goto out;
+               if (!r)
+                       break;
+               n = r;
+               r = write(to, buf, n);
+               if (r < 0)
+                       goto out;
+               if ((size_t)r != n)
+                       goto out;
+               len -= n;
+       }
+
+       err = 0;
+out:
+       free(buf);
+       return err;
+}
+
+struct kcore {
+       int fd;
+       int elfclass;
+       Elf *elf;
+       GElf_Ehdr ehdr;
+};
+
+static int kcore__open(struct kcore *kcore, const char *filename)
+{
+       GElf_Ehdr *ehdr;
+
+       kcore->fd = open(filename, O_RDONLY);
+       if (kcore->fd == -1)
+               return -1;
+
+       kcore->elf = elf_begin(kcore->fd, ELF_C_READ, NULL);
+       if (!kcore->elf)
+               goto out_close;
+
+       kcore->elfclass = gelf_getclass(kcore->elf);
+       if (kcore->elfclass == ELFCLASSNONE)
+               goto out_end;
+
+       ehdr = gelf_getehdr(kcore->elf, &kcore->ehdr);
+       if (!ehdr)
+               goto out_end;
+
+       return 0;
+
+out_end:
+       elf_end(kcore->elf);
+out_close:
+       close(kcore->fd);
+       return -1;
+}
+
+static int kcore__init(struct kcore *kcore, char *filename, int elfclass,
+                      bool temp)
+{
+       GElf_Ehdr *ehdr;
+
+       kcore->elfclass = elfclass;
+
+       if (temp)
+               kcore->fd = mkstemp(filename);
+       else
+               kcore->fd = open(filename, O_WRONLY | O_CREAT | O_EXCL, 0400);
+       if (kcore->fd == -1)
+               return -1;
+
+       kcore->elf = elf_begin(kcore->fd, ELF_C_WRITE, NULL);
+       if (!kcore->elf)
+               goto out_close;
+
+       if (!gelf_newehdr(kcore->elf, elfclass))
+               goto out_end;
+
+       ehdr = gelf_getehdr(kcore->elf, &kcore->ehdr);
+       if (!ehdr)
+               goto out_end;
+
+       return 0;
+
+out_end:
+       elf_end(kcore->elf);
+out_close:
+       close(kcore->fd);
+       unlink(filename);
+       return -1;
+}
+
+static void kcore__close(struct kcore *kcore)
+{
+       elf_end(kcore->elf);
+       close(kcore->fd);
+}
+
+static int kcore__copy_hdr(struct kcore *from, struct kcore *to, size_t count)
+{
+       GElf_Ehdr *ehdr = &to->ehdr;
+       GElf_Ehdr *kehdr = &from->ehdr;
+
+       memcpy(ehdr->e_ident, kehdr->e_ident, EI_NIDENT);
+       ehdr->e_type      = kehdr->e_type;
+       ehdr->e_machine   = kehdr->e_machine;
+       ehdr->e_version   = kehdr->e_version;
+       ehdr->e_entry     = 0;
+       ehdr->e_shoff     = 0;
+       ehdr->e_flags     = kehdr->e_flags;
+       ehdr->e_phnum     = count;
+       ehdr->e_shentsize = 0;
+       ehdr->e_shnum     = 0;
+       ehdr->e_shstrndx  = 0;
+
+       if (from->elfclass == ELFCLASS32) {
+               ehdr->e_phoff     = sizeof(Elf32_Ehdr);
+               ehdr->e_ehsize    = sizeof(Elf32_Ehdr);
+               ehdr->e_phentsize = sizeof(Elf32_Phdr);
+       } else {
+               ehdr->e_phoff     = sizeof(Elf64_Ehdr);
+               ehdr->e_ehsize    = sizeof(Elf64_Ehdr);
+               ehdr->e_phentsize = sizeof(Elf64_Phdr);
+       }
+
+       if (!gelf_update_ehdr(to->elf, ehdr))
+               return -1;
+
+       if (!gelf_newphdr(to->elf, count))
+               return -1;
+
+       return 0;
+}
+
+static int kcore__add_phdr(struct kcore *kcore, int idx, off_t offset,
+                          u64 addr, u64 len)
+{
+       GElf_Phdr gphdr;
+       GElf_Phdr *phdr;
+
+       phdr = gelf_getphdr(kcore->elf, idx, &gphdr);
+       if (!phdr)
+               return -1;
+
+       phdr->p_type    = PT_LOAD;
+       phdr->p_flags   = PF_R | PF_W | PF_X;
+       phdr->p_offset  = offset;
+       phdr->p_vaddr   = addr;
+       phdr->p_paddr   = 0;
+       phdr->p_filesz  = len;
+       phdr->p_memsz   = len;
+       phdr->p_align   = page_size;
+
+       if (!gelf_update_phdr(kcore->elf, idx, phdr))
+               return -1;
+
+       return 0;
+}
+
+static off_t kcore__write(struct kcore *kcore)
+{
+       return elf_update(kcore->elf, ELF_C_WRITE);
+}
+
+struct phdr_data {
+       off_t offset;
+       u64 addr;
+       u64 len;
+};
+
+struct kcore_copy_info {
+       u64 stext;
+       u64 etext;
+       u64 first_symbol;
+       u64 last_symbol;
+       u64 first_module;
+       u64 last_module_symbol;
+       struct phdr_data kernel_map;
+       struct phdr_data modules_map;
+};
+
+static int kcore_copy__process_kallsyms(void *arg, const char *name, char type,
+                                       u64 start)
+{
+       struct kcore_copy_info *kci = arg;
+
+       if (!symbol_type__is_a(type, MAP__FUNCTION))
+               return 0;
+
+       if (strchr(name, '[')) {
+               if (start > kci->last_module_symbol)
+                       kci->last_module_symbol = start;
+               return 0;
+       }
+
+       if (!kci->first_symbol || start < kci->first_symbol)
+               kci->first_symbol = start;
+
+       if (!kci->last_symbol || start > kci->last_symbol)
+               kci->last_symbol = start;
+
+       if (!strcmp(name, "_stext")) {
+               kci->stext = start;
+               return 0;
+       }
+
+       if (!strcmp(name, "_etext")) {
+               kci->etext = start;
+               return 0;
+       }
+
+       return 0;
+}
+
+static int kcore_copy__parse_kallsyms(struct kcore_copy_info *kci,
+                                     const char *dir)
+{
+       char kallsyms_filename[PATH_MAX];
+
+       scnprintf(kallsyms_filename, PATH_MAX, "%s/kallsyms", dir);
+
+       if (symbol__restricted_filename(kallsyms_filename, "/proc/kallsyms"))
+               return -1;
+
+       if (kallsyms__parse(kallsyms_filename, kci,
+                           kcore_copy__process_kallsyms) < 0)
+               return -1;
+
+       return 0;
+}
+
+static int kcore_copy__process_modules(void *arg,
+                                      const char *name __maybe_unused,
+                                      u64 start)
+{
+       struct kcore_copy_info *kci = arg;
+
+       if (!kci->first_module || start < kci->first_module)
+               kci->first_module = start;
+
+       return 0;
+}
+
+static int kcore_copy__parse_modules(struct kcore_copy_info *kci,
+                                    const char *dir)
+{
+       char modules_filename[PATH_MAX];
+
+       scnprintf(modules_filename, PATH_MAX, "%s/modules", dir);
+
+       if (symbol__restricted_filename(modules_filename, "/proc/modules"))
+               return -1;
+
+       if (modules__parse(modules_filename, kci,
+                          kcore_copy__process_modules) < 0)
+               return -1;
+
+       return 0;
+}
+
+static void kcore_copy__map(struct phdr_data *p, u64 start, u64 end, u64 pgoff,
+                           u64 s, u64 e)
+{
+       if (p->addr || s < start || s >= end)
+               return;
+
+       p->addr = s;
+       p->offset = (s - start) + pgoff;
+       p->len = e < end ? e - s : end - s;
+}
+
+static int kcore_copy__read_map(u64 start, u64 len, u64 pgoff, void *data)
+{
+       struct kcore_copy_info *kci = data;
+       u64 end = start + len;
+
+       kcore_copy__map(&kci->kernel_map, start, end, pgoff, kci->stext,
+                       kci->etext);
+
+       kcore_copy__map(&kci->modules_map, start, end, pgoff, kci->first_module,
+                       kci->last_module_symbol);
+
+       return 0;
+}
+
+static int kcore_copy__read_maps(struct kcore_copy_info *kci, Elf *elf)
+{
+       if (elf_read_maps(elf, true, kcore_copy__read_map, kci) < 0)
+               return -1;
+
+       return 0;
+}
+
+static int kcore_copy__calc_maps(struct kcore_copy_info *kci, const char *dir,
+                                Elf *elf)
+{
+       if (kcore_copy__parse_kallsyms(kci, dir))
+               return -1;
+
+       if (kcore_copy__parse_modules(kci, dir))
+               return -1;
+
+       if (kci->stext)
+               kci->stext = round_down(kci->stext, page_size);
+       else
+               kci->stext = round_down(kci->first_symbol, page_size);
+
+       if (kci->etext) {
+               kci->etext = round_up(kci->etext, page_size);
+       } else if (kci->last_symbol) {
+               kci->etext = round_up(kci->last_symbol, page_size);
+               kci->etext += page_size;
+       }
+
+       kci->first_module = round_down(kci->first_module, page_size);
+
+       if (kci->last_module_symbol) {
+               kci->last_module_symbol = round_up(kci->last_module_symbol,
+                                                  page_size);
+               kci->last_module_symbol += page_size;
+       }
+
+       if (!kci->stext || !kci->etext)
+               return -1;
+
+       if (kci->first_module && !kci->last_module_symbol)
+               return -1;
+
+       return kcore_copy__read_maps(kci, elf);
+}
+
+static int kcore_copy__copy_file(const char *from_dir, const char *to_dir,
+                                const char *name)
+{
+       char from_filename[PATH_MAX];
+       char to_filename[PATH_MAX];
+
+       scnprintf(from_filename, PATH_MAX, "%s/%s", from_dir, name);
+       scnprintf(to_filename, PATH_MAX, "%s/%s", to_dir, name);
+
+       return copyfile_mode(from_filename, to_filename, 0400);
+}
+
+static int kcore_copy__unlink(const char *dir, const char *name)
+{
+       char filename[PATH_MAX];
+
+       scnprintf(filename, PATH_MAX, "%s/%s", dir, name);
+
+       return unlink(filename);
+}
+
+static int kcore_copy__compare_fds(int from, int to)
+{
+       char *buf_from;
+       char *buf_to;
+       ssize_t ret;
+       size_t len;
+       int err = -1;
+
+       buf_from = malloc(page_size);
+       buf_to = malloc(page_size);
+       if (!buf_from || !buf_to)
+               goto out;
+
+       while (1) {
+               /* Use read because mmap won't work on proc files */
+               ret = read(from, buf_from, page_size);
+               if (ret < 0)
+                       goto out;
+
+               if (!ret)
+                       break;
+
+               len = ret;
+
+               if (readn(to, buf_to, len) != (int)len)
+                       goto out;
+
+               if (memcmp(buf_from, buf_to, len))
+                       goto out;
+       }
+
+       err = 0;
+out:
+       free(buf_to);
+       free(buf_from);
+       return err;
+}
+
+static int kcore_copy__compare_files(const char *from_filename,
+                                    const char *to_filename)
+{
+       int from, to, err = -1;
+
+       from = open(from_filename, O_RDONLY);
+       if (from < 0)
+               return -1;
+
+       to = open(to_filename, O_RDONLY);
+       if (to < 0)
+               goto out_close_from;
+
+       err = kcore_copy__compare_fds(from, to);
+
+       close(to);
+out_close_from:
+       close(from);
+       return err;
+}
+
+static int kcore_copy__compare_file(const char *from_dir, const char *to_dir,
+                                   const char *name)
+{
+       char from_filename[PATH_MAX];
+       char to_filename[PATH_MAX];
+
+       scnprintf(from_filename, PATH_MAX, "%s/%s", from_dir, name);
+       scnprintf(to_filename, PATH_MAX, "%s/%s", to_dir, name);
+
+       return kcore_copy__compare_files(from_filename, to_filename);
+}
+
+/**
+ * kcore_copy - copy kallsyms, modules and kcore from one directory to another.
+ * @from_dir: from directory
+ * @to_dir: to directory
+ *
+ * This function copies kallsyms, modules and kcore files from one directory to
+ * another.  kallsyms and modules are copied entirely.  Only code segments are
+ * copied from kcore.  It is assumed that two segments suffice: one for the
+ * kernel proper and one for all the modules.  The code segments are determined
+ * from kallsyms and modules files.  The kernel map starts at _stext or the
+ * lowest function symbol, and ends at _etext or the highest function symbol.
+ * The module map starts at the lowest module address and ends at the highest
+ * module symbol.  Start addresses are rounded down to the nearest page.  End
+ * addresses are rounded up to the nearest page.  An extra page is added to the
+ * highest kernel symbol and highest module symbol to, hopefully, encompass that
+ * symbol too.  Because it contains only code sections, the resulting kcore is
+ * unusual.  One significant peculiarity is that the mapping (start -> pgoff)
+ * is not the same for the kernel map and the modules map.  That happens because
+ * the data is copied adjacently whereas the original kcore has gaps.  Finally,
+ * kallsyms and modules files are compared with their copies to check that
+ * modules have not been loaded or unloaded while the copies were taking place.
+ *
+ * Return: %0 on success, %-1 on failure.
+ */
+int kcore_copy(const char *from_dir, const char *to_dir)
+{
+       struct kcore kcore;
+       struct kcore extract;
+       size_t count = 2;
+       int idx = 0, err = -1;
+       off_t offset = page_size, sz, modules_offset = 0;
+       struct kcore_copy_info kci = { .stext = 0, };
+       char kcore_filename[PATH_MAX];
+       char extract_filename[PATH_MAX];
+
+       if (kcore_copy__copy_file(from_dir, to_dir, "kallsyms"))
+               return -1;
+
+       if (kcore_copy__copy_file(from_dir, to_dir, "modules"))
+               goto out_unlink_kallsyms;
+
+       scnprintf(kcore_filename, PATH_MAX, "%s/kcore", from_dir);
+       scnprintf(extract_filename, PATH_MAX, "%s/kcore", to_dir);
+
+       if (kcore__open(&kcore, kcore_filename))
+               goto out_unlink_modules;
+
+       if (kcore_copy__calc_maps(&kci, from_dir, kcore.elf))
+               goto out_kcore_close;
+
+       if (kcore__init(&extract, extract_filename, kcore.elfclass, false))
+               goto out_kcore_close;
+
+       if (!kci.modules_map.addr)
+               count -= 1;
+
+       if (kcore__copy_hdr(&kcore, &extract, count))
+               goto out_extract_close;
+
+       if (kcore__add_phdr(&extract, idx++, offset, kci.kernel_map.addr,
+                           kci.kernel_map.len))
+               goto out_extract_close;
+
+       if (kci.modules_map.addr) {
+               modules_offset = offset + kci.kernel_map.len;
+               if (kcore__add_phdr(&extract, idx, modules_offset,
+                                   kci.modules_map.addr, kci.modules_map.len))
+                       goto out_extract_close;
+       }
+
+       sz = kcore__write(&extract);
+       if (sz < 0 || sz > offset)
+               goto out_extract_close;
+
+       if (copy_bytes(kcore.fd, kci.kernel_map.offset, extract.fd, offset,
+                      kci.kernel_map.len))
+               goto out_extract_close;
+
+       if (modules_offset && copy_bytes(kcore.fd, kci.modules_map.offset,
+                                        extract.fd, modules_offset,
+                                        kci.modules_map.len))
+               goto out_extract_close;
+
+       if (kcore_copy__compare_file(from_dir, to_dir, "modules"))
+               goto out_extract_close;
+
+       if (kcore_copy__compare_file(from_dir, to_dir, "kallsyms"))
+               goto out_extract_close;
+
+       err = 0;
+
+out_extract_close:
+       kcore__close(&extract);
+       if (err)
+               unlink(extract_filename);
+out_kcore_close:
+       kcore__close(&kcore);
+out_unlink_modules:
+       if (err)
+               kcore_copy__unlink(to_dir, "modules");
+out_unlink_kallsyms:
+       if (err)
+               kcore_copy__unlink(to_dir, "kallsyms");
+
+       return err;
+}
+
+int kcore_extract__create(struct kcore_extract *kce)
+{
+       struct kcore kcore;
+       struct kcore extract;
+       size_t count = 1;
+       int idx = 0, err = -1;
+       off_t offset = page_size, sz;
+
+       if (kcore__open(&kcore, kce->kcore_filename))
+               return -1;
+
+       strcpy(kce->extract_filename, PERF_KCORE_EXTRACT);
+       if (kcore__init(&extract, kce->extract_filename, kcore.elfclass, true))
+               goto out_kcore_close;
+
+       if (kcore__copy_hdr(&kcore, &extract, count))
+               goto out_extract_close;
+
+       if (kcore__add_phdr(&extract, idx, offset, kce->addr, kce->len))
+               goto out_extract_close;
+
+       sz = kcore__write(&extract);
+       if (sz < 0 || sz > offset)
+               goto out_extract_close;
+
+       if (copy_bytes(kcore.fd, kce->offs, extract.fd, offset, kce->len))
+               goto out_extract_close;
+
+       err = 0;
+
+out_extract_close:
+       kcore__close(&extract);
+       if (err)
+               unlink(kce->extract_filename);
+out_kcore_close:
+       kcore__close(&kcore);
+
+       return err;
+}
+
+void kcore_extract__delete(struct kcore_extract *kce)
+{
+       unlink(kce->extract_filename);
+}
+
  void symbol__elf_init(void)
  {
         elf_version(EV_CURRENT);
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c

index 3a802c300fc564adce713aebbd848a1b936e78c1..2d2dd0532b5a971e588086c0605ae0d6ed4080df 100644 (file)
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -308,6 +308,21 @@ int file__read_maps(int fd __maybe_unused, bool exe __maybe_unused,
         return -1;
  }
  
+int kcore_extract__create(struct kcore_extract *kce __maybe_unused)
+{
+       return -1;
+}
+
+void kcore_extract__delete(struct kcore_extract *kce __maybe_unused)
+{
+}
+
+int kcore_copy(const char *from_dir __maybe_unused,
+              const char *to_dir __maybe_unused)
+{
+       return -1;
+}
+
  void symbol__elf_init(void)
  {
  }
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index 7eb0362f4ffd2468c81f6ae1a93686d00dfefe50..c0c36965fff0f0060b2f2a077edc17c569648bf2 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -51,6 +51,7 @@ static enum dso_binary_type binary_type_symtab[] = {
         DSO_BINARY_TYPE__SYSTEM_PATH_DSO,
         DSO_BINARY_TYPE__GUEST_KMODULE,
         DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE,
+       DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO,
         DSO_BINARY_TYPE__NOT_FOUND,
  };
  
@@ -159,10 +160,12 @@ again:
  
                 if (choose_best_symbol(curr, next) == SYMBOL_A) {
                         rb_erase(&next->rb_node, symbols);
+                       symbol__delete(next);
                         goto again;
                 } else {
                         nd = rb_next(&curr->rb_node);
                         rb_erase(&curr->rb_node, symbols);
+                       symbol__delete(curr);
                 }
         }
  }
@@ -499,6 +502,64 @@ out_failure:
         return -1;
  }
  
+int modules__parse(const char *filename, void *arg,
+                  int (*process_module)(void *arg, const char *name,
+                                        u64 start))
+{
+       char *line = NULL;
+       size_t n;
+       FILE *file;
+       int err = 0;
+
+       file = fopen(filename, "r");
+       if (file == NULL)
+               return -1;
+
+       while (1) {
+               char name[PATH_MAX];
+               u64 start;
+               char *sep;
+               ssize_t line_len;
+
+               line_len = getline(&line, &n, file);
+               if (line_len < 0) {
+                       if (feof(file))
+                               break;
+                       err = -1;
+                       goto out;
+               }
+
+               if (!line) {
+                       err = -1;
+                       goto out;
+               }
+
+               line[--line_len] = '\0'; /* \n */
+
+               sep = strrchr(line, 'x');
+               if (sep == NULL)
+                       continue;
+
+               hex2u64(sep + 1, &start);
+
+               sep = strchr(line, ' ');
+               if (sep == NULL)
+                       continue;
+
+               *sep = '\0';
+
+               scnprintf(name, sizeof(name), "[%s]", line);
+
+               err = process_module(arg, name, start);
+               if (err)
+                       break;
+       }
+out:
+       free(line);
+       fclose(file);
+       return err;
+}
+
  struct process_kallsyms_args {
         struct map *map;
         struct dso *dso;
@@ -739,51 +800,242 @@ bool symbol__restricted_filename(const char *filename,
         return restricted;
  }
  
-struct kcore_mapfn_data {
-       struct dso *dso;
-       enum map_type type;
-       struct list_head maps;
+struct module_info {
+       struct rb_node rb_node;
+       char *name;
+       u64 start;
  };
  
-static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data)
+static void add_module(struct module_info *mi, struct rb_root *modules)
  {
-       struct kcore_mapfn_data *md = data;
-       struct map *map;
+       struct rb_node **p = &modules->rb_node;
+       struct rb_node *parent = NULL;
+       struct module_info *m;
  
-       map = map__new2(start, md->dso, md->type);
-       if (map == NULL)
+       while (*p != NULL) {
+               parent = *p;
+               m = rb_entry(parent, struct module_info, rb_node);
+               if (strcmp(mi->name, m->name) < 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+       rb_link_node(&mi->rb_node, parent, p);
+       rb_insert_color(&mi->rb_node, modules);
+}
+
+static void delete_modules(struct rb_root *modules)
+{
+       struct module_info *mi;
+       struct rb_node *next = rb_first(modules);
+
+       while (next) {
+               mi = rb_entry(next, struct module_info, rb_node);
+               next = rb_next(&mi->rb_node);
+               rb_erase(&mi->rb_node, modules);
+               free(mi->name);
+               free(mi);
+       }
+}
+
+static struct module_info *find_module(const char *name,
+                                      struct rb_root *modules)
+{
+       struct rb_node *n = modules->rb_node;
+
+       while (n) {
+               struct module_info *m;
+               int cmp;
+
+               m = rb_entry(n, struct module_info, rb_node);
+               cmp = strcmp(name, m->name);
+               if (cmp < 0)
+                       n = n->rb_left;
+               else if (cmp > 0)
+                       n = n->rb_right;
+               else
+                       return m;
+       }
+
+       return NULL;
+}
+
+static int __read_proc_modules(void *arg, const char *name, u64 start)
+{
+       struct rb_root *modules = arg;
+       struct module_info *mi;
+
+       mi = zalloc(sizeof(struct module_info));
+       if (!mi)
                 return -ENOMEM;
  
-       map->end = map->start + len;
-       map->pgoff = pgoff;
+       mi->name = strdup(name);
+       mi->start = start;
  
-       list_add(&map->node, &md->maps);
+       if (!mi->name) {
+               free(mi);
+               return -ENOMEM;
+       }
+
+       add_module(mi, modules);
+
+       return 0;
+}
+
+static int read_proc_modules(const char *filename, struct rb_root *modules)
+{
+       if (symbol__restricted_filename(filename, "/proc/modules"))
+               return -1;
+
+       if (modules__parse(filename, modules, __read_proc_modules)) {
+               delete_modules(modules);
+               return -1;
+       }
  
         return 0;
  }
  
+int compare_proc_modules(const char *from, const char *to)
+{
+       struct rb_root from_modules = RB_ROOT;
+       struct rb_root to_modules = RB_ROOT;
+       struct rb_node *from_node, *to_node;
+       struct module_info *from_m, *to_m;
+       int ret = -1;
+
+       if (read_proc_modules(from, &from_modules))
+               return -1;
+
+       if (read_proc_modules(to, &to_modules))
+               goto out_delete_from;
+
+       from_node = rb_first(&from_modules);
+       to_node = rb_first(&to_modules);
+       while (from_node) {
+               if (!to_node)
+                       break;
+
+               from_m = rb_entry(from_node, struct module_info, rb_node);
+               to_m = rb_entry(to_node, struct module_info, rb_node);
+
+               if (from_m->start != to_m->start ||
+                   strcmp(from_m->name, to_m->name))
+                       break;
+
+               from_node = rb_next(from_node);
+               to_node = rb_next(to_node);
+       }
+
+       if (!from_node && !to_node)
+               ret = 0;
+
+       delete_modules(&to_modules);
+out_delete_from:
+       delete_modules(&from_modules);
+
+       return ret;
+}
+
+static int do_validate_kcore_modules(const char *filename, struct map *map,
+                                 struct map_groups *kmaps)
+{
+       struct rb_root modules = RB_ROOT;
+       struct map *old_map;
+       int err;
+
+       err = read_proc_modules(filename, &modules);
+       if (err)
+               return err;
+
+       old_map = map_groups__first(kmaps, map->type);
+       while (old_map) {
+               struct map *next = map_groups__next(old_map);
+               struct module_info *mi;
+
+               if (old_map == map || old_map->start == map->start) {
+                       /* The kernel map */
+                       old_map = next;
+                       continue;
+               }
+
+               /* Module must be in memory at the same address */
+               mi = find_module(old_map->dso->short_name, &modules);
+               if (!mi || mi->start != old_map->start) {
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               old_map = next;
+       }
+out:
+       delete_modules(&modules);
+       return err;
+}
+
  /*
- * If kallsyms is referenced by name then we look for kcore in the same
+ * If kallsyms is referenced by name then we look for filename in the same
   * directory.
   */
-static bool kcore_filename_from_kallsyms_filename(char *kcore_filename,
-                                                 const char *kallsyms_filename)
+static bool filename_from_kallsyms_filename(char *filename,
+                                           const char *base_name,
+                                           const char *kallsyms_filename)
  {
         char *name;
  
-       strcpy(kcore_filename, kallsyms_filename);
-       name = strrchr(kcore_filename, '/');
+       strcpy(filename, kallsyms_filename);
+       name = strrchr(filename, '/');
         if (!name)
                 return false;
  
-       if (!strcmp(name, "/kallsyms")) {
-               strcpy(name, "/kcore");
+       name += 1;
+
+       if (!strcmp(name, "kallsyms")) {
+               strcpy(name, base_name);
                 return true;
         }
  
         return false;
  }
  
+static int validate_kcore_modules(const char *kallsyms_filename,
+                                 struct map *map)
+{
+       struct map_groups *kmaps = map__kmap(map)->kmaps;
+       char modules_filename[PATH_MAX];
+
+       if (!filename_from_kallsyms_filename(modules_filename, "modules",
+                                            kallsyms_filename))
+               return -EINVAL;
+
+       if (do_validate_kcore_modules(modules_filename, map, kmaps))
+               return -EINVAL;
+
+       return 0;
+}
+
+struct kcore_mapfn_data {
+       struct dso *dso;
+       enum map_type type;
+       struct list_head maps;
+};
+
+static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data)
+{
+       struct kcore_mapfn_data *md = data;
+       struct map *map;
+
+       map = map__new2(start, md->dso, md->type);
+       if (map == NULL)
+               return -ENOMEM;
+
+       map->end = map->start + len;
+       map->pgoff = pgoff;
+
+       list_add(&map->node, &md->maps);
+
+       return 0;
+}
+
  static int dso__load_kcore(struct dso *dso, struct map *map,
                            const char *kallsyms_filename)
  {
@@ -800,8 +1052,12 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
         if (map != machine->vmlinux_maps[map->type])
                 return -EINVAL;
  
-       if (!kcore_filename_from_kallsyms_filename(kcore_filename,
-                                                  kallsyms_filename))
+       if (!filename_from_kallsyms_filename(kcore_filename, "kcore",
+                                            kallsyms_filename))
+               return -EINVAL;
+
+       /* All modules must be present at their original addresses */
+       if (validate_kcore_modules(kallsyms_filename, map))
                 return -EINVAL;
  
         md.dso = dso;
@@ -1188,6 +1444,105 @@ out:
         return err;
  }
  
+static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz)
+{
+       char kallsyms_filename[PATH_MAX];
+       struct dirent *dent;
+       int ret = -1;
+       DIR *d;
+
+       d = opendir(dir);
+       if (!d)
+               return -1;
+
+       while (1) {
+               dent = readdir(d);
+               if (!dent)
+                       break;
+               if (dent->d_type != DT_DIR)
+                       continue;
+               scnprintf(kallsyms_filename, sizeof(kallsyms_filename),
+                         "%s/%s/kallsyms", dir, dent->d_name);
+               if (!validate_kcore_modules(kallsyms_filename, map)) {
+                       strlcpy(dir, kallsyms_filename, dir_sz);
+                       ret = 0;
+                       break;
+               }
+       }
+
+       closedir(d);
+
+       return ret;
+}
+
+static char *dso__find_kallsyms(struct dso *dso, struct map *map)
+{
+       u8 host_build_id[BUILD_ID_SIZE];
+       char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+       bool is_host = false;
+       char path[PATH_MAX];
+
+       if (!dso->has_build_id) {
+               /*
+                * Last resort, if we don't have a build-id and couldn't find
+                * any vmlinux file, try the running kernel kallsyms table.
+                */
+               goto proc_kallsyms;
+       }
+
+       if (sysfs__read_build_id("/sys/kernel/notes", host_build_id,
+                                sizeof(host_build_id)) == 0)
+               is_host = dso__build_id_equal(dso, host_build_id);
+
+       build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
+
+       /* Use /proc/kallsyms if possible */
+       if (is_host) {
+               DIR *d;
+               int fd;
+
+               /* If no cached kcore go with /proc/kallsyms */
+               scnprintf(path, sizeof(path), "%s/[kernel.kcore]/%s",
+                         buildid_dir, sbuild_id);
+               d = opendir(path);
+               if (!d)
+                       goto proc_kallsyms;
+               closedir(d);
+
+               /*
+                * Do not check the build-id cache, until we know we cannot use
+                * /proc/kcore.
+                */
+               fd = open("/proc/kcore", O_RDONLY);
+               if (fd != -1) {
+                       close(fd);
+                       /* If module maps match go with /proc/kallsyms */
+                       if (!validate_kcore_modules("/proc/kallsyms", map))
+                               goto proc_kallsyms;
+               }
+
+               /* Find kallsyms in build-id cache with kcore */
+               if (!find_matching_kcore(map, path, sizeof(path)))
+                       return strdup(path);
+
+               goto proc_kallsyms;
+       }
+
+       scnprintf(path, sizeof(path), "%s/[kernel.kallsyms]/%s",
+                 buildid_dir, sbuild_id);
+
+       if (access(path, F_OK)) {
+               pr_err("No kallsyms or vmlinux with build-id %s was found\n",
+                      sbuild_id);
+               return NULL;
+       }
+
+       return strdup(path);
+
+proc_kallsyms:
+       return strdup("/proc/kallsyms");
+}
+
  static int dso__load_kernel_sym(struct dso *dso, struct map *map,
                                 symbol_filter_t filter)
  {
@@ -1214,7 +1569,7 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map,
                 goto do_kallsyms;
         }
  
-       if (symbol_conf.vmlinux_name != NULL) {
+       if (!symbol_conf.ignore_vmlinux && symbol_conf.vmlinux_name != NULL) {
                 err = dso__load_vmlinux(dso, map,
                                         symbol_conf.vmlinux_name, filter);
                 if (err > 0) {
@@ -1226,7 +1581,7 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map,
                 return err;
         }
  
-       if (vmlinux_path != NULL) {
+       if (!symbol_conf.ignore_vmlinux && vmlinux_path != NULL) {
                 err = dso__load_vmlinux_path(dso, map, filter);
                 if (err > 0)
                         return err;
@@ -1236,51 +1591,11 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map,
         if (symbol_conf.symfs[0] != 0)
                 return -1;
  
-       /*
-        * Say the kernel DSO was created when processing the build-id header table,
-        * we have a build-id, so check if it is the same as the running kernel,
-        * using it if it is.
-        */
-       if (dso->has_build_id) {
-               u8 kallsyms_build_id[BUILD_ID_SIZE];
-               char sbuild_id[BUILD_ID_SIZE * 2 + 1];
-
-               if (sysfs__read_build_id("/sys/kernel/notes", kallsyms_build_id,
-                                        sizeof(kallsyms_build_id)) == 0) {
-                       if (dso__build_id_equal(dso, kallsyms_build_id)) {
-                               kallsyms_filename = "/proc/kallsyms";
-                               goto do_kallsyms;
-                       }
-               }
-               /*
-                * Now look if we have it on the build-id cache in
-                * $HOME/.debug/[kernel.kallsyms].
-                */
-               build_id__sprintf(dso->build_id, sizeof(dso->build_id),
-                                 sbuild_id);
-
-               if (asprintf(&kallsyms_allocated_filename,
-                            "%s/.debug/[kernel.kallsyms]/%s",
-                            getenv("HOME"), sbuild_id) == -1) {
-                       pr_err("Not enough memory for kallsyms file lookup\n");
-                       return -1;
-               }
-
-               kallsyms_filename = kallsyms_allocated_filename;
+       kallsyms_allocated_filename = dso__find_kallsyms(dso, map);
+       if (!kallsyms_allocated_filename)
+               return -1;
  
-               if (access(kallsyms_filename, F_OK)) {
-                       pr_err("No kallsyms or vmlinux with build-id %s "
-                              "was found\n", sbuild_id);
-                       free(kallsyms_allocated_filename);
-                       return -1;
-               }
-       } else {
-               /*
-                * Last resort, if we don't have a build-id and couldn't find
-                * any vmlinux file, try the running kernel kallsyms table.
-                */
-               kallsyms_filename = "/proc/kallsyms";
-       }
+       kallsyms_filename = kallsyms_allocated_filename;
  
  do_kallsyms:
         err = dso__load_kallsyms(dso, kallsyms_filename, map, filter);
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index fd5b70ea29812334572fc7a6f5fad2c3ec7058e7..07de8fea2f48dbc346124539e7393307b839051a 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -13,7 +13,7 @@
  #include <libgen.h>
  #include "build-id.h"
  
-#ifdef LIBELF_SUPPORT
+#ifdef HAVE_LIBELF_SUPPORT
  #include <libelf.h>
  #include <gelf.h>
  #endif
@@ -21,7 +21,7 @@
  
  #include "dso.h"
  
-#ifdef HAVE_CPLUS_DEMANGLE
+#ifdef HAVE_CPLUS_DEMANGLE_SUPPORT
  extern char *cplus_demangle(const char *, int);
  
  static inline char *bfd_demangle(void __maybe_unused *v, const char *c, int i)
@@ -46,7 +46,7 @@ static inline char *bfd_demangle(void __maybe_unused *v,
   * libelf 0.8.x and earlier do not support ELF_C_READ_MMAP;
   * for newer versions we can use mmap to reduce memory usage:
   */
-#ifdef LIBELF_MMAP
+#ifdef HAVE_LIBELF_MMAP_SUPPORT
  # define PERF_ELF_C_READ_MMAP ELF_C_READ_MMAP
  #else
  # define PERF_ELF_C_READ_MMAP ELF_C_READ
@@ -85,6 +85,7 @@ struct symbol_conf {
         unsigned short  priv_size;
         unsigned short  nr_events;
         bool            try_vmlinux_path,
+                       ignore_vmlinux,
                         show_kernel_path,
                         use_modules,
                         sort_by_name,
@@ -178,7 +179,7 @@ struct symsrc {
         int fd;
         enum dso_binary_type type;
  
-#ifdef LIBELF_SUPPORT
+#ifdef HAVE_LIBELF_SUPPORT
         Elf *elf;
         GElf_Ehdr ehdr;
  
@@ -222,6 +223,9 @@ int sysfs__read_build_id(const char *filename, void *bf, size_t size);
  int kallsyms__parse(const char *filename, void *arg,
                     int (*process_symbol)(void *arg, const char *name,
                                           char type, u64 start));
+int modules__parse(const char *filename, void *arg,
+                  int (*process_module)(void *arg, const char *name,
+                                        u64 start));
  int filename__read_debuglink(const char *filename, char *debuglink,
                              size_t size);
  
@@ -252,4 +256,21 @@ typedef int (*mapfn_t)(u64 start, u64 len, u64 pgoff, void *data);
  int file__read_maps(int fd, bool exe, mapfn_t mapfn, void *data,
                     bool *is_64_bit);
  
+#define PERF_KCORE_EXTRACT "/tmp/perf-kcore-XXXXXX"
+
+struct kcore_extract {
+       char *kcore_filename;
+       u64 addr;
+       u64 offs;
+       u64 len;
+       char extract_filename[sizeof(PERF_KCORE_EXTRACT)];
+       int fd;
+};
+
+int kcore_extract__create(struct kcore_extract *kce);
+void kcore_extract__delete(struct kcore_extract *kce);
+
+int kcore_copy(const char *from_dir, const char *to_dir);
+int compare_proc_modules(const char *from, const char *to);
+
  #endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h

index b554ffc462b653e73b7e8de84cfa53e2609be989..88cfeaff600b334390842e725b9a7f792b0fd9af 100644 (file)
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -24,6 +24,7 @@ struct perf_top {
         u64                exact_samples;
         u64                guest_us_samples, guest_kernel_samples;
         int                print_entries, count_filter, delay_secs;
+       int                max_stack;
         bool               hide_kernel_symbols, hide_user_symbols, zero;
         bool               use_tui, use_stdio;
         bool               kptr_restrict_warned;
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c

index e9e1c03f927d27bce1e041a9cc61ca47dae2b987..6681f71f2f95e329ddc5eba636d05ce67d3591ee 100644 (file)
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -120,42 +120,6 @@ raw_field_value(struct event_format *event, const char *name, void *data)
         return val;
  }
  
-void *raw_field_ptr(struct event_format *event, const char *name, void *data)
-{
-       struct format_field *field;
-
-       field = pevent_find_any_field(event, name);
-       if (!field)
-               return NULL;
-
-       if (field->flags & FIELD_IS_DYNAMIC) {
-               int offset;
-
-               offset = *(int *)(data + field->offset);
-               offset &= 0xffff;
-
-               return data + offset;
-       }
-
-       return data + field->offset;
-}
-
-int trace_parse_common_type(struct pevent *pevent, void *data)
-{
-       struct pevent_record record;
-
-       record.data = data;
-       return pevent_data_type(pevent, &record);
-}
-
-int trace_parse_common_pid(struct pevent *pevent, void *data)
-{
-       struct pevent_record record;
-
-       record.data = data;
-       return pevent_data_pid(pevent, &record);
-}
-
  unsigned long long read_size(struct event_format *event, void *ptr, int size)
  {
         return pevent_read_number(event->pevent, ptr, size);
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h

index fafe1a40444a2b0785e4d41b048ee0926786fcf9..04df63114109604403f2f57a84ed03ec0b765707 100644 (file)
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -11,8 +11,6 @@ union perf_event;
  struct perf_tool;
  struct thread;
  
-extern struct pevent *perf_pevent;
-
  int bigendian(void);
  
  struct pevent *read_trace_init(int file_bigendian, int host_bigendian);
@@ -23,26 +21,19 @@ int parse_ftrace_file(struct pevent *pevent, char *buf, unsigned long size);
  int parse_event_file(struct pevent *pevent,
                      char *buf, unsigned long size, char *sys);
  
-struct pevent_record *trace_peek_data(struct pevent *pevent, int cpu);
-
  unsigned long long
  raw_field_value(struct event_format *event, const char *name, void *data);
-void *raw_field_ptr(struct event_format *event, const char *name, void *data);
  
  void parse_proc_kallsyms(struct pevent *pevent, char *file, unsigned int size);
  void parse_ftrace_printk(struct pevent *pevent, char *file, unsigned int size);
  
  ssize_t trace_report(int fd, struct pevent **pevent, bool repipe);
  
-int trace_parse_common_type(struct pevent *pevent, void *data);
-int trace_parse_common_pid(struct pevent *pevent, void *data);
-
  struct event_format *trace_find_next_event(struct pevent *pevent,
                                            struct event_format *event);
  unsigned long long read_size(struct event_format *event, void *ptr, int size);
  unsigned long long eval_flag(const char *flag);
  
-struct pevent_record *trace_read_data(struct pevent *pevent, int cpu);
  int read_tracing_data(int fd, struct list_head *pattrs);
  
  struct tracing_data {
diff --git a/tools/perf/util/unwind.h b/tools/perf/util/unwind.h

index cb6bc503a792413dd701fec8dafbf60d9e921ed6..ec0c71a2ca2e22f2c78ef90726d0e9afe623b43c 100644 (file)
--- a/tools/perf/util/unwind.h
+++ b/tools/perf/util/unwind.h
@@ -13,7 +13,7 @@ struct unwind_entry {
  
  typedef int (*unwind_entry_cb_t)(struct unwind_entry *entry, void *arg);
  
-#ifdef LIBUNWIND_SUPPORT
+#ifdef HAVE_LIBUNWIND_SUPPORT
  int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
                         struct machine *machine,
                         struct thread *thread,
@@ -31,5 +31,5 @@ unwind__get_entries(unwind_entry_cb_t cb __maybe_unused,
  {
         return 0;
  }
-#endif /* LIBUNWIND_SUPPORT */
+#endif /* HAVE_LIBUNWIND_SUPPORT */
  #endif /* __UNWIND_H */
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c

index 6d17b18e915d56b47e02d16c4c27cae2a85764db..c25e57b3acb2994d7d53b8089f1897030b71efb7 100644 (file)
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -1,7 +1,7 @@
  #include "../perf.h"
  #include "util.h"
  #include <sys/mman.h>
-#ifdef BACKTRACE_SUPPORT
+#ifdef HAVE_BACKTRACE_SUPPORT
  #include <execinfo.h>
  #endif
  #include <stdio.h>
@@ -55,17 +55,20 @@ int mkdir_p(char *path, mode_t mode)
         return (stat(path, &st) && mkdir(path, mode)) ? -1 : 0;
  }
  
-static int slow_copyfile(const char *from, const char *to)
+static int slow_copyfile(const char *from, const char *to, mode_t mode)
  {
-       int err = 0;
+       int err = -1;
         char *line = NULL;
         size_t n;
         FILE *from_fp = fopen(from, "r"), *to_fp;
+       mode_t old_umask;
  
         if (from_fp == NULL)
                 goto out;
  
+       old_umask = umask(mode ^ 0777);
         to_fp = fopen(to, "w");
+       umask(old_umask);
         if (to_fp == NULL)
                 goto out_fclose_from;
  
@@ -82,7 +85,7 @@ out:
         return err;
  }
  
-int copyfile(const char *from, const char *to)
+int copyfile_mode(const char *from, const char *to, mode_t mode)
  {
         int fromfd, tofd;
         struct stat st;
@@ -93,13 +96,13 @@ int copyfile(const char *from, const char *to)
                 goto out;
  
         if (st.st_size == 0) /* /proc? do it slowly... */
-               return slow_copyfile(from, to);
+               return slow_copyfile(from, to, mode);
  
         fromfd = open(from, O_RDONLY);
         if (fromfd < 0)
                 goto out;
  
-       tofd = creat(to, 0755);
+       tofd = creat(to, mode);
         if (tofd < 0)
                 goto out_close_from;
  
@@ -121,6 +124,11 @@ out:
         return err;
  }
  
+int copyfile(const char *from, const char *to)
+{
+       return copyfile_mode(from, to, 0755);
+}
+
  unsigned long convert_unit(unsigned long value, char *unit)
  {
         *unit = ' ';
@@ -204,7 +212,7 @@ int hex2u64(const char *ptr, u64 *long_val)
  }
  
  /* Obtain a backtrace and print it to stdout. */
-#ifdef BACKTRACE_SUPPORT
+#ifdef HAVE_BACKTRACE_SUPPORT
  void dump_stack(void)
  {
         void *array[16];
@@ -361,3 +369,45 @@ int parse_nsec_time(const char *str, u64 *ptime)
         *ptime = time_sec * NSEC_PER_SEC + time_nsec;
         return 0;
  }
+
+unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
+{
+       struct parse_tag *i = tags;
+
+       while (i->tag) {
+               char *s;
+
+               s = strchr(str, i->tag);
+               if (s) {
+                       unsigned long int value;
+                       char *endptr;
+
+                       value = strtoul(str, &endptr, 10);
+                       if (s != endptr)
+                               break;
+
+                       value *= i->mult;
+                       return value;
+               }
+               i++;
+       }
+
+       return (unsigned long) -1;
+}
+
+int filename__read_int(const char *filename, int *value)
+{
+       char line[64];
+       int fd = open(filename, O_RDONLY), err = -1;
+
+       if (fd < 0)
+               return -1;
+
+       if (read(fd, line, sizeof(line)) > 0) {
+               *value = atoi(line);
+               err = 0;
+       }
+
+       close(fd);
+       return err;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index a53535949043f32654a181c3728a421c52baf276..c8f362daba8755f5b054d70c90f52211f6d8ccc8 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -128,6 +128,8 @@ void put_tracing_file(char *file);
  #endif
  #endif
  
+#define PERF_GTK_DSO  "libperf-gtk.so"
+
  /* General helper functions */
  extern void usage(const char *err) NORETURN;
  extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
@@ -241,6 +243,7 @@ static inline int sane_case(int x, int high)
  
  int mkdir_p(char *path, mode_t mode);
  int copyfile(const char *from, const char *to);
+int copyfile_mode(const char *from, const char *to, mode_t mode);
  
  s64 perf_atoll(const char *str);
  char **argv_split(const char *str, int *argcp);
@@ -270,6 +273,13 @@ bool is_power_of_2(unsigned long n)
         return (n != 0 && ((n & (n - 1)) == 0));
  }
  
+static inline unsigned next_pow2(unsigned x)
+{
+       if (!x)
+               return 1;
+       return 1ULL << (32 - __builtin_clz(x - 1));
+}
+
  size_t hex_width(u64 v);
  int hex2u64(const char *ptr, u64 *val);
  
@@ -281,4 +291,20 @@ void dump_stack(void);
  extern unsigned int page_size;
  
  void get_term_dimensions(struct winsize *ws);
+
+struct parse_tag {
+       char tag;
+       int mult;
+};
+
+unsigned long parse_tag_value(const char *str, struct parse_tag *tags);
+
+#define SRCLINE_UNKNOWN  ((char *) "??:0")
+
+struct dso;
+
+char *get_srcline(struct dso *dso, unsigned long addr);
+void free_srcline(char *srcline);
+
+int filename__read_int(const char *filename, int *value);
  #endif /* GIT_COMPAT_UTIL_H */
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include

index 0d0506d55c71348a3ddce72fcec03824ed987bf3..ee76544deecb4138c7fdd2fee2ef4b410d39e5cc 100644 (file)
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -59,21 +59,22 @@ QUIET_SUBDIR0  = +$(MAKE) $(COMMAND_O) -C # space to separate -C and subdir
  QUIET_SUBDIR1  =
  
  ifneq ($(findstring $(MAKEFLAGS),s),s)
-ifneq ($(V),1)
-       QUIET_CC       = @echo '   ' CC $@;
-       QUIET_AR       = @echo '   ' AR $@;
-       QUIET_LINK     = @echo '   ' LINK $@;
-       QUIET_MKDIR    = @echo '   ' MKDIR $@;
-       QUIET_GEN      = @echo '   ' GEN $@;
+  ifneq ($(V),1)
+       QUIET_CC       = @echo '  CC       '$@;
+       QUIET_AR       = @echo '  AR       '$@;
+       QUIET_LINK     = @echo '  LINK     '$@;
+       QUIET_MKDIR    = @echo '  MKDIR    '$@;
+       QUIET_GEN      = @echo '  GEN      '$@;
         QUIET_SUBDIR0  = +@subdir=
-       QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+       QUIET_SUBDIR1  = ;$(NO_SUBDIR) \
+                         echo '  SUBDIR   '$$subdir; \
                          $(MAKE) $(PRINT_DIR) -C $$subdir
-       QUIET_FLEX     = @echo '   ' FLEX $@;
-       QUIET_BISON    = @echo '   ' BISON $@;
+       QUIET_FLEX     = @echo '  FLEX     '$@;
+       QUIET_BISON    = @echo '  BISON    '$@;
  
         descend = \
-               +@echo '   ' DESCEND $(1); \
+               +@echo         '  DESCEND  '$(1); \
                 mkdir -p $(OUTPUT)$(1) && \
                 $(MAKE) $(COMMAND_O) subdir=$(if $(subdir),$(subdir)/$(1),$(1)) $(PRINT_DIR) -C $(1) $(2)
-endif
+  endif
  endif
author	Thierry Reding <treding@nvidia.com>
	Thu, 24 Oct 2013 12:39:48 +0000 (14:39 +0200)
committer	Thierry Reding <treding@nvidia.com>
	Thu, 24 Oct 2013 12:51:22 +0000 (14:51 +0200)